Merge 'test: cluster: util: sleep for 0.01s between writes in do_writes' from Patryk Jędrzejczak

Tests use `start_writes` as a simple write workload to test that writes succeed when they should (e.g., there is no availability loss), but not to test performance. There is no reason to overload the CPU, which can lead to test failures. I suspect this function to be the cause of SCYLLADB-929, where the failures of `test_raft_recovery_user_data` (that creates multiple write workloads with `start_writes`) indicated that the machine was overloaded. The relevant observations: - two runs failed at the same time in debug mode, - there were many reactor stalls and RPC timeouts in the logs (leading to unexpected events like servers marking each other down and group0 leader changes). I didn't prove that `start_writes` really caused this, but adding this sleep should be a good change, even if I'm wrong. The number of writes performed by the test decreases 30-50 times with the sleep. Note that some other util functions like `start_writes_to_cdc_table` have such a sleep. This PR also contains some minor updates to `test_raft_recovery_user_data`. Fixes SCYLLADB-929 No backport: - the failures were observed only in master CI, - no proof that the change fixes the issue, so backports could be a waste of time. Closes scylladb/scylladb#28917 * github.com:scylladb/scylladb: test: test_raft_recovery_user_data: replace asyncio.gather with gather_safely test: test_raft_recovery_user_data: use the exclude_node API test: test_raft_recovery_user_data: drop tablet_load_stats_cfg test: cluster: util: sleep for 0.01s between writes in do_writes
2026-03-09 12:12:04 +02:00
parent 47e8206482 c8c57850d9
commit 6bba4f7ca1
2 changed files with 6 additions and 11 deletions
--- a/test/cluster/test_raft_recovery_user_data.py
+++ b/test/cluster/test_raft_recovery_user_data.py
@@ -15,7 +15,7 @@ from test.pylib.internal_types import ServerInfo
 from test.pylib.manager_client import ManagerClient
 from test.pylib.rest_client import read_barrier
 from test.pylib.scylla_cluster import ReplaceConfig
-from test.pylib.util import unique_name, wait_for_cql_and_get_hosts
+from test.pylib.util import gather_safely, unique_name, wait_for_cql_and_get_hosts
 from test.cluster.conftest import cluster_con
 from test.cluster.util import check_system_topology_and_cdc_generations_v3_consistency, \
        check_token_ring_and_group0_consistency, delete_discovery_state_and_group0_id, delete_raft_group_data, \
@@ -50,13 +50,10 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
    rf_rack_cfg = {'rf_rack_valid_keyspaces': False}
    # Workaround for flakiness from https://github.com/scylladb/scylladb/issues/23565.
    hints_cfg = {'hinted_handoff_enabled': False}
-    # Workaround for https://github.com/scylladb/scylladb/issues/25163.
-    # It makes the test ~170 s faster with remove_dead_nodes_with == "replace".
-    tablet_load_stats_cfg = {'tablet_load_stats_refresh_interval_in_seconds': 1}
    cfg = {
        'endpoint_snitch': 'GossipingPropertyFileSnitch',
        'tablets_mode_for_new_keyspaces': 'enabled',
-    } | rf_rack_cfg | hints_cfg | tablet_load_stats_cfg
+    } | rf_rack_cfg | hints_cfg

    property_file_dc1 = {'dc': 'dc1', 'rack': 'rack1'}
    property_file_dc2 = {'dc': 'dc2', 'rack': 'rack2'}
@@ -71,6 +68,7 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
    cql, _ = await manager.get_ready_cql(live_servers + dead_servers)
    hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60)
    dead_hosts = await wait_for_cql_and_get_hosts(cql, dead_servers, time.time() + 60)
+    dead_host_ids = await gather_safely(*(manager.get_host_id(srv.server_id) for srv in dead_servers))

    # When table audit is enabled, Scylla creates the "audit" keyspace with
    # NetworkTopologyStrategy. During remove_node, streaming fails for the audit keyspace
@@ -99,7 +97,7 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
    await asyncio.sleep(1)

    logging.info(f'Killing {dead_servers}')
-    await asyncio.gather(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
+    await gather_safely(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))

    logging.info('Checking that group 0 has no majority')
    with pytest.raises(Exception, match="raft operation \\[read_barrier\\] timed out"):
@@ -149,12 +147,8 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
    if remove_dead_nodes_with == "remove":
        # We must mark dead nodes as permanently dead so that they are ignored in topology commands. Without this step,
        # ALTER KEYSPACE below would fail on the global token metadata barrier.
-        # For now, we do not have a specific API to mark nodes as dead, so we use a workaround.
-        # FIXME: use the specific API once scylladb/scylladb#21281 is fixed.
        logging.info(f'Marking {dead_servers} as permanently dead')
-        await manager.remove_node(live_servers[0].server_id, dead_servers[0].server_id,
-                                  [dead_srv.ip_addr for dead_srv in dead_servers[1:]],
-                                  expected_error='Removenode failed')
+        await manager.api.exclude_node(live_servers[0].ip_addr, dead_host_ids)

        logging.info(f'Decreasing RF of {ks_name} to 0 in dc2')
        for i in range(1, rf + 1):
--- a/test/cluster/util.py
+++ b/test/cluster/util.py
@@ -359,6 +359,7 @@ async def start_writes(cql: Session, rf: int, cl: ConsistencyLevel, concurrency:
            except Exception as e:
                logging.error(f"Write started {time.time() - start_time}s ago failed: {e}")
                raise
+            await asyncio.sleep(0.01)
        logging.info(f"Worker #{worker_id} did {write_count} successful writes")

    tasks = [asyncio.create_task(do_writes(worker_id)) for worker_id in range(concurrency)]