Merge 'test: cluster: util: sleep for 0.01s between writes in do_writes' from Patryk Jędrzejczak

Tests use `start_writes` as a simple write workload to test that writes
succeed when they should (e.g., there is no availability loss), but not to
test performance. There is no reason to overload the CPU, which can lead to
test failures.

I suspect this function to be the cause of SCYLLADB-929, where the failures
of `test_raft_recovery_user_data` (that creates multiple write workloads
with `start_writes`) indicated that the machine was overloaded.
The relevant observations:
- two runs failed at the same time in debug mode,
- there were many reactor stalls and RPC timeouts in the logs (leading to
  unexpected events like servers marking each other down and group0
  leader changes).

I didn't prove that `start_writes` really caused this, but adding this sleep
should be a good change, even if I'm wrong.

The number of writes performed by the test decreases 30-50 times with the
sleep.

Note that some other util functions like `start_writes_to_cdc_table` have
such a sleep.

This PR also contains some minor updates to `test_raft_recovery_user_data`.

Fixes SCYLLADB-929

No backport:
- the failures were observed only in master CI,
- no proof that the change fixes the issue, so backports could be a waste
  of time.

Closes scylladb/scylladb#28917

* github.com:scylladb/scylladb:
  test: test_raft_recovery_user_data: replace asyncio.gather with gather_safely
  test: test_raft_recovery_user_data: use the exclude_node API
  test: test_raft_recovery_user_data: drop tablet_load_stats_cfg
  test: cluster: util: sleep for 0.01s between writes in do_writes
This commit is contained in:
Botond Dénes
2026-03-09 12:12:04 +02:00
2 changed files with 6 additions and 11 deletions

View File

@@ -15,7 +15,7 @@ from test.pylib.internal_types import ServerInfo
from test.pylib.manager_client import ManagerClient
from test.pylib.rest_client import read_barrier
from test.pylib.scylla_cluster import ReplaceConfig
from test.pylib.util import unique_name, wait_for_cql_and_get_hosts
from test.pylib.util import gather_safely, unique_name, wait_for_cql_and_get_hosts
from test.cluster.conftest import cluster_con
from test.cluster.util import check_system_topology_and_cdc_generations_v3_consistency, \
check_token_ring_and_group0_consistency, delete_discovery_state_and_group0_id, delete_raft_group_data, \
@@ -50,13 +50,10 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
rf_rack_cfg = {'rf_rack_valid_keyspaces': False}
# Workaround for flakiness from https://github.com/scylladb/scylladb/issues/23565.
hints_cfg = {'hinted_handoff_enabled': False}
# Workaround for https://github.com/scylladb/scylladb/issues/25163.
# It makes the test ~170 s faster with remove_dead_nodes_with == "replace".
tablet_load_stats_cfg = {'tablet_load_stats_refresh_interval_in_seconds': 1}
cfg = {
'endpoint_snitch': 'GossipingPropertyFileSnitch',
'tablets_mode_for_new_keyspaces': 'enabled',
} | rf_rack_cfg | hints_cfg | tablet_load_stats_cfg
} | rf_rack_cfg | hints_cfg
property_file_dc1 = {'dc': 'dc1', 'rack': 'rack1'}
property_file_dc2 = {'dc': 'dc2', 'rack': 'rack2'}
@@ -71,6 +68,7 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
cql, _ = await manager.get_ready_cql(live_servers + dead_servers)
hosts = await wait_for_cql_and_get_hosts(cql, live_servers, time.time() + 60)
dead_hosts = await wait_for_cql_and_get_hosts(cql, dead_servers, time.time() + 60)
dead_host_ids = await gather_safely(*(manager.get_host_id(srv.server_id) for srv in dead_servers))
# When table audit is enabled, Scylla creates the "audit" keyspace with
# NetworkTopologyStrategy. During remove_node, streaming fails for the audit keyspace
@@ -99,7 +97,7 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
await asyncio.sleep(1)
logging.info(f'Killing {dead_servers}')
await asyncio.gather(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
await gather_safely(*(manager.server_stop(server_id=srv.server_id) for srv in dead_servers))
logging.info('Checking that group 0 has no majority')
with pytest.raises(Exception, match="raft operation \\[read_barrier\\] timed out"):
@@ -149,12 +147,8 @@ async def test_raft_recovery_user_data(manager: ManagerClient, remove_dead_nodes
if remove_dead_nodes_with == "remove":
# We must mark dead nodes as permanently dead so that they are ignored in topology commands. Without this step,
# ALTER KEYSPACE below would fail on the global token metadata barrier.
# For now, we do not have a specific API to mark nodes as dead, so we use a workaround.
# FIXME: use the specific API once scylladb/scylladb#21281 is fixed.
logging.info(f'Marking {dead_servers} as permanently dead')
await manager.remove_node(live_servers[0].server_id, dead_servers[0].server_id,
[dead_srv.ip_addr for dead_srv in dead_servers[1:]],
expected_error='Removenode failed')
await manager.api.exclude_node(live_servers[0].ip_addr, dead_host_ids)
logging.info(f'Decreasing RF of {ks_name} to 0 in dc2')
for i in range(1, rf + 1):

View File

@@ -359,6 +359,7 @@ async def start_writes(cql: Session, rf: int, cl: ConsistencyLevel, concurrency:
except Exception as e:
logging.error(f"Write started {time.time() - start_time}s ago failed: {e}")
raise
await asyncio.sleep(0.01)
logging.info(f"Worker #{worker_id} did {write_count} successful writes")
tasks = [asyncio.create_task(do_writes(worker_id)) for worker_id in range(concurrency)]