test: test_raft_no_quorum: decrease group0_raft_op_timeout_in_ms after quorum loss

`test_raft_no_quorum.py::test_cannot_add_new_node` is currently flaky in dev mode. The bootstrap of the first node can fail due to `add_entry()` timing out (with the 1s timeout set by the test case). Other test cases in this test file could fail in the same way as well, so we need a general fix. We don't want to increase the timeout in dev mode, as it would slow down the test. The solution is to keep the timeout unchanged, but set it only after quorum is lost. This prevents unexpected timeouts of group0 operations with almost no impact on the test running time. A note about the new `update_group0_raft_op_timeout` function: waiting for the log seems to be necessary only for `test_quorum_lost_during_node_join_response_handler`, but let's do it for all test cases just in case (including `test_can_restart` that shouldn't be flaky currently). Fixes https://scylladb.atlassian.net/browse/SCYLLADB-913 Closes scylladb/scylladb#28998 (cherry picked from commit 526e5986fe) Closes scylladb/scylladb#29068 Closes scylladb/scylladb#29097
2026-03-11 13:11:40 +01:00
parent 0c786045ff
commit 3863dfbc0a
1 changed files with 30 additions and 9 deletions
--- a/test/cluster/test_raft_no_quorum.py
+++ b/test/cluster/test_raft_no_quorum.py
@@ -7,6 +7,7 @@ import logging

 import pytest
 import asyncio
+from test.pylib.internal_types import ServerNum
 from test.pylib.manager_client import ManagerClient
 from test.cluster.conftest import skip_mode
 from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
@@ -20,6 +21,20 @@ def fixture_raft_op_timeout(build_mode):
    return 10000 if build_mode == 'debug' else 1000


+async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
+    logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
+    running_ids = [srv.server_id for srv in await manager.running_servers()]
+    if server_id in running_ids:
+        # If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
+        # doesn't guarantee that the new config file is active. Work around this by looking at the logs.
+        log_file = await manager.server_open_log(server_id)
+        mark = await log_file.mark()
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+        await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
+    else:
+        await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
+
+
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
@@ -42,7 +57,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)

    config = {
        'direct_failure_detector_ping_timeout_in_ms': 300,
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -64,6 +78,10 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
                         manager.server_stop_gracefully(servers[3].server_id),
                         manager.server_stop_gracefully(servers[4].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
+                           for srv in servers[:2]))
+
    logger.info("starting a sixth node with no quorum")
    await manager.server_add(expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
                             timeout=60)
@@ -76,7 +94,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
 async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
    config = {
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -107,6 +124,9 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-before-add-entry injection")
    await injection_handler.message()

@@ -126,7 +146,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli

    logger.info("adding a fourth node")
    servers += [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -153,6 +172,9 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
+
    logger.info("release join-node-response_handler-before-read-barrier injection")
    injection_handler = InjectionHandler(manager.api,
                                         'join-node-response_handler-before-read-barrier',
@@ -169,7 +191,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
 async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
    logger.info("starting a first node (the leader)")
    servers = [await manager.server_add(config={
-        'group0_raft_op_timeout_in_ms': raft_op_timeout,
        'error_injections_at_startup': [
            {
                'name': 'raft-group-registry-fd-threshold-in-ms',
@@ -189,6 +210,9 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
    await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
                         manager.server_stop_gracefully(servers[2].server_id))

+    # Do it here to prevent unexpected timeouts before quorum loss.
+    await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
+
    logger.info("attempting removenode for the second node")
    await manager.remove_node(servers[0].server_id, servers[1].server_id,
                            expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
@@ -232,9 +256,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
    await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))

    # This ensures the read barriers below fail quickly without group 0 quorum.
-    logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))

    logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
    for idx, srv in enumerate(servers[:2]):
@@ -246,8 +268,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None

    # Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
    # times out.
-    await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
-                           for srv in servers))
+    await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))

    logger.info(f"Restarting {servers[2:]} with group 0 quorum")
    for srv in servers[2:]: