test: test_raft_no_quorum: decrease group0_raft_op_timeout_in_ms after quorum loss
`test_raft_no_quorum.py::test_cannot_add_new_node` is currently flaky in dev
mode. The bootstrap of the first node can fail due to `add_entry()` timing
out (with the 1s timeout set by the test case).
Other test cases in this test file could fail in the same way as well, so we
need a general fix. We don't want to increase the timeout in dev mode, as it
would slow down the test. The solution is to keep the timeout unchanged, but
set it only after quorum is lost. This prevents unexpected timeouts of group0
operations with almost no impact on the test running time.
A note about the new `update_group0_raft_op_timeout` function: waiting for
the log seems to be necessary only for
`test_quorum_lost_during_node_join_response_handler`, but let's do it
for all test cases just in case (including `test_can_restart` that shouldn't
be flaky currently).
Fixes https://scylladb.atlassian.net/browse/SCYLLADB-913
Closes scylladb/scylladb#28998
(cherry picked from commit 526e5986fe)
Closes scylladb/scylladb#29068
Closes scylladb/scylladb#29097
This commit is contained in:
@@ -7,6 +7,7 @@ import logging
|
||||
|
||||
import pytest
|
||||
import asyncio
|
||||
from test.pylib.internal_types import ServerNum
|
||||
from test.pylib.manager_client import ManagerClient
|
||||
from test.cluster.conftest import skip_mode
|
||||
from test.pylib.rest_client import inject_error_one_shot, InjectionHandler, read_barrier
|
||||
@@ -20,6 +21,20 @@ def fixture_raft_op_timeout(build_mode):
|
||||
return 10000 if build_mode == 'debug' else 1000
|
||||
|
||||
|
||||
async def update_group0_raft_op_timeout(server_id: ServerNum, manager: ManagerClient, timeout: int) -> None:
|
||||
logger.info(f"Updating group0_raft_op_timeout_in_ms on server {server_id} to {timeout}")
|
||||
running_ids = [srv.server_id for srv in await manager.running_servers()]
|
||||
if server_id in running_ids:
|
||||
# If the node is alive, server_update_config only sends the SIGHUP signal to the Scylla process, so awaiting it
|
||||
# doesn't guarantee that the new config file is active. Work around this by looking at the logs.
|
||||
log_file = await manager.server_open_log(server_id)
|
||||
mark = await log_file.mark()
|
||||
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
|
||||
await log_file.wait_for("completed re-reading configuration file", from_mark=mark, timeout=60)
|
||||
else:
|
||||
await manager.server_update_config(server_id, 'group0_raft_op_timeout_in_ms', timeout)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@skip_mode('release', 'error injections are not supported in release mode')
|
||||
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
|
||||
@@ -42,7 +57,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
|
||||
config = {
|
||||
'direct_failure_detector_ping_timeout_in_ms': 300,
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -64,6 +78,10 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
manager.server_stop_gracefully(servers[3].server_id),
|
||||
manager.server_stop_gracefully(servers[4].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout)
|
||||
for srv in servers[:2]))
|
||||
|
||||
logger.info("starting a sixth node with no quorum")
|
||||
await manager.server_add(expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
|
||||
timeout=60)
|
||||
@@ -76,7 +94,6 @@ async def test_cannot_add_new_node(manager: ManagerClient, raft_op_timeout: int)
|
||||
@skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64')
|
||||
async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_timeout: int) -> None:
|
||||
config = {
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -107,6 +124,9 @@ async def test_quorum_lost_during_node_join(manager: ManagerClient, raft_op_time
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("release join-node-before-add-entry injection")
|
||||
await injection_handler.message()
|
||||
|
||||
@@ -126,7 +146,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
|
||||
logger.info("adding a fourth node")
|
||||
servers += [await manager.server_add(config={
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -153,6 +172,9 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[3].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("release join-node-response_handler-before-read-barrier injection")
|
||||
injection_handler = InjectionHandler(manager.api,
|
||||
'join-node-response_handler-before-read-barrier',
|
||||
@@ -169,7 +191,6 @@ async def test_quorum_lost_during_node_join_response_handler(manager: ManagerCli
|
||||
async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: int) -> None:
|
||||
logger.info("starting a first node (the leader)")
|
||||
servers = [await manager.server_add(config={
|
||||
'group0_raft_op_timeout_in_ms': raft_op_timeout,
|
||||
'error_injections_at_startup': [
|
||||
{
|
||||
'name': 'raft-group-registry-fd-threshold-in-ms',
|
||||
@@ -189,6 +210,9 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in
|
||||
await asyncio.gather(manager.server_stop_gracefully(servers[1].server_id),
|
||||
manager.server_stop_gracefully(servers[2].server_id))
|
||||
|
||||
# Do it here to prevent unexpected timeouts before quorum loss.
|
||||
await update_group0_raft_op_timeout(servers[0].server_id, manager, raft_op_timeout)
|
||||
|
||||
logger.info("attempting removenode for the second node")
|
||||
await manager.remove_node(servers[0].server_id, servers[1].server_id,
|
||||
expected_error="raft operation [read_barrier] timed out, there is no raft quorum",
|
||||
@@ -232,9 +256,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
|
||||
await asyncio.gather(*(manager.server_stop(srv.server_id) for srv in servers))
|
||||
|
||||
# This ensures the read barriers below fail quickly without group 0 quorum.
|
||||
logger.info(f"Decreasing group0_raft_op_timeout_in_ms on {servers}")
|
||||
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', raft_op_timeout)
|
||||
for srv in servers))
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, raft_op_timeout) for srv in servers))
|
||||
|
||||
logger.info(f"Restarting {servers[:2]} with no group 0 quorum")
|
||||
for idx, srv in enumerate(servers[:2]):
|
||||
@@ -246,8 +268,7 @@ async def test_can_restart(manager: ManagerClient, raft_op_timeout: int) -> None
|
||||
|
||||
# Increase the timeout back to 300s to ensure the new group 0 leader is elected before the first read barrier below
|
||||
# times out.
|
||||
await asyncio.gather(*(manager.server_update_config(srv.server_id, 'group0_raft_op_timeout_in_ms', 300000)
|
||||
for srv in servers))
|
||||
await asyncio.gather(*(update_group0_raft_op_timeout(srv.server_id, manager, 300000) for srv in servers))
|
||||
|
||||
logger.info(f"Restarting {servers[2:]} with group 0 quorum")
|
||||
for srv in servers[2:]:
|
||||
|
||||
Reference in New Issue
Block a user