From 39339b9f703999ed98d751b1541c69fbaea3d367 Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Tue, 30 Jan 2024 15:36:29 +0100 Subject: [PATCH 1/2] test: topology/util: update comment for `reconnect_driver` The issues mentioned in the comment before are already fixed. Unfortunately, there is another, opposite issue which this function can be used for. The previous issue was about the existing driver session not reconnecting. The current issue is about the existing driver session reconnecting too much... (and in the middle of queries.) --- test/topology/util.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/test/topology/util.py b/test/topology/util.py index 8dba66069c..157b777f79 100644 --- a/test/topology/util.py +++ b/test/topology/util.py @@ -21,12 +21,23 @@ logger = logging.getLogger(__name__) async def reconnect_driver(manager: ManagerClient) -> Session: - """Workaround for scylladb/python-driver#170 and scylladb/python-driver#230: - the existing driver session may not reconnect, create a new one. + """Can be used as a workaround for scylladb/python-driver#295. + + When restarting a node, a pre-existing session connected to the cluster + may reconnect to the restarted node multiple times. Even if we verify + that the session can perform a query on that node (e.g. like `wait_for_cql`, + which tries to select from system.local), the driver may again reconnect + after that, and following queries may fail. + + The new session created by this function *should* not have this problem, + although (if I remember correctly) there is no 100% guarantee; still, + the chance of this problem appearing should be significantly decreased + with the new session. """ logging.info(f"Reconnecting driver") manager.driver_close() await manager.driver_connect() + logging.info(f"Driver reconnected") cql = manager.cql assert(cql) return cql From 74bf60a8ca9fd2d6d0797e8ed67ad233b01b53ee Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Tue, 30 Jan 2024 15:40:33 +0100 Subject: [PATCH 2/2] test_raft_snapshot_request: fix flakiness Add workaround for scylladb/python-driver#295. Also an assert made at the end of the test was false, it is fixed with appropriate comment added. --- test/topology_custom/test_raft_snapshot_request.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/test/topology_custom/test_raft_snapshot_request.py b/test/topology_custom/test_raft_snapshot_request.py index 92b37fb825..7ba6fcafe0 100644 --- a/test/topology_custom/test_raft_snapshot_request.py +++ b/test/topology_custom/test_raft_snapshot_request.py @@ -11,6 +11,7 @@ import logging from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for, wait_for_cql_and_get_hosts, read_barrier +from test.topology.util import reconnect_driver logger = logging.getLogger(__name__) @@ -32,7 +33,10 @@ async def trigger_snapshot(manager: ManagerClient, group0_id: str, ip_addr) -> N @pytest.mark.asyncio async def test_raft_snapshot_request(manager: ManagerClient): - servers = await manager.servers_add(3) + cmdline = [ + '--logger-log-level', 'raft=trace', + ] + servers = await manager.servers_add(3, cmdline=cmdline) cql = manager.get_cql() s1 = servers[0] @@ -68,6 +72,7 @@ async def test_raft_snapshot_request(manager: ManagerClient): # Restarting the two servers will cause a newly elected leader to append a dummy command. await asyncio.gather(*(manager.server_restart(s.server_id) for s in servers[:2])) logger.info(f"Restarted {servers[:2]}") + cql = await reconnect_driver(manager) # Wait for one server to append the command and do a read_barrier on the other # to make sure both appended async def appended_command() -> int | None: @@ -86,9 +91,10 @@ async def test_raft_snapshot_request(manager: ManagerClient): await trigger_snapshot(manager, group0_id, s.ip_addr) h = (await wait_for_cql_and_get_hosts(cql, [s], time.time() + 60))[0] snap = await get_raft_snap_id(cql, h) - logger.info("New snapshot ID on {s}: {snap}") + logger.info(f"New snapshot ID on {s}: {snap}") await manager.server_start(s2.server_id) logger.info(f"Server {s2} restarted") + cql = await reconnect_driver(manager) await wait_for_cql_and_get_hosts(cql, [s2], time.time() + 60) async def received_snapshot() -> str | None: new_s2_snap_id = await get_raft_snap_id(cql, h2) @@ -98,4 +104,6 @@ async def test_raft_snapshot_request(manager: ManagerClient): new_s2_snap_id = await wait_for(received_snapshot, time.time() + 60) logger.info(f"{s2} received new snapshot: {new_s2_snap_id}") new_s2_log_size = await get_raft_log_size(cql, h2) - assert new_s2_log_size == 0 + # Log size may be 1 because topology coordinator fiber may start and commit an entry + # after that last snapshot was created (the two events race with each other). + assert new_s2_log_size <= 1