Merge 'Fix flakiness in test_raft_snapshot_request' from Kamil Braun

Add workaround for scylladb/python-driver#295. Also an assert made at the end of the test was false, it is fixed with appropriate comment added. Closes scylladb/scylladb#17071 * github.com:scylladb/scylladb: test_raft_snapshot_request: fix flakiness test: topology/util: update comment for `reconnect_driver`
2024-01-31 09:30:26 +02:00
parent 84ddc37130 74bf60a8ca
commit f8d3070559
2 changed files with 24 additions and 5 deletions
--- a/test/topology/util.py
+++ b/test/topology/util.py
@@ -21,12 +21,23 @@ logger = logging.getLogger(__name__)


 async def reconnect_driver(manager: ManagerClient) -> Session:
-    """Workaround for scylladb/python-driver#170 and scylladb/python-driver#230:
-       the existing driver session may not reconnect, create a new one.
+    """Can be used as a workaround for scylladb/python-driver#295.
+
+       When restarting a node, a pre-existing session connected to the cluster
+       may reconnect to the restarted node multiple times. Even if we verify
+       that the session can perform a query on that node (e.g. like `wait_for_cql`,
+       which tries to select from system.local), the driver may again reconnect
+       after that, and following queries may fail.
+
+       The new session created by this function *should* not have this problem,
+       although (if I remember correctly) there is no 100% guarantee; still,
+       the chance of this problem appearing should be significantly decreased
+       with the new session.
    """
    logging.info(f"Reconnecting driver")
    manager.driver_close()
    await manager.driver_connect()
+    logging.info(f"Driver reconnected")
    cql = manager.cql
    assert(cql)
    return cql
--- a/test/topology_custom/test_raft_snapshot_request.py
+++ b/test/topology_custom/test_raft_snapshot_request.py
@@ -11,6 +11,7 @@ import logging

 from test.pylib.manager_client import ManagerClient
 from test.pylib.util import wait_for, wait_for_cql_and_get_hosts, read_barrier
+from test.topology.util import reconnect_driver


 logger = logging.getLogger(__name__)
@@ -32,7 +33,10 @@ async def trigger_snapshot(manager: ManagerClient, group0_id: str, ip_addr) -> N

@pytest.mark.asyncio
 async def test_raft_snapshot_request(manager: ManagerClient):
-    servers = await manager.servers_add(3)
+    cmdline = [
+        '--logger-log-level', 'raft=trace',
+        ]
+    servers = await manager.servers_add(3, cmdline=cmdline)
    cql = manager.get_cql()

    s1 = servers[0]
@@ -68,6 +72,7 @@ async def test_raft_snapshot_request(manager: ManagerClient):
    # Restarting the two servers will cause a newly elected leader to append a dummy command.
    await asyncio.gather(*(manager.server_restart(s.server_id) for s in servers[:2]))
    logger.info(f"Restarted {servers[:2]}")
+    cql = await reconnect_driver(manager)
    # Wait for one server to append the command and do a read_barrier on the other
    # to make sure both appended
    async def appended_command() -> int | None:
@@ -86,9 +91,10 @@ async def test_raft_snapshot_request(manager: ManagerClient):
        await trigger_snapshot(manager, group0_id, s.ip_addr)
        h = (await wait_for_cql_and_get_hosts(cql, [s], time.time() + 60))[0]
        snap = await get_raft_snap_id(cql, h)
-        logger.info("New snapshot ID on {s}: {snap}")
+        logger.info(f"New snapshot ID on {s}: {snap}")
    await manager.server_start(s2.server_id)
    logger.info(f"Server {s2} restarted")
+    cql = await reconnect_driver(manager)
    await wait_for_cql_and_get_hosts(cql, [s2], time.time() + 60)
    async def received_snapshot() -> str | None:
        new_s2_snap_id = await get_raft_snap_id(cql, h2)
@@ -98,4 +104,6 @@ async def test_raft_snapshot_request(manager: ManagerClient):
    new_s2_snap_id = await wait_for(received_snapshot, time.time() + 60)
    logger.info(f"{s2} received new snapshot: {new_s2_snap_id}")
    new_s2_log_size = await get_raft_log_size(cql, h2)
-    assert new_s2_log_size == 0
+    # Log size may be 1 because topology coordinator fiber may start and commit an entry
+    # after that last snapshot was created (the two events race with each other).
+    assert new_s2_log_size <= 1