From 57a4e5594d1a332f0db6137a5b29976a0d181087 Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 23 Jan 2024 11:03:25 +0800 Subject: [PATCH 1/3] test: Check repair status in ScyllaRESTAPIClient Raise an exception in case the repair is not successful. --- test/pylib/rest_client.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/pylib/rest_client.py b/test/pylib/rest_client.py index 93e239d11d..67bfced5bb 100644 --- a/test/pylib/rest_client.py +++ b/test/pylib/rest_client.py @@ -292,7 +292,8 @@ class ScyllaRESTAPIClient(): """Repair the given table and wait for it to complete""" sequence_number = await self.client.post_json(f"/storage_service/repair_async/{keyspace}", host=node_ip, params={"columnFamilies": table}) status = await self.client.get_json(f"/storage_service/repair_status", host=node_ip, params={"id": str(sequence_number)}) - return status + if status != 'SUCCESSFUL': + raise Exception(f"Repair id {sequence_number} on node {node_ip} for table {keyspace}.{table} failed: status={status}") class ScyllaMetrics: def __init__(self, lines: list[str]): From 7c230f17ccd103684e3c8fcfe632f634c970643a Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 23 Jan 2024 11:04:38 +0800 Subject: [PATCH 2/3] test: Wait for nodes to be up when repair If a node is not UP yet, repair in the test will be a partial repair. Check nodes see each other as UP before repair. Fixes #16859 --- test/topology_experimental_raft/test_tablets.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/topology_experimental_raft/test_tablets.py b/test/topology_experimental_raft/test_tablets.py index cbcd18de27..0ad172b365 100644 --- a/test/topology_experimental_raft/test_tablets.py +++ b/test/topology_experimental_raft/test_tablets.py @@ -66,6 +66,16 @@ async def get_tablet_replica(manager: ManagerClient, server: ServerInfo, keyspac replicas = await get_tablet_replicas(manager, server, keyspace_name, table_name, token) return replicas[0] +async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: list[ServerInfo]): + node = server.ip_addr + await manager.servers_see_each_other(servers) + live_nodes_wanted = [s.ip_addr for s in servers] + live_nodes = await manager.api.get_alive_endpoints(node) + live_nodes_wanted.sort() + live_nodes.sort() + assert live_nodes == live_nodes_wanted + logger.info(f"Repair table on node {node} live_nodes={live_nodes} live_nodes_wanted={live_nodes_wanted}") + await manager.api.repair(node, "test", "test") @pytest.mark.asyncio async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient): @@ -396,8 +406,7 @@ async def test_tablet_repair(manager: ManagerClient): keys = range(256) await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - logger.info("Repair table") - await manager.api.repair(servers[0].ip_addr, "test", "test") + await repair_on_node(manager, servers[0], servers) async def check(): logger.info("Checking table") @@ -440,8 +449,8 @@ async def test_tablet_missing_data_repair(manager: ManagerClient): logger.info(f"Started server {idx}"); await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - logger.info("Repair table") - await manager.api.repair(servers[0].ip_addr, "test", "test") + + await repair_on_node(manager, servers[0], servers) async def check(): logger.info("Checking table") From 99e3d2ce7210eef489a2fa02544fee73465c800c Mon Sep 17 00:00:00 2001 From: Asias He Date: Tue, 23 Jan 2024 11:10:34 +0800 Subject: [PATCH 3/3] test: Enable test_tablet_missing_data_repair again Fixes #16859 --- test/topology_experimental_raft/test_tablets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/topology_experimental_raft/test_tablets.py b/test/topology_experimental_raft/test_tablets.py index 0ad172b365..60f704a9ff 100644 --- a/test/topology_experimental_raft/test_tablets.py +++ b/test/topology_experimental_raft/test_tablets.py @@ -419,7 +419,6 @@ async def test_tablet_repair(manager: ManagerClient): await cql.run_async("DROP KEYSPACE test;") -@pytest.mark.skip(reason="failing a lot, see https://github.com/scylladb/scylladb/issues/16859") @pytest.mark.repair @pytest.mark.asyncio async def test_tablet_missing_data_repair(manager: ManagerClient):