Merge 'Fix test_tablet_missing_data_repair' from Asias He

This PR fixes test_tablet_missing_data_repair and enable the test again.

If a node is not UP yet, repair in the test will be a partial repair. The partial repair will not repair all the data which cause the check of rows after repair to fail.  Check nodes see each other as UP before repair.

Closes scylladb/scylladb#16930

* github.com:scylladb/scylladb:
  test: Enable test_tablet_missing_data_repair again
  test: Wait for nodes to be up when repair
  test: Check repair status in ScyllaRESTAPIClient
This commit is contained in:
Botond Dénes
2024-01-23 10:38:13 +02:00
2 changed files with 15 additions and 6 deletions

View File

@@ -292,7 +292,8 @@ class ScyllaRESTAPIClient():
"""Repair the given table and wait for it to complete"""
sequence_number = await self.client.post_json(f"/storage_service/repair_async/{keyspace}", host=node_ip, params={"columnFamilies": table})
status = await self.client.get_json(f"/storage_service/repair_status", host=node_ip, params={"id": str(sequence_number)})
return status
if status != 'SUCCESSFUL':
raise Exception(f"Repair id {sequence_number} on node {node_ip} for table {keyspace}.{table} failed: status={status}")
class ScyllaMetrics:
def __init__(self, lines: list[str]):

View File

@@ -66,6 +66,16 @@ async def get_tablet_replica(manager: ManagerClient, server: ServerInfo, keyspac
replicas = await get_tablet_replicas(manager, server, keyspace_name, table_name, token)
return replicas[0]
async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: list[ServerInfo]):
node = server.ip_addr
await manager.servers_see_each_other(servers)
live_nodes_wanted = [s.ip_addr for s in servers]
live_nodes = await manager.api.get_alive_endpoints(node)
live_nodes_wanted.sort()
live_nodes.sort()
assert live_nodes == live_nodes_wanted
logger.info(f"Repair table on node {node} live_nodes={live_nodes} live_nodes_wanted={live_nodes_wanted}")
await manager.api.repair(node, "test", "test")
@pytest.mark.asyncio
async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(manager: ManagerClient):
@@ -396,8 +406,7 @@ async def test_tablet_repair(manager: ManagerClient):
keys = range(256)
await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])
logger.info("Repair table")
await manager.api.repair(servers[0].ip_addr, "test", "test")
await repair_on_node(manager, servers[0], servers)
async def check():
logger.info("Checking table")
@@ -410,7 +419,6 @@ async def test_tablet_repair(manager: ManagerClient):
await cql.run_async("DROP KEYSPACE test;")
@pytest.mark.skip(reason="failing a lot, see https://github.com/scylladb/scylladb/issues/16859")
@pytest.mark.repair
@pytest.mark.asyncio
async def test_tablet_missing_data_repair(manager: ManagerClient):
@@ -440,8 +448,8 @@ async def test_tablet_missing_data_repair(manager: ManagerClient):
logger.info(f"Started server {idx}");
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info("Repair table")
await manager.api.repair(servers[0].ip_addr, "test", "test")
await repair_on_node(manager, servers[0], servers)
async def check():
logger.info("Checking table")