From 3b14c5b84a5b356c59c45c70dff47992bfd5ff6a Mon Sep 17 00:00:00 2001 From: "Raphael S. Carvalho" Date: Tue, 23 Jan 2024 21:29:40 -0300 Subject: [PATCH] test/topology_experimental_raft: Add tablet split test Signed-off-by: Raphael S. Carvalho --- .../test_tablets.py | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/test/topology_experimental_raft/test_tablets.py b/test/topology_experimental_raft/test_tablets.py index 61e18955c0..7046425de3 100644 --- a/test/topology_experimental_raft/test_tablets.py +++ b/test/topology_experimental_raft/test_tablets.py @@ -551,3 +551,73 @@ async def test_tablet_cleanup(manager: ManagerClient): # Bonus: check that commitlog_cleanups doesn't have any garbage after restart. assert 0 == (await cql.run_async("SELECT COUNT(*) FROM system.commitlog_cleanups", host=hosts[0]))[0].count + +async def get_tablet_count(manager: ManagerClient, server: ServerInfo, keyspace_name: str, table_name: str): + host = manager.cql.cluster.metadata.get_host(server.ip_addr) + + # read_barrier is needed to ensure that local tablet metadata on the queried node + # reflects the finalized tablet movement. + await read_barrier(manager.cql, host) + + table_id = await manager.get_table_id(keyspace_name, table_name) + rows = await manager.cql.run_async(f"SELECT tablet_count FROM system.tablets where " + f"table_id = {table_id}", host=host) + return rows[0].tablet_count + +@pytest.mark.asyncio +@skip_mode('release', 'error injections are not supported in release mode') +async def test_tablet_split(manager: ManagerClient): + logger.info("Bootstrapping cluster") + cmdline = [ + '--logger-log-level', 'storage_service=debug', + '--target-tablet-size-in-bytes', '1024', + ] + servers = [await manager.server_add(cmdline=cmdline)] + + await manager.api.disable_tablet_balancing(servers[0].ip_addr) + + cql = manager.get_cql() + await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") + await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + + # enough to trigger multiple splits with max size of 1024 bytes. + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async("SELECT * FROM test.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk + + await check() + + await manager.api.flush_keyspace(servers[0].ip_addr, "test") + + tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') + assert tablet_count == 1 + + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) + + # Increases the chance of tablet migration concurrent with split + await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) + + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() + + # Now there's a split and migration need, so they'll potentially run concurrently. + await manager.api.enable_tablet_balancing(servers[0].ip_addr) + + await check() + time.sleep(5) # Give load balancer some time to do work + + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + + await check() + + tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') + assert tablet_count > 1