From 50ce0aaf1c5fd2fa7ae52f7f372e5e84a3759f5b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 09:30:44 +0200 Subject: [PATCH 01/56] test/topology/util: new_test_keyspace: accept ManagerClient Following patch will convert topology tests to use new_test_keyspace and friends. Some tests restart server and reset the driver connection so we cannot use the original cql Session for dropping the created keyspace in the `finally` block. Pass the ManagerClient instead to get a new cql session for dropping the keyspace. Signed-off-by: Benny Halevy --- test/topology/test_aggregation.py | 4 ++-- test/topology/test_mv.py | 14 +++++++------- test/topology/util.py | 20 ++++++++++---------- test/topology_custom/test_tombstone_gc.py | 14 +++++++------- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/test/topology/test_aggregation.py b/test/topology/test_aggregation.py index 97c6cf917d..61a1122aa9 100644 --- a/test/topology/test_aggregation.py +++ b/test/topology/test_aggregation.py @@ -39,8 +39,8 @@ async def test_cancel_mapreduce(manager: ManagerClient): [host1] = filter(lambda host: host.address == s1.ip_addr, hosts) host_id2 = await manager.get_host_id(s2.server_id) - async with new_test_keyspace(cql, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: - async with new_test_table(cql, ks, "pk int PRIMARY KEY, v int") as t: + async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: + async with new_test_table(manager, ks, "pk int PRIMARY KEY, v int") as t: # Distribute data across the nodes. for _ in range(250): # Note: CQL int is a 32-bit integer. diff --git a/test/topology/test_mv.py b/test/topology/test_mv.py index 40953308ac..8666a1cae0 100644 --- a/test/topology/test_mv.py +++ b/test/topology/test_mv.py @@ -34,14 +34,14 @@ async def test_mv_tombstone_gc_setting(manager): be here and not in the single-node cqlpy. """ cql = manager.cql - async with new_test_keyspace(cql, ksdef) as keyspace: - async with new_test_table(cql, keyspace, "p int primary key, x int") as table: + async with new_test_keyspace(manager, ksdef) as keyspace: + async with new_test_table(manager, keyspace, "p int primary key, x int") as table: # Adding "WITH tombstone_gc = ..." In the CREATE MATERIALIZED VIEW: - async with new_materialized_view(cql, table, "*", "p, x", "p is not null and x is not null", "WITH tombstone_gc = {'mode': 'repair'}") as mv: + async with new_materialized_view(manager, table, "*", "p, x", "p is not null and x is not null", "WITH tombstone_gc = {'mode': 'repair'}") as mv: s = list(cql.execute(f"DESC {mv}"))[0].create_statement assert "'mode': 'repair'" in s # Adding "WITH tombstone_gc = ..." In the ALTER MATERIALIZED VIEW: - async with new_materialized_view(cql, table, "*", "p, x", "p is not null and x is not null") as mv: + async with new_materialized_view(manager, table, "*", "p, x", "p is not null and x is not null") as mv: s = list(cql.execute(f"DESC {mv}"))[0].create_statement assert not "'mode': 'repair'" in s await cql.run_async("ALTER MATERIALIZED VIEW " + mv + " WITH tombstone_gc = {'mode': 'repair'}") @@ -57,11 +57,11 @@ async def test_mv_tombstone_gc_not_inherited(manager): demonstrates the existing behavior. """ cql = manager.cql - async with new_test_keyspace(cql, ksdef) as keyspace: - async with new_test_table(cql, keyspace, "p int primary key, x int", "WITH tombstone_gc = {'mode': 'repair'}") as table: + async with new_test_keyspace(manager, ksdef) as keyspace: + async with new_test_table(manager, keyspace, "p int primary key, x int", "WITH tombstone_gc = {'mode': 'repair'}") as table: s = list(cql.execute(f"DESC {table}"))[0].create_statement assert "'mode': 'repair'" in s - async with new_materialized_view(cql, table, "*", "p, x", "p is not null and x is not null") as mv: + async with new_materialized_view(manager, table, "*", "p, x", "p is not null and x is not null") as mv: s = list(cql.execute(f"DESC {mv}"))[0].create_statement # Base's setting is NOT inherited to the view: assert not "'mode': 'repair'" in s diff --git a/test/topology/util.py b/test/topology/util.py index 6b55d9c000..c315845753 100644 --- a/test/topology/util.py +++ b/test/topology/util.py @@ -470,22 +470,22 @@ async def wait_new_coordinator_elected(manager: ManagerClient, expected_num_of_e await wait_for(new_coordinator_elected, deadline=deadline) @asynccontextmanager -async def new_test_keyspace(cql, opts, host=None): +async def new_test_keyspace(manager: ManagerClient, opts, host=None): """ A utility function for creating a new temporary keyspace with given options. It can be used in a "async with", as: - async with new_test_keyspace(cql, '...') as keyspace: + async with new_test_keyspace(ManagerClient, '...') as keyspace: """ keyspace = unique_name() - await cql.run_async("CREATE KEYSPACE " + keyspace + " " + opts, host=host) + await manager.get_cql().run_async("CREATE KEYSPACE " + keyspace + " " + opts, host=host) try: yield keyspace finally: - await cql.run_async("DROP KEYSPACE " + keyspace, host=host) + await manager.get_cql().run_async("DROP KEYSPACE " + keyspace, host=host) previously_used_table_names = [] @asynccontextmanager -async def new_test_table(cql, keyspace, schema, extra="", host=None, reuse_tables=True): +async def new_test_table(manager: ManagerClient, keyspace, schema, extra="", host=None, reuse_tables=True): """ A utility function for creating a new temporary table with a given schema. Because Scylla becomes slower when a huge number of uniquely-named tables @@ -503,27 +503,27 @@ async def new_test_table(cql, keyspace, schema, extra="", host=None, reuse_table else: table_name = unique_name() table = keyspace + "." + table_name - await cql.run_async("CREATE TABLE " + table + "(" + schema + ")" + extra, host=host) + await manager.get_cql().run_async("CREATE TABLE " + table + "(" + schema + ")" + extra, host=host) try: yield table finally: - await cql.run_async("DROP TABLE " + table, host=host) + await manager.get_cql().run_async("DROP TABLE " + table, host=host) if reuse_tables: previously_used_table_names.append(table_name) @asynccontextmanager -async def new_materialized_view(cql, table, select, pk, where, extra=""): +async def new_materialized_view(manager: ManagerClient, table, select, pk, where, extra=""): """ A utility function for creating a new temporary materialized view in an existing table. """ keyspace = table.split('.')[0] mv = keyspace + "." + unique_name() - await cql.run_async(f"CREATE MATERIALIZED VIEW {mv} AS SELECT {select} FROM {table} WHERE {where} PRIMARY KEY ({pk}) {extra}") + await manager.get_cql().run_async(f"CREATE MATERIALIZED VIEW {mv} AS SELECT {select} FROM {table} WHERE {where} PRIMARY KEY ({pk}) {extra}") try: yield mv finally: - await cql.run_async(f"DROP MATERIALIZED VIEW {mv}") + await manager.get_cql().run_async(f"DROP MATERIALIZED VIEW {mv}") async def get_raft_log_size(cql, host) -> int: diff --git a/test/topology_custom/test_tombstone_gc.py b/test/topology_custom/test_tombstone_gc.py index fda85ddcc7..3d20679a23 100644 --- a/test/topology_custom/test_tombstone_gc.py +++ b/test/topology_custom/test_tombstone_gc.py @@ -35,8 +35,8 @@ async def test_default_tombstone_gc(manager: ManagerClient, rf: int, tablets: bo _ = [await manager.server_add() for _ in range(2)] cql = manager.get_cql() tablets_enabled = "true" if tablets else "false" - async with new_test_keyspace(cql, f"with replication = {{ 'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} and tablets = {{ 'enabled': {tablets_enabled} }}") as keyspace: - async with new_test_table(cql, keyspace, "p int primary key, x int") as table: + async with new_test_keyspace(manager, f"with replication = {{ 'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} and tablets = {{ 'enabled': {tablets_enabled} }}") as keyspace: + async with new_test_table(manager, keyspace, "p int primary key, x int") as table: check_tombstone_gc_mode(cql, table, get_expected_tombstone_gc_mode(rf, tablets)) @@ -47,8 +47,8 @@ async def test_default_tombstone_gc_does_not_override(manager: ManagerClient, rf _ = [await manager.server_add() for _ in range(2)] cql = manager.get_cql() tablets_enabled = "true" if tablets else "false" - async with new_test_keyspace(cql, f"with replication = {{ 'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} and tablets = {{ 'enabled': {tablets_enabled} }}") as keyspace: - async with new_test_table(cql, keyspace, "p int primary key, x int", " with tombstone_gc = {'mode': 'disabled'}") as table: + async with new_test_keyspace(manager, f"with replication = {{ 'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} and tablets = {{ 'enabled': {tablets_enabled} }}") as keyspace: + async with new_test_table(manager, keyspace, "p int primary key, x int", " with tombstone_gc = {'mode': 'disabled'}") as table: await cql.run_async(f"ALTER TABLE {table} add y int") check_tombstone_gc_mode(cql, table, "disabled") @@ -103,12 +103,12 @@ async def test_group0_tombstone_gc(manager: ManagerClient): # create/alter/drop a few tables async def alter_system_schema(keyspace=None, table_count=3): if not keyspace: - async with new_test_keyspace(cql, "with replication = { 'class': 'NetworkTopologyStrategy', 'replication_factor': 2}", host=host_primary) as keyspace: + async with new_test_keyspace(manager, "with replication = { 'class': 'NetworkTopologyStrategy', 'replication_factor': 2}", host=host_primary) as keyspace: alter_system_schema(keyspace, table_count) return for _ in range(table_count): - async with new_test_table(cql, keyspace, "p int primary key, x int", host=host_primary, reuse_tables=False) as table: + async with new_test_table(manager, keyspace, "p int primary key, x int", host=host_primary, reuse_tables=False) as table: await cql.run_async(f"ALTER TABLE {table} add y int") def get_tombstone(row): @@ -164,7 +164,7 @@ async def test_group0_tombstone_gc(manager: ManagerClient): await wait_for(partial(tombstone_gc_completed, tombstone_mark), deadline) with disable_schema_agreement_wait(cql): - async with new_test_keyspace(cql, "with replication = { 'class': 'NetworkTopologyStrategy', 'replication_factor': 2}", host=host_primary) as keyspace: + async with new_test_keyspace(manager, "with replication = { 'class': 'NetworkTopologyStrategy', 'replication_factor': 2}", host=host_primary) as keyspace: await alter_system_schema(keyspace) tombstone_mark = datetime.now(timezone.utc) From 5d448f721e7fed65acb74c5aaa4532662bd6bb40 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 07:37:41 +0200 Subject: [PATCH 02/56] test/topology/util: CREATE KEYSPACE IF NOT EXISTS Workaround spurious keyspace creation errors due to retries caused by https://github.com/scylladb/python-driver/issues/317. This is safe since the function uses a unique_name for the keyspace so it should never exist by mistake. Signed-off-by: Benny Halevy --- test/topology/util.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/test/topology/util.py b/test/topology/util.py index c315845753..27fde95ff3 100644 --- a/test/topology/util.py +++ b/test/topology/util.py @@ -286,7 +286,7 @@ async def start_writes(cql: Session, rf: int, cl: ConsistencyLevel, concurrency: stop_event = asyncio.Event() ks_name = unique_name() - await cql.run_async(f"CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}}") + await cql.run_async(f"CREATE KEYSPACE IF NOT EXISTS {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}}") await cql.run_async(f"USE {ks_name}") await cql.run_async(f"CREATE TABLE tbl (pk int PRIMARY KEY, v int)") @@ -322,7 +322,7 @@ async def start_writes_to_cdc_table(cql: Session, concurrency: int = 3): stop_event = asyncio.Event() ks_name = unique_name() - await cql.run_async(f"CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}} AND tablets = {{ 'enabled': false }}") + await cql.run_async(f"CREATE KEYSPACE IF NOT EXISTS {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}} AND tablets = {{ 'enabled': false }}") await cql.run_async(f"CREATE TABLE {ks_name}.tbl (pk int PRIMARY KEY, v int) WITH cdc = {{'enabled':true}}") stmt = cql.prepare(f"INSERT INTO {ks_name}.tbl (pk, v) VALUES (?, 0)") @@ -477,7 +477,9 @@ async def new_test_keyspace(manager: ManagerClient, opts, host=None): async with new_test_keyspace(ManagerClient, '...') as keyspace: """ keyspace = unique_name() - await manager.get_cql().run_async("CREATE KEYSPACE " + keyspace + " " + opts, host=host) + # Use CREATE KEYSPACE IF NOT EXISTS as a workaround for + # https://github.com/scylladb/python-driver/issues/317 + await manager.get_cql().run_async(f"CREATE KEYSPACE IF NOT EXISTS {keyspace} {opts}", host=host) try: yield keyspace finally: From f9463023697e564722f6d0d9d6e66b74ea76917c Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 17:56:02 +0200 Subject: [PATCH 03/56] test/topology/util: refactor new_test_keyspace Define create_new_test_keyspace that can be used in cases we cannot automatically drop the newly created keyspace due to e.g. loss of raft majority at the end of the test. Signed-off-by: Benny Halevy --- test/topology/util.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/test/topology/util.py b/test/topology/util.py index 27fde95ff3..f717e0a3db 100644 --- a/test/topology/util.py +++ b/test/topology/util.py @@ -469,6 +469,17 @@ async def wait_new_coordinator_elected(manager: ManagerClient, expected_num_of_e await wait_for(new_coordinator_elected, deadline=deadline) +async def create_new_test_keyspace(cql: Session, opts, host=None): + """ + A utility function for creating a new temporary keyspace with given + options. + """ + keyspace = unique_name() + # Use CREATE KEYSPACE IF NOT EXISTS as a workaround for + # https://github.com/scylladb/python-driver/issues/317 + await cql.run_async(f"CREATE KEYSPACE IF NOT EXISTS {keyspace} {opts}", host=host) + return keyspace + @asynccontextmanager async def new_test_keyspace(manager: ManagerClient, opts, host=None): """ @@ -476,10 +487,7 @@ async def new_test_keyspace(manager: ManagerClient, opts, host=None): options. It can be used in a "async with", as: async with new_test_keyspace(ManagerClient, '...') as keyspace: """ - keyspace = unique_name() - # Use CREATE KEYSPACE IF NOT EXISTS as a workaround for - # https://github.com/scylladb/python-driver/issues/317 - await manager.get_cql().run_async(f"CREATE KEYSPACE IF NOT EXISTS {keyspace} {opts}", host=host) + keyspace = await create_new_test_keyspace(manager.get_cql(), opts, host) try: yield keyspace finally: From 0fd1b846fec64e62a6c8d3503cb1a7ed22eae2f7 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 18:45:19 +0200 Subject: [PATCH 04/56] test/topology/util: new_test_keyspace: drop keyspace only on success When the test fails with exception, keep the keyspace intact for post-mortem analysis. Signed-off-by: Benny Halevy --- test/topology/util.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/topology/util.py b/test/topology/util.py index f717e0a3db..5837945a33 100644 --- a/test/topology/util.py +++ b/test/topology/util.py @@ -490,7 +490,10 @@ async def new_test_keyspace(manager: ManagerClient, opts, host=None): keyspace = await create_new_test_keyspace(manager.get_cql(), opts, host) try: yield keyspace - finally: + except: + logger.info(f"Error happened while using keyspace '{keyspace}', the keyspace is left in place for investigation") + raise + else: await manager.get_cql().run_async("DROP KEYSPACE " + keyspace, host=host) previously_used_table_names = [] From a66ddb7c04af220f5deebd78663dbde0ffc209e6 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 07:48:02 +0200 Subject: [PATCH 05/56] topology/test_tls: test_upgrade_to_ssl: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology/test_tls.py | 125 +++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 63 deletions(-) diff --git a/test/topology/test_tls.py b/test/topology/test_tls.py index 417f5649bf..e67671947e 100644 --- a/test/topology/test_tls.py +++ b/test/topology/test_tls.py @@ -6,6 +6,7 @@ from test.pylib.manager_client import ManagerClient from cassandra.connection import ConnectionShutdown +from test.topology.util import new_test_keyspace import asyncio import logging @@ -24,79 +25,77 @@ async def test_upgrade_to_ssl(manager: ManagerClient) -> None: "all": [7001], } - ks = 'ks' cf = 'cf' servers = await manager.running_servers() cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}}") - await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY) WITH tombstone_gc = {{'mode': 'immediate'}}") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY) WITH tombstone_gc = {{'mode': 'immediate'}}") - async def update_config_and_restart(mode): - for srv in servers: - # get the log and current pos - log = await manager.server_open_log(srv.server_id) - mark = await log.mark() - # stop one server - await manager.server_stop_gracefully(srv.server_id) - # change internode encryption - seo = (await manager.server_get_config(srv.server_id))['server_encryption_options'] - seo['internode_encryption'] = mode - await manager.server_update_config(srv.server_id, "server_encryption_options", seo) - # restart - await manager.server_start(srv.server_id) - # now check we get the expected messaging server listening lines in log - expected_ports = mode2ports[mode] - pattern = "|".join(["port " + str(port) for port in expected_ports]) - res = await log.grep(pattern, from_mark=mark) - assert len(res) == len(expected_ports), \ - f"The listened ports are not same as expected! " \ - f"Expected ports: {expected_ports}\nReal listened ports: {res}" + async def update_config_and_restart(mode): + for srv in servers: + # get the log and current pos + log = await manager.server_open_log(srv.server_id) + mark = await log.mark() + # stop one server + await manager.server_stop_gracefully(srv.server_id) + # change internode encryption + seo = (await manager.server_get_config(srv.server_id))['server_encryption_options'] + seo['internode_encryption'] = mode + await manager.server_update_config(srv.server_id, "server_encryption_options", seo) + # restart + await manager.server_start(srv.server_id) + # now check we get the expected messaging server listening lines in log + expected_ports = mode2ports[mode] + pattern = "|".join(["port " + str(port) for port in expected_ports]) + res = await log.grep(pattern, from_mark=mark) + assert len(res) == len(expected_ports), \ + f"The listened ports are not same as expected! " \ + f"Expected ports: {expected_ports}\nReal listened ports: {res}" - async def reconnect(): - manager.driver_close() - await manager.driver_connect() - return manager.get_cql() + async def reconnect(): + manager.driver_close() + await manager.driver_connect() + return manager.get_cql() - async def run_retry_async(stmt : str): - lcql = cql - while True: - try: - await lcql.run_async(stmt) - return - except ConnectionShutdown: - lcql = await reconnect(); + async def run_retry_async(stmt : str): + lcql = cql + while True: + try: + await lcql.run_async(stmt) + return + except ConnectionShutdown: + lcql = await reconnect(); - # iterate from none to all and back - for mode in ["none", "transitional", "all", "transitional", "none"]: - # run a bunch of inserts in background. TODO: have something akin to cassandra-stress - # we can run in separate thread/process to really guarantee parallelism. - go_on = True + # iterate from none to all and back + for mode in ["none", "transitional", "all", "transitional", "none"]: + # run a bunch of inserts in background. TODO: have something akin to cassandra-stress + # we can run in separate thread/process to really guarantee parallelism. + go_on = True - async def write_in_background(): - count = 0; - while go_on: - await run_retry_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({count});") - count = count + 1 - return count + async def write_in_background(): + count = 0; + while go_on: + await run_retry_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({count});") + count = count + 1 + return count -# f = asyncio.gather( -# *[run_retry_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(count)] -# ) - f = write_in_background() + # f = asyncio.gather( + # *[run_retry_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(count)] + # ) + f = write_in_background() - # do a rolling restart, updating the internode_encryption mode - await update_config_and_restart(mode) - go_on = False - # wait for the writes to finish - count = await f - cql = await reconnect() - # check writes completed even though we are so very rolling - await cql.run_async(f"SELECT COUNT(*) FROM {ks}.{cf}") - assert count == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.{cf}"))[0].count - # and drop data - await cql.run_async(f"TRUNCATE {ks}.{cf}") + # do a rolling restart, updating the internode_encryption mode + await update_config_and_restart(mode) + go_on = False + # wait for the writes to finish + count = await f + cql = await reconnect() + # check writes completed even though we are so very rolling + await cql.run_async(f"SELECT COUNT(*) FROM {ks}.{cf}") + assert count == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.{cf}"))[0].count + # and drop data + await cql.run_async(f"TRUNCATE {ks}.{cf}") - await cql.run_async(f"DROP TABLE {ks}.{cf};") - await cql.run_async(f"DROP KEYSPACE {ks};") + await cql.run_async(f"DROP TABLE {ks}.{cf};") From df84097a4b11119a12266e812ab4f19a65302b40 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 06/56] topology_custom/test_change_replication_factor_1_to_0: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_change_replication_factor_1_to_0.py | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/test/topology_custom/test_change_replication_factor_1_to_0.py b/test/topology_custom/test_change_replication_factor_1_to_0.py index d714c150e9..96beb3165d 100644 --- a/test/topology_custom/test_change_replication_factor_1_to_0.py +++ b/test/topology_custom/test_change_replication_factor_1_to_0.py @@ -12,7 +12,7 @@ from cassandra import ConsistencyLevel # type: ignore from cassandra.query import SimpleStatement # type: ignore from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts -from test.topology.util import check_token_ring_and_group0_consistency +from test.topology.util import check_token_ring_and_group0_consistency, new_test_keyspace from test.pylib.util import wait_for logger = logging.getLogger(__name__) @@ -35,37 +35,37 @@ async def test_change_replication_factor_1_to_0(request: pytest.FixtureRequest, property_file={'dc': f'dc{i}', 'rack': f'myrack{i}'}) cql = manager.get_cql() - await cql.run_async("create keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 1}") - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 1}") as ks: + await cql.run_async(f"create table {ks}.t (pk int primary key)") - srvs = await manager.running_servers() - await wait_for_cql_and_get_hosts(cql, srvs, time.time() + 60) + srvs = await manager.running_servers() + await wait_for_cql_and_get_hosts(cql, srvs, time.time() + 60) - stmt = cql.prepare(f"SELECT * FROM ks.t where pk = ?") - stmt.consistency_level = ConsistencyLevel.LOCAL_QUORUM + stmt = cql.prepare(f"SELECT * FROM {ks}.t where pk = ?") + stmt.consistency_level = ConsistencyLevel.LOCAL_QUORUM - stop_event = asyncio.Event() + stop_event = asyncio.Event() - async def do_reads() -> None: - iteration = 0 - while not stop_event.is_set(): - start_time = time.time() - try: - await cql.run_async(stmt, [0]) - except Exception as e: - logger.error(f"Read started {time.time() - start_time}s ago failed: {e}") - raise - iteration += 1 - await asyncio.sleep(0.01) - logger.info(f"Finishing with iter {iteration}") + async def do_reads() -> None: + iteration = 0 + while not stop_event.is_set(): + start_time = time.time() + try: + await cql.run_async(stmt, [0]) + except Exception as e: + logger.error(f"Read started {time.time() - start_time}s ago failed: {e}") + raise + iteration += 1 + await asyncio.sleep(0.01) + logger.info(f"Finishing with iter {iteration}") - tasks = [asyncio.create_task(do_reads()) for _ in range(3)] + tasks = [asyncio.create_task(do_reads()) for _ in range(3)] - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 0}") + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 0}}") - await asyncio.sleep(1) - stop_event.set() - await asyncio.gather(*tasks) + await asyncio.sleep(1) + stop_event.set() + await asyncio.gather(*tasks) # Tests #22688 - we should be able to both do further alter:s of a keyspace # even after removing replication factor fully from a dc and decommission of said @@ -87,27 +87,27 @@ async def test_change_replication_factor_1_to_0_and_decommission(request: pytest property_file={'dc': f'dc{i}', 'rack': 'myrack'}) cql = manager.get_cql() - await cql.run_async("create keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 1}") - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 1}") as ks: + await cql.run_async(f"create table {ks}.t (pk int primary key)") - srvs = await manager.running_servers() - sorted(srvs, key=lambda si: si.datacenter) - assert(srvs[1].datacenter == "dc1") + srvs = await manager.running_servers() + sorted(srvs, key=lambda si: si.datacenter) + assert(srvs[1].datacenter == "dc1") - await wait_for_cql_and_get_hosts(cql, srvs, time.time() + 60) + await wait_for_cql_and_get_hosts(cql, srvs, time.time() + 60) - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO ks.t (pk) VALUES ({k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.t (pk) VALUES ({k});") for k in keys]) - # dc1 = 0 -> remove me from said dc - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 0}") + # dc1 = 0 -> remove me from said dc + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc0': 1, 'dc1': 0}}") - logger.info(f"Decommissioning node {srvs[1]}") - - # decommission dc1 - await manager.decommission_node(srvs[1].server_id) - await check_token_ring_and_group0_consistency(manager) + logger.info(f"Decommissioning node {srvs[1]}") + + # decommission dc1 + await manager.decommission_node(srvs[1].server_id) + await check_token_ring_and_group0_consistency(manager) - # ensure this no-op alter still works - async with asyncio.timeout(30): - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc0': 1}") + # ensure this no-op alter still works + async with asyncio.timeout(30): + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc0': 1}}") From 59687c25e0ae67da46fec2c02d5102d9c63202f8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 07/56] topology_custom/test_commitlog_segment_data_resurrection: use new_test_keyspace Signed-off-by: Benny Halevy --- ...est_commitlog_segment_data_resurrection.py | 115 +++++++++--------- 1 file changed, 59 insertions(+), 56 deletions(-) diff --git a/test/topology_custom/test_commitlog_segment_data_resurrection.py b/test/topology_custom/test_commitlog_segment_data_resurrection.py index acafe673ae..62ab7e21c4 100644 --- a/test/topology_custom/test_commitlog_segment_data_resurrection.py +++ b/test/topology_custom/test_commitlog_segment_data_resurrection.py @@ -4,6 +4,7 @@ # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 # from test.pylib.manager_client import ManagerClient +from test.topology.util import new_test_keyspace import pytest import os @@ -52,78 +53,80 @@ async def test_pinned_cl_segment_doesnt_resurrect_data(manager: ManagerClient): def get_cl_segments(): return {os.path.basename(s) for s in glob.glob(os.path.join(cl_path, "CommitLog-*"))} - await cql.run_async("create keyspace ks1 with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") - await cql.run_async("create keyspace ks2 with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") - await cql.run_async("create table ks1.tbl1 (pk int, ck int, primary key(pk, ck))") - await cql.run_async("create table ks2.tbl2 (pk int, ck int, v text, primary key(pk, ck)) WITH gc_grace_seconds = 0") + async with new_test_keyspace(manager, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks1, \ + new_test_keyspace(manager, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks2: + tbl1 = f"{ks1}.tbl1" + tbl2 = f"{ks2}.tbl2" + await cql.run_async(f"create table {tbl1} (pk int, ck int, primary key(pk, ck))") + await cql.run_async(f"create table {tbl2} (pk int, ck int, v text, primary key(pk, ck)) WITH gc_grace_seconds = 0") - cl_path = commitlog_path() - segments_before_writes = await get_segments_num() - segments_after_writes = segments_before_writes + cl_path = commitlog_path() + segments_before_writes = await get_segments_num() + segments_after_writes = segments_before_writes - logger.debug(f"Have {segments_after_writes} segments before writing data") + logger.debug(f"Have {segments_after_writes} segments before writing data") - insert_id_tbl1 = cql.prepare("INSERT INTO ks1.tbl1 (pk, ck) VALUES (?, ?)") - insert_id_tbl2 = cql.prepare("INSERT INTO ks2.tbl2 (pk, ck, v) VALUES (?, ?, ?)") - pk1 = 0 - pk2 = 1 - ck = 0 - value = "v" * 1024 + insert_id_tbl1 = cql.prepare(f"INSERT INTO {tbl1} (pk, ck) VALUES (?, ?)") + insert_id_tbl2 = cql.prepare(f"INSERT INTO {tbl2} (pk, ck, v) VALUES (?, ?, ?)") + pk1 = 0 + pk2 = 1 + ck = 0 + value = "v" * 1024 - logger.debug(f"Filling segment with mixed data from ks1.tbl1 and ks2.tbl2") + logger.debug(f"Filling segment with mixed data from {tbl1} and {tbl2}") - # Ensure at least one segment with writes from both tables - while segments_after_writes < segments_before_writes + 1: - cql.execute(insert_id_tbl1, (pk1, ck)) - cql.execute(insert_id_tbl2, (pk1, ck, value)) - ck = ck + 1 - segments_after_writes = await get_segments_num() + # Ensure at least one segment with writes from both tables + while segments_after_writes < segments_before_writes + 1: + cql.execute(insert_id_tbl1, (pk1, ck)) + cql.execute(insert_id_tbl2, (pk1, ck, value)) + ck = ck + 1 + segments_after_writes = await get_segments_num() - logger.debug(f"Filling segment(s) with ks2.tbl2 only") + logger.debug(f"Filling segment(s) with {tbl2} only") - while segments_after_writes < segments_before_writes + 3: - cql.execute(insert_id_tbl2, (pk1, ck, value)) - ck = ck + 1 - segments_after_writes = await get_segments_num() + while segments_after_writes < segments_before_writes + 3: + cql.execute(insert_id_tbl2, (pk1, ck, value)) + ck = ck + 1 + segments_after_writes = await get_segments_num() - cql.execute(f"DELETE FROM ks2.tbl2 WHERE pk = {pk1}") + cql.execute(f"DELETE FROM {tbl2} WHERE pk = {pk1}") - # We need to make sure the segment in which the above delete landed in - # is full, otherwise the memtable flush will not be able to destroy it. - logger.debug(f"Filling another segment with ks2.tbl2 (pk={pk2})") + # We need to make sure the segment in which the above delete landed in + # is full, otherwise the memtable flush will not be able to destroy it. + logger.debug(f"Filling another segment with {tbl2} (pk={pk2})") - while segments_after_writes < segments_before_writes + 4: - cql.execute(insert_id_tbl2, (pk2, ck, value)) - ck = ck + 1 - segments_after_writes = await get_segments_num() + while segments_after_writes < segments_before_writes + 4: + cql.execute(insert_id_tbl2, (pk2, ck, value)) + ck = ck + 1 + segments_after_writes = await get_segments_num() - segments_before = get_cl_segments() - logger.debug(f"Wrote {ck} rows, now have {segments_after_writes} segments ({segments_before}") + segments_before = get_cl_segments() + logger.debug(f"Wrote {ck} rows, now have {segments_after_writes} segments ({segments_before}") - logger.debug("Flush ks2.tbl2") - await manager.api.keyspace_flush(node_ip=server.ip_addr, keyspace="ks2", table="tbl2") - await manager.api.keyspace_compaction(node_ip=server.ip_addr, keyspace="ks2", table="tbl2") + logger.debug(f"Flush {tbl2}") + await manager.api.keyspace_flush(node_ip=server.ip_addr, keyspace=ks2, table="tbl2") + await manager.api.keyspace_compaction(node_ip=server.ip_addr, keyspace=ks2, table="tbl2") - segments_after = get_cl_segments() - logger.debug(f"After flush+compact, now have {await get_segments_num()} segments ({segments_after})") + segments_after = get_cl_segments() + logger.debug(f"After flush+compact, now have {await get_segments_num()} segments ({segments_after})") - assert len(list(cql.execute(f"SELECT * FROM ks1.tbl1 WHERE pk = {pk1}"))) > 0 - assert len(list(cql.execute(f"SELECT * FROM ks2.tbl2 WHERE pk = {pk1}"))) == 0 + assert len(list(cql.execute(f"SELECT * FROM {tbl1} WHERE pk = {pk1}"))) > 0 + assert len(list(cql.execute(f"SELECT * FROM {tbl2} WHERE pk = {pk1}"))) == 0 - # Need to ensure at least one segment was freed. - # We assume the last segment, containing the tombstone, was among the freed ones. - logger.debug(f"before seg {segments_before}, after seg {segments_after}") - removed_segments = segments_before - segments_after - assert len(removed_segments) > 0 + # Need to ensure at least one segment was freed. + # We assume the last segment, containing the tombstone, was among the freed ones. + logger.debug(f"before seg {segments_before}, after seg {segments_after}") + removed_segments = segments_before - segments_after + assert len(removed_segments) > 0 - logger.debug(f"The following segments were removed: {removed_segments}") + logger.debug(f"The following segments were removed: {removed_segments}") - logger.debug("Kill + restart the node") - await manager.server_stop(server.server_id) - await manager.server_start(server.server_id) + logger.debug("Kill + restart the node") + await manager.server_stop(server.server_id) + await manager.server_start(server.server_id) - manager.driver_close() - await manager.driver_connect() - cql = manager.cql + manager.driver_close() + await manager.driver_connect() + cql = manager.cql - assert len(list(cql.execute(f"SELECT * FROM ks2.tbl2 WHERE pk = {pk1}"))) == 0 + assert len(list(cql.execute(f"SELECT * FROM {tbl2} WHERE pk = {pk1}"))) == 0 From fdb339bf282dd3b72ed53aced945deef123de07b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 08/56] topology_custom/test_compacting_reader_tombstone_gc_with_data_in_memtable: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_compacting_reader_tombstone_gc.py | 46 ++++++++++--------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/test/topology_custom/test_compacting_reader_tombstone_gc.py b/test/topology_custom/test_compacting_reader_tombstone_gc.py index 5c8597b1ed..a9d77c8df5 100644 --- a/test/topology_custom/test_compacting_reader_tombstone_gc.py +++ b/test/topology_custom/test_compacting_reader_tombstone_gc.py @@ -5,6 +5,7 @@ # from test.pylib.manager_client import ManagerClient +from test.topology.util import new_test_keyspace import pytest import asyncio @@ -24,35 +25,36 @@ async def test_compacting_reader_tombstone_gc_with_data_in_memtable(manager: Man servers = [await manager.server_add(cmdline=cmdline)] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds = 0;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks: + table = f"{ks}.test" + await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, c int) WITH gc_grace_seconds = 0;") - await manager.api.disable_autocompaction(servers[0].ip_addr, "test") + await manager.api.disable_autocompaction(servers[0].ip_addr, ks) - key = 7 # Whatever + key = 7 # Whatever - # Simulates scenario where node missed tombstone and has it written to sstable directly - # after repair, whereas the deleted data remains on memtable due to low write activity. + # Simulates scenario where node missed tombstone and has it written to sstable directly + # after repair, whereas the deleted data remains on memtable due to low write activity. - # write a expiring tombstone into a sstable (flushed below) - await cql.run_async(f'DELETE FROM test.test USING timestamp 10 WHERE pk = {key}') + # write a expiring tombstone into a sstable (flushed below) + await cql.run_async(f'DELETE FROM {table} USING timestamp 10 WHERE pk = {key}') - # waits for tombstone to expire - time.sleep(1) + # waits for tombstone to expire + time.sleep(1) - # system-wide flush to prevent CL segment from blocking tombstone GC in the read path. - await manager.api.flush_all_keyspaces(servers[0].ip_addr) + # system-wide flush to prevent CL segment from blocking tombstone GC in the read path. + await manager.api.flush_all_keyspaces(servers[0].ip_addr) - # write into memtable data shadowed by the tombstone now living in the sstable - await cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({key}, 0) USING timestamp 9') + # write into memtable data shadowed by the tombstone now living in the sstable + await cql.run_async(f'INSERT INTO {table} (pk, c) VALUES ({key}, 0) USING timestamp 9') - await manager.api.drop_sstable_caches(servers[0].ip_addr) + await manager.api.drop_sstable_caches(servers[0].ip_addr) - # Without cache, the compacting reader is bypassed; Verify that the data in memtable is discarded - bypass_cache_rows = cql.execute(f'SELECT pk, c FROM test.test WHERE pk = {key} BYPASS CACHE;') - assert len(list(bypass_cache_rows)) == 0 + # Without cache, the compacting reader is bypassed; Verify that the data in memtable is discarded + bypass_cache_rows = cql.execute(f'SELECT pk, c FROM {table} WHERE pk = {key} BYPASS CACHE;') + assert len(list(bypass_cache_rows)) == 0 - # With the cache, the compacting reader is involved; - # Verify that the tombstone is not purged, allowing it to shadow the data in memtable - through_cache_rows = cql.execute(f'SELECT pk, c FROM test.test WHERE pk = {key};') - assert len(list(through_cache_rows)) == 0 + # With the cache, the compacting reader is involved; + # Verify that the tombstone is not purged, allowing it to shadow the data in memtable + through_cache_rows = cql.execute(f'SELECT pk, c FROM {table} WHERE pk = {key};') + assert len(list(through_cache_rows)) == 0 From 205ed113dd65e8c5842e33b26a03e1074dc64e30 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 09/56] topology_custom/test_read_repair: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_read_repair.py | 267 +++++++++++++---------- 1 file changed, 157 insertions(+), 110 deletions(-) diff --git a/test/topology_custom/test_read_repair.py b/test/topology_custom/test_read_repair.py index c7d992075a..a45cb6875c 100644 --- a/test/topology_custom/test_read_repair.py +++ b/test/topology_custom/test_read_repair.py @@ -24,6 +24,8 @@ from cassandra.murmur3 import murmur3 # type: ignore from test.pylib.util import wait_for_cql_and_get_hosts from test.pylib.internal_types import ServerInfo +from test.pylib.manager_client import ManagerClient +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -36,18 +38,55 @@ def serialize_int(i: int) -> str: def serialize_key(i: int) -> str: return struct.pack(">hl", 4, i).hex() +class DataClass: + @classmethod + def get_column_spec(self) -> str: + raise NotImplementedError() -class row_tombstone_data: + @classmethod + def get_unique_key(self) -> str: + raise NotImplementedError() + + @classmethod + def get_select_query(self, ks) -> str: + raise NotImplementedError() + + @classmethod + def generate_sstable(self, total_rows: int, live_rows: set[int], dead_timestamp: int, live_timestamp: int, + deletion_time: datetime.datetime) -> list[dict[str, Any]]: + raise NotImplementedError() + + @classmethod + def check_mutation_row(self, row, expected_live_rows: set[int]) -> tuple | None: + raise NotImplementedError() + + @classmethod + def check_page_count(self, page_count) -> None: + raise NotImplementedError() + + @classmethod + def check_result_row(self, i: int, row) -> None: + raise NotImplementedError() + +class row_tombstone_data(DataClass): pk = 0 v = 1 - column_spec = "pk int, ck int, v int, PRIMARY KEY (pk, ck)" - select_query = f"SELECT * FROM ks.tbl WHERE pk = {pk}" - unique_key = 'ck' + @classmethod + def get_column_spec(self) -> str: + return "pk int, ck int, v int, PRIMARY KEY (pk, ck)" + + @classmethod + def get_unique_key(self) -> str: + return 'ck' + + @classmethod + def get_select_query(self, ks) -> str: + return f"SELECT * FROM {ks}.tbl WHERE pk = {self.pk}" @classmethod def generate_sstable(cls, total_rows: int, live_rows: set[int], dead_timestamp: int, live_timestamp: int, - deletion_time: datetime.datetime): + deletion_time: datetime.datetime) -> list[dict[str, Any]]: rows = [] formatted_deletion_time = deletion_time.strftime("%Y-%m-%d %H:%M:%S") serialized_value = serialize_int(cls.v) @@ -94,7 +133,7 @@ class row_tombstone_data: return row.ck, is_live @classmethod - def check_page_count(cls, page_count): + def check_page_count(cls, page_count) -> None: assert page_count > 1 @classmethod @@ -104,12 +143,16 @@ class row_tombstone_data: assert row.v == cls.v -class partition_tombstone_data: +class partition_tombstone_data(DataClass): v = 1 - column_spec = "pk int PRIMARY KEY, v int" - select_query = "SELECT * FROM ks.tbl" - unique_key = 'pk' + @classmethod + def get_column_spec(self) -> str: + return "pk int PRIMARY KEY, v int" + + @classmethod + def get_unique_key(self) -> str: + return 'pk' partition_tombstone_timestamp = None partition_live = False @@ -123,9 +166,13 @@ class partition_tombstone_data: def __lt__(self, o): return self.token < o.token + @classmethod + def get_select_query(self, ks): + return f"SELECT * FROM {ks}.tbl" + @classmethod def generate_sstable(cls, total_rows: int, live_rows: set[int], dead_timestamp: int, live_timestamp: int, - deletion_time: datetime.datetime): + deletion_time: datetime.datetime) -> list[dict[str, Any]]: partitions = [] formatted_deletion_time = deletion_time.strftime("%Y-%m-%d %H:%M:%S") serialized_value = serialize_int(cls.v) @@ -181,7 +228,7 @@ class partition_tombstone_data: return None @classmethod - def check_page_count(cls, page_count): + def check_page_count(cls, page_count) -> None: # We cannot reliably generate partitions such that they trigger short pages # So we allow for a single page too. pass @@ -204,7 +251,7 @@ def workdir(): @pytest.mark.parametrize("data_class", incremental_repair_test_data) @pytest.mark.asyncio -async def test_incremental_read_repair(data_class, workdir, manager): +async def test_incremental_read_repair(data_class: DataClass, workdir: str, manager: ManagerClient): """Stress the incremental read repair logic Write a long stream of row tombstones, with a live row before and after. @@ -225,109 +272,109 @@ async def test_incremental_read_repair(data_class, workdir, manager): # The test generates and uploads sstables, assuming their specific # contents. These assumptions are not held with tablets, which # distribute data among sstables differently than vnodes. - cql.execute("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = { 'enabled': false }") - table_schema = f"CREATE TABLE ks.tbl ({data_class.column_spec}) WITH speculative_retry = 'NONE'" - cql.execute(table_schema) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = { 'enabled': false }") as ks: + table_schema = f"CREATE TABLE {ks}.tbl ({data_class.get_column_spec()}) WITH speculative_retry = 'NONE'" + cql.execute(table_schema) - schema_file_path = os.path.join(workdir, "schema.cql") - with open(schema_file_path, "w") as schema_file: - schema_file.write(table_schema) + schema_file_path = os.path.join(workdir, "schema.cql") + with open(schema_file_path, "w") as schema_file: + schema_file.write(table_schema) - dead_timestamp = int(time.time() * 1000) - live_timestamp = dead_timestamp + 1 + dead_timestamp = int(time.time() * 1000) + live_timestamp = dead_timestamp + 1 - total_rows = 100 - max_live_rows = 8 - deletion_time = datetime.datetime.now() + total_rows = 100 + max_live_rows = 8 + deletion_time = datetime.datetime.now() - row_set: TypeAlias = set[int] + row_set: TypeAlias = set[int] - async def generate_and_upload_sstable(node: ServerInfo, node_row: int) -> row_set: - live_rows = {random.randint(0, total_rows - 1) for _ in range(random.randint(0, max_live_rows))} - live_rows.add(node_row) + async def generate_and_upload_sstable(node: ServerInfo, node_row: int) -> row_set: + live_rows = {random.randint(0, total_rows - 1) for _ in range(random.randint(0, max_live_rows))} + live_rows.add(node_row) - sstable = data_class.generate_sstable(total_rows, live_rows, dead_timestamp, live_timestamp, deletion_time) - scylla_exe = await manager.server_get_exe(node.server_id) - node_workdir = await manager.server_get_workdir(node.server_id) - table_upload_dir = glob.glob(os.path.join(node_workdir, "data", "ks", "tbl-*", "upload"))[0] + sstable = data_class.generate_sstable(total_rows, live_rows, dead_timestamp, live_timestamp, deletion_time) + scylla_exe = await manager.server_get_exe(node.server_id) + node_workdir = await manager.server_get_workdir(node.server_id) + table_upload_dir = glob.glob(os.path.join(node_workdir, "data", ks, "tbl-*", "upload"))[0] - input_file_path = os.path.join(workdir, f"node{node.server_id}.sstable.json") - with open(input_file_path, "w") as f: - json.dump(sstable, f, indent=4) + input_file_path = os.path.join(workdir, f"node{node.server_id}.sstable.json") + with open(input_file_path, "w") as f: + json.dump(sstable, f, indent=4) - subprocess.check_call([ - scylla_exe, "sstable", "write", - "--schema-file", schema_file_path, - "--input-file", input_file_path, - "--output-dir", table_upload_dir, - "--generation", "1"]) + subprocess.check_call([ + scylla_exe, "sstable", "write", + "--schema-file", schema_file_path, + "--input-file", input_file_path, + "--output-dir", table_upload_dir, + "--generation", "1"]) - await manager.api.load_new_sstables(node.ip_addr, "ks", "tbl") + await manager.api.load_new_sstables(node.ip_addr, ks, "tbl") - return live_rows + return live_rows - node1_rows = await generate_and_upload_sstable(node1, 0) - node2_rows = await generate_and_upload_sstable(node2, total_rows - 1) - all_rows = node1_rows | node2_rows - assert len(all_rows) >= 2 + node1_rows = await generate_and_upload_sstable(node1, 0) + node2_rows = await generate_and_upload_sstable(node2, total_rows - 1) + all_rows = node1_rows | node2_rows + assert len(all_rows) >= 2 - logger.info(f"node1_rows: {len(node1_rows)} rows, row ids: {node1_rows}") - logger.info(f"node2_rows: {len(node2_rows)} rows, row ids: {node2_rows}") - logger.info(f"all_rows: {len(all_rows)} rows, row ids: {all_rows}") + logger.info(f"node1_rows: {len(node1_rows)} rows, row ids: {node1_rows}") + logger.info(f"node2_rows: {len(node2_rows)} rows, row ids: {node2_rows}") + logger.info(f"all_rows: {len(all_rows)} rows, row ids: {all_rows}") - def check_rows(cql: Session, host: Host, expected_live_rows: row_set) -> None: - actual_live_rows = set() - actual_dead_rows = set() - for row in cql.execute("SELECT * FROM MUTATION_FRAGMENTS(ks.tbl)", host=host): - res = data_class.check_mutation_row(row, expected_live_rows) - if res is None: - continue - row_id, is_live = res - if is_live: - actual_live_rows.add(row_id) + def check_rows(cql: Session, host: Host, expected_live_rows: row_set) -> None: + actual_live_rows = set() + actual_dead_rows = set() + for row in cql.execute(f"SELECT * FROM MUTATION_FRAGMENTS({ks}.tbl)", host=host): + res = data_class.check_mutation_row(row, expected_live_rows) + if res is None: + continue + row_id, is_live = res + if is_live: + actual_live_rows.add(row_id) + else: + actual_dead_rows.add(row_id) + + # Account rows that have a tombstone but are live only once. + actual_dead_rows -= actual_live_rows + + assert actual_live_rows == expected_live_rows + assert len(actual_live_rows) + len(actual_dead_rows) == total_rows + + logger.info("Check rows with CL=ONE before read-repair") + check_rows(cql, host1, node1_rows) + check_rows(cql, host2, node2_rows) + + logger.info("Run read-repair") + res = cql.execute(SimpleStatement(data_class.get_select_query(ks), consistency_level=ConsistencyLevel.ALL)) + res_rows = [] + pages = [] + while True: + res_rows.extend(list(res.current_rows)) + pages.append(list(res.current_rows)) + if res.has_more_pages: + res.fetch_next_page() else: - actual_dead_rows.add(row_id) + break - # Account rows that have a tombstone but are live only once. - actual_dead_rows -= actual_live_rows + logger.debug(f"repair: {len(pages)} pages: {pages}") + data_class.check_page_count(len(pages)) + assert len(res_rows) == len(all_rows) + actual_row_ids = set() + for res_row in res_rows: + row_id = getattr(res_row, data_class.get_unique_key()) + actual_row_ids.add(row_id) + assert row_id in all_rows + data_class.check_result_row(row_id, res_row) + assert actual_row_ids == all_rows - assert actual_live_rows == expected_live_rows - assert len(actual_live_rows) + len(actual_dead_rows) == total_rows + for node in (node1, node2): + await manager.api.keyspace_flush(node.ip_addr, ks) + await manager.api.keyspace_compaction(node.ip_addr, ks) - logger.info("Check rows with CL=ONE before read-repair") - check_rows(cql, host1, node1_rows) - check_rows(cql, host2, node2_rows) - - logger.info("Run read-repair") - res = cql.execute(SimpleStatement(data_class.select_query, consistency_level=ConsistencyLevel.ALL)) - res_rows = [] - pages = [] - while True: - res_rows.extend(list(res.current_rows)) - pages.append(list(res.current_rows)) - if res.has_more_pages: - res.fetch_next_page() - else: - break - - logger.debug(f"repair: {len(pages)} pages: {pages}") - data_class.check_page_count(len(pages)) - assert len(res_rows) == len(all_rows) - actual_row_ids = set() - for res_row in res_rows: - row_id = getattr(res_row, data_class.unique_key) - actual_row_ids.add(row_id) - assert row_id in all_rows - data_class.check_result_row(row_id, res_row) - assert actual_row_ids == all_rows - - for node in (node1, node2): - await manager.api.keyspace_flush(node.ip_addr, "ks") - await manager.api.keyspace_compaction(node.ip_addr, "ks") - - logger.info("Check rows with CL=ONE after read-repair") - check_rows(cql, host1, all_rows) - check_rows(cql, host2, all_rows) + logger.info("Check rows with CL=ONE after read-repair") + check_rows(cql, host1, all_rows) + check_rows(cql, host2, all_rows) @pytest.mark.asyncio @@ -342,18 +389,18 @@ async def test_read_repair_with_trace_logging(request, manager): srvs = await manager.running_servers() await wait_for_cql_and_get_hosts(cql, srvs, time.time() + 60) - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2};") - await cql.run_async("CREATE TABLE ks.t (pk bigint PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2};") as ks: + await cql.run_async(f"CREATE TABLE {ks}.t (pk bigint PRIMARY KEY, c int);") - await cql.run_async("INSERT INTO ks.t (pk, c) VALUES (0, 0)") + await cql.run_async(f"INSERT INTO {ks}.t (pk, c) VALUES (0, 0)") - await manager.server_stop(srvs[0].server_id) - prepared = cql.prepare("INSERT INTO ks.t (pk, c) VALUES (0, 1)") - prepared.consistency_level = ConsistencyLevel.ONE - await cql.run_async(prepared) + await manager.server_stop(srvs[0].server_id) + prepared = cql.prepare(f"INSERT INTO {ks}.t (pk, c) VALUES (0, 1)") + prepared.consistency_level = ConsistencyLevel.ONE + await cql.run_async(prepared) - await manager.server_start(srvs[0].server_id) + await manager.server_start(srvs[0].server_id) - prepared = cql.prepare("SELECT * FROM ks.t WHERE pk = 0") - prepared.consistency_level = ConsistencyLevel.ALL - await cql.run_async(prepared) + prepared = cql.prepare(f"SELECT * FROM {ks}.t WHERE pk = 0") + prepared.consistency_level = ConsistencyLevel.ALL + await cql.run_async(prepared) From 57faab9ffa9c14b3d726275bfcd27591e41a8d99 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 10/56] topology_custom/test_read_repair_with_conflicting_hash_keys: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_conflicting_keys_read_repair.py | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/test/topology_custom/test_conflicting_keys_read_repair.py b/test/topology_custom/test_conflicting_keys_read_repair.py index 6d01e0f2eb..a078100994 100644 --- a/test/topology_custom/test_conflicting_keys_read_repair.py +++ b/test/topology_custom/test_conflicting_keys_read_repair.py @@ -12,6 +12,7 @@ from cassandra import ConsistencyLevel # type: ignore from cassandra.query import SimpleStatement # type: ignore from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -44,38 +45,39 @@ async def test_read_repair_with_conflicting_hash_keys(request: pytest.FixtureReq srvs = await manager.servers_add(3) cql, _ = await manager.get_ready_cql(srvs) - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};") - await cql.run_async("CREATE TABLE ks.t (pk bigint PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (pk bigint PRIMARY KEY, c int);") - # Stop one of the nodes. - await manager.server_stop_gracefully(srvs[0].server_id) + # Stop one of the nodes. + await manager.server_stop_gracefully(srvs[0].server_id) - # Add rows with partition kays that cause murmur3 hash collision, token value [6874760189787677834]. - pk1 = -4818441857111425024 - pk2 = -8686612841249112064 - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk1}, 111)", consistency_level=ConsistencyLevel.ONE)) - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, c) VALUES ({pk2}, 222)", consistency_level=ConsistencyLevel.ONE)) + # Add rows with partition kays that cause murmur3 hash collision, token value [6874760189787677834]. + pk1 = -4818441857111425024 + pk2 = -8686612841249112064 + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, c) VALUES ({pk1}, 111)", consistency_level=ConsistencyLevel.ONE)) + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, c) VALUES ({pk2}, 222)", consistency_level=ConsistencyLevel.ONE)) - # Start the offline node. - await manager.server_start(srvs[0].server_id, wait_others=2) + # Start the offline node. + await manager.server_start(srvs[0].server_id, wait_others=2) - # Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes. - res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL)) + # Run a SELECT query with ALL consistency level, forcing reading from all 3 nodes. + res = await cql.run_async(SimpleStatement(f"SELECT * FROM {table}", consistency_level=ConsistencyLevel.ALL)) - # Validate the results (should be OK). - assert len(res) == 2 - for row in res: - if (row.pk == pk1): - assert row.c == 111 - elif (row.pk == pk2): - assert row.c == 222 + # Validate the results (should be OK). + assert len(res) == 2 + for row in res: + if (row.pk == pk1): + assert row.c == 111 + elif (row.pk == pk2): + assert row.c == 222 - res = await cql.run_async(SimpleStatement("SELECT * FROM ks.t", consistency_level=ConsistencyLevel.ALL)) + res = await cql.run_async(SimpleStatement(f"SELECT * FROM {table}", consistency_level=ConsistencyLevel.ALL)) - # Validate the results (will be wrong in case the diff calculation hash map uses tokens as keys). - assert len(res) == 2 - for row in res: - if (row.pk == pk1): - assert row.c == 111 - elif (row.pk == pk2): - assert row.c == 222 + # Validate the results (will be wrong in case the diff calculation hash map uses tokens as keys). + assert len(res) == 2 + for row in res: + if (row.pk == pk1): + assert row.c == 111 + elif (row.pk == pk2): + assert row.c == 222 From 4fefffe335cc8664b1a2acf11dc0edf7655954be Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 11/56] topology_custom/test_data_resurrection_after_cleanup: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_data_resurrection_after_cleanup.py | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/test/topology_custom/test_data_resurrection_after_cleanup.py b/test/topology_custom/test_data_resurrection_after_cleanup.py index 98c21fc87b..10ad3f0840 100644 --- a/test/topology_custom/test_data_resurrection_after_cleanup.py +++ b/test/topology_custom/test_data_resurrection_after_cleanup.py @@ -7,7 +7,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error_one_shot from test.topology.conftest import skip_mode -from test.topology.util import check_token_ring_and_group0_consistency +from test.topology.util import check_token_ring_and_group0_consistency, new_test_keyspace import pytest import asyncio @@ -28,45 +28,46 @@ async def test_data_resurrection_after_cleanup(manager: ManagerClient): cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds=0;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") as ks: + table = f"{ks}.test" + await cql.run_async(f"CREATE TABLE {table} (pk int PRIMARY KEY, c int) WITH gc_grace_seconds=0;") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {table} (pk, c) VALUES ({k}, {k});") for k in keys]) - async def check(expected_keys): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(expected_keys) - for r in rows: - assert r.c == r.pk + async def check(expected_keys): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {table};") + assert len(rows) == len(expected_keys) + for r in rows: + assert r.c == r.pk - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - await check(keys) + await check(keys) - logger.info("Adding new server") - servers.append(await manager.server_add(cmdline=cmdline)) + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) - time.sleep(1) - await check(keys) + time.sleep(1) + await check(keys) - await inject_error_one_shot(manager.api, servers[0].ip_addr, "major_compaction_before_cleanup") - await manager.api.cleanup_keyspace(servers[0].ip_addr, "test") + await inject_error_one_shot(manager.api, servers[0].ip_addr, "major_compaction_before_cleanup") + await manager.api.cleanup_keyspace(servers[0].ip_addr, ks) - deleted_keys = range(128) - await asyncio.gather(*[cql.run_async(f"DELETE FROM test.test WHERE pk={k};") for k in deleted_keys]) - # Make sures tombstones are gone - await manager.api.flush_keyspace(servers[1].ip_addr, "test") - time.sleep(1) - await manager.api.keyspace_compaction(servers[1].ip_addr, "test") + deleted_keys = range(128) + await asyncio.gather(*[cql.run_async(f"DELETE FROM {table} WHERE pk={k};") for k in deleted_keys]) + # Make sures tombstones are gone + await manager.api.flush_keyspace(servers[1].ip_addr, ks) + time.sleep(1) + await manager.api.keyspace_compaction(servers[1].ip_addr, ks) - # Regains ownership of deleted data + # Regains ownership of deleted data - logger.info(f"Decommissioning node {servers[1]}") - await manager.decommission_node(servers[1].server_id) - await check_token_ring_and_group0_consistency(manager) + logger.info(f"Decommissioning node {servers[1]}") + await manager.decommission_node(servers[1].server_id) + await check_token_ring_and_group0_consistency(manager) - time.sleep(1) - await check(range(128)) + time.sleep(1) + await check(range(128)) From 480a5837ab732be16969fd00a6e42b5cc83e6b6a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 12/56] topology_custom/test_group0_schema_versioning: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_group0_schema_versioning.py | 347 +++++++++--------- 1 file changed, 173 insertions(+), 174 deletions(-) diff --git a/test/topology_custom/test_group0_schema_versioning.py b/test/topology_custom/test_group0_schema_versioning.py index f1d399e27c..0cf98620fb 100644 --- a/test/topology_custom/test_group0_schema_versioning.py +++ b/test/topology_custom/test_group0_schema_versioning.py @@ -18,7 +18,7 @@ from test.pylib.manager_client import ManagerClient, ServerInfo from test.pylib.util import wait_for_cql_and_get_hosts from test.pylib.log_browsing import ScyllaLogFile from test.topology.util import reconnect_driver, wait_until_upgrade_finishes, \ - enter_recovery_state, delete_raft_data_and_upgrade_state + enter_recovery_state, delete_raft_data_and_upgrade_state, new_test_keyspace logger = logging.getLogger(__name__) @@ -89,15 +89,15 @@ async def verify_table_versions_synced(cql: Session, hs: list[Host], ignore_syst await verify_scylla_tables_versions_synced(cql, hs, ignore_system_tables) -async def verify_in_memory_table_versions(srvs: list[ServerInfo], logs: list[ScyllaLogFile], marks: list[int]): +async def verify_in_memory_table_versions(srvs: list[ServerInfo], logs: list[ScyllaLogFile], marks: list[int], table): """ Assumes that `logs` are log files of servers `srvs`, correspondingly in order. Assumes that `marks` are log markers (obtained by `ScyllaLogFile.mark()`) corresponding to `logs` in order. - Assumes that an 'alter table ks.t ...' statement was performed after obtaining `marks`. - Checks that every server printed the same version in `Altering ks.t...' log message. + Assumes that an 'alter table {table} ...' statement was performed after obtaining `marks`. + Checks that every server printed the same version in `Altering {table}...' log message. """ logger.info("Verifying that in-memory table schema versions are in sync") - matches = [await log.grep("Altering ks.t.*version=(.*)", from_mark=mark) for log, mark in zip(logs, marks)] + matches = [await log.grep(f"Altering {table}.*version=(.*)", from_mark=mark) for log, mark in zip(logs, marks)] def get_version(srv: ServerInfo, matches: list[tuple[str, re.Match[str]]]): if not matches: @@ -132,155 +132,154 @@ async def test_schema_versioning_with_recovery(manager: ManagerClient): hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) logger.info("Creating keyspace and table") - await cql.run_async("create keyspace ks with replication = " - "{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") - await verify_table_versions_synced(cql, hosts) - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks_name: + await verify_table_versions_synced(cql, hosts) + table_name = "t" + table = f"{ks_name}.{table_name}" + await cql.run_async(f"create table {table} (pk int primary key)") - logger.info("Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + logger.info("Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - await verify_table_versions_synced(cql, hosts) - ks_t_version = await get_scylla_tables_version(cql, hosts[0], 'ks', 't') - assert ks_t_version - - logs = [await manager.server_open_log(srv.server_id) for srv in servers] - marks = [await log.mark() for log in logs] - - logger.info("Altering table") - await cql.run_async("alter table ks.t with comment = ''") - - await verify_table_versions_synced(cql, hosts) - await verify_in_memory_table_versions(servers, logs, marks) - - new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], 'ks', 't') - assert new_ks_t_version - assert new_ks_t_version != ks_t_version - ks_t_version = new_ks_t_version - - # We still have a group 0 majority, don't do this at home. - srv1 = servers[0] - logger.info(f"Rebooting {srv1} in RECOVERY mode") - h1 = next(h for h in hosts if h.address == srv1.ip_addr) - await cql.run_async("update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h1) - await manager.server_restart(srv1.server_id) - - cql = await reconnect_driver(manager) - logger.info(f"Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - - await verify_table_versions_synced(cql, hosts) - - # We're doing a schema change on RECOVERY node while we have two nodes running in group 0 mode. - # Don't do this at home. - # - # Now, the two nodes are not doing any schema changes right now, so this doesn't actually break anything: - # the RECOVERY node is operating using the old schema change procedure, which means - # that it pushes the schema mutations to other nodes directly with RPC, modifying - # the group 0 state machine on other two nodes. - # - # There is one problem with this however. If the RECOVERY node considers some other node - # as DOWN, it will silently *not* push the schema change, completing the operation - # "successfully" nevertheless (it will return to the driver without error). - # Usually in this case we rely on eventual convergence of schema through gossip, - # which will not happen here, because the group 0 nodes are not doing schema pulls! - # So we need to make sure that the RECOVERY node sees the other nodes as UP before - # we perform the schema change, so it pushes the mutations to them. - logger.info(f"Waiting until RECOVERY node ({srv1}) sees other servers as UP") - await manager.server_sees_others(srv1.server_id, 2) - - marks = [await log.mark() for log in logs] - logger.info(f"Altering table on RECOVERY node ({srv1})") - await cql.run_async("alter table ks.t with comment = ''", host=h1) - - await verify_table_versions_synced(cql, hosts) - await verify_in_memory_table_versions(servers, logs, marks) - - new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], 'ks', 't') - assert not new_ks_t_version - ks_t_version = new_ks_t_version - - logger.info(f"Stopping {srv1} gracefully") - await manager.server_stop_gracefully(srv1.server_id) - - srv2 = servers[1] - logger.info(f"Waiting until {srv2} sees {srv1} as dead") - await manager.server_not_sees_other_server(srv2.ip_addr, srv1.ip_addr) - - # Now we modify schema through group 0 while the RECOVERY node is dead. - # Don't do this at home. - marks = [await log.mark() for log in logs] - h2 = next(h for h in hosts if h.address == srv2.ip_addr) - logger.info(f"Altering table on group 0 node {srv2}") - await cql.run_async("alter table ks.t with comment = ''", host=h2) - - await manager.server_start(srv1.server_id) - cql = await reconnect_driver(manager) - logger.info(f"Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - - logger.info(f"Waiting until {srv2} sees {srv1} as UP") - await manager.server_sees_other_server(srv2.ip_addr, srv1.ip_addr) - - # The RECOVERY node will pull schema when it gets a write. - # The other group 0 node will do a barrier so it will also sync schema before the write returns. - logger.info("Forcing schema sync through CL=ALL INSERT") - await cql.run_async(SimpleStatement("insert into ks.t (pk) values (0)", consistency_level=ConsistencyLevel.ALL), - host=h2) - - await verify_table_versions_synced(cql, hosts) - await verify_in_memory_table_versions(servers, logs, marks) - - new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], 'ks', 't') - assert new_ks_t_version - ks_t_version = new_ks_t_version - - srv3 = servers[2] - h3 = next(h for h in hosts if h.address == srv3.ip_addr) - logger.info("Finishing recovery") - for h in [h2, h3]: - await cql.run_async( - "update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h) - await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in [srv2, srv3])) - - cql = await reconnect_driver(manager) - logger.info("Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - - for h in [h1, h2, h3]: - await delete_raft_data_and_upgrade_state(cql, h) - - logger.info("Restarting servers") - await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers)) - - cql = await reconnect_driver(manager) - logger.info("Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - - logging.info(f"Waiting until upgrade finishes") - for h in [h1, h2, h3]: - await wait_until_upgrade_finishes(cql, h, time.time() + 60) - - await verify_table_versions_synced(cql, hosts) - - for change in [ - "alter table ks.t with comment = ''", - "alter table ks.t add v int", - "alter table ks.t alter v type blob"]: + await verify_table_versions_synced(cql, hosts) + ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name) + assert ks_t_version + logs = [await manager.server_open_log(srv.server_id) for srv in servers] marks = [await log.mark() for log in logs] - logger.info(f"Altering table with \"{change}\"") - await cql.run_async(change) - new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], 'ks', 't') + logger.info("Altering table") + await cql.run_async(f"alter table {table} with comment = ''") + + await verify_table_versions_synced(cql, hosts) + await verify_in_memory_table_versions(servers, logs, marks, table) + + new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name) assert new_ks_t_version assert new_ks_t_version != ks_t_version ks_t_version = new_ks_t_version - await verify_table_versions_synced(cql, hosts) - await verify_in_memory_table_versions(servers, logs, marks) + # We still have a group 0 majority, don't do this at home. + srv1 = servers[0] + logger.info(f"Rebooting {srv1} in RECOVERY mode") + h1 = next(h for h in hosts if h.address == srv1.ip_addr) + await cql.run_async("update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h1) + await manager.server_restart(srv1.server_id) - await cql.run_async("drop keyspace ks") + cql = await reconnect_driver(manager) + logger.info(f"Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + + await verify_table_versions_synced(cql, hosts) + + # We're doing a schema change on RECOVERY node while we have two nodes running in group 0 mode. + # Don't do this at home. + # + # Now, the two nodes are not doing any schema changes right now, so this doesn't actually break anything: + # the RECOVERY node is operating using the old schema change procedure, which means + # that it pushes the schema mutations to other nodes directly with RPC, modifying + # the group 0 state machine on other two nodes. + # + # There is one problem with this however. If the RECOVERY node considers some other node + # as DOWN, it will silently *not* push the schema change, completing the operation + # "successfully" nevertheless (it will return to the driver without error). + # Usually in this case we rely on eventual convergence of schema through gossip, + # which will not happen here, because the group 0 nodes are not doing schema pulls! + # So we need to make sure that the RECOVERY node sees the other nodes as UP before + # we perform the schema change, so it pushes the mutations to them. + logger.info(f"Waiting until RECOVERY node ({srv1}) sees other servers as UP") + await manager.server_sees_others(srv1.server_id, 2) + + marks = [await log.mark() for log in logs] + logger.info(f"Altering table on RECOVERY node ({srv1})") + await cql.run_async(f"alter table {table} with comment = ''", host=h1) + + await verify_table_versions_synced(cql, hosts) + await verify_in_memory_table_versions(servers, logs, marks, table) + + new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name) + assert not new_ks_t_version + ks_t_version = new_ks_t_version + + logger.info(f"Stopping {srv1} gracefully") + await manager.server_stop_gracefully(srv1.server_id) + + srv2 = servers[1] + logger.info(f"Waiting until {srv2} sees {srv1} as dead") + await manager.server_not_sees_other_server(srv2.ip_addr, srv1.ip_addr) + + # Now we modify schema through group 0 while the RECOVERY node is dead. + # Don't do this at home. + marks = [await log.mark() for log in logs] + h2 = next(h for h in hosts if h.address == srv2.ip_addr) + logger.info(f"Altering table on group 0 node {srv2}") + await cql.run_async(f"alter table {table} with comment = ''", host=h2) + + await manager.server_start(srv1.server_id) + cql = await reconnect_driver(manager) + logger.info(f"Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + + logger.info(f"Waiting until {srv2} sees {srv1} as UP") + await manager.server_sees_other_server(srv2.ip_addr, srv1.ip_addr) + + # The RECOVERY node will pull schema when it gets a write. + # The other group 0 node will do a barrier so it will also sync schema before the write returns. + logger.info("Forcing schema sync through CL=ALL INSERT") + await cql.run_async(SimpleStatement(f"insert into {table} (pk) values (0)", consistency_level=ConsistencyLevel.ALL), + host=h2) + + await verify_table_versions_synced(cql, hosts) + await verify_in_memory_table_versions(servers, logs, marks, table) + + new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name) + assert new_ks_t_version + ks_t_version = new_ks_t_version + + srv3 = servers[2] + h3 = next(h for h in hosts if h.address == srv3.ip_addr) + logger.info("Finishing recovery") + for h in [h2, h3]: + await cql.run_async( + "update system.scylla_local set value = 'recovery' where key = 'group0_upgrade_state'", host=h) + await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in [srv2, srv3])) + + cql = await reconnect_driver(manager) + logger.info("Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + + for h in [h1, h2, h3]: + await delete_raft_data_and_upgrade_state(cql, h) + + logger.info("Restarting servers") + await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers)) + + cql = await reconnect_driver(manager) + logger.info("Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + + logging.info(f"Waiting until upgrade finishes") + for h in [h1, h2, h3]: + await wait_until_upgrade_finishes(cql, h, time.time() + 60) + + await verify_table_versions_synced(cql, hosts) + + for change in [ + f"alter table {table} with comment = ''", + f"alter table {table} add v int", + f"alter table {table} alter v type blob"]: + + marks = [await log.mark() for log in logs] + logger.info(f"Altering table with \"{change}\"") + await cql.run_async(change) + + new_ks_t_version = await get_scylla_tables_version(cql, hosts[0], ks_name, table_name) + assert new_ks_t_version + assert new_ks_t_version != ks_t_version + ks_t_version = new_ks_t_version + + await verify_table_versions_synced(cql, hosts) + await verify_in_memory_table_versions(servers, logs, marks, table) @pytest.mark.asyncio async def test_upgrade(manager: ManagerClient): @@ -311,42 +310,42 @@ async def test_upgrade(manager: ManagerClient): await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) logger.info("Creating keyspace and table") - await cql.run_async("create keyspace ks with replication = " - "{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") - await verify_table_versions_synced(cql, hosts) - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks_name: + table = f"{ks_name}.t" + await verify_table_versions_synced(cql, hosts) + await cql.run_async(f"create table {table} (pk int primary key)") - logging.info(f"Deleting Raft data and upgrade state on {hosts}") - await asyncio.gather(*(delete_raft_data_and_upgrade_state(cql, h) for h in hosts)) + logging.info(f"Deleting Raft data and upgrade state on {hosts}") + await asyncio.gather(*(delete_raft_data_and_upgrade_state(cql, h) for h in hosts)) - logging.info(f"Restarting {servers}") - await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers)) - cql = await reconnect_driver(manager) + logging.info(f"Restarting {servers}") + await asyncio.gather(*(manager.server_restart(srv.server_id) for srv in servers)) + cql = await reconnect_driver(manager) - logger.info("Waiting for driver") - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + logger.info("Waiting for driver") + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - logging.info(f"Waiting until Raft upgrade procedure finishes") - await asyncio.gather(*(wait_until_upgrade_finishes(cql, h, time.time() + 60) for h in hosts)) + logging.info(f"Waiting until Raft upgrade procedure finishes") + await asyncio.gather(*(wait_until_upgrade_finishes(cql, h, time.time() + 60) for h in hosts)) - logs = [await manager.server_open_log(srv.server_id) for srv in servers] + logs = [await manager.server_open_log(srv.server_id) for srv in servers] - marks = [await log.mark() for log in logs] - logger.info("Altering table") - await cql.run_async("alter table ks.t with comment = ''") + marks = [await log.mark() for log in logs] + logger.info("Altering table") + await cql.run_async(f"alter table {table} with comment = ''") - await verify_table_versions_synced(cql, hosts) - await verify_in_memory_table_versions(servers, logs, marks) + await verify_table_versions_synced(cql, hosts) + await verify_in_memory_table_versions(servers, logs, marks, table) - # `group0_schema_version` should be present - # and the version column for `ks.t` should be non-null. - for h in hosts: - logger.info(f"Checking that `group0_schema_version` is set on {h}") - assert (await get_group0_schema_version(cql, h)) is not None + # `group0_schema_version` should be present + # and the version column for `{table}` should be non-null. + for h in hosts: + logger.info(f"Checking that `group0_schema_version` is set on {h}") + assert (await get_group0_schema_version(cql, h)) is not None - for h in hosts: - logger.info(f"Checking that `version` column for `ks.t` is set on {h}") - versions = await get_scylla_tables_versions(cql, h) - for ks, _, v in versions: - if ks == "ks": - assert v is not None + for h in hosts: + logger.info(f"Checking that `version` column for `{table}` is set on {h}") + versions = await get_scylla_tables_versions(cql, h) + for ks, _, v in versions: + if ks == "ks": + assert v is not None From fed078a38af9cf8ca603cd2a149800d833c355c6 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 13/56] topology_custom/test_hints: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_hints.py | 218 +++++++++++++++-------------- 1 file changed, 114 insertions(+), 104 deletions(-) diff --git a/test/topology_custom/test_hints.py b/test/topology_custom/test_hints.py index bbb8cd2092..fdcc83aca7 100644 --- a/test/topology_custom/test_hints.py +++ b/test/topology_custom/test_hints.py @@ -19,7 +19,7 @@ from test.pylib.rest_client import inject_error from test.pylib.util import wait_for from test.topology.conftest import skip_mode -from test.topology.util import get_topology_coordinator, find_server_by_host_id +from test.topology.util import get_topology_coordinator, find_server_by_host_id, new_test_keyspace logger = logging.getLogger(__name__) @@ -59,24 +59,28 @@ async def test_write_cl_any_to_dead_node_generates_hints(manager: ManagerClient) servers = await manager.servers_add(node_count) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)") - await manager.server_stop_gracefully(servers[1].server_id) + await manager.server_stop_gracefully(servers[1].server_id) - def get_hints_written_count(server): - return get_hint_manager_metric(server, "written") + def get_hints_written_count(server): + return get_hint_manager_metric(server, "written") - hints_before = get_hints_written_count(servers[0]) + hints_before = get_hints_written_count(servers[0]) - # Some of the inserts will be targeted to the dead node. - # The coordinator doesn't have live targets to send the write to, but it should write a hint. - for i in range(100): - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY)) + # Some of the inserts will be targeted to the dead node. + # The coordinator doesn't have live targets to send the write to, but it should write a hint. + for i in range(100): + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i+1})", consistency_level=ConsistencyLevel.ANY)) - # Verify hints are written - hints_after = get_hints_written_count(servers[0]) - assert hints_after > hints_before + # Verify hints are written + hints_after = get_hints_written_count(servers[0]) + assert hints_after > hints_before + + # For dropping the keyspace + await manager.server_start(servers[1].server_id) @pytest.mark.asyncio async def test_limited_concurrency_of_writes(manager: ManagerClient): @@ -91,19 +95,23 @@ async def test_limited_concurrency_of_writes(manager: ManagerClient): node2 = await manager.server_add() cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)") - await manager.server_stop_gracefully(node2.server_id) + await manager.server_stop_gracefully(node2.server_id) - async with inject_error(manager.api, node1.ip_addr, "slow_down_writing_hints"): - try: - for i in range(100): - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i})", consistency_level=ConsistencyLevel.ONE)) - pytest.fail("The coordinator node has not been overloaded, which indiciates that the concurrency of writing hints is NOT limited") - except NoHostAvailable as e: - for _, err in e.errors.items(): - assert err.summary == "Coordinator node overloaded" and re.match(r"Too many in flight hints: \d+", err.message) + async with inject_error(manager.api, node1.ip_addr, "slow_down_writing_hints"): + try: + for i in range(100): + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i})", consistency_level=ConsistencyLevel.ONE)) + pytest.fail("The coordinator node has not been overloaded, which indiciates that the concurrency of writing hints is NOT limited") + except NoHostAvailable as e: + for _, err in e.errors.items(): + assert err.summary == "Coordinator node overloaded" and re.match(r"Too many in flight hints: \d+", err.message) + + # For dropping the keyspace + await manager.server_start(node2.server_id) @pytest.mark.asyncio async def test_sync_point(manager: ManagerClient): @@ -117,40 +125,41 @@ async def test_sync_point(manager: ManagerClient): [node1, node2, node3] = await manager.servers_add(node_count) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)") - await manager.server_stop_gracefully(node2.server_id) - await manager.server_stop_gracefully(node3.server_id) + await manager.server_stop_gracefully(node2.server_id) + await manager.server_stop_gracefully(node3.server_id) - await manager.server_not_sees_other_server(node1.ip_addr, node2.ip_addr) - await manager.server_not_sees_other_server(node1.ip_addr, node3.ip_addr) + await manager.server_not_sees_other_server(node1.ip_addr, node2.ip_addr) + await manager.server_not_sees_other_server(node1.ip_addr, node3.ip_addr) - mutation_count = 5 - for primary_key in range(mutation_count): - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({primary_key}, {primary_key})", consistency_level=ConsistencyLevel.ONE)) + mutation_count = 5 + for primary_key in range(mutation_count): + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({primary_key}, {primary_key})", consistency_level=ConsistencyLevel.ONE)) - # Mutations need to be applied to hinted handoff's commitlog before we create the sync point. - # Otherwise, the sync point will correspond to no hints at all. + # Mutations need to be applied to hinted handoff's commitlog before we create the sync point. + # Otherwise, the sync point will correspond to no hints at all. - # We need to wrap the function in an async function to make `wait_for` be able to use it below. - async def check_no_hints_in_progress_node1() -> bool: - return get_hint_manager_metric(node1, "size_of_hints_in_progress") == 0 + # We need to wrap the function in an async function to make `wait_for` be able to use it below. + async def check_no_hints_in_progress_node1() -> bool: + return get_hint_manager_metric(node1, "size_of_hints_in_progress") == 0 - deadline = time.time() + 30 - await wait_for(check_no_hints_in_progress_node1, deadline) + deadline = time.time() + 30 + await wait_for(check_no_hints_in_progress_node1, deadline) - sync_point1 = create_sync_point(node1) + sync_point1 = create_sync_point(node1) - await manager.server_start(node2.server_id) - await manager.server_sees_other_server(node1.ip_addr, node2.ip_addr) + await manager.server_start(node2.server_id) + await manager.server_sees_other_server(node1.ip_addr, node2.ip_addr) - assert not await_sync_point(node1, sync_point1, 30) + assert not await_sync_point(node1, sync_point1, 30) - await manager.server_start(node3.server_id) - await manager.server_sees_other_server(node1.ip_addr, node3.ip_addr) + await manager.server_start(node3.server_id) + await manager.server_sees_other_server(node1.ip_addr, node3.ip_addr) - assert await_sync_point(node1, sync_point1, 30) + assert await_sync_point(node1, sync_point1, 30) @pytest.mark.asyncio @@ -171,67 +180,68 @@ async def test_hints_consistency_during_decommission(manager: ManagerClient): cql = manager.cql logger.info("Creatting a keyspace with RF=1 and a table") - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = { 'enabled': false }") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = { 'enabled': false }") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (pk int primary key, v int)") - logger.info("Stopping node 3") - await manager.server_stop_gracefully(server3.server_id) - await manager.others_not_see_server(server3.ip_addr) + logger.info("Stopping node 3") + await manager.server_stop_gracefully(server3.server_id) + await manager.others_not_see_server(server3.ip_addr) - # Write 100 rows with CL=ANY. Some of the rows will only be stored as hints because of RF=1 - logger.info("Writing 100 rows with CL=ANY") - for i in range(100): - await cql.run_async(SimpleStatement(f"INSERT INTO ks.t (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY)) + # Write 100 rows with CL=ANY. Some of the rows will only be stored as hints because of RF=1 + logger.info("Writing 100 rows with CL=ANY") + for i in range(100): + await cql.run_async(SimpleStatement(f"INSERT INTO {table} (pk, v) VALUES ({i}, {i + 1})", consistency_level=ConsistencyLevel.ANY)) - # Temporarily pause hints replay, we will unpause it after decommission starts and streaming is done, - # but before switching to writing to new nodes - logger.info("Pause hints replay on nodes 1 and 2") - for srv in (server1, server2): - await manager.api.enable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay", one_shot=False) - - # Start the node - logger.info("Start node 3") - await manager.server_start(server3.server_id) - await manager.servers_see_each_other([server1, server2, server3]) - - # Record the current position of hints so that we can wait for them later - sync_points = [create_sync_point(srv) for srv in (server1, server2)] - - async with asyncio.TaskGroup() as tg: - coord = await get_topology_coordinator(manager) - coord_srv = await find_server_by_host_id(manager, [server1, server2, server3], coord) - - # Make sure topology coordinator will pause right after streaming - logger.info("Enabling injection on the topology coordinator that will tell it to pause streaming") - await manager.api.enable_injection(coord_srv.ip_addr, "topology_coordinator_pause_after_streaming", one_shot=False) - coord_log = await manager.server_open_log(coord_srv.server_id) - coord_mark = await coord_log.mark() - - # Start decommission - it will get stuck on error injection so do it in the background - logger.info("Starting decommission in the background") - decommission_result = tg.create_task(manager.decommission_node(server3.server_id)) - - # Wait until streaming ends - logger.info("Wait until decomission finishes streaming") - await coord_log.wait_for(f'decommissioning: streaming completed for node', from_mark=coord_mark) - - # Now, unpause hints and let them be replayed - logger.info("Unpause hints replay on nodes 1 and 2") + # Temporarily pause hints replay, we will unpause it after decommission starts and streaming is done, + # but before switching to writing to new nodes + logger.info("Pause hints replay on nodes 1 and 2") for srv in (server1, server2): - await manager.api.disable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay") + await manager.api.enable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay", one_shot=False) - logger.info("Wait until hints are replayed from nodes 1 and 2") - await asyncio.gather(*(asyncio.to_thread(await_sync_point, srv, pt, timeout=30) for srv, pt in zip((server1, server2), sync_points))) + # Start the node + logger.info("Start node 3") + await manager.server_start(server3.server_id) + await manager.servers_see_each_other([server1, server2, server3]) - # Unpause streaming and let decommission finish - logger.info("Unpause streaming") - await manager.api.disable_injection(coord_srv.ip_addr, "topology_coordinator_pause_after_streaming") + # Record the current position of hints so that we can wait for them later + sync_points = [create_sync_point(srv) for srv in (server1, server2)] - logger.info("Wait until decomission finishes") - await decommission_result + async with asyncio.TaskGroup() as tg: + coord = await get_topology_coordinator(manager) + coord_srv = await find_server_by_host_id(manager, [server1, server2, server3], coord) - # Verify that no data has been lost - if the hints replay only sent the hints to the original destination (server3), - # then they will be only present on server3 which already left the cluster - logger.info("Verify that no data stored in hints have been lost") - for i in range(100): - assert list(await cql.run_async(f"SELECT v FROM ks.t WHERE pk = {i}")) == [(i + 1,)] + # Make sure topology coordinator will pause right after streaming + logger.info("Enabling injection on the topology coordinator that will tell it to pause streaming") + await manager.api.enable_injection(coord_srv.ip_addr, "topology_coordinator_pause_after_streaming", one_shot=False) + coord_log = await manager.server_open_log(coord_srv.server_id) + coord_mark = await coord_log.mark() + + # Start decommission - it will get stuck on error injection so do it in the background + logger.info("Starting decommission in the background") + decommission_result = tg.create_task(manager.decommission_node(server3.server_id)) + + # Wait until streaming ends + logger.info("Wait until decomission finishes streaming") + await coord_log.wait_for(f'decommissioning: streaming completed for node', from_mark=coord_mark) + + # Now, unpause hints and let them be replayed + logger.info("Unpause hints replay on nodes 1 and 2") + for srv in (server1, server2): + await manager.api.disable_injection(srv.ip_addr, "hinted_handoff_pause_hint_replay") + + logger.info("Wait until hints are replayed from nodes 1 and 2") + await asyncio.gather(*(asyncio.to_thread(await_sync_point, srv, pt, timeout=30) for srv, pt in zip((server1, server2), sync_points))) + + # Unpause streaming and let decommission finish + logger.info("Unpause streaming") + await manager.api.disable_injection(coord_srv.ip_addr, "topology_coordinator_pause_after_streaming") + + logger.info("Wait until decomission finishes") + await decommission_result + + # Verify that no data has been lost - if the hints replay only sent the hints to the original destination (server3), + # then they will be only present on server3 which already left the cluster + logger.info("Verify that no data stored in hints have been lost") + for i in range(100): + assert list(await cql.run_async(f"SELECT v FROM {table} WHERE pk = {i}")) == [(i + 1,)] From c6653e65ba26291dfc9cc550a7fb365366bc83e8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 14/56] topology_custom/test_ip_mappings: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_ip_mappings.py | 64 ++++++++++++------------ 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/test/topology_custom/test_ip_mappings.py b/test/topology_custom/test_ip_mappings.py index 3d48192115..4f0bc12dc5 100644 --- a/test/topology_custom/test_ip_mappings.py +++ b/test/topology_custom/test_ip_mappings.py @@ -9,6 +9,7 @@ import pytest import logging from test.pylib.rest_client import inject_error_one_shot +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -17,39 +18,40 @@ async def test_broken_bootstrap(manager: ManagerClient): server_a = await manager.server_add() server_b = await manager.server_add(start=False) - await manager.cql.run_async("CREATE KEYSPACE test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") - await manager.cql.run_async("CREATE TABLE test.test (a int PRIMARY KEY, b int)") - for i in range(100): - await manager.cql.run_async(f"INSERT INTO test.test (a, b) VALUES ({i}, {i})") - await inject_error_one_shot(manager.api, server_a.ip_addr, "crash-before-bootstrapping-node-added") - try: - # Timeout fast since we do not expect the operation to complete - # because the coordinator is dead by now due to the error injection - # above - await manager.server_start(server_b.server_id, timeout=5) - pytest.fail("Expected server_add to fail") - except Exception: - pass + async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: + table = f"{ks}.test" + await manager.cql.run_async(f"CREATE TABLE {table} (a int PRIMARY KEY, b int)") + for i in range(100): + await manager.cql.run_async(f"INSERT INTO {table} (a, b) VALUES ({i}, {i})") + await inject_error_one_shot(manager.api, server_a.ip_addr, "crash-before-bootstrapping-node-added") + try: + # Timeout fast since we do not expect the operation to complete + # because the coordinator is dead by now due to the error injection + # above + await manager.server_start(server_b.server_id, timeout=5) + pytest.fail("Expected server_add to fail") + except Exception: + pass - await manager.server_stop(server_b.server_id) - await manager.server_stop(server_a.server_id) + await manager.server_stop(server_b.server_id) + await manager.server_stop(server_a.server_id) - stop_event = asyncio.Event() - async def worker(): - logger.info("Worker started") - while not stop_event.is_set(): - for i in range(100): - await manager.cql.run_async(f"INSERT INTO test.test (a, b) VALUES ({i}, {i})") - response = await manager.cql.run_async(f"SELECT * FROM test.test WHERE a = {i}") - assert response[0].b == i - await asyncio.sleep(0.1) - logger.info("Worker stopped") + stop_event = asyncio.Event() + async def worker(): + logger.info("Worker started") + while not stop_event.is_set(): + for i in range(100): + await manager.cql.run_async(f"INSERT INTO {table} (a, b) VALUES ({i}, {i})") + response = await manager.cql.run_async(f"SELECT * FROM {table} WHERE a = {i}") + assert response[0].b == i + await asyncio.sleep(0.1) + logger.info("Worker stopped") - await manager.server_start(server_a.server_id) - await manager.driver_connect() + await manager.server_start(server_a.server_id) + await manager.driver_connect() - worker_task = asyncio.create_task(worker()) + worker_task = asyncio.create_task(worker()) - await asyncio.sleep(20) - stop_event.set() - await worker_task + await asyncio.sleep(20) + stop_event.set() + await worker_task From 9c095b622bff549d9dc1682f005048e171cd50f4 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 15/56] topology_custom/test_lwt_semaphore: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_lwt_semaphore.py | 28 ++++++++++++---------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/test/topology_custom/test_lwt_semaphore.py b/test/topology_custom/test_lwt_semaphore.py index e1f4b6a001..4304e70c8c 100644 --- a/test/topology_custom/test_lwt_semaphore.py +++ b/test/topology_custom/test_lwt_semaphore.py @@ -11,6 +11,7 @@ from test.pylib.util import wait_for_cql_and_get_hosts import pytest from cassandra.protocol import WriteTimeout from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace @pytest.mark.asyncio @skip_mode('debug', 'aarch64/debug is unpredictably slow', platform_key='aarch64') @@ -20,20 +21,21 @@ async def test_cas_semaphore(manager): host = await wait_for_cql_and_get_hosts(manager.cql, {servers[0]}, time.time() + 60) - await manager.cql.run_async("CREATE KEYSPACE test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") - await manager.cql.run_async("CREATE TABLE test.test (a int PRIMARY KEY, b int)") + async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: + table = f"{ks}.test" + await manager.cql.run_async(f"CREATE TABLE {table} (a int PRIMARY KEY, b int)") - async with inject_error(manager.api, servers[0].ip_addr, 'cas_timeout_after_lock'): - res = [manager.cql.run_async(f"INSERT INTO test.test (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)] - try: - await asyncio.gather(*res) - except WriteTimeout: - pass + async with inject_error(manager.api, servers[0].ip_addr, 'cas_timeout_after_lock'): + res = [manager.cql.run_async(f"INSERT INTO {table} (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)] + try: + await asyncio.gather(*res) + except WriteTimeout: + pass - res = [manager.cql.run_async(f"INSERT INTO test.test (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)] - await asyncio.gather(*res) + res = [manager.cql.run_async(f"INSERT INTO {table} (a) VALUES (0) IF NOT EXISTS", host=host[0]) for r in range(10)] + await asyncio.gather(*res) - metrics = await manager.metrics.query(servers[0].ip_addr) - contention = metrics.get(name="scylla_storage_proxy_coordinator_cas_write_contention_count") + metrics = await manager.metrics.query(servers[0].ip_addr) + contention = metrics.get(name="scylla_storage_proxy_coordinator_cas_write_contention_count") - assert contention == None + assert contention == None From 0668c642a2ad1f28c70d6f84fb670b4c0044e52a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 16/56] topology_custom/test_maintenance_mode: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_maintenance_mode.py | 98 ++++++++++--------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/test/topology_custom/test_maintenance_mode.py b/test/topology_custom/test_maintenance_mode.py index 9e83b8859e..b17b088e94 100644 --- a/test/topology_custom/test_maintenance_mode.py +++ b/test/topology_custom/test_maintenance_mode.py @@ -11,6 +11,7 @@ from cassandra.policies import WhiteListRoundRobinPolicy from test.pylib.manager_client import ManagerClient from test.topology.conftest import cluster_con from test.pylib.util import wait_for_cql_and_get_hosts +from test.topology.util import new_test_keyspace import pytest import logging @@ -32,68 +33,69 @@ async def test_maintenance_mode(manager: ManagerClient): cluster = cluster_con([server_b.ip_addr], 9042, False) cql = cluster.connect() - await cql.run_async("CREATE KEYSPACE ks WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") - await cql.run_async("CREATE TABLE ks.t (k int PRIMARY KEY, v int)") + async with new_test_keyspace(manager, "WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: + table = f"{ks}.t" + await cql.run_async(f"CREATE TABLE {table} (k int PRIMARY KEY, v int)") - # Token ranges of the server A - # [(start_token, end_token)] - ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token, endpoint - FROM system.token_ring WHERE keyspace_name = 'ks' - AND endpoint = '{server_a.ip_addr}' ALLOW FILTERING""")] + # Token ranges of the server A + # [(start_token, end_token)] + ranges = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"""SELECT start_token, end_token, endpoint + FROM system.token_ring WHERE keyspace_name = 'ks' + AND endpoint = '{server_a.ip_addr}' ALLOW FILTERING""")] - # Insert data to the cluster and find a key that is stored on server A. - for i in range(256): - await cql.run_async(f"INSERT INTO ks.t (k, v) VALUES ({i}, {i})") + # Insert data to the cluster and find a key that is stored on server A. + for i in range(256): + await cql.run_async(f"INSERT INTO {table} (k, v) VALUES ({i}, {i})") - # [(key, token of this key)] - keys_with_tokens = [(int(row[0]), int(row[1])) for row in await cql.run_async("SELECT k, token(k) FROM ks.t")] - key_on_server_a = None + # [(key, token of this key)] + keys_with_tokens = [(int(row[0]), int(row[1])) for row in await cql.run_async(f"SELECT k, token(k) FROM {table}")] + key_on_server_a = None - for key, token in keys_with_tokens: - for start, end in ranges: - if (start < end and start < token <= end) or (start >= end and (token <= end or start < token)): - key_on_server_a = key + for key, token in keys_with_tokens: + for start, end in ranges: + if (start < end and start < token <= end) or (start >= end and (token <= end or start < token)): + key_on_server_a = key - if key_on_server_a is None: - # There is only a chance ~(1/2)^256 that all keys are stored on the server B - # In this case we skip the test - pytest.skip("All keys are stored on the server B") + if key_on_server_a is None: + # There is only a chance ~(1/2)^256 that all keys are stored on the server B + # In this case we skip the test + pytest.skip("All keys are stored on the server B") - # Start server A in maintenance mode - await manager.server_stop_gracefully(server_a.server_id) - await manager.server_update_config(server_a.server_id, "maintenance_mode", "true") - await manager.server_start(server_a.server_id) + # Start server A in maintenance mode + await manager.server_stop_gracefully(server_a.server_id) + await manager.server_update_config(server_a.server_id, "maintenance_mode", "true") + await manager.server_start(server_a.server_id) - # Check that the regular CQL port is not available - assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0 + # Check that the regular CQL port is not available + assert socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect_ex((server_a.ip_addr, 9042)) != 0 - maintenance_cluster = cluster_con([socket_endpoint], 9042, False, - load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint])) - maintenance_cql = maintenance_cluster.connect() + maintenance_cluster = cluster_con([socket_endpoint], 9042, False, + load_balancing_policy=WhiteListRoundRobinPolicy([socket_endpoint])) + maintenance_cql = maintenance_cluster.connect() - # Check that local data is available in maintenance mode - res = await maintenance_cql.run_async(f"SELECT v FROM ks.t WHERE k = {key_on_server_a}") - assert res[0][0] == key_on_server_a + # Check that local data is available in maintenance mode + res = await maintenance_cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}") + assert res[0][0] == key_on_server_a - # Check that group0 operations are disabled - with pytest.raises(ConfigurationException): - await maintenance_cql.run_async(f"CREATE TABLE ks.t2 (k int PRIMARY KEY, v int)") + # Check that group0 operations are disabled + with pytest.raises(ConfigurationException): + await maintenance_cql.run_async(f"CREATE TABLE ks.t2 (k int PRIMARY KEY, v int)") - await maintenance_cql.run_async(f"UPDATE ks.t SET v = {key_on_server_a + 1} WHERE k = {key_on_server_a}") + await maintenance_cql.run_async(f"UPDATE {table} SET v = {key_on_server_a + 1} WHERE k = {key_on_server_a}") - # Ensure that server B recognizes server A as being shutdown, not as being alive. - res = await cql.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'") - assert res[0][0] == "shutdown" + # Ensure that server B recognizes server A as being shutdown, not as being alive. + res = await cql.run_async(f"SELECT status FROM system.cluster_status WHERE peer = '{server_a.ip_addr}'") + assert res[0][0] == "shutdown" - await manager.server_stop_gracefully(server_a.server_id) + await manager.server_stop_gracefully(server_a.server_id) - # Restart in normal mode to see if the changes made in maintenance mode are persisted - await manager.server_update_config(server_a.server_id, "maintenance_mode", "false") - await manager.server_start(server_a.server_id, wait_others=1) - await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60) - await manager.servers_see_each_other([server_a, server_b]) + # Restart in normal mode to see if the changes made in maintenance mode are persisted + await manager.server_update_config(server_a.server_id, "maintenance_mode", "false") + await manager.server_start(server_a.server_id, wait_others=1) + await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60) + await manager.servers_see_each_other([server_a, server_b]) - res = await cql.run_async(f"SELECT v FROM ks.t WHERE k = {key_on_server_a}") - assert res[0][0] == key_on_server_a + 1 + res = await cql.run_async(f"SELECT v FROM {table} WHERE k = {key_on_server_a}") + assert res[0][0] == key_on_server_a + 1 From 0e11aad9c5b25aace005a10c9c8b1ad0105c551b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 17/56] topology_custom/test_major_compaction: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_major_compaction.py | 182 +++++++++--------- 1 file changed, 92 insertions(+), 90 deletions(-) diff --git a/test/topology_custom/test_major_compaction.py b/test/topology_custom/test_major_compaction.py index f004ad649b..724b752d67 100644 --- a/test/topology_custom/test_major_compaction.py +++ b/test/topology_custom/test_major_compaction.py @@ -11,6 +11,7 @@ import asyncio from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error_one_shot from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace, reconnect_driver logger = logging.getLogger(__name__) @@ -42,53 +43,52 @@ async def test_major_compaction_consider_only_existing_data(manager: ManagerClie server = (await manager.servers_add(1))[0] logger.info("Creating table") - ks = "test_consider_only_existing_data" - cf = "t1" + cf = "test_consider_only_existing_data" cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY) WITH tombstone_gc = {{'mode': 'immediate'}}") - await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY) WITH tombstone_gc = {{'mode': 'immediate'}}") + await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(20)]) - await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.{cf} WHERE pk = {k};") for k in range(10)]) - await manager.api.keyspace_flush(server.ip_addr, ks, cf) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(20)]) + await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.{cf} WHERE pk = {k};") for k in range(10)]) + await manager.api.keyspace_flush(server.ip_addr, ks, cf) - # let a second pass, so that the tombstones are eligible for gc - await asyncio.sleep(1) + # let a second pass, so that the tombstones are eligible for gc + await asyncio.sleep(1) - # error injection to make compaction wait after collecting sstables - injection = "major_compaction_wait" - injection_handler = await inject_error_one_shot(manager.api, server.ip_addr, injection) + # error injection to make compaction wait after collecting sstables + injection = "major_compaction_wait" + injection_handler = await inject_error_one_shot(manager.api, server.ip_addr, injection) - logger.info("Start major compaction") - log = await manager.server_open_log(server.server_id) - mark = await log.mark() - compaction_task = asyncio.create_task(manager.api.keyspace_compaction(server.ip_addr, ks, cf, consider_only_existing_data=consider_only_existing_data)) - # wait for the injection to pause the compaction - await log.wait_for("major_compaction_wait: waiting", from_mark=mark, timeout=30) + logger.info("Start major compaction") + log = await manager.server_open_log(server.server_id) + mark = await log.mark() + compaction_task = asyncio.create_task(manager.api.keyspace_compaction(server.ip_addr, ks, cf, consider_only_existing_data=consider_only_existing_data)) + # wait for the injection to pause the compaction + await log.wait_for("major_compaction_wait: waiting", from_mark=mark, timeout=30) - # insert new backdated rows with deleted keys and flush them - # into a new sstable that will not be part of the major compaction - logger.info("Insert backdated data into the table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k}) USING TIMESTAMP 1;") for k in range(5)]) - await manager.api.keyspace_flush(server.ip_addr, ks, cf) + # insert new backdated rows with deleted keys and flush them + # into a new sstable that will not be part of the major compaction + logger.info("Insert backdated data into the table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k}) USING TIMESTAMP 1;") for k in range(5)]) + await manager.api.keyspace_flush(server.ip_addr, ks, cf) - # insert few more rows with deleted keys with backdated data into memtable - await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k}) USING TIMESTAMP 1;") for k in range(5, 10)]) + # insert few more rows with deleted keys with backdated data into memtable + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k}) USING TIMESTAMP 1;") for k in range(5, 10)]) - # resume compaction - await injection_handler.message() - await compaction_task + # resume compaction + await injection_handler.message() + await compaction_task - # evict cache to make backdated data visible for consider_only_existing_data mode - if consider_only_existing_data: - await manager.api.drop_sstable_caches(server.ip_addr) + # evict cache to make backdated data visible for consider_only_existing_data mode + if consider_only_existing_data: + await manager.api.drop_sstable_caches(server.ip_addr) - logger.info("Verify major compaction results") - expected_count = 1 if consider_only_existing_data else 0 - for k in range(10): - assert len(await cql.run_async(f"SELECT * FROM {ks}.{cf} WHERE pk = {k}")) == expected_count + logger.info("Verify major compaction results") + expected_count = 1 if consider_only_existing_data else 0 + for k in range(10): + assert len(await cql.run_async(f"SELECT * FROM {ks}.{cf} WHERE pk = {k}")) == expected_count @pytest.mark.asyncio @pytest.mark.parametrize("compaction_flush_all_tables_before_major_seconds", [0, 2, 10]) @@ -110,37 +110,36 @@ async def test_major_compaction_flush_all_tables(manager: ManagerClient, compact server = (await manager.servers_add(1, config=cfg, cmdline=['--smp=1']))[0] logger.info("Creating table") - ks = "test_flush_all_tables" - cf = "t1" + cf = "test_flush_all_tables" cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY)") - await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY)") + await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(256)]) - await manager.api.keyspace_flush(server.ip_addr, ks, cf) - log = await manager.server_open_log(server.server_id) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(256)]) + await manager.api.keyspace_flush(server.ip_addr, ks, cf) + log = await manager.server_open_log(server.server_id) - async def check_all_table_flush_in_major_compaction(expect_all_table_flush: bool): - mark = await log.mark() + async def check_all_table_flush_in_major_compaction(expect_all_table_flush: bool): + mark = await log.mark() - logger.info("Start major compaction") - await manager.api.keyspace_compaction(server.ip_addr, ks, cf) + logger.info("Start major compaction") + await manager.api.keyspace_compaction(server.ip_addr, ks, cf) - flush_log = await log.grep("Forcing new commitlog segment and flushing all tables", from_mark=mark) - assert len(flush_log) == (1 if expect_all_table_flush else 0) + flush_log = await log.grep("Forcing new commitlog segment and flushing all tables", from_mark=mark) + assert len(flush_log) == (1 if expect_all_table_flush else 0) - # all tables should be flushed the first time unless compaction_flush_all_tables_before_major_seconds == 0 - await check_all_table_flush_in_major_compaction(compaction_flush_all_tables_before_major_seconds != 0) + # all tables should be flushed the first time unless compaction_flush_all_tables_before_major_seconds == 0 + await check_all_table_flush_in_major_compaction(compaction_flush_all_tables_before_major_seconds != 0) - if compaction_flush_all_tables_before_major_seconds == 2: - # let 2 seconds pass before trying again - await asyncio.sleep(compaction_flush_all_tables_before_major_seconds) + if compaction_flush_all_tables_before_major_seconds == 2: + # let 2 seconds pass before trying again + await asyncio.sleep(compaction_flush_all_tables_before_major_seconds) - # for the second time, all tables should be flushed only if - # compaction_flush_all_tables_before_major_seconds == 2 as only 2 seconds have passed - await check_all_table_flush_in_major_compaction(compaction_flush_all_tables_before_major_seconds == 2) + # for the second time, all tables should be flushed only if + # compaction_flush_all_tables_before_major_seconds == 2 as only 2 seconds have passed + await check_all_table_flush_in_major_compaction(compaction_flush_all_tables_before_major_seconds == 2) # Testcase for https://github.com/scylladb/scylladb/issues/20197 @pytest.mark.asyncio @@ -159,38 +158,41 @@ async def test_shutdown_drain_during_compaction(manager: ManagerClient): server = await manager.server_add(cmdline=['--smp=1']) logger.info("Creating table") - ks = "test_shutdown_drain_during_compaction" - cf = "t1" + cf = "test_shutdown_drain_during_compaction" cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY);") - await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.{cf} (pk int PRIMARY KEY);") + await disable_autocompaction_across_keyspaces(manager, server.ip_addr, ks) - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(100)]) - await manager.api.keyspace_flush(server.ip_addr, ks, cf) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{cf} (pk) VALUES ({k});") for k in range(100)]) + await manager.api.keyspace_flush(server.ip_addr, ks, cf) - # inject error to make compaction wait just before it updates the compaction_history table - injection = "update_history_wait" - injection_handler = await inject_error_one_shot(manager.api, server.ip_addr, injection) + # inject error to make compaction wait just before it updates the compaction_history table + injection = "update_history_wait" + injection_handler = await inject_error_one_shot(manager.api, server.ip_addr, injection) - log = await manager.server_open_log(server.server_id) - mark = await log.mark() - # start compaction and wait for it to pause at the injection point - logger.info("Start compaction") - compaction_task = asyncio.create_task(manager.api.keyspace_compaction(server.ip_addr, ks, cf)) - await log.wait_for("update_history_wait: waiting", mark, 30) + log = await manager.server_open_log(server.server_id) + mark = await log.mark() + # start compaction and wait for it to pause at the injection point + logger.info("Start compaction") + compaction_task = asyncio.create_task(manager.api.keyspace_compaction(server.ip_addr, ks, cf)) + await log.wait_for("update_history_wait: waiting", mark, 30) - mark = await log.mark() - # Start server shutdown - logger.info("Shutdown server") - stop_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id)) - # wait until the shutdown drain request is sent to compaction_manager - await log.wait_for("Asked to drain", mark, 30) - # now resume compaction and let shutdown complete - await injection_handler.message() - # wait server to shutdown - await stop_task - # During shutdown, errors mentioning 'seastar::abort_requested_exception' is expected as we do abort the compaction midway. - # Verify that the shutdown completed without any other unexpected errors - assert len(await log.grep(expr="ERROR .*", filter_expr=".* seastar::abort_requested_exception \(abort requested\)", from_mark=mark)) == 0 + mark = await log.mark() + # Start server shutdown + logger.info("Shutdown server") + stop_task = asyncio.create_task(manager.server_stop_gracefully(server.server_id)) + # wait until the shutdown drain request is sent to compaction_manager + await log.wait_for("Asked to drain", mark, 30) + # now resume compaction and let shutdown complete + await injection_handler.message() + # wait server to shutdown + await stop_task + # During shutdown, errors mentioning 'seastar::abort_requested_exception' is expected as we do abort the compaction midway. + # Verify that the shutdown completed without any other unexpected errors + assert len(await log.grep(expr="ERROR .*", filter_expr=".* seastar::abort_requested_exception \(abort requested\)", from_mark=mark)) == 0 + + # For dropping the keyspace + await manager.server_start(server.server_id) + await reconnect_driver(manager) From ef85c4b27efac9766cc4ef9c35f15ea20b39ddba Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 18/56] topology_custom/test_mv_admission_control: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/test_mv_admission_control.py | 154 +++++++++--------- 1 file changed, 77 insertions(+), 77 deletions(-) diff --git a/test/topology_custom/mv/test_mv_admission_control.py b/test/topology_custom/mv/test_mv_admission_control.py index c1151e68ad..40af215aca 100644 --- a/test/topology_custom/mv/test_mv_admission_control.py +++ b/test/topology_custom/mv/test_mv_admission_control.py @@ -13,6 +13,7 @@ import logging from test.topology.conftest import skip_mode from test.pylib.util import wait_for_view from test.topology_custom.mv.tablets.test_mv_tablets import pin_the_only_tablet, get_tablet_replicas +from test.topology.util import new_test_keyspace from cassandra.cluster import ConsistencyLevel, EXEC_PROFILE_DEFAULT # type: ignore from cassandra.cqltypes import Int32Type # type: ignore @@ -31,31 +32,28 @@ async def test_mv_admission_control_exception(manager: ManagerClient) -> None: config = {'error_injections_at_startup': ['view_update_limit', 'update_backlog_immediately'], 'enable_tablets': True} servers = await manager.servers_add(node_count, config=config) cql, hosts = await manager.get_ready_cql(servers) - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, v text, PRIMARY KEY (key, c))") - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") - await wait_for_view(cql, 'mv_cf_view', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, v text, PRIMARY KEY (key, c))") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") + await wait_for_view(cql, 'mv_cf_view', node_count) - # Only remote updates hold on to memory, so make the update remote by pinning base and view tablets to different nodes. - await pin_the_only_tablet(manager, "ks", "tab", servers[0]) - await pin_the_only_tablet(manager, "ks", "mv_cf_view", servers[1]) + # Only remote updates hold on to memory, so make the update remote by pinning base and view tablets to different nodes. + await pin_the_only_tablet(manager, ks, "tab", servers[0]) + await pin_the_only_tablet(manager, ks, "mv_cf_view", servers[1]) - # Prepare the statement so that the write goes to the same shard both - # times (the first write will cause only the shard on which it was - # performed to have the updated view update backlog). - stmt = cql.prepare(f"INSERT INTO ks.tab (key, c, v) VALUES (?, ?, ?)") - # To inspect the error message, we need to disable retries, which can't - # be done in `prepare()` or `run_async()`. Instead, we use `BoundStatement`. - bnd_stmt = BoundStatement(stmt, retry_policy=FallthroughRetryPolicy()) - await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) - await cql.run_async(bnd_stmt.bind([0, 0, 240000*'a']), host=hosts[0]) - with pytest.raises(Exception, match="View update backlog is too high"): - await cql.run_async(bnd_stmt.bind([0, 0, 'a']), host=hosts[0]) - await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) - - await cql.run_async(f"DROP KEYSPACE ks") + # Prepare the statement so that the write goes to the same shard both + # times (the first write will cause only the shard on which it was + # performed to have the updated view update backlog). + stmt = cql.prepare(f"INSERT INTO {ks}.tab (key, c, v) VALUES (?, ?, ?)") + # To inspect the error message, we need to disable retries, which can't + # be done in `prepare()` or `run_async()`. Instead, we use `BoundStatement`. + bnd_stmt = BoundStatement(stmt, retry_policy=FallthroughRetryPolicy()) + await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) + await cql.run_async(bnd_stmt.bind([0, 0, 240000*'a']), host=hosts[0]) + with pytest.raises(Exception, match="View update backlog is too high"): + await cql.run_async(bnd_stmt.bind([0, 0, 'a']), host=hosts[0]) + await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) # In this test we have a table with a materialized view and a replication factor of 3 # and 4 nodes so that not all views get paired with replicas on the same nodes. @@ -73,62 +71,64 @@ async def test_mv_retried_writes_reach_all_replicas(manager: ManagerClient) -> N server = await manager.server_add(config={'error_injections_at_startup': ['view_update_limit', 'delay_before_remote_view_update', 'update_backlog_immediately'], 'enable_tablets': True}) cql, hosts = await manager.get_ready_cql(servers) - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 3}}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, v text, PRIMARY KEY (key, c))") - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") - await wait_for_view(cql, 'mv_cf_view', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, v text, PRIMARY KEY (key, c))") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") + await wait_for_view(cql, 'mv_cf_view', node_count) - # Disable tablet balancing so that the slow node doesn't get tablets moved away from it. - for s in servers: - await manager.api.disable_tablet_balancing(s.ip_addr) - await manager.api.disable_tablet_balancing(server.ip_addr) - - # Make sure that the slow node has a base table tablet and no view tablets, so that the - # view updates from it are remote. (using shard 0 and token 0 when moving tablets as they don't make a difference here) - base_tablet_replicas = await get_tablet_replicas(manager, servers[0], "ks", "tab", 0) - base_tablet_hosts = [str(replica[0]) for replica in base_tablet_replicas] - slow_host_id = await manager.get_host_id(server.server_id) - if str(slow_host_id) not in base_tablet_hosts: - base_tablet_host_id, base_tablet_shard = base_tablet_replicas[0] - await manager.api.move_tablet(servers[0].ip_addr, "ks", "tab", base_tablet_host_id, base_tablet_shard, slow_host_id, 0, 0) - view_tablet_replicas = await get_tablet_replicas(manager, servers[0], "ks", "mv_cf_view", 0) - view_tablet_hosts = [str(replica[0]) for replica in view_tablet_replicas] - for replica_host, replica_shard in view_tablet_replicas: - if str(replica_host) != str(slow_host_id): - continue - slow_host_shard = replica_shard - # Move the view tablet to the node that doesn't have one + # Disable tablet balancing so that the slow node doesn't get tablets moved away from it. for s in servers: - fast_host_id = await manager.get_host_id(s.server_id) - if str(fast_host_id) not in view_tablet_hosts: - await manager.api.move_tablet(servers[0].ip_addr, "ks", "mv_cf_view", slow_host_id, slow_host_shard, fast_host_id, 0, 0) - break + await manager.api.disable_tablet_balancing(s.ip_addr) + await manager.api.disable_tablet_balancing(server.ip_addr) - # Prepare the statement so that the write goes to the same shard - # for all requests (the backlog increase caused by a write is only - # immediately noted on the shard that the write was performed on). - stmt = cql.prepare(f"INSERT INTO ks.tab (key, c, v) VALUES (?, ?, ?)") - for i in range(10): - # Perform a write that will increase the view update backlog on the slow node - # to a level causing admission control to reject the following writes. - await cql.run_async(stmt, [0, i, 240000*'a'], host=hosts[0]) - # Based on whether the response from the slow node is received before the next write, - # the following small write can serve two purposes: - # 1. If the response is received before the next write, the write will be rejected by - # admission control and retried until it reaches all replicas. - # 2. If the response is not received before the next write, the write will be sent to - # the slow node without causing the view update backlog limit to be exceeded. Then, - # due to cl=ALL, the coordinator will wait for the response from the slow node, which - # will carry an up-to-date view update backlog for the next large write. - cl_all_execution_profile = cql.execution_profile_clone_update(EXEC_PROFILE_DEFAULT, consistency_level = ConsistencyLevel.ALL) - await cql.run_async(stmt, [0, 10 + i, 'a'], host=hosts[0], execution_profile=cl_all_execution_profile) + # Make sure that the slow node has a base table tablet and no view tablets, so that the + # view updates from it are remote. (using shard 0 and token 0 when moving tablets as they don't make a difference here) + base_tablet_replicas = await get_tablet_replicas(manager, servers[0], ks, "tab", 0) + base_tablet_hosts = [str(replica[0]) for replica in base_tablet_replicas] + slow_host_id = await manager.get_host_id(server.server_id) + if str(slow_host_id) not in base_tablet_hosts: + base_tablet_host_id, base_tablet_shard = base_tablet_replicas[0] + await manager.api.move_tablet(servers[0].ip_addr, ks, "tab", base_tablet_host_id, base_tablet_shard, slow_host_id, 0, 0) + view_tablet_replicas = await get_tablet_replicas(manager, servers[0], ks, "mv_cf_view", 0) + view_tablet_hosts = [str(replica[0]) for replica in view_tablet_replicas] + for replica_host, replica_shard in view_tablet_replicas: + if str(replica_host) != str(slow_host_id): + continue + slow_host_shard = replica_shard + # Move the view tablet to the node that doesn't have one + for s in servers: + fast_host_id = await manager.get_host_id(s.server_id) + if str(fast_host_id) not in view_tablet_hosts: + await manager.api.move_tablet(servers[0].ip_addr, ks, "mv_cf_view", slow_host_id, slow_host_shard, fast_host_id, 0, 0) + break - # Verify that all writes reached the slow node - await asyncio.gather(*(manager.server_stop_gracefully(s.server_id) for s in servers)) - print(f"Connecting to {server.ip_addr}") - await manager.driver_connect(server=server) - cql = manager.get_cql() + # Prepare the statement so that the write goes to the same shard + # for all requests (the backlog increase caused by a write is only + # immediately noted on the shard that the write was performed on). + stmt = cql.prepare(f"INSERT INTO {ks}.tab (key, c, v) VALUES (?, ?, ?)") + for i in range(10): + # Perform a write that will increase the view update backlog on the slow node + # to a level causing admission control to reject the following writes. + await cql.run_async(stmt, [0, i, 240000*'a'], host=hosts[0]) + # Based on whether the response from the slow node is received before the next write, + # the following small write can serve two purposes: + # 1. If the response is received before the next write, the write will be rejected by + # admission control and retried until it reaches all replicas. + # 2. If the response is not received before the next write, the write will be sent to + # the slow node without causing the view update backlog limit to be exceeded. Then, + # due to cl=ALL, the coordinator will wait for the response from the slow node, which + # will carry an up-to-date view update backlog for the next large write. + cl_all_execution_profile = cql.execution_profile_clone_update(EXEC_PROFILE_DEFAULT, consistency_level = ConsistencyLevel.ALL) + await cql.run_async(stmt, [0, 10 + i, 'a'], host=hosts[0], execution_profile=cl_all_execution_profile) - assert len(await cql.run_async(SimpleStatement(f"SELECT * FROM ks.tab", consistency_level=ConsistencyLevel.ONE))) == 20 + # Verify that all writes reached the slow node + await asyncio.gather(*(manager.server_stop_gracefully(s.server_id) for s in servers)) + print(f"Connecting to {server.ip_addr}") + await manager.driver_connect(server=server) + cql = manager.get_cql() + + assert len(await cql.run_async(SimpleStatement(f"SELECT * FROM {ks}.tab", consistency_level=ConsistencyLevel.ONE))) == 20 + + # For dropping the keyspace + await asyncio.gather(*(manager.server_start(s.server_id) for s in servers)) From b13e48b64891f450a3b04884cc444f5b2005f1ce Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 19/56] topology_custom/test_mv_backlog: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/mv/test_mv_backlog.py | 210 ++++++++++----------- 1 file changed, 102 insertions(+), 108 deletions(-) diff --git a/test/topology_custom/mv/test_mv_backlog.py b/test/topology_custom/mv/test_mv_backlog.py index afc0236c58..67fb5d0f83 100644 --- a/test/topology_custom/mv/test_mv_backlog.py +++ b/test/topology_custom/mv/test_mv_backlog.py @@ -13,6 +13,7 @@ from test.topology.conftest import skip_mode from test.pylib.util import wait_for_view, wait_for from test.topology_custom.mv.tablets.test_mv_tablets import pin_the_only_tablet from test.pylib.tablets import get_tablet_replica +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -28,29 +29,26 @@ async def test_view_backlog_increased_after_write(manager: ManagerClient) -> Non # Use a higher smp to make it more likely that the writes go to a different shard than the coordinator. servers = await manager.servers_add(node_count, cmdline=['--smp', '5'], config={'error_injections_at_startup': ['never_finish_remote_view_updates'], 'enable_tablets': True}) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (base_key int, view_key int, v text, PRIMARY KEY (base_key, view_key))") - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE view_key IS NOT NULL and base_key IS NOT NULL PRIMARY KEY (view_key, base_key) ") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (base_key int, view_key int, v text, PRIMARY KEY (base_key, view_key))") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE view_key IS NOT NULL and base_key IS NOT NULL PRIMARY KEY (view_key, base_key) ") - await wait_for_view(cql, 'mv_cf_view', node_count) - # Only remote updates hold on to memory, so make the update remote - await pin_the_only_tablet(manager, "ks", "tab", servers[0]) - (_, shard) = await get_tablet_replica(manager, servers[0], "ks", "tab", 0) - await pin_the_only_tablet(manager, "ks", "mv_cf_view", servers[1]) + await wait_for_view(cql, 'mv_cf_view', node_count) + # Only remote updates hold on to memory, so make the update remote + await pin_the_only_tablet(manager, ks, "tab", servers[0]) + (_, shard) = await get_tablet_replica(manager, servers[0], ks, "tab", 0) + await pin_the_only_tablet(manager, ks, "mv_cf_view", servers[1]) - for v in [1000, 4000, 16000, 64000, 256000]: - # Don't use a prepared statement, so that writes are likely sent to a different shard - # than the one containing the key. - await cql.run_async(f"INSERT INTO ks.tab (base_key, view_key, v) VALUES ({v}, {v}, '{v*'a'}')") - # The view update backlog should increase on the node generating view updates - local_metrics = await manager.metrics.query(servers[0].ip_addr) - view_backlog = local_metrics.get('scylla_storage_proxy_replica_view_update_backlog', shard=str(shard)) - # The read view_backlog might still contain backlogs from the previous iterations, so we only assert that it is large enough - assert view_backlog > v - - await cql.run_async(f"DROP KEYSPACE ks") + for v in [1000, 4000, 16000, 64000, 256000]: + # Don't use a prepared statement, so that writes are likely sent to a different shard + # than the one containing the key. + await cql.run_async(f"INSERT INTO {ks}.tab (base_key, view_key, v) VALUES ({v}, {v}, '{v*'a'}')") + # The view update backlog should increase on the node generating view updates + local_metrics = await manager.metrics.query(servers[0].ip_addr) + view_backlog = local_metrics.get('scylla_storage_proxy_replica_view_update_backlog', shard=str(shard)) + # The read view_backlog might still contain backlogs from the previous iterations, so we only assert that it is large enough + assert view_backlog > v # This test reproduces issues #18461 and #18783 # In the test, we create a table and perform a write to it that fills the view update backlog. @@ -61,26 +59,23 @@ async def test_gossip_same_backlog(manager: ManagerClient) -> None: node_count = 2 servers = await manager.servers_add(node_count, config={'error_injections_at_startup': ['view_update_limit', 'update_backlog_immediately'], 'enable_tablets': True}) cql, hosts = await manager.get_ready_cql(servers) - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, v text, PRIMARY KEY (key, c))") - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") - await wait_for_view(cql, 'mv_cf_view', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, v text, PRIMARY KEY (key, c))") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") + await wait_for_view(cql, 'mv_cf_view', node_count) - # Only remote updates hold on to memory, so make the update remote - await pin_the_only_tablet(manager, "ks", "tab", servers[0]) - await pin_the_only_tablet(manager, "ks", "mv_cf_view", servers[1]) + # Only remote updates hold on to memory, so make the update remote + await pin_the_only_tablet(manager, ks, "tab", servers[0]) + await pin_the_only_tablet(manager, ks, "mv_cf_view", servers[1]) - stmt = cql.prepare(f"INSERT INTO ks.tab (key, c, v) VALUES (?, ?, ?)") + stmt = cql.prepare(f"INSERT INTO {ks}.tab (key, c, v) VALUES (?, ?, ?)") - await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) - await cql.run_async(stmt, [0, 0, 240000*'a'], host=hosts[0]) - await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) - # The next write should be admitted eventually, after a gossip round (1s) is performed - await cql.run_async(stmt, [0, 0, 'a'], host=hosts[0]) - - await cql.run_async(f"DROP KEYSPACE ks") + await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) + await cql.run_async(stmt, [0, 0, 240000*'a'], host=hosts[0]) + await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) + # The next write should be admitted eventually, after a gossip round (1s) is performed + await cql.run_async(stmt, [0, 0, 'a'], host=hosts[0]) # A test for the view_flow_control_delay_limit_in_ms parameter. # @@ -110,88 +105,87 @@ async def test_configurable_mv_control_flow_delay(manager: ManagerClient) -> Non config={'error_injections_at_startup': ['update_backlog_immediately', 'view_update_limit', 'skip_updating_local_backlog_via_view_update_backlog_broker'], 'enable_tablets': True}, cmdline=['--smp=1']) cql, hosts = await manager.get_ready_cql(servers) - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, v text, PRIMARY KEY (key, c))") - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") - await wait_for_view(cql, 'mv_cf_view', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, v text, PRIMARY KEY (key, c))") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") + await wait_for_view(cql, 'mv_cf_view', node_count) - # Only remote updates hold on to memory, so make the update remote - srv_base = servers[0] - srv_view = servers[1] - host_base = next(h for h in hosts if h.address == srv_base.ip_addr) - await pin_the_only_tablet(manager, "ks", "tab", srv_base) - await pin_the_only_tablet(manager, "ks", "mv_cf_view", srv_view) + # Only remote updates hold on to memory, so make the update remote + srv_base = servers[0] + srv_view = servers[1] + host_base = next(h for h in hosts if h.address == srv_base.ip_addr) + await pin_the_only_tablet(manager, ks, "tab", srv_base) + await pin_the_only_tablet(manager, ks, "mv_cf_view", srv_view) - # All nodes in the cluster run with --smp=1, so there is only shard 0 - shard = 0 + # All nodes in the cluster run with --smp=1, so there is only shard 0 + shard = 0 - delay_metric_name = 'scylla_storage_proxy_coordinator_mv_flow_control_delay_total' - throttled_writes_metric_name = 'scylla_storage_proxy_coordinator_throttled_base_writes_total' + delay_metric_name = 'scylla_storage_proxy_coordinator_mv_flow_control_delay_total' + throttled_writes_metric_name = 'scylla_storage_proxy_coordinator_throttled_base_writes_total' - delay_limits = [0, 500, 1000, 2000, 10000] - computed_delays = [] + delay_limits = [0, 500, 1000, 2000, 10000] + computed_delays = [] - stmt = cql.prepare(f"INSERT INTO ks.tab (key, c, v) VALUES (?, ?, ?)") + stmt = cql.prepare(f"INSERT INTO {ks}.tab (key, c, v) VALUES (?, ?, ?)") - for delay_limit in delay_limits: - logger.info(f"delay_limit = {delay_limit}") + for delay_limit in delay_limits: + logger.info(f"delay_limit = {delay_limit}") - # Update the delay - await asyncio.gather(*(cql.run_async(f"UPDATE system.config SET value = '{delay_limit}' WHERE name = 'view_flow_control_delay_limit_in_ms'", host=h) for h in hosts)) + # Update the delay + await asyncio.gather(*(cql.run_async(f"UPDATE system.config SET value = '{delay_limit}' WHERE name = 'view_flow_control_delay_limit_in_ms'", host=h) for h in hosts)) - # Make sure that view updates will hang - await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) + # Make sure that view updates will hang + await asyncio.gather(*(manager.api.enable_injection(s.ip_addr, "never_finish_remote_view_updates", one_shot=False) for s in servers)) - # Generate a large view update and then a small one. - # The reason why we do two writes is as follows: view backlog is propagated - # in responses from base writes and the coordinator caches it but it will - # not necessarily use it when calculating the delay of the same write. - # The second small write will use the value of the backlog from the previous write. - await cql.run_async(stmt, [0, 0, 100000*'a'], host=host_base) + # Generate a large view update and then a small one. + # The reason why we do two writes is as follows: view backlog is propagated + # in responses from base writes and the coordinator caches it but it will + # not necessarily use it when calculating the delay of the same write. + # The second small write will use the value of the backlog from the previous write. + await cql.run_async(stmt, [0, 0, 100000*'a'], host=host_base) - # Measure the total delay before the second write, and the number of delayed writes - local_metrics = await manager.metrics.query(srv_base.ip_addr) - before_computed_delay = local_metrics.get(delay_metric_name, shard=str(shard)) or 0.0 - before_total_throttled_writes = local_metrics.get(throttled_writes_metric_name, shard=str(shard)) or 0.0 - - # Do the second write, as mentioned previously - await cql.run_async(stmt, [0, 0, ''], host=host_base) - - # Make sure that there is exactly one throttled write and calculate a delay for it. - # If we're testing the 0ms delay, instead make sure that there were no delayed writes. - local_metrics = await manager.metrics.query(srv_base.ip_addr) - after_computed_delay = local_metrics.get(delay_metric_name, shard=str(shard)) or 0.0 - after_total_throttled_writes = local_metrics.get(throttled_writes_metric_name, shard=str(shard)) or 0.0 - - if delay_limit == 0: - assert after_total_throttled_writes == before_total_throttled_writes - else: - assert after_total_throttled_writes == before_total_throttled_writes + 1 - - computed_delay = after_computed_delay - before_computed_delay - computed_delays.append(computed_delay) - - # Unpause the view update and wait until it is drained in order to prepare for the next pass - await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) - async def view_updates_drained(): + # Measure the total delay before the second write, and the number of delayed writes local_metrics = await manager.metrics.query(srv_base.ip_addr) - backlog = local_metrics.get('scylla_storage_proxy_replica_view_update_backlog', shard=str(shard)) - if backlog == 0: - return True - await wait_for(view_updates_drained, deadline=time.time() + 30.0) + before_computed_delay = local_metrics.get(delay_metric_name, shard=str(shard)) or 0.0 + before_total_throttled_writes = local_metrics.get(throttled_writes_metric_name, shard=str(shard)) or 0.0 - ratios = [delay / limit for delay, limit in zip(computed_delays, delay_limits) if limit != 0] + # Do the second write, as mentioned previously + await cql.run_async(stmt, [0, 0, ''], host=host_base) - logger.info(f"delay_limits: {delay_limits}") - logger.info(f"computed_delays: {computed_delays}") - logger.info(f"ratios (for non-zero limits): {ratios}") + # Make sure that there is exactly one throttled write and calculate a delay for it. + # If we're testing the 0ms delay, instead make sure that there were no delayed writes. + local_metrics = await manager.metrics.query(srv_base.ip_addr) + after_computed_delay = local_metrics.get(delay_metric_name, shard=str(shard)) or 0.0 + after_total_throttled_writes = local_metrics.get(throttled_writes_metric_name, shard=str(shard)) or 0.0 - # Check that the ratios are relatively stable, i.e. there is not much - # relative difference between minimum and maximum - assert min(ratios) / max(ratios) > 0.9 + if delay_limit == 0: + assert after_total_throttled_writes == before_total_throttled_writes + else: + assert after_total_throttled_writes == before_total_throttled_writes + 1 - # Additionally, check that the delay is zero for a zero value - # of the view_flow_control_delay_limit_in_ms parameter - assert computed_delays[0] == 0.0 + computed_delay = after_computed_delay - before_computed_delay + computed_delays.append(computed_delay) + + # Unpause the view update and wait until it is drained in order to prepare for the next pass + await asyncio.gather(*(manager.api.disable_injection(s.ip_addr, "never_finish_remote_view_updates") for s in servers)) + async def view_updates_drained(): + local_metrics = await manager.metrics.query(srv_base.ip_addr) + backlog = local_metrics.get('scylla_storage_proxy_replica_view_update_backlog', shard=str(shard)) + if backlog == 0: + return True + await wait_for(view_updates_drained, deadline=time.time() + 30.0) + + ratios = [delay / limit for delay, limit in zip(computed_delays, delay_limits) if limit != 0] + + logger.info(f"delay_limits: {delay_limits}") + logger.info(f"computed_delays: {computed_delays}") + logger.info(f"ratios (for non-zero limits): {ratios}") + + # Check that the ratios are relatively stable, i.e. there is not much + # relative difference between minimum and maximum + assert min(ratios) / max(ratios) > 0.9 + + # Additionally, check that the delay is zero for a zero value + # of the view_flow_control_delay_limit_in_ms parameter + assert computed_delays[0] == 0.0 From a82e7341100f713a10cc6b4632e859f13824ce95 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 20/56] topology_custom/test_mv_building: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/mv/test_mv_building.py | 103 ++++++++++---------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/test/topology_custom/mv/test_mv_building.py b/test/topology_custom/mv/test_mv_building.py index af548f25a3..dcbb648d15 100644 --- a/test/topology_custom/mv/test_mv_building.py +++ b/test/topology_custom/mv/test_mv_building.py @@ -9,10 +9,10 @@ import logging from test.pylib.manager_client import ManagerClient from test.pylib.tablets import get_tablet_replica from test.pylib.util import unique_name, wait_for_view +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) - # This test makes sure that view building is done mainly in the streaming scheduling group # and not the gossip scheduling group. We do that by measuring the time each group was # busy during the view building process and confirming that the gossip group was busy @@ -22,30 +22,30 @@ logger = logging.getLogger(__name__) async def test_view_building_scheduling_group(manager: ManagerClient): server = await manager.server_add() cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE ks.tab (p int, c int, PRIMARY KEY (p, c))") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (p int, c int, PRIMARY KEY (p, c))") - # Insert 50000 rows to the table. Use unlogged batches to speed up the process. - for i in range(1000): - inserts = [f"INSERT INTO ks.tab(p, c) VALUES ({i+1000*x}, {i+1000*x})" for x in range(50)] - batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n" - await manager.cql.run_async(batch) + # Insert 50000 rows to the table. Use unlogged batches to speed up the process. + for i in range(1000): + inserts = [f"INSERT INTO {ks}.tab(p, c) VALUES ({i+1000*x}, {i+1000*x})" for x in range(50)] + batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n" + await manager.cql.run_async(batch) - metrics_before = await manager.metrics.query(server.ip_addr) - ms_gossip_before = metrics_before.get('scylla_scheduler_runtime_ms', {'group': 'gossip'}) - ms_streaming_before = metrics_before.get('scylla_scheduler_runtime_ms', {'group': 'streaming'}) + metrics_before = await manager.metrics.query(server.ip_addr) + ms_gossip_before = metrics_before.get('scylla_scheduler_runtime_ms', {'group': 'gossip'}) + ms_streaming_before = metrics_before.get('scylla_scheduler_runtime_ms', {'group': 'streaming'}) - await cql.run_async("CREATE MATERIALIZED VIEW ks.mv AS SELECT p, c FROM ks.tab WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p)") - await wait_for_view(cql, 'mv', 1) + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv AS SELECT p, c FROM {ks}.tab WHERE p IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, p)") + await wait_for_view(cql, 'mv', 1) - metrics_after = await manager.metrics.query(server.ip_addr) - ms_gossip_after = metrics_after.get('scylla_scheduler_runtime_ms', {'group': 'gossip'}) - ms_streaming_after = metrics_after.get('scylla_scheduler_runtime_ms', {'group': 'streaming'}) - ms_streaming = ms_streaming_after - ms_streaming_before - ms_statement = ms_gossip_after - ms_gossip_before - ratio = ms_statement / ms_streaming - print(f"ms_streaming: {ms_streaming}, ms_statement: {ms_statement}, ratio: {ratio}") - assert ratio < 0.1 + metrics_after = await manager.metrics.query(server.ip_addr) + ms_gossip_after = metrics_after.get('scylla_scheduler_runtime_ms', {'group': 'gossip'}) + ms_streaming_after = metrics_after.get('scylla_scheduler_runtime_ms', {'group': 'streaming'}) + ms_streaming = ms_streaming_after - ms_streaming_before + ms_statement = ms_gossip_after - ms_gossip_before + ratio = ms_statement / ms_streaming + print(f"ms_streaming: {ms_streaming}, ms_statement: {ms_statement}, ratio: {ratio}") + assert ratio < 0.1 # A sanity check test ensures that starting and shutting down Scylla when view building is # disabled is conducted properly and we don't run into any issues. @@ -71,46 +71,45 @@ async def test_view_building_with_tablet_move(manager: ManagerClient, build_mode await manager.api.disable_tablet_balancing(servers[0].ip_addr) - ks = unique_name() table = 'test' view_count = 4 views = [f"{table}_view_{i}" for i in range(view_count)] cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 4}}") - await cql.run_async(f"CREATE TABLE {ks}.{table} (pk int PRIMARY KEY, c int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.{table} (pk int PRIMARY KEY, c int)") - # prefill the base table with enough rows so that view building takes some time - # and runs during the tablet move - keys = 200000 if build_mode != 'debug' else 10000 - batch_size = 50 - for k in range(0, keys, batch_size): - inserts = [f"INSERT INTO {ks}.{table}(pk, c) VALUES ({i}, {i})" for i in range(k, k+batch_size)] - batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n" - await manager.cql.run_async(batch) + # prefill the base table with enough rows so that view building takes some time + # and runs during the tablet move + keys = 200000 if build_mode != 'debug' else 10000 + batch_size = 50 + for k in range(0, keys, batch_size): + inserts = [f"INSERT INTO {ks}.{table}(pk, c) VALUES ({i}, {i})" for i in range(k, k+batch_size)] + batch = "BEGIN UNLOGGED BATCH\n" + "\n".join(inserts) + "\nAPPLY BATCH\n" + await manager.cql.run_async(batch) - logger.info("Adding new server") - servers.append(await manager.server_add()) + logger.info("Adding new server") + servers.append(await manager.server_add()) - # create some views so they are built together but starting at different tokens - for view in views: - await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.{view} AS SELECT * FROM {ks}.{table} WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)") - await asyncio.sleep(1) + # create some views so they are built together but starting at different tokens + for view in views: + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.{view} AS SELECT * FROM {ks}.{table} WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)") + await asyncio.sleep(1) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 - # move all tablets except the first one (with lowest token range) to the other node. - table_id = await manager.get_table_id(ks, table) - rows = await manager.cql.run_async(f"SELECT last_token FROM system.tablets where table_id = {table_id}") - move_tablets_tasks = [] - for r in rows[1:]: - tablet_token = r.last_token - replica = await get_tablet_replica(manager, servers[0], ks, table, tablet_token) - move_tablets_tasks.append(asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, table, replica[0], replica[1], s1_host_id, dst_shard, tablet_token))) - await asyncio.gather(*move_tablets_tasks) + # move all tablets except the first one (with lowest token range) to the other node. + table_id = await manager.get_table_id(ks, table) + rows = await manager.cql.run_async(f"SELECT last_token FROM system.tablets where table_id = {table_id}") + move_tablets_tasks = [] + for r in rows[1:]: + tablet_token = r.last_token + replica = await get_tablet_replica(manager, servers[0], ks, table, tablet_token) + move_tablets_tasks.append(asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, table, replica[0], replica[1], s1_host_id, dst_shard, tablet_token))) + await asyncio.gather(*move_tablets_tasks) - for view in views: - await wait_for_view(cql, view, len(servers)) + for view in views: + await wait_for_view(cql, view, len(servers)) From 629ee3cb46ca74ec1382f5dce60fed8c89d630fd Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 21/56] topology_custom/test_mv_delete_partitions: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/test_mv_delete_partitions.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/topology_custom/mv/test_mv_delete_partitions.py b/test/topology_custom/mv/test_mv_delete_partitions.py index 35a2add62f..b54b72b07b 100644 --- a/test/topology_custom/mv/test_mv_delete_partitions.py +++ b/test/topology_custom/mv/test_mv_delete_partitions.py @@ -11,15 +11,17 @@ import time import logging from test.topology.conftest import skip_mode from test.pylib.util import wait_for_view +from test.topology.util import new_test_keyspace from cassandra.cqltypes import Int32Type logger = logging.getLogger(__name__) -async def insert_with_concurrency(cql, value_count, concurrency): +async def insert_with_concurrency(cql, table, value_count, concurrency): + ks = table.split(".")[0] def serialize_int(i): return Int32Type.serialize(i, cql.cluster.protocol_version) def get_replicas(key): - return cql.cluster.metadata.get_replicas("ks", serialize_int(key)) + return cql.cluster.metadata.get_replicas(ks, serialize_int(key)) local_node = get_replicas(0)[0] logger.info(f"Starting writes with concurrency {concurrency}") async def do_inserts(m: int): @@ -28,7 +30,7 @@ async def insert_with_concurrency(cql, value_count, concurrency): m_count += 1 update_key = m # For each row in [0, value_count) with key % concurrency == m, insert a row with the same remainder m - insert_stmt = cql.prepare(f"INSERT INTO ks.tab (key, c) VALUES (?, ?)") + insert_stmt = cql.prepare(f"INSERT INTO {ks}.tab (key, c) VALUES (?, ?)") inserted_count = 0 while inserted_count < m_count: # Only remote updates hold on to memory, so try another key until the update is remote @@ -61,16 +63,14 @@ async def test_delete_partition_rows_from_table_with_mv(manager: ManagerClient) node_count = 2 await manager.servers_add(node_count, config={'error_injections_at_startup': ['view_update_limit', 'delay_before_remote_view_update']}) cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, PRIMARY KEY (key, c))") - await insert_with_concurrency(cql, 200, 100) + async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, PRIMARY KEY (key, c))") + await insert_with_concurrency(cql, f"{ks}.tab", 200, 100) - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.mv_cf_view AS SELECT * FROM ks.tab " - "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv_cf_view AS SELECT * FROM {ks}.tab " + "WHERE c IS NOT NULL and key IS NOT NULL PRIMARY KEY (c, key) ") - await wait_for_view(cql, "mv_cf_view", node_count) + await wait_for_view(cql, "mv_cf_view", node_count) - logger.info(f"Deleting all rows from partition with key 0") - await cql.run_async(f"DELETE FROM ks.tab WHERE key = 0", timeout=300) - - await cql.run_async(f"DROP KEYSPACE ks") + logger.info(f"Deleting all rows from partition with key 0") + await cql.run_async(f"DELETE FROM {ks}.tab WHERE key = 0", timeout=300) From 42a104038da78096902f25cc35b6261ccfda1f77 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 22/56] topology_custom/test_mv_fail_building: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/test_mv_fail_building.py | 78 ++++++++++--------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/test/topology_custom/mv/test_mv_fail_building.py b/test/topology_custom/mv/test_mv_fail_building.py index 21cf2dbbb5..8b3cbee22d 100644 --- a/test/topology_custom/mv/test_mv_fail_building.py +++ b/test/topology_custom/mv/test_mv_fail_building.py @@ -9,6 +9,7 @@ import time from test.topology.conftest import skip_mode from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_view +from test.topology.util import new_test_keyspace, reconnect_driver from cassandra.cluster import ConsistencyLevel # type: ignore from cassandra.query import SimpleStatement # type: ignore @@ -22,28 +23,26 @@ async def test_mv_fail_building(manager: ManagerClient) -> None: node_count = 3 servers = await manager.servers_add(node_count) cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 3}}") - await cql.run_async(f"CREATE TABLE ks.tab (key int, c int, PRIMARY KEY (key, c))") - # Insert initial rows for building an index - for i in range(10): - await cql.run_async(f"INSERT INTO ks.tab (key, c) VALUES ({i}, 0)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (key int, c int, PRIMARY KEY (key, c))") + # Insert initial rows for building an index + for i in range(10): + await cql.run_async(f"INSERT INTO {ks}.tab (key, c) VALUES ({i}, 0)") - for s in servers: - await manager.api.enable_injection(s.ip_addr, 'view_building_failure', one_shot=True) + for s in servers: + await manager.api.enable_injection(s.ip_addr, 'view_building_failure', one_shot=True) - await cql.run_async(f"CREATE INDEX tab_by_c ON ks.tab (c)") + await cql.run_async(f"CREATE INDEX tab_by_c ON {ks}.tab (c)") - # Insert more rows while building an index which is delayed by the 'view_building_failure' injection. - for i in range(10, 20): - await cql.run_async(f"INSERT INTO ks.tab (key, c) VALUES ({i}, 0)") - await wait_for_view(cql, "tab_by_c_index", node_count) + # Insert more rows while building an index which is delayed by the 'view_building_failure' injection. + for i in range(10, 20): + await cql.run_async(f"INSERT INTO {ks}.tab (key, c) VALUES ({i}, 0)") + await wait_for_view(cql, "tab_by_c_index", node_count) - # Verify that all rows were inserted to the view by reading from the index - rows = await cql.run_async(SimpleStatement(f"SELECT * FROM ks.tab WHERE c = 0", consistency_level=ConsistencyLevel.ALL)) - base_rows = await cql.run_async(SimpleStatement(f"SELECT * FROM ks.tab", consistency_level=ConsistencyLevel.ALL)) - assert sorted(rows) == sorted(base_rows) - - await cql.run_async(f"DROP KEYSPACE ks") + # Verify that all rows were inserted to the view by reading from the index + rows = await cql.run_async(SimpleStatement(f"SELECT * FROM {ks}.tab WHERE c = 0", consistency_level=ConsistencyLevel.ALL)) + base_rows = await cql.run_async(SimpleStatement(f"SELECT * FROM {ks}.tab", consistency_level=ConsistencyLevel.ALL)) + assert sorted(rows) == sorted(base_rows) # Reproduces #18929 # Test view build operations running during node shutdown and view drain. @@ -55,27 +54,32 @@ async def test_mv_build_during_shutdown(manager: ManagerClient): server = await manager.server_add() cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.t (pk int primary key, v int)") - for i in range(100): - await cql.run_async(f"insert into ks.t (pk, v) values ({i}, {i+1})") + for i in range(100): + await cql.run_async(f"insert into {ks}.t (pk, v) values ({i}, {i+1})") - # Start building two views. The first is delayed by the injection, and the second - # view build is queued, waiting on the view builder semaphore. - await manager.api.enable_injection(server.ip_addr, "delay_before_get_view_natural_endpoint", one_shot=True) - cql.run_async("CREATE materialized view ks.t_view1 AS select pk, v from ks.t where v is not null primary key (v, pk)") - cql.run_async("CREATE materialized view ks.t_view2 AS select pk, v from ks.t where v is not null primary key (v, pk)") + # Start building two views. The first is delayed by the injection, and the second + # view build is queued, waiting on the view builder semaphore. + await manager.api.enable_injection(server.ip_addr, "delay_before_get_view_natural_endpoint", one_shot=True) + create_task1 = cql.run_async(f"CREATE materialized view {ks}.t_view1 AS select pk, v from {ks}.t where v is not null primary key (v, pk)") + create_task2 = cql.run_async(f"CREATE materialized view {ks}.t_view2 AS select pk, v from {ks}.t where v is not null primary key (v, pk)") - log = await manager.server_open_log(server.server_id) - mark = await log.mark() + log = await manager.server_open_log(server.server_id) + mark = await log.mark() - # Start node shutdown. this will drain and abort the running view build. - # As we continue and drain the view building of view1 and view2 we will - # have writes to the database, running during the draining phase. - # If the drain order is correct it should succeed without errors. - await manager.server_stop_gracefully(server.server_id) + # Start node shutdown. this will drain and abort the running view build. + # As we continue and drain the view building of view1 and view2 we will + # have writes to the database, running during the draining phase. + # If the drain order is correct it should succeed without errors. + await manager.server_stop_gracefully(server.server_id) - # Verify no db write errors during the shutdown - occurrences = await log.grep(expr="exception during mutation write", from_mark=mark) - assert len(occurrences) == 0 + # Verify no db write errors during the shutdown + occurrences = await log.grep(expr="exception during mutation write", from_mark=mark) + assert len(occurrences) == 0 + + # For dropping the keyspace + await manager.server_start(server.server_id) + await reconnect_driver(manager) + asyncio.gather(create_task1, create_task2) From d5e3c578f56a4bd8fe35330ea779765ac44c2919 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 23/56] topology_custom/test_mv_read_concurrency: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/test_mv_read_concurrency.py | 128 +++++++++--------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/test/topology_custom/mv/test_mv_read_concurrency.py b/test/topology_custom/mv/test_mv_read_concurrency.py index f4d68cae49..1f3dc288d7 100644 --- a/test/topology_custom/mv/test_mv_read_concurrency.py +++ b/test/topology_custom/mv/test_mv_read_concurrency.py @@ -11,6 +11,7 @@ import logging from test.topology.conftest import skip_mode from test.pylib.util import wait_for_view +from test.topology.util import new_test_keyspace from cassandra import ReadTimeout, WriteTimeout logger = logging.getLogger(__name__) @@ -36,54 +37,54 @@ async def test_mv_read_concurrency(manager: ManagerClient) -> None: servers = await manager.servers_add(node_count, config=cfg) cql, _ = await manager.get_ready_cql(servers) - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - await cql.run_async(f"CREATE TABLE ks.tab (p int PRIMARY KEY, mvp int, v text)") - await cql.run_async(f"CREATE TABLE ks.tab2 (p int PRIMARY KEY, mvp int)") - await cql.run_async(f"CREATE MATERIALIZED VIEW IF NOT EXISTS ks.mv AS SELECT p, mvp FROM ks.tab \ - WHERE p IS NOT NULL AND mvp IS NOT NULL PRIMARY KEY (mvp, p)") - await wait_for_view(cql, 'mv', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (p int PRIMARY KEY, mvp int, v text)") + await cql.run_async(f"CREATE TABLE {ks}.tab2 (p int PRIMARY KEY, mvp int)") + await cql.run_async(f"CREATE MATERIALIZED VIEW IF NOT EXISTS {ks}.mv AS SELECT p, mvp FROM {ks}.tab \ + WHERE p IS NOT NULL AND mvp IS NOT NULL PRIMARY KEY (mvp, p)") + await wait_for_view(cql, 'mv', node_count) - row_count = 300 - for i in range(10): - await cql.run_async(f"INSERT INTO ks.tab2 (p, mvp) VALUES ({i}, {i})") + row_count = 300 + for i in range(10): + await cql.run_async(f"INSERT INTO {ks}.tab2 (p, mvp) VALUES ({i}, {i})") - # The injection prolongs the time we hold the read concurrency semaphore resources during the rbw during a view update - await manager.api.enable_injection(servers[0].ip_addr, "keep_mv_read_semaphore_units_10ms_longer", one_shot=False) + # The injection prolongs the time we hold the read concurrency semaphore resources during the rbw during a view update + await manager.api.enable_injection(servers[0].ip_addr, "keep_mv_read_semaphore_units_10ms_longer", one_shot=False) - failed = None - stop_event = asyncio.Event() - async def do_read(i: int): - read_stmt = cql.prepare(f"SELECT mvp FROM ks.tab2 WHERE p=? USING TIMEOUT 10s") - while not stop_event.is_set(): - try: - await manager.cql.run_async(read_stmt, [i]) - await asyncio.sleep(0.1) - except ReadTimeout as err: - stop_event.set() - # Fail the test after waiting for the other tasks to finish to avoid clogging the test logs with 100000*'a' - nonlocal failed - failed = err + failed = None + stop_event = asyncio.Event() + async def do_read(i: int): + read_stmt = cql.prepare(f"SELECT mvp FROM {ks}.tab2 WHERE p=? USING TIMEOUT 10s") + while not stop_event.is_set(): + try: + await manager.cql.run_async(read_stmt, [i]) + await asyncio.sleep(0.1) + except ReadTimeout as err: + stop_event.set() + # Fail the test after waiting for the other tasks to finish to avoid clogging the test logs with 100000*'a' + nonlocal failed + failed = err - async def do_mv_inserts(i: int): - insert_stmt = cql.prepare(f"INSERT INTO ks.tab(p, mvp, v) VALUES (?, ?, '{100000*'a'}') USING TIMEOUT 10s") - reps = 0 - while not stop_event.is_set() and reps < 50: - try: - await manager.cql.run_async(insert_stmt, [i, i]) - reps += 1 - except WriteTimeout: - # The writes may timeout for the same reason as the reads, but this test is focused on the reads specifically, so don't fail - logger.info(f"Write timeout on {i}") + async def do_mv_inserts(i: int): + insert_stmt = cql.prepare(f"INSERT INTO {ks}.tab(p, mvp, v) VALUES (?, ?, '{100000*'a'}') USING TIMEOUT 10s") + reps = 0 + while not stop_event.is_set() and reps < 50: + try: + await manager.cql.run_async(insert_stmt, [i, i]) + reps += 1 + except WriteTimeout: + # The writes may timeout for the same reason as the reads, but this test is focused on the reads specifically, so don't fail + logger.info(f"Write timeout on {i}") - read_tasks = [asyncio.create_task(do_read(i)) for i in range(10)] - insert_tasks = [asyncio.create_task(do_mv_inserts(i)) for i in range(row_count)] + read_tasks = [asyncio.create_task(do_read(i)) for i in range(10)] + insert_tasks = [asyncio.create_task(do_mv_inserts(i)) for i in range(row_count)] - await asyncio.gather(*insert_tasks) - stop_event.set() - await asyncio.gather(*read_tasks) + await asyncio.gather(*insert_tasks) + stop_event.set() + await asyncio.gather(*read_tasks) - if failed: - raise failed + if failed: + raise failed # This test verifies that the writes causing view updates don't make Scylla use excessive memory. # Similarly to the read timeout test, we create a table with a materialized view, and then run @@ -108,31 +109,30 @@ async def test_mv_read_memory(manager: ManagerClient) -> None: cql, _ = await manager.get_ready_cql(servers) # Use just 1 tablet to make the test more predictable by running all view updates on the same shard - await cql.run_async(f"CREATE KEYSPACE ks WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}" - "AND tablets = {'initial': 1}") - await cql.run_async(f"CREATE TABLE ks.tab (p int PRIMARY KEY, mvp int, v text)") - await cql.run_async(f"CREATE MATERIALIZED VIEW IF NOT EXISTS ks.mv AS SELECT p, mvp FROM ks.tab \ - WHERE p IS NOT NULL AND mvp IS NOT NULL PRIMARY KEY (mvp, p)") - await wait_for_view(cql, 'mv', node_count) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.tab (p int PRIMARY KEY, mvp int, v text)") + await cql.run_async(f"CREATE MATERIALIZED VIEW IF NOT EXISTS {ks}.mv AS SELECT p, mvp FROM {ks}.tab \ + WHERE p IS NOT NULL AND mvp IS NOT NULL PRIMARY KEY (mvp, p)") + await wait_for_view(cql, 'mv', node_count) - row_count = 500 + row_count = 500 - # The injection prolongs the time we hold the read concurrency semaphore resources during the rbw during a view update - await manager.api.enable_injection(servers[0].ip_addr, "keep_mv_read_semaphore_units_10ms_longer", one_shot=False) + # The injection prolongs the time we hold the read concurrency semaphore resources during the rbw during a view update + await manager.api.enable_injection(servers[0].ip_addr, "keep_mv_read_semaphore_units_10ms_longer", one_shot=False) - stop_event = asyncio.Event() - async def do_mv_inserts(i: int): - insert_stmt = cql.prepare(f"INSERT INTO ks.tab(p, mvp, v) VALUES (?, ?, '{100000*'a'}') USING TIMEOUT 30s") - reps = 0 - while not stop_event.is_set() and reps < 10: - try: - await manager.cql.run_async(insert_stmt, [i, i]) - reps += 1 - except WriteTimeout: - # A write timeout doesn't necessarily show that we run out of memory - the read queueing - # might just have done its job, so don't fail the test to avoid false negatives - logger.info(f"Write timeout on {i}") + stop_event = asyncio.Event() + async def do_mv_inserts(i: int): + insert_stmt = cql.prepare(f"INSERT INTO {ks}.tab(p, mvp, v) VALUES (?, ?, '{100000*'a'}') USING TIMEOUT 30s") + reps = 0 + while not stop_event.is_set() and reps < 10: + try: + await manager.cql.run_async(insert_stmt, [i, i]) + reps += 1 + except WriteTimeout: + # A write timeout doesn't necessarily show that we run out of memory - the read queueing + # might just have done its job, so don't fail the test to avoid false negatives + logger.info(f"Write timeout on {i}") - insert_tasks = [asyncio.create_task(do_mv_inserts(i)) for i in range(row_count)] + insert_tasks = [asyncio.create_task(do_mv_inserts(i)) for i in range(row_count)] - await asyncio.gather(*insert_tasks) + await asyncio.gather(*insert_tasks) From c05794c15660a256c06a7e987577335db60c4564 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 24/56] topology_custom/test_mv_tablets: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/tablets/test_mv_tablets.py | 102 +++++++++--------- 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/test/topology_custom/mv/tablets/test_mv_tablets.py b/test/topology_custom/mv/tablets/test_mv_tablets.py index beff3eabcb..3726fe0e27 100644 --- a/test/topology_custom/mv/tablets/test_mv_tablets.py +++ b/test/topology_custom/mv/tablets/test_mv_tablets.py @@ -11,6 +11,7 @@ from test.pylib.rest_client import read_barrier from test.pylib.util import wait_for_cql_and_get_hosts from test.pylib.internal_types import ServerInfo from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace from test.topology_custom.test_alternator import get_alternator, alternator_config, full_query @@ -88,10 +89,9 @@ async def test_tablet_mv_create(manager: ManagerClient): servers = await manager.servers_add(1) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE MATERIALIZED VIEW test.tv AS SELECT * FROM test.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)") - await cql.run_async("DROP KEYSPACE test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.tv AS SELECT * FROM {ks}.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk)") @pytest.mark.asyncio @@ -106,13 +106,12 @@ async def test_tablet_mv_simple(manager: ManagerClient): servers = await manager.servers_add(1) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE MATERIALIZED VIEW test.tv AS SELECT * FROM test.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") - await cql.run_async("INSERT INTO test.test (pk, c) VALUES (2, 3)") - # We used SYNCHRONOUS_UPDATES=TRUE, so the view should be updated: - assert [(3,2)] == list(await cql.run_async("SELECT * FROM test.tv WHERE c=3")) - await cql.run_async("DROP KEYSPACE test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.tv AS SELECT * FROM {ks}.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (2, 3)") + # We used SYNCHRONOUS_UPDATES=TRUE, so the view should be updated: + assert [(3,2)] == list(await cql.run_async(f"SELECT * FROM {ks}.tv WHERE c=3")) @pytest.mark.asyncio async def test_tablet_mv_simple_6node(manager: ManagerClient): @@ -128,13 +127,12 @@ async def test_tablet_mv_simple_6node(manager: ManagerClient): """ servers = await manager.servers_add(6) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE MATERIALIZED VIEW test.tv AS SELECT * FROM test.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") - await cql.run_async("INSERT INTO test.test (pk, c) VALUES (2, 3)") - # We used SYNCHRONOUS_UPDATES=TRUE, so the view should be updated: - assert [(3,2)] == list(await cql.run_async("SELECT * FROM test.tv WHERE c=3")) - await cql.run_async("DROP KEYSPACE test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.tv AS SELECT * FROM {ks}.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (2, 3)") + # We used SYNCHRONOUS_UPDATES=TRUE, so the view should be updated: + assert [(3,2)] == list(await cql.run_async(f"SELECT * FROM {ks}.tv WHERE c=3")) async def inject_error_on(manager, error_name, servers): errs = [manager.api.enable_injection(s.ip_addr, error_name, False) for s in servers] @@ -228,11 +226,10 @@ async def test_tablet_si_create(manager: ManagerClient): servers = await manager.servers_add(1) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE INDEX my_idx ON test.test(c)") - await cql.run_async("DROP INDEX test.my_idx") - await cql.run_async("DROP KEYSPACE test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE INDEX my_idx ON {ks}.test(c)") + await cql.run_async(f"DROP INDEX {ks}.my_idx") async def test_tablet_lsi_create(manager: ManagerClient): """A basic test for creating a *local* secondary index on a table stored @@ -243,11 +240,10 @@ async def test_tablet_lsi_create(manager: ManagerClient): servers = await manager.servers_add(1) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE INDEX my_idx ON test.test((pk),c)") - await cql.run_async("DROP INDEX test.my_idx") - await cql.run_async("DROP KEYSPACE test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE INDEX my_idx ON {ks}.test((pk),c)") + await cql.run_async(f"DROP INDEX {ks}.my_idx") @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -272,32 +268,30 @@ async def test_tablet_cql_lsi(manager: ManagerClient): # Create a table with an LSI, using tablets. Use just 1 tablets, # which is silly in any real-world use case, but makes this test simpler # and faster. - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE INDEX my_idx ON test.test((pk),c)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 100}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE INDEX my_idx ON {ks}.test((pk),c)") - # Move the base tablet (there's just one) to node 0, and the view tablet - # (of the view backing the index) to node 1. In particular all view - # updates will then be remote: node 0 will send view updates to node 1. - await pin_the_only_tablet(manager, 'test', 'test', servers[0]) - await pin_the_only_tablet(manager, 'test', 'my_idx_index', servers[1]) + # Move the base tablet (there's just one) to node 0, and the view tablet + # (of the view backing the index) to node 1. In particular all view + # updates will then be remote: node 0 will send view updates to node 1. + await pin_the_only_tablet(manager, ks, 'test', servers[0]) + await pin_the_only_tablet(manager, ks, 'my_idx_index', servers[1]) - # Add a fixed (0.5 second) delay before view updates, to increase the - # likehood that if the write didn't wait for the view update, we can try - # reading before the view update happened and fail the test. - await inject_error_on(manager, "delay_before_remote_view_update", servers); + # Add a fixed (0.5 second) delay before view updates, to increase the + # likehood that if the write didn't wait for the view update, we can try + # reading before the view update happened and fail the {ks}. + await inject_error_on(manager, "delay_before_remote_view_update", servers); - # Write to the base table (whose only replica is on node 0). - zzz = time.time() - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES (7, 42)") - # If synchronous update worked, this log message should say more - # than 0.5 seconds (the delay added by injection). If it didn't work, - # the time will be less than 0.5 seconds and the read is likely to fail. - logger.info(f"Insert took {time.time()-zzz}") - # Read using the index (whose only replica is on node 1, and delayed - # by the injection above). LSI should use synchronous view updates, - # so the data should be searchable through the local secondary index - # immediately after the previous INSERT returned. - assert [(7,42)] == list(await cql.run_async(f"SELECT * FROM test.test WHERE pk=7 AND c=42")) - - await cql.run_async("DROP KEYSPACE test") + # Write to the base table (whose only replica is on node 0). + zzz = time.time() + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES (7, 42)") + # If synchronous update worked, this log message should say more + # than 0.5 seconds (the delay added by injection). If it didn't work, + # the time will be less than 0.5 seconds and the read is likely to fail. + logger.info(f"Insert took {time.time()-zzz}") + # Read using the index (whose only replica is on node 1, and delayed + # by the injection above). LSI should use synchronous view updates, + # so the data should be searchable through the local secondary index + # immediately after the previous INSERT returned. + assert [(7,42)] == list(await cql.run_async(f"SELECT * FROM {ks}.test WHERE pk=7 AND c=42")) From 966cf82dae9aa2924eaec9c2b681ead6978479bb Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 25/56] topology_custom/test_mv_tablets_empty_ip: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/tablets/test_mv_tablets_empty_ip.py | 67 ++++++++++--------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/test/topology_custom/mv/tablets/test_mv_tablets_empty_ip.py b/test/topology_custom/mv/tablets/test_mv_tablets_empty_ip.py index a3d00f4a6f..4c07764e85 100644 --- a/test/topology_custom/mv/tablets/test_mv_tablets_empty_ip.py +++ b/test/topology_custom/mv/tablets/test_mv_tablets_empty_ip.py @@ -14,6 +14,7 @@ from cassandra.cluster import ConnectionException, NoHostAvailable # type: igno from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.manager_client import ManagerClient from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -32,41 +33,41 @@ async def test_mv_tablets_empty_ip(manager: ManagerClient): servers = await manager.servers_add(4, config = cfg) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") - await cql.run_async("CREATE materialized view ks.t_view AS select pk, v from ks.t where v is not null primary key (v, pk)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.t (pk int primary key, v int)") + await cql.run_async(f"CREATE materialized view {ks}.t_view AS select pk, v from {ks}.t where v is not null primary key (v, pk)") - stop_event = asyncio.Event() - concurrency = 10 - async def do_writes(start_it) -> int: - iteration = start_it - while not stop_event.is_set(): - start_time = time.time() - try: - await cql.run_async(f"insert into ks.t (pk, v) values ({iteration}, {iteration+1})") - except NoHostAvailable as e: - for _, err in e.errors.items(): - # ConnectionException can be raised when the node is shutting down. - if not isinstance(err, ConnectionException): - logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") - raise - except Exception as e: - logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") - raise - iteration += concurrency - await asyncio.sleep(0.01) - return iteration + stop_event = asyncio.Event() + concurrency = 10 + async def do_writes(start_it) -> int: + iteration = start_it + while not stop_event.is_set(): + start_time = time.time() + try: + await cql.run_async(f"insert into {ks}.t (pk, v) values ({iteration}, {iteration+1})") + except NoHostAvailable as e: + for _, err in e.errors.items(): + # ConnectionException can be raised when the node is shutting down. + if not isinstance(err, ConnectionException): + logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") + raise + except Exception as e: + logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") + raise + iteration += concurrency + await asyncio.sleep(0.01) + return iteration - logger.info("Starting to write") - tasks = [asyncio.create_task(do_writes(i)) for i in range(concurrency)] + logger.info("Starting to write") + tasks = [asyncio.create_task(do_writes(i)) for i in range(concurrency)] - logger.info("Stopping the last node") - await manager.server_stop_gracefully(servers[-1].server_id) - replace_cfg = ReplaceConfig(replaced_id = servers[-1].server_id, reuse_ip_addr = False, use_host_id = True) + logger.info("Stopping the last node") + await manager.server_stop_gracefully(servers[-1].server_id) + replace_cfg = ReplaceConfig(replaced_id = servers[-1].server_id, reuse_ip_addr = False, use_host_id = True) - logger.info("Replacing the last node") - await manager.server_add(replace_cfg=replace_cfg, config = cfg) + logger.info("Replacing the last node") + await manager.server_add(replace_cfg=replace_cfg, config = cfg) - logger.info("Stopping writes") - stop_event.set() - await asyncio.gather(*tasks) + logger.info("Stopping writes") + stop_event.set() + await asyncio.gather(*tasks) From 11005b10dbe52caa9f38ebc2e790b27f68d1ee7d Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 26/56] topology_custom/test_mv_tablets_replace: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/tablets/test_mv_tablets_replace.py | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/test/topology_custom/mv/tablets/test_mv_tablets_replace.py b/test/topology_custom/mv/tablets/test_mv_tablets_replace.py index 8756491cb6..5f52671a08 100644 --- a/test/topology_custom/mv/tablets/test_mv_tablets_replace.py +++ b/test/topology_custom/mv/tablets/test_mv_tablets_replace.py @@ -19,6 +19,7 @@ import logging from test.topology.conftest import skip_mode from test.topology.util import get_topology_coordinator, find_server_by_host_id from test.topology_custom.mv.tablets.test_mv_tablets import get_tablet_replicas +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -35,58 +36,57 @@ async def test_tablet_mv_replica_pairing_during_replace(manager: ManagerClient): servers = await manager.servers_add(4) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}" - " AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int)") - await cql.run_async("CREATE MATERIALIZED VIEW test.tv AS SELECT * FROM test.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int)") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.tv AS SELECT * FROM {ks}.test WHERE c IS NOT NULL AND pk IS NOT NULL PRIMARY KEY (c, pk) WITH SYNCHRONOUS_UPDATES = TRUE") - # Disable migrations concurrent with replace since we don't handle nodes going down during migration yet. - # See https://github.com/scylladb/scylladb/issues/16527 - await manager.api.disable_tablet_balancing(servers[0].ip_addr) + # Disable migrations concurrent with replace since we don't handle nodes going down during migration yet. + # See https://github.com/scylladb/scylladb/issues/16527 + await manager.api.disable_tablet_balancing(servers[0].ip_addr) - base_replicas = await get_tablet_replicas(manager, servers[0], "test", "test", 0) - logger.info(f'test.test replicas: {base_replicas}') - view_replicas = await get_tablet_replicas(manager, servers[0], "test", "tv", 0) - logger.info(f'test.tv replicas: {view_replicas}') - server_to_replace = await find_server_by_host_id(manager, servers, HostID(str(view_replicas[0][0]))) - server_to_down = await find_server_by_host_id(manager, servers, HostID(str(base_replicas[0][0]))) + base_replicas = await get_tablet_replicas(manager, servers[0], ks, "test", 0) + logger.info(f'{ks}.test replicas: {base_replicas}') + view_replicas = await get_tablet_replicas(manager, servers[0], ks, "tv", 0) + logger.info(f'{ks}.tv replicas: {view_replicas}') + server_to_replace = await find_server_by_host_id(manager, servers, HostID(str(view_replicas[0][0]))) + server_to_down = await find_server_by_host_id(manager, servers, HostID(str(base_replicas[0][0]))) - logger.info('Downing a node to be replaced') - await manager.server_stop(server_to_replace.server_id) + logger.info('Downing a node to be replaced') + await manager.server_stop(server_to_replace.server_id) - logger.info('Blocking tablet rebuild') - coord = await get_topology_coordinator(manager) - coord_serv = await find_server_by_host_id(manager, servers, coord) - await manager.api.enable_injection(coord_serv.ip_addr, "tablet_transition_updates", one_shot=True) - coord_log = await manager.server_open_log(coord_serv.server_id) - coord_mark = await coord_log.mark() + logger.info('Blocking tablet rebuild') + coord = await get_topology_coordinator(manager) + coord_serv = await find_server_by_host_id(manager, servers, coord) + await manager.api.enable_injection(coord_serv.ip_addr, "tablet_transition_updates", one_shot=True) + coord_log = await manager.server_open_log(coord_serv.server_id) + coord_mark = await coord_log.mark() - logger.info('Replacing the node') - replace_cfg = ReplaceConfig(replaced_id = server_to_replace.server_id, reuse_ip_addr = False, use_host_id = True) - replace_task = asyncio.create_task(manager.server_add(replace_cfg)) + logger.info('Replacing the node') + replace_cfg = ReplaceConfig(replaced_id = server_to_replace.server_id, reuse_ip_addr = False, use_host_id = True) + replace_task = asyncio.create_task(manager.server_add(replace_cfg)) - await coord_log.wait_for('tablet_transition_updates: waiting', from_mark=coord_mark) + await coord_log.wait_for('tablet_transition_updates: waiting', from_mark=coord_mark) - if server_to_down.server_id != server_to_replace.server_id: - await manager.server_stop(server_to_down.server_id) + if server_to_down.server_id != server_to_replace.server_id: + await manager.server_stop(server_to_down.server_id) - # The update is supposed to go to the second replica only, since the other one is downed. - # If pairing would shift, the update to the view would be lost because the first replica - # is the one which is in the left state. - logger.info('Updating base table') - await cql.run_async(SimpleStatement("INSERT INTO test.test (pk, c) VALUES (3, 4)", consistency_level=ConsistencyLevel.ONE)) - logger.info('Querying the view') - assert [(4,3)] == list(await cql.run_async(SimpleStatement("SELECT * FROM test.tv WHERE c=4", consistency_level=ConsistencyLevel.ONE))) + # The update is supposed to go to the second replica only, since the other one is downed. + # If pairing would shift, the update to the view would be lost because the first replica + # is the one which is in the left state. + logger.info('Updating base table') + await cql.run_async(SimpleStatement(f"INSERT INTO {ks}.test (pk, c) VALUES (3, 4)", consistency_level=ConsistencyLevel.ONE)) + logger.info('Querying the view') + assert [(4,3)] == list(await cql.run_async(SimpleStatement(f"SELECT * FROM {ks}.tv WHERE c=4", consistency_level=ConsistencyLevel.ONE))) - if server_to_down.server_id != server_to_replace.server_id: - await manager.server_start(server_to_down.server_id) + if server_to_down.server_id != server_to_replace.server_id: + await manager.server_start(server_to_down.server_id) - logger.info('Unblocking tablet rebuild') - if coord_serv.server_id != server_to_down.server_id: - await manager.api.message_injection(coord_serv.ip_addr, "tablet_transition_updates") + logger.info('Unblocking tablet rebuild') + if coord_serv.server_id != server_to_down.server_id: + await manager.api.message_injection(coord_serv.ip_addr, "tablet_transition_updates") - logger.info('Waiting for replace') - await replace_task + logger.info('Waiting for replace') + await replace_task - logger.info('Querying') - assert [(4,3)] == list(await cql.run_async("SELECT * FROM test.tv WHERE c=4")) + logger.info('Querying') + assert [(4,3)] == list(await cql.run_async(f"SELECT * FROM {ks}.tv WHERE c=4")) From ff9c8428df3039bd28ec3ce2202381ef196df46c Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 27/56] topology_custom/test_mv_topology_change: use new_test_keyspace Signed-off-by: Benny Halevy --- .../mv/test_mv_topology_change.py | 193 +++++++++--------- 1 file changed, 97 insertions(+), 96 deletions(-) diff --git a/test/topology_custom/mv/test_mv_topology_change.py b/test/topology_custom/mv/test_mv_topology_change.py index 20bac33889..8e11b3f9c0 100644 --- a/test/topology_custom/mv/test_mv_topology_change.py +++ b/test/topology_custom/mv/test_mv_topology_change.py @@ -16,6 +16,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.tablets import get_tablet_replica from test.topology.conftest import skip_mode from test.pylib.util import wait_for +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -36,51 +37,51 @@ async def test_mv_topology_change(manager: ManagerClient): servers = [await manager.server_add(config=cfg, timeout=60) for _ in range(3)] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3};") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") - await cql.run_async("CREATE materialized view ks.t_view AS select pk, v from ks.t where v is not null primary key (v, pk)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.t (pk int primary key, v int)") + await cql.run_async(f"CREATE materialized view {ks}.t_view AS select pk, v from {ks}.t where v is not null primary key (v, pk)") - stop_event = asyncio.Event() - concurrency = 10 - async def do_writes(start_it, repeat) -> int: - iteration = start_it - while not stop_event.is_set(): - start_time = time.time() - try: - await cql.run_async(f"insert into ks.t (pk, v) values ({iteration}, {iteration})") - except NoHostAvailable as e: - for _, err in e.errors.items(): - # ConnectionException can be raised when the node is shutting down. - if not isinstance(err, ConnectionException): - logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") - raise - except Exception as e: - logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") - raise - iteration += concurrency - if not repeat: - break - await asyncio.sleep(0.01) - return iteration + stop_event = asyncio.Event() + concurrency = 10 + async def do_writes(start_it, repeat) -> int: + iteration = start_it + while not stop_event.is_set(): + start_time = time.time() + try: + await cql.run_async(f"insert into {ks}.t (pk, v) values ({iteration}, {iteration})") + except NoHostAvailable as e: + for _, err in e.errors.items(): + # ConnectionException can be raised when the node is shutting down. + if not isinstance(err, ConnectionException): + logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") + raise + except Exception as e: + logger.error(f"Write started {time.time() - start_time}s ago failed: {e}") + raise + iteration += concurrency + if not repeat: + break + await asyncio.sleep(0.01) + return iteration - # to hit the issue #18709 it's enough to start one batch of writes, the effective - # replication maps for base and view will change after the writes start but before they finish - tasks = [asyncio.create_task(do_writes(i, repeat=False)) for i in range(concurrency)] + # to hit the issue #18709 it's enough to start one batch of writes, the effective + # replication maps for base and view will change after the writes start but before they finish + tasks = [asyncio.create_task(do_writes(i, repeat=False)) for i in range(concurrency)] - server = await manager.server_add() + server = await manager.server_add() - await asyncio.gather(*tasks) + await asyncio.gather(*tasks) - [await manager.api.disable_injection(s.ip_addr, "delay_before_get_view_natural_endpoint") for s in servers] + [await manager.api.disable_injection(s.ip_addr, "delay_before_get_view_natural_endpoint") for s in servers] - # to hit the issue #17786 we need to run multiple batches of writes, so that some write is processed while the - # effective replication maps for base and view are different - tasks = [asyncio.create_task(do_writes(i, repeat=True)) for i in range(concurrency)] - await manager.decommission_node(server.server_id) + # to hit the issue #17786 we need to run multiple batches of writes, so that some write is processed while the + # effective replication maps for base and view are different + tasks = [asyncio.create_task(do_writes(i, repeat=True)) for i in range(concurrency)] + await manager.decommission_node(server.server_id) - stop_event.set() - await asyncio.gather(*tasks) + stop_event.set() + await asyncio.gather(*tasks) # Reproduces #19152 # Verify a pending replica is not doing unnecessary work of building and sending view updates. @@ -103,68 +104,68 @@ async def test_mv_update_on_pending_replica(manager: ManagerClient, intranode): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE MATERIALIZED VIEW test.mv1 AS SELECT * FROM test.test WHERE pk IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, pk);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv1 AS SELECT * FROM {ks}.test WHERE pk IS NOT NULL AND c IS NOT NULL PRIMARY KEY (c, pk);") - table_id = await manager.get_table_id('test', 'test') + table_id = await manager.get_table_id(ks, 'test') - servers.append(await manager.server_add(config=cfg, cmdline=cmd)) + servers.append(await manager.server_add(config=cfg, cmdline=cmd)) - key = 7 # Whatever - tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 0)") + key = 7 # Whatever + tablet_token = 0 # Doesn't matter since there is one tablet + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 0)") - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) - src_shard = replica[1] - dst_shard = 1-replica[1] - assert replica[0] == s0_host_id + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) + src_shard = replica[1] + dst_shard = 1-replica[1] + assert replica[0] == s0_host_id - if intranode: - dst_host = s0_host_id - dst_ip = servers[0].ip_addr - streaming_wait_injection = "intranode_migration_streaming_wait" - else: - dst_host = s1_host_id - dst_ip = servers[1].ip_addr - streaming_wait_injection = "stream_mutation_fragments" + if intranode: + dst_host = s0_host_id + dst_ip = servers[0].ip_addr + streaming_wait_injection = "intranode_migration_streaming_wait" + else: + dst_host = s1_host_id + dst_ip = servers[1].ip_addr + streaming_wait_injection = "stream_mutation_fragments" - await manager.api.enable_injection(dst_ip, streaming_wait_injection, one_shot=True) + await manager.api.enable_injection(dst_ip, streaming_wait_injection, one_shot=True) - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", s0_host_id, src_shard, dst_host, dst_shard, tablet_token)) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", s0_host_id, src_shard, dst_host, dst_shard, tablet_token)) - async def tablet_is_streaming(): - res = await cql.run_async(f"SELECT stage FROM system.tablets WHERE table_id={table_id}") - stage = res[0].stage - return stage == 'streaming' or None + async def tablet_is_streaming(): + res = await cql.run_async(f"SELECT stage FROM system.tablets WHERE table_id={table_id}") + stage = res[0].stage + return stage == 'streaming' or None - await wait_for(tablet_is_streaming, time.time() + 60) + await wait_for(tablet_is_streaming, time.time() + 60) - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, {1})") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, {1})") - # Release abandoned streaming - await manager.api.message_injection(dst_ip, streaming_wait_injection) + # Release abandoned streaming + await manager.api.message_injection(dst_ip, streaming_wait_injection) - logger.info("Waiting for migration to finish") - await migration_task - logger.info("Migration done") + logger.info("Waiting for migration to finish") + await migration_task + logger.info("Migration done") - def get_view_updates_on_wrong_node_count(server): - metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text - pattern = re.compile("^scylla_database_total_view_updates_on_wrong_node") - for metric in metrics.split('\n'): - if pattern.match(metric) is not None: - return int(float(metric.split()[1])) + def get_view_updates_on_wrong_node_count(server): + metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text + pattern = re.compile("^scylla_database_total_view_updates_on_wrong_node") + for metric in metrics.split('\n'): + if pattern.match(metric) is not None: + return int(float(metric.split()[1])) - assert all(map(lambda x: x is None or x == 0, [get_view_updates_on_wrong_node_count(server) for server in servers])) + assert all(map(lambda x: x is None or x == 0, [get_view_updates_on_wrong_node_count(server) for server in servers])) - res = await cql.run_async(f"SELECT c FROM test.test WHERE pk={key}") - assert [1] == [x.c for x in res] - res = await cql.run_async(f"SELECT c FROM test.mv1 WHERE pk={key} ALLOW FILTERING") - assert [1] == [x.c for x in res] + res = await cql.run_async(f"SELECT c FROM {ks}.test WHERE pk={key}") + assert [1] == [x.c for x in res] + res = await cql.run_async(f"SELECT c FROM {ks}.mv1 WHERE pk={key} ALLOW FILTERING") + assert [1] == [x.c for x in res] # Reproduces issue #19529 # Write to a table with MV while one node is stopped, and verify @@ -179,18 +180,18 @@ async def test_mv_write_to_dead_node(manager: ManagerClient): servers = await manager.servers_add(4) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") - await cql.run_async("CREATE TABLE ks.t (pk int primary key, v int)") - await cql.run_async("CREATE materialized view ks.t_view AS select pk, v from ks.t where v is not null primary key (v, pk)") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.t (pk int primary key, v int)") + await cql.run_async(f"CREATE materialized view {ks}.t_view AS select pk, v from {ks}.t where v is not null primary key (v, pk)") - await manager.server_stop_gracefully(servers[-1].server_id) + await manager.server_stop_gracefully(servers[-1].server_id) - # Do inserts. some should generate MV writes to the stopped node - for i in range(100): - await cql.run_async(f"insert into ks.t (pk, v) values ({i}, {i+1})") + # Do inserts. some should generate MV writes to the stopped node + for i in range(100): + await cql.run_async(f"insert into {ks}.t (pk, v) values ({i}, {i+1})") - # Remove the node to trigger a topology change. - # If the MV write is not completed, as in issue #19529, the topology change - # will be held for long time until the write timeouts. - # Otherwise, it is expected to complete in short time. - await manager.remove_node(servers[0].server_id, servers[-1].server_id, timeout=30) + # Remove the node to trigger a topology change. + # If the MV write is not completed, as in issue #19529, the topology change + # will be held for long time until the write timeouts. + # Otherwise, it is expected to complete in short time. + await manager.remove_node(servers[0].server_id, servers[-1].server_id, timeout=30) From 55b35eb21c31589476481e96b225697d074dd969 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 28/56] topology_custom/test_node_isolation: use create_new_test_keyspace new_test_keyspace is problematic here since the presence of the banned node can fail the automatic drop of the test keyspace due to NoHostAvailable (in debug mode for some reason) Signed-off-by: Benny Halevy --- test/topology_custom/test_node_isolation.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/topology_custom/test_node_isolation.py b/test/topology_custom/test_node_isolation.py index 16518b89a9..da7898ebde 100644 --- a/test/topology_custom/test_node_isolation.py +++ b/test/topology_custom/test_node_isolation.py @@ -14,6 +14,7 @@ from cassandra.policies import WhiteListRoundRobinPolicy # type: ignore from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import read_barrier +from test.topology.util import create_new_test_keyspace logger = logging.getLogger(__name__) @@ -32,9 +33,8 @@ async def test_banned_node_cannot_communicate(manager: ManagerClient) -> None: # Use RF=2 keyspace and below CL=ALL so that performing an INSERT requires # communicating with another node. - await cql.run_async("create keyspace ks with replication = " - "{'class': 'SimpleStrategy', 'replication_factor': 2}") - await cql.run_async("create table ks.t (pk int primary key)") + ks = await create_new_test_keyspace(cql, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") + await cql.run_async(f"create table {ks}.t (pk int primary key)") # Pause one of the servers so other nodes mark it as dead and we can remove it. # We deliberately don't shut it down, but only pause it - we want to test @@ -55,7 +55,7 @@ async def test_banned_node_cannot_communicate(manager: ManagerClient) -> None: with manager.con_gen([srvs[2].ip_addr], manager.port, manager.use_ssl) as c: with c.connect() as s: logger.info(f"Connected, sending request") - q = SimpleStatement('insert into ks.t (pk) values (0)', consistency_level=ConsistencyLevel.ALL) + q = SimpleStatement(f'insert into {ks}.t (pk) values (0)', consistency_level=ConsistencyLevel.ALL) # Before introducing host banning, a removed node was able to participate # as if it was a normal node and, for example, could insert data into the cluster. # Now other nodes refuse to communicate so we'll get an exception. From 5759a97eb46f7f41b26da3ecf53e4cbc356d0245 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 29/56] topology_custom/test_node_shutdown_waits_for_pending_requests: use new_test_keyspace Signed-off-by: Benny Halevy --- ...ode_shutdown_waits_for_pending_requests.py | 75 ++++++++++--------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/test/topology_custom/test_node_shutdown_waits_for_pending_requests.py b/test/topology_custom/test_node_shutdown_waits_for_pending_requests.py index e2831b2a8d..e8148ca4be 100644 --- a/test/topology_custom/test_node_shutdown_waits_for_pending_requests.py +++ b/test/topology_custom/test_node_shutdown_waits_for_pending_requests.py @@ -10,6 +10,7 @@ from cassandra.cluster import ConsistencyLevel # type: ignore from cassandra.protocol import ReadTimeout # type: ignore from test.pylib.util import wait_for_cql_and_get_hosts from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace, reconnect_driver logger = logging.getLogger(__name__) @@ -28,48 +29,52 @@ async def test_node_shutdown_waits_for_pending_requests(manager: ManagerClient) h0 = (await wait_for_cql_and_get_hosts(cql, [servers[0]], time.time() + 60))[0] logger.info('create keyspace and table') - await cql.run_async("create keyspace ks with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") - await cql.run_async('create table ks.test_table (pk int primary key)') + async with new_test_keyspace(manager, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f'create table {ks}.test_table (pk int primary key)') - logger.info('insert test row into the table') - await cql.run_async('insert into ks.test_table(pk) values (42)') + logger.info('insert test row into the table') + await cql.run_async(f'insert into {ks}.test_table(pk) values (42)') - logger.info(f'make storage_proxy::handle_read error injection on the node {servers[1]}') - injection_handler = await inject_error_one_shot( - manager.api, servers[1].ip_addr, 'storage_proxy::handle_read', parameters={'cf_name': 'test_table'}) + logger.info(f'make storage_proxy::handle_read error injection on the node {servers[1]}') + injection_handler = await inject_error_one_shot( + manager.api, servers[1].ip_addr, 'storage_proxy::handle_read', parameters={'cf_name': 'test_table'}) - logger.info(f'start ConsistencyLevel.ALL read request on {servers[0]} as coordinator') - read_future = cql.run_async(SimpleStatement('select pk from ks.test_table using timeout 1000ms', - consistency_level=ConsistencyLevel.ALL), - host=h0) + logger.info(f'start ConsistencyLevel.ALL read request on {servers[0]} as coordinator') + read_future = cql.run_async(SimpleStatement(f'select pk from {ks}.test_table using timeout 1000ms', + consistency_level=ConsistencyLevel.ALL), + host=h0) - logger.info(f'wait until the read request hit storage_proxy::handle_read on the node {servers[1]}') - log_file2 = await manager.server_open_log(servers[1].server_id) - await log_file2.wait_for("storage_proxy::handle_read injection hit", timeout=60) + logger.info(f'wait until the read request hit storage_proxy::handle_read on the node {servers[1]}') + log_file2 = await manager.server_open_log(servers[1].server_id) + await log_file2.wait_for("storage_proxy::handle_read injection hit", timeout=60) - logger.info(f'trigger shutdown of the node {servers[1]}') - stop_future = asyncio.create_task(manager.server_stop_gracefully(servers[1].server_id)) + logger.info(f'trigger shutdown of the node {servers[1]}') + stop_future = asyncio.create_task(manager.server_stop_gracefully(servers[1].server_id)) - logger.info(f'wait until node shutdown process reaches the storage proxy verbs') - await log_file2.wait_for("Shutting down storage proxy RPC verbs", timeout=60) + logger.info(f'wait until node shutdown process reaches the storage proxy verbs') + await log_file2.wait_for("Shutting down storage proxy RPC verbs", timeout=60) - logger.info(f'release the read request') - await injection_handler.message() + logger.info(f'release the read request') + await injection_handler.message() - # We get a timeout instead of the actual response here. - # This seems to be a flaw in the current Scylla code — when a node - # is shutting down, the drain_on_shutdown method if storage_service is called before - # storage_proxy::stop_remote. The drain_on_shutdown calls messaging_service::shutdown, - # which means that although storage_proxy::stop_remote waits for current requests to complete, - # client sockets are already closed so the responses can't be delivered to the clients. - # We get a timeout and not a failure because digest_read_resolver::on_error has - # a magic special case for error_kind::DISCONNECT: - # "wait for timeout in hope that the client will issue speculative read" - logger.info(f'wait for read request') - with pytest.raises(ReadTimeout): - await read_future + # We get a timeout instead of the actual response here. + # This seems to be a flaw in the current Scylla code — when a node + # is shutting down, the drain_on_shutdown method if storage_service is called before + # storage_proxy::stop_remote. The drain_on_shutdown calls messaging_service::shutdown, + # which means that although storage_proxy::stop_remote waits for current requests to complete, + # client sockets are already closed so the responses can't be delivered to the clients. + # We get a timeout and not a failure because digest_read_resolver::on_error has + # a magic special case for error_kind::DISCONNECT: + # "wait for timeout in hope that the client will issue speculative read" + logger.info(f'wait for read request') + with pytest.raises(ReadTimeout): + await read_future - logger.info(f'wait for successful node {servers[1]} shutdown') - await stop_future + logger.info(f'wait for successful node {servers[1]} shutdown') + await stop_future - logger.info('done') + logger.info('done') + + # For dropping the keyspace + await manager.server_start(servers[1].server_id) + await reconnect_driver(manager) From c68d2a471c872906fe27ac189e0a27226887a683 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 30/56] topology_custom/test_not_enough_token_owners: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_not_enough_token_owners.py | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/test/topology_custom/test_not_enough_token_owners.py b/test/topology_custom/test_not_enough_token_owners.py index c3e3493ab3..a00a4cab4f 100644 --- a/test/topology_custom/test_not_enough_token_owners.py +++ b/test/topology_custom/test_not_enough_token_owners.py @@ -9,6 +9,7 @@ import time from test.pylib.manager_client import ManagerClient from test.pylib.util import unique_name, wait_for_cql_and_get_hosts +from test.topology.util import new_test_keyspace @pytest.mark.asyncio @@ -52,20 +53,18 @@ async def test_not_enough_token_owners(manager: ManagerClient): await wait_for_cql_and_get_hosts(cql, [server_a], time.time() + 60) - ks_name = unique_name() - await cql.run_async(f"""CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', - 'replication_factor': 2}} AND tablets = {{ 'enabled': true }}""") - await cql.run_async(f'CREATE TABLE {ks_name}.tbl (pk int PRIMARY KEY, v int)') - await cql.run_async(f'INSERT INTO {ks_name}.tbl (pk, v) VALUES (1, 1)') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = { 'enabled': true }") as ks_name: + await cql.run_async(f'CREATE TABLE {ks_name}.tbl (pk int PRIMARY KEY, v int)') + await cql.run_async(f'INSERT INTO {ks_name}.tbl (pk, v) VALUES (1, 1)') - # FIXME: Once scylladb/scylladb#16195 is fixed, we will have to replace the expected error message. - # A similar change may be needed for remove_node below. - logging.info(f'Trying to decommission {server_a} - one of the two token owners') - await manager.decommission_node(server_a.server_id, expected_error='Unable to find new replica for tablet') + # FIXME: Once scylladb/scylladb#16195 is fixed, we will have to replace the expected error message. + # A similar change may be needed for remove_node below. + logging.info(f'Trying to decommission {server_a} - one of the two token owners') + await manager.decommission_node(server_a.server_id, expected_error='Unable to find new replica for tablet') - logging.info(f'Stopping {server_a}') - await manager.server_stop_gracefully(server_a.server_id) + logging.info(f'Stopping {server_a}') + await manager.server_stop_gracefully(server_a.server_id) - logging.info(f'Trying to remove {server_a}, one of the two token owners, by {server_b}') - await manager.remove_node(server_b.server_id, server_a.server_id, - expected_error='Unable to find new replica for tablet') + logging.info(f'Trying to remove {server_a}, one of the two token owners, by {server_b}') + await manager.remove_node(server_b.server_id, server_a.server_id, + expected_error='Unable to find new replica for tablet') From e05372afa45972f482f4bb15c95a3200862ea63a Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 31/56] topology_custom/test_query_rebounce: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_query_rebounce.py | 25 ++++++++++----------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/test/topology_custom/test_query_rebounce.py b/test/topology_custom/test_query_rebounce.py index efc4046449..6425fd2a35 100644 --- a/test/topology_custom/test_query_rebounce.py +++ b/test/topology_custom/test_query_rebounce.py @@ -12,6 +12,7 @@ import time from test.pylib.internal_types import IPAddress from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -44,22 +45,20 @@ async def test_query_rebounce(manager: ManagerClient): servers = await manager.running_servers() cql = manager.get_cql() - await cql.run_async("create keyspace ks with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}" - "and tablets = {'enabled': false};") + async with new_test_keyspace(manager, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 1} and tablets = {'enabled': false};") as ks: + await cql.run_async(f"create table {ks}.lwt (a int, b int, primary key(a));") - await cql.run_async("create table ks.lwt (a int, b int, primary key(a));") + await cql.run_async(f"insert into {ks}.lwt (a,b ) values (1, 10);") + await cql.run_async(f"insert into {ks}.lwt (a,b ) values (2, 20);") - await cql.run_async("insert into ks.lwt (a,b ) values (1, 10);") - await cql.run_async("insert into ks.lwt (a,b ) values (2, 20);") + errs = [manager.api.enable_injection(s.ip_addr, "forced_bounce_to_shard_counter", one_shot=False, + parameters={'value': '2'}) + for s in servers] - errs = [manager.api.enable_injection(s.ip_addr, "forced_bounce_to_shard_counter", one_shot=False, - parameters={'value': '2'}) - for s in servers] + await asyncio.gather(*errs) - await asyncio.gather(*errs) + await cql.run_async(f"update {ks}.lwt set b = 11 where a = 1 if b = 10;") - await cql.run_async("update ks.lwt set b = 11 where a = 1 if b = 10;") + rows = await cql.run_async(f"select b from {ks}.lwt where a = 1;") - rows = await cql.run_async("select b from ks.lwt where a = 1;") - - assert rows[0].b == 11 + assert rows[0].b == 11 From 380c5e5ac8cc1dcd0d60b077c27218de461404c2 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 32/56] topology_custom/test_raft_fix_broken_snapshot: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_raft_fix_broken_snapshot.py | 49 +++++++++---------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/test/topology_custom/test_raft_fix_broken_snapshot.py b/test/topology_custom/test_raft_fix_broken_snapshot.py index 4d3b0d69db..5281391e76 100644 --- a/test/topology_custom/test_raft_fix_broken_snapshot.py +++ b/test/topology_custom/test_raft_fix_broken_snapshot.py @@ -12,7 +12,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for_cql_and_get_hosts from test.topology.util import reconnect_driver, enter_recovery_state, \ delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes, \ - wait_for_token_ring_and_group0_consistency + wait_for_token_ring_and_group0_consistency, new_test_keyspace from test.topology.conftest import skip_mode @@ -49,33 +49,32 @@ async def test_raft_fix_broken_snapshot(manager: ManagerClient): await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) logger.info(f"Creating keyspace") - await cql.run_async( - "create keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f"create table {ks}.t (pk int primary key)") - logger.info(f"Leaving recovery state") - await delete_raft_data_and_upgrade_state(cql, h) - await manager.server_stop_gracefully(srv.server_id) - await manager.server_start(srv.server_id) - cql = await reconnect_driver(manager) - await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) + logger.info(f"Leaving recovery state") + await delete_raft_data_and_upgrade_state(cql, h) + await manager.server_stop_gracefully(srv.server_id) + await manager.server_start(srv.server_id) + cql = await reconnect_driver(manager) + await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) - logger.info(f"Waiting for group 0 upgrade to finish") - await wait_until_upgrade_finishes(cql, h, time.time() + 60) + logger.info(f"Waiting for group 0 upgrade to finish") + await wait_until_upgrade_finishes(cql, h, time.time() + 60) - # The Raft log will only contain this change, - # older schema changes can only be obtained through snapshot transfer. - await cql.run_async("create table ks.t2 (pk int primary key)") + # The Raft log will only contain this change, + # older schema changes can only be obtained through snapshot transfer. + await cql.run_async(f"create table {ks}.t2 (pk int primary key)") - # Restarting the server should trigger snapshot creation. - await manager.server_restart(srv.server_id) - cql = await reconnect_driver(manager) - await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) + # Restarting the server should trigger snapshot creation. + await manager.server_restart(srv.server_id) + cql = await reconnect_driver(manager) + await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) - await manager.server_add(config=cfg) - await manager.server_sees_others(srv.server_id, 1) - await wait_for_token_ring_and_group0_consistency(manager, time.time() + 60) + await manager.server_add(config=cfg) + await manager.server_sees_others(srv.server_id, 1) + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 60) - # This would fail if snapshot creation wasn't triggered, - # second node reporting 'Failed to apply mutation ... no_such_column_family` - await cql.run_async("insert into ks.t (pk) values (0)", host=h) + # This would fail if snapshot creation wasn't triggered, + # second node reporting 'Failed to apply mutation ... no_such_column_family` + await cql.run_async(f"insert into {ks}.t (pk) values (0)", host=h) From 3f3549126455883b66d7d155c8bbf764e429eac2 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 33/56] topology_custom/test_raft_no_quorum: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_raft_no_quorum.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/topology_custom/test_raft_no_quorum.py b/test/topology_custom/test_raft_no_quorum.py index 1294d71fd9..4b98cc31f8 100644 --- a/test/topology_custom/test_raft_no_quorum.py +++ b/test/topology_custom/test_raft_no_quorum.py @@ -10,6 +10,7 @@ import asyncio from test.pylib.manager_client import ManagerClient from test.topology.conftest import skip_mode from test.pylib.rest_client import inject_error_one_shot, InjectionHandler +from test.topology.util import create_new_test_keyspace logger = logging.getLogger(__name__) @@ -189,17 +190,16 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in servers += [await manager.server_add()] logger.info('create keyspace and table') - await manager.get_cql().run_async("create keyspace ks " - "with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") - await manager.get_cql().run_async('create table ks.test_table (pk int primary key)') + ks = await create_new_test_keyspace(manager.get_cql(), "with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") + await manager.get_cql().run_async(f'create table {ks}.test_table (pk int primary key)') logger.info("stopping the second node") await manager.server_stop_gracefully(servers[1].server_id) logger.info("attempting removenode for the second node") await manager.remove_node(servers[0].server_id, servers[1].server_id, - expected_error="raft operation [read_barrier] timed out, there is no raft quorum", - timeout=60) + expected_error="raft operation [read_barrier] timed out, there is no raft quorum", + timeout=60) logger.info("attempting decommission_node for the first node") await manager.decommission_node(servers[0].server_id, @@ -208,11 +208,11 @@ async def test_cannot_run_operations(manager: ManagerClient, raft_op_timeout: in logger.info("attempting rebuild_node for the first node") await manager.rebuild_node(servers[0].server_id, - expected_error="raft operation [read_barrier] timed out, there is no raft quorum", - timeout=60) + expected_error="raft operation [read_barrier] timed out, there is no raft quorum", + timeout=60) with pytest.raises(Exception, match="raft operation \[read_barrier\] timed out, " "there is no raft quorum, total voters count 2, alive voters count 1"): - await manager.get_cql().run_async('drop table ks.test_table', timeout=60) + await manager.get_cql().run_async(f'drop table {ks}.test_table', timeout=60) logger.info("done") From e72a9d3faa1c57415661e86fa6aa8e36cedbc2b0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 34/56] topology_custom/test_raft_snapshot_truncation: use create_new_test_keyspace Using the new_test_keyspace fixture is awkward for this test as it is written to explicitly drop the created keyspaces at certain points. Therefore, just use create_new_test_keyspace to standardize the creation procedure. Signed-off-by: Benny Halevy --- .../test_raft_snapshot_truncation.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/topology_custom/test_raft_snapshot_truncation.py b/test/topology_custom/test_raft_snapshot_truncation.py index dc7624860f..cd8bc88f8c 100644 --- a/test/topology_custom/test_raft_snapshot_truncation.py +++ b/test/topology_custom/test_raft_snapshot_truncation.py @@ -11,7 +11,7 @@ import logging from test.pylib.manager_client import ManagerClient from test.pylib.util import wait_for, wait_for_cql_and_get_hosts -from test.topology.util import reconnect_driver, trigger_snapshot, get_topology_coordinator, get_raft_log_size, get_raft_snap_id +from test.topology.util import reconnect_driver, trigger_snapshot, get_topology_coordinator, get_raft_log_size, get_raft_snap_id, create_new_test_keyspace from test.pylib.rest_client import inject_error_one_shot logger = logging.getLogger(__name__) @@ -49,7 +49,7 @@ async def test_raft_snapshot_truncation(manager: ManagerClient): logger.info(f"Log size on {s1}: {log_size}") assert (log_size > 0) - await cql.run_async("create keyspace ks with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") + ks = await create_new_test_keyspace(cql, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}") log_size = await get_raft_log_size(cql, h1) logger.info(f"After add keyspace Log size on {s1}: {log_size}") @@ -62,7 +62,7 @@ async def test_raft_snapshot_truncation(manager: ManagerClient): await asyncio.gather(*errs) # Change schema - trigger log truncation. - await cql.run_async("drop keyspace ks") + await cql.run_async(f"drop keyspace {ks}") log_size = await get_raft_log_size(cql, h1) logger.info(f"After drop keyspace Log size on {s1}: {log_size}") @@ -82,12 +82,13 @@ async def test_raft_snapshot_truncation(manager: ManagerClient): original_snap_id = await get_raft_snap_id(cql, h1) # Create 3 keyspaces. + keyspaces = [] for i in range(3): - await cql.run_async(f"create keyspace ks{i} with replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}}") + keyspaces.append(await create_new_test_keyspace(cql, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 1}")) # Drop 2 keyspaces. for i in range(2): - await cql.run_async(f"drop keyspace ks{i}") + await cql.run_async(f"drop keyspace {keyspaces[i]}") log_size = await get_raft_log_size(cql, h1) logger.info(f"After add/drop keyspace Log size on {s1}: {log_size}.") @@ -99,7 +100,7 @@ async def test_raft_snapshot_truncation(manager: ManagerClient): await asyncio.gather(*errs) # Change schema by dropping the last keyspace, that will trigger log truncation. - await cql.run_async("drop keyspace ks2") + await cql.run_async(f"drop keyspace {keyspaces[2]}") new_snap_id = await get_raft_snap_id(cql, h1) From 47326d01b795a566b3422730f5f8c315a748e8fa Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 35/56] topology_custom/test_reversed_queries_during_simulated_upgrade_process: use new_test_keyspace Signed-off-by: Benny Halevy --- ...ueries_during_simulated_upgrade_process.py | 59 ++++++++++--------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/test/topology_custom/test_reversed_queries_during_simulated_upgrade_process.py b/test/topology_custom/test_reversed_queries_during_simulated_upgrade_process.py index c726246284..8423803702 100644 --- a/test/topology_custom/test_reversed_queries_during_simulated_upgrade_process.py +++ b/test/topology_custom/test_reversed_queries_during_simulated_upgrade_process.py @@ -8,6 +8,7 @@ from itertools import zip_longest from cassandra.query import SimpleStatement, ConsistencyLevel from test.pylib.manager_client import ManagerClient from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace def verify_data(response, expected_data): @@ -30,39 +31,39 @@ async def test_reversed_queries_during_upgrade(manager: ManagerClient) -> None: cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") - await cql.run_async("CREATE TABLE ks.test (pk int, ck1 int, ck2 int, PRIMARY KEY (pk, ck1, ck2));") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int, ck1 int, ck2 int, PRIMARY KEY (pk, ck1, ck2));") - await asyncio.gather(*[cql.run_async(f"INSERT INTO ks.test (pk, ck1, ck2) VALUES ({k % 10}, {k % 3}, {k});") for k in range(100)]) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, ck1, ck2) VALUES ({k % 10}, {k % 3}, {k});") for k in range(100)]) - native_reverse_format_config = [] - legacy_reverse_format_config = [ - {"name": "suppress_features", "value": ["NATIVE_REVERSE_QUERIES"]} - ] + native_reverse_format_config = [] + legacy_reverse_format_config = [ + {"name": "suppress_features", "value": ["NATIVE_REVERSE_QUERIES"]} + ] - queries = [ - (SimpleStatement("SELECT * FROM ks.test WHERE pk = 6 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), - ((6, 2, 86), (6, 2, 56), (6, 2, 26), (6, 1, 76), (6, 1, 46), (6, 1, 16), (6, 0, 96), (6, 0, 66), (6, 0, 36), (6, 0, 6))), - (SimpleStatement("SELECT * FROM ks.test WHERE pk = 6 AND ck1 < 2 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), - ((6, 1, 76), (6, 1, 46), (6, 1, 16), (6, 0, 96), (6, 0, 66), (6, 0, 36), (6, 0, 6))), - (SimpleStatement("SELECT * FROM ks.test WHERE pk = 6 AND ck1 = 0 AND ck2 > 10 AND ck2 < 80 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), - ((6, 0, 66), (6, 0, 36))), - (SimpleStatement("SELECT * FROM ks.test WHERE pk = 6 AND (ck1, ck2) >= (1, 46) ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), - ((6, 2, 86), (6, 2, 56), (6, 2, 26), (6, 1, 76), (6, 1, 46))), - (SimpleStatement("SELECT * FROM ks.test WHERE pk IN (5, 6) AND (ck1, ck2) >= (1, 55) ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL, fetch_size=0), - ((5, 2, 95), (6, 2, 86), (5, 2, 65), (6, 2, 56), (5, 2, 35), (6, 2, 26), (5, 2, 5), (5, 1, 85), (6, 1, 76), (5, 1, 55))), - ] + queries = [ + (SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk = 6 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), + ((6, 2, 86), (6, 2, 56), (6, 2, 26), (6, 1, 76), (6, 1, 46), (6, 1, 16), (6, 0, 96), (6, 0, 66), (6, 0, 36), (6, 0, 6))), + (SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk = 6 AND ck1 < 2 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), + ((6, 1, 76), (6, 1, 46), (6, 1, 16), (6, 0, 96), (6, 0, 66), (6, 0, 36), (6, 0, 6))), + (SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk = 6 AND ck1 = 0 AND ck2 > 10 AND ck2 < 80 ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), + ((6, 0, 66), (6, 0, 36))), + (SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk = 6 AND (ck1, ck2) >= (1, 46) ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL), + ((6, 2, 86), (6, 2, 56), (6, 2, 26), (6, 1, 76), (6, 1, 46))), + (SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk IN (5, 6) AND (ck1, ck2) >= (1, 55) ORDER BY ck1 DESC, ck2 DESC BYPASS CACHE;", consistency_level=ConsistencyLevel.ALL, fetch_size=0), + ((5, 2, 95), (6, 2, 86), (5, 2, 65), (6, 2, 56), (5, 2, 35), (6, 2, 26), (5, 2, 5), (5, 1, 85), (6, 1, 76), (5, 1, 55))), + ] - for config in [legacy_reverse_format_config, native_reverse_format_config]: - await manager.server_stop_gracefully(node1.server_id) - await manager.server_update_config( - node1.server_id, "error_injections_at_startup", config - ) - await manager.server_start(node1.server_id) - - for query, expected_data in queries: + for config in [legacy_reverse_format_config, native_reverse_format_config]: await manager.server_stop_gracefully(node1.server_id) - await manager.server_wipe_sstables(node1.server_id, "ks", "test") + await manager.server_update_config( + node1.server_id, "error_injections_at_startup", config + ) await manager.server_start(node1.server_id) - verify_data(cql.execute(query), expected_data) + for query, expected_data in queries: + await manager.server_stop_gracefully(node1.server_id) + await manager.server_wipe_sstables(node1.server_id, "ks", "test") + await manager.server_start(node1.server_id) + + verify_data(cql.execute(query), expected_data) From 72bc4016e75063e04d68c53cff501eb4b2f19ca8 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 36/56] topology_custom/test_rpc_compression: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_rpc_compression.py | 210 +++++++++---------- 1 file changed, 104 insertions(+), 106 deletions(-) diff --git a/test/topology_custom/test_rpc_compression.py b/test/topology_custom/test_rpc_compression.py index 37234a28a3..39ef27b079 100644 --- a/test/topology_custom/test_rpc_compression.py +++ b/test/topology_custom/test_rpc_compression.py @@ -10,6 +10,8 @@ from test.pylib.internal_types import ServerInfo from test.pylib.rest_client import ScyllaMetrics from test.pylib.util import wait_for_cql_and_get_hosts, unique_name from test.pylib.manager_client import ManagerClient +from test.topology.util import new_test_keyspace + import pytest import asyncio import time @@ -64,32 +66,31 @@ async def test_basic(manager: ManagerClient) -> None: cql = manager.get_cql() replication_factor = 2 - ks = unique_name() - await cql.run_async(f"create keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") - await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") - write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") - write_stmt.consistency_level = ConsistencyLevel.ALL + async with new_test_keyspace(manager, f"with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") as ks: + await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") + write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") + write_stmt.consistency_level = ConsistencyLevel.ALL - # 128 kiB message, should give compression ratio of ~0.5 for lz4 and ~0.25 for zstd. - message = b''.join(bytes(random.choices(range(16), k=1024)) * 2 for _ in range(64)) + # 128 kiB message, should give compression ratio of ~0.5 for lz4 and ~0.25 for zstd. + message = b''.join(bytes(random.choices(range(16), k=1024)) * 2 for _ in range(64)) - async def test_algo(algo: str, expected_ratio: float): - n_messages = 100 - metrics_before = await get_metrics(manager, servers) - await asyncio.gather(*[cql.run_async(write_stmt, parameters=[message, pk]) for pk in range(n_messages)]) - metrics_after = await get_metrics(manager, servers) + async def test_algo(algo: str, expected_ratio: float): + n_messages = 100 + metrics_before = await get_metrics(manager, servers) + await asyncio.gather(*[cql.run_async(write_stmt, parameters=[message, pk]) for pk in range(n_messages)]) + metrics_after = await get_metrics(manager, servers) - volume = n_messages * len(message) * (replication_factor - 1) - uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) - compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) - assert approximately_equal(uncompressed, volume, 0.9) - assert approximately_equal(compressed, expected_ratio * volume, 0.9) + volume = n_messages * len(message) * (replication_factor - 1) + uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) + compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) + assert approximately_equal(uncompressed, volume, 0.9) + assert approximately_equal(compressed, expected_ratio * volume, 0.9) - await with_retries(functools.partial(test_algo, "lz4", 0.5), timeout=600) + await with_retries(functools.partial(test_algo, "lz4", 0.5), timeout=600) - await live_update_config(manager, servers, "internode_compression_zstd_max_cpu_fraction", "1.0") + await live_update_config(manager, servers, "internode_compression_zstd_max_cpu_fraction", "1.0") - await with_retries(functools.partial(test_algo, "zstd", 0.25), timeout=600) + await with_retries(functools.partial(test_algo, "zstd", 0.25), timeout=600) @pytest.mark.asyncio async def test_dict_training(manager: ManagerClient) -> None: @@ -112,47 +113,46 @@ async def test_dict_training(manager: ManagerClient) -> None: cql = manager.get_cql() replication_factor = 2 - ks = unique_name() - await cql.run_async(f"create keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") - await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") - write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") - dict_stmt = cql.prepare(f"select data from system.dicts") - write_stmt.consistency_level = ConsistencyLevel.ALL + async with new_test_keyspace(manager, f"with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") as ks: + await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") + write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") + dict_stmt = cql.prepare(f"select data from system.dicts") + write_stmt.consistency_level = ConsistencyLevel.ALL - msg_size = 16*1024 - msg_train = random.randbytes(msg_size) - msg_notrain = random.randbytes(msg_size) + msg_size = 16*1024 + msg_train = random.randbytes(msg_size) + msg_notrain = random.randbytes(msg_size) - async def write_messages(m: bytes): - n_messages = 100 - assert n_messages * len(m) > training_min_bytes - await asyncio.gather(*[cql.run_async(write_stmt, parameters=[m, pk]) for pk in range(n_messages)]) + async def write_messages(m: bytes): + n_messages = 100 + assert n_messages * len(m) > training_min_bytes + await asyncio.gather(*[cql.run_async(write_stmt, parameters=[m, pk]) for pk in range(n_messages)]) - async def set_dict_training_when(x: str): - await live_update_config(manager, servers, "rpc_dict_training_when", x) + async def set_dict_training_when(x: str): + await live_update_config(manager, servers, "rpc_dict_training_when", x) - await write_messages(msg_notrain) - await set_dict_training_when("when_leader") - await write_messages(msg_train) - await set_dict_training_when("never") - await write_messages(msg_notrain) - await set_dict_training_when("when_leader") - await write_messages(msg_train) + await write_messages(msg_notrain) + await set_dict_training_when("when_leader") + await write_messages(msg_train) + await set_dict_training_when("never") + await write_messages(msg_notrain) + await set_dict_training_when("when_leader") + await write_messages(msg_train) - ngram_size = 8 - def make_ngrams(x: bytes) -> list[bytes]: - return [x[i:i+ngram_size] for i in range(len(x) - ngram_size)] - msg_train_ngrams = set(make_ngrams(msg_train)) - msg_notrain_ngrams = set(make_ngrams(msg_notrain)) + ngram_size = 8 + def make_ngrams(x: bytes) -> list[bytes]: + return [x[i:i+ngram_size] for i in range(len(x) - ngram_size)] + msg_train_ngrams = set(make_ngrams(msg_train)) + msg_notrain_ngrams = set(make_ngrams(msg_notrain)) - async def test_once() -> None: - results = await cql.run_async(dict_stmt) - dicts = [bytes(x[0]) for x in results] - dict_ngrams = set(make_ngrams(bytes().join(dicts))) - assert len(msg_train_ngrams & dict_ngrams) > 0.5 * len(msg_train_ngrams) - assert len(msg_notrain_ngrams & dict_ngrams) < 0.5 * len(msg_notrain_ngrams) + async def test_once() -> None: + results = await cql.run_async(dict_stmt) + dicts = [bytes(x[0]) for x in results] + dict_ngrams = set(make_ngrams(bytes().join(dicts))) + assert len(msg_train_ngrams & dict_ngrams) > 0.5 * len(msg_train_ngrams) + assert len(msg_notrain_ngrams & dict_ngrams) < 0.5 * len(msg_notrain_ngrams) - await with_retries(test_once, timeout=600) + await with_retries(test_once, timeout=600) @pytest.mark.asyncio async def test_external_dicts(manager: ManagerClient) -> None: @@ -175,47 +175,46 @@ async def test_external_dicts(manager: ManagerClient) -> None: cql = manager.get_cql() replication_factor = 2 - ks = unique_name() - await cql.run_async(f"create keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") - await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") - write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") - write_stmt.consistency_level = ConsistencyLevel.ALL + async with new_test_keyspace(manager, f"with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") as ks: + await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") + write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") + write_stmt.consistency_level = ConsistencyLevel.ALL - msg_size = 32*1024 - ngram_size = 64 - common_ngrams = [random.randbytes(ngram_size) for _ in range(msg_size//2//ngram_size)] + msg_size = 32*1024 + ngram_size = 64 + common_ngrams = [random.randbytes(ngram_size) for _ in range(msg_size//2//ngram_size)] - # 128 kiB messages, should give compression ratio of ~0.5 for lz4 and ~0.25 for zstd - # when compressed with a common dictionary. - def make_message() -> bytes: - common_part = b''.join(random.sample(common_ngrams, k=msg_size//2//ngram_size)) - assert len(common_part) == msg_size // 2 - unique_part = bytes(random.choices(range(16), k=msg_size//2)) - assert len(unique_part) == msg_size // 2 - return common_part + unique_part + # 128 kiB messages, should give compression ratio of ~0.5 for lz4 and ~0.25 for zstd + # when compressed with a common dictionary. + def make_message() -> bytes: + common_part = b''.join(random.sample(common_ngrams, k=msg_size//2//ngram_size)) + assert len(common_part) == msg_size // 2 + unique_part = bytes(random.choices(range(16), k=msg_size//2)) + assert len(unique_part) == msg_size // 2 + return common_part + unique_part - async def test_once(algo: str, expected_ratio: float): - n_messages = 1000 - metrics_before = await get_metrics(manager, servers) - messages = [make_message() for _ in range(n_messages)] - await asyncio.gather(*[cql.run_async(write_stmt, parameters=[m, pk]) for pk, m in enumerate(messages)]) - metrics_after = await get_metrics(manager, servers) + async def test_once(algo: str, expected_ratio: float): + n_messages = 1000 + metrics_before = await get_metrics(manager, servers) + messages = [make_message() for _ in range(n_messages)] + await asyncio.gather(*[cql.run_async(write_stmt, parameters=[m, pk]) for pk, m in enumerate(messages)]) + metrics_after = await get_metrics(manager, servers) - volume = sum(len(m) for m in messages) * (replication_factor - 1) - uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) - compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) - assert approximately_equal(uncompressed, volume, 0.8) - assert approximately_equal(compressed, expected_ratio * volume, 0.8) + volume = sum(len(m) for m in messages) * (replication_factor - 1) + uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) + compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) + assert approximately_equal(uncompressed, volume, 0.8) + assert approximately_equal(compressed, expected_ratio * volume, 0.8) - await with_retries(functools.partial(test_once, "lz4", 0.5), timeout=600) - await live_update_config(manager, servers, "internode_compression_zstd_max_cpu_fraction", "1.0"), - await with_retries(functools.partial(test_once, "zstd", 0.25), timeout=600) + await with_retries(functools.partial(test_once, "lz4", 0.5), timeout=600) + await live_update_config(manager, servers, "internode_compression_zstd_max_cpu_fraction", "1.0"), + await with_retries(functools.partial(test_once, "zstd", 0.25), timeout=600) - # Test that the dicts are loaded on startup. - await asyncio.gather(*[manager.server_stop_gracefully(s.server_id) for s in servers]) - await asyncio.gather(*[manager.server_update_config(s.server_id, 'rpc_dict_training_when', 'never') for s in servers]) - await asyncio.gather(*[manager.server_start(s.server_id) for s in servers]) - await with_retries(functools.partial(test_once, "lz4", 0.5), timeout=600) + # Test that the dicts are loaded on startup. + await asyncio.gather(*[manager.server_stop_gracefully(s.server_id) for s in servers]) + await asyncio.gather(*[manager.server_update_config(s.server_id, 'rpc_dict_training_when', 'never') for s in servers]) + await asyncio.gather(*[manager.server_start(s.server_id) for s in servers]) + await with_retries(functools.partial(test_once, "lz4", 0.5), timeout=600) # Similar to test_external_dicts, but simpler. @pytest.mark.asyncio @@ -239,24 +238,23 @@ async def test_external_dicts_sanity(manager: ManagerClient) -> None: cql = manager.get_cql() replication_factor = 2 - ks = unique_name() - await cql.run_async(f"create keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") - await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") - write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") - write_stmt.consistency_level = ConsistencyLevel.ALL + async with new_test_keyspace(manager, f"with replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {replication_factor}}}") as ks: + await cql.run_async(f"create table {ks}.cf (pk int, v blob, primary key (pk))") + write_stmt = cql.prepare(f"update {ks}.cf set v = ? where pk = ?") + write_stmt.consistency_level = ConsistencyLevel.ALL - msg = random.randbytes(8192) + msg = random.randbytes(8192) - async def test_algo(algo: str, expected_ratio): - n_messages = 1000 - metrics_before = await get_metrics(manager, servers) - await asyncio.gather(*[cql.run_async(write_stmt, parameters=[msg, pk]) for pk in range(n_messages)]) - metrics_after = await get_metrics(manager, servers) + async def test_algo(algo: str, expected_ratio): + n_messages = 1000 + metrics_before = await get_metrics(manager, servers) + await asyncio.gather(*[cql.run_async(write_stmt, parameters=[msg, pk]) for pk in range(n_messages)]) + metrics_after = await get_metrics(manager, servers) - volume = len(msg) * n_messages * (replication_factor - 1) - uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) - compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) - assert approximately_equal(uncompressed, volume, 0.8) - assert compressed < expected_ratio * uncompressed + volume = len(msg) * n_messages * (replication_factor - 1) + uncompressed = uncompressed_sent(metrics_after, algo) - uncompressed_sent(metrics_before, algo) + compressed = compressed_sent(metrics_after, algo) - compressed_sent(metrics_before, algo) + assert approximately_equal(uncompressed, volume, 0.8) + assert compressed < expected_ratio * uncompressed - await with_retries(functools.partial(test_algo, "lz4", 0.04), timeout=600) + await with_retries(functools.partial(test_algo, "lz4", 0.04), timeout=600) From 4fd6c2d24ecbc541effed5ad80fc8e74b3b7fd4e Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 37/56] topology_custom/test_select_from_mutation_fragments: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_select_from_mutation_fragments.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/test/topology_custom/test_select_from_mutation_fragments.py b/test/topology_custom/test_select_from_mutation_fragments.py index b002dc2836..45d10760a8 100644 --- a/test/topology_custom/test_select_from_mutation_fragments.py +++ b/test/topology_custom/test_select_from_mutation_fragments.py @@ -9,6 +9,7 @@ import pytest from cassandra.protocol import InvalidRequest # type: ignore from cassandra.query import SimpleStatement # type: ignore +from test.topology.util import new_test_keyspace from test.pylib.manager_client import ManagerClient @@ -19,23 +20,22 @@ async def test_sticky_coordinator_enforced(manager: ManagerClient) -> None: cql = manager.get_cql() - await cql.run_async("create keyspace ks" - " with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") - await cql.run_async("create table ks.tbl (pk int, ck int, v int, primary key (pk, ck))") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f"create table {ks}.tbl (pk int, ck int, v int, primary key (pk, ck))") - num_rows = 43 - expected_num_rows = num_rows + 2 # rows + partition-start + partitione-end - for ck in range(0, num_rows): - await cql.run_async(f"INSERT INTO ks.tbl (pk, ck, v) VALUES (0, {ck}, 100)") + num_rows = 43 + expected_num_rows = num_rows + 2 # rows + partition-start + partitione-end + for ck in range(0, num_rows): + await cql.run_async(f"INSERT INTO {ks}.tbl (pk, ck, v) VALUES (0, {ck}, 100)") - unpaged_res = await cql.run_async("SELECT * FROM MUTATION_FRAGMENTS(ks.tbl) WHERE pk = 0") - assert len(unpaged_res) == expected_num_rows + unpaged_res = await cql.run_async(f"SELECT * FROM MUTATION_FRAGMENTS({ks}.tbl) WHERE pk = 0") + assert len(unpaged_res) == expected_num_rows - read_stmt = SimpleStatement("SELECT * FROM MUTATION_FRAGMENTS(ks.tbl) WHERE pk = 0", fetch_size=10) + read_stmt = SimpleStatement(f"SELECT * FROM MUTATION_FRAGMENTS({ks}.tbl) WHERE pk = 0", fetch_size=10) - # The default round-robin load-balancing policy will jump between the nodes. - # This should trigger an exception. - with pytest.raises( - InvalidRequest, - match="Moving between coordinators is not allowed in SELECT FROM MUTATION_FRAGMENTS\\(\\) statements.*"): - await cql.run_async(read_stmt, all_pages=True) + # The default round-robin load-balancing policy will jump between the nodes. + # This should trigger an exception. + with pytest.raises( + InvalidRequest, + match="Moving between coordinators is not allowed in SELECT FROM MUTATION_FRAGMENTS\\(\\) statements.*"): + await cql.run_async(read_stmt, all_pages=True) From 50a8f5c1c049a5e626e192380ddd5340ac52a239 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 38/56] topology_custom/test_shutdown_hang: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_shutdown_hang.py | 66 ++++++++++++---------- 1 file changed, 35 insertions(+), 31 deletions(-) diff --git a/test/topology_custom/test_shutdown_hang.py b/test/topology_custom/test_shutdown_hang.py index 74b42d40d3..c4f22339d6 100644 --- a/test/topology_custom/test_shutdown_hang.py +++ b/test/topology_custom/test_shutdown_hang.py @@ -14,7 +14,7 @@ from cassandra.cluster import ConsistencyLevel # type: ignore from cassandra.protocol import WriteTimeout # type: ignore from test.pylib.manager_client import ManagerClient -from test.topology.util import wait_for_token_ring_and_group0_consistency +from test.topology.util import wait_for_token_ring_and_group0_consistency, new_test_keyspace, reconnect_driver from test.topology.conftest import skip_mode @@ -34,38 +34,42 @@ async def test_hints_manager_shutdown_hang(manager: ManagerClient) -> None: cql = manager.get_cql() logger.info("Create keyspace and table") - await cql.run_async("create keyspace ks with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'SimpleStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f"create table {ks}.t (pk int primary key)") - logger.info(f"Stop {s2}") - await manager.server_stop(s2.server_id) + logger.info(f"Stop {s2}") + await manager.server_stop(s2.server_id) - logger.info("Write data with small timeout") - # We're using a small timeout for the insert so it's not unexpected that it would fail on slow - # CI machines. To avoid flakiness we disable the test in debug mode (as well as release since - # it requires an error injection - so it will run only in dev mode) and we retry the write 10 times. - passed = False - for _ in range(10): - try: - await cql.run_async(SimpleStatement("insert into ks.t (pk) values (0) using timeout 500ms", - consistency_level=ConsistencyLevel.ONE)) - except WriteTimeout: - logger.info("write timeout, retrying") - else: - passed = True - break + logger.info("Write data with small timeout") + # We're using a small timeout for the insert so it's not unexpected that it would fail on slow + # CI machines. To avoid flakiness we disable the test in debug mode (as well as release since + # it requires an error injection - so it will run only in dev mode) and we retry the write 10 times. + passed = False + for _ in range(10): + try: + await cql.run_async(SimpleStatement(f"insert into {ks}.t (pk) values (0) using timeout 500ms", + consistency_level=ConsistencyLevel.ONE)) + except WriteTimeout: + logger.info("write timeout, retrying") + else: + passed = True + break - if not passed: - pytest.fail("Write timed out on each attempt") + if not passed: + pytest.fail("Write timed out on each attempt") - # The write succeeded but a background task was left to finish the write to the other node - # (which is dead but the first node didn't mark it as dead yet). - # The background task will timeout shortly because of 'using timeout' in the statement. - # This will cause a hint to get created. - # The hints manager starts sending the hint soon after (hint flushing happens every - # ~1 second with the error injection). - logger.info("Sleep") - await asyncio.sleep(2) + # The write succeeded but a background task was left to finish the write to the other node + # (which is dead but the first node didn't mark it as dead yet). + # The background task will timeout shortly because of 'using timeout' in the statement. + # This will cause a hint to get created. + # The hints manager starts sending the hint soon after (hint flushing happens every + # ~1 second with the error injection). + logger.info("Sleep") + await asyncio.sleep(2) - logger.info(f"Stop {s1} gracefully") - await manager.server_stop_gracefully(s1.server_id) + logger.info(f"Stop {s1} gracefully") + await manager.server_stop_gracefully(s1.server_id) + + # For dropping the keyspace + await asyncio.gather(*[manager.server_start(s.server_id) for s in [s1, s2]]) + await reconnect_driver(manager) From 005ceb77d3b43efafa2c1c3a1aa22d6d8a74c093 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 39/56] topology_custom/test_table_desc_read_barrier: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_table_desc_read_barrier.py | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/test/topology_custom/test_table_desc_read_barrier.py b/test/topology_custom/test_table_desc_read_barrier.py index 9df4807a48..e588d85f28 100644 --- a/test/topology_custom/test_table_desc_read_barrier.py +++ b/test/topology_custom/test_table_desc_read_barrier.py @@ -10,6 +10,7 @@ import pytest from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error, read_barrier from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace logger = logging.getLogger(__name__) @@ -40,22 +41,21 @@ async def test_table_desc_read_barrier(manager: ManagerClient) -> None: cql, hosts = await manager.get_ready_cql(servers) logger.info("Creating keyspace and table") - await cql.run_async("create keyspace ks with replication = " - "{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") - await cql.run_async("create table ks.t (pk int primary key)") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"create table {ks}.t (pk int primary key)") - logger.info("Disabling the schema agreement wait") - assert hasattr(cql.cluster, "max_schema_agreement_wait") - cql.cluster.max_schema_agreement_wait = 0 + logger.info("Disabling the schema agreement wait") + assert hasattr(cql.cluster, "max_schema_agreement_wait") + cql.cluster.max_schema_agreement_wait = 0 - async with inject_error(manager.api, servers[0].ip_addr, 'group0_state_machine::delay_apply'): - logger.info("Altering table") - sec_host = next(h for h in hosts if h.address == servers[1].ip_addr) - await cql.run_async("alter table ks.t add s1 int", host=sec_host) + async with inject_error(manager.api, servers[0].ip_addr, 'group0_state_machine::delay_apply'): + logger.info("Altering table") + sec_host = next(h for h in hosts if h.address == servers[1].ip_addr) + await cql.run_async(f"alter table {ks}.t add s1 int", host=sec_host) - # wait for the first node to see the latest state (after the delay ends) - await read_barrier(manager.api, servers[0].ip_addr) + # wait for the first node to see the latest state (after the delay ends) + await read_barrier(manager.api, servers[0].ip_addr) - # verify that there is no schema difference after the read barrier - desc_schema = [await cql.run_async("DESC SCHEMA", host=h) for h in hosts] - assert desc_schema[0] == desc_schema[1] + # verify that there is no schema difference after the read barrier + desc_schema = [await cql.run_async("DESC SCHEMA", host=h) for h in hosts] + assert desc_schema[0] == desc_schema[1] From 649e68c6db3fa7fbd42c3794778bdf5ca68dd8db Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 40/56] topology_custom/test_tablets: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_tablets.py | 675 ++++++++++++++------------- 1 file changed, 339 insertions(+), 336 deletions(-) diff --git a/test/topology_custom/test_tablets.py b/test/topology_custom/test_tablets.py index 4da4a020f0..98ce82750f 100644 --- a/test/topology_custom/test_tablets.py +++ b/test/topology_custom/test_tablets.py @@ -12,7 +12,7 @@ from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas from test.pylib.util import unique_name from test.topology.conftest import skip_mode -from test.topology.util import wait_for_cql_and_get_hosts +from test.topology.util import wait_for_cql_and_get_hosts, create_new_test_keyspace, new_test_keyspace, reconnect_driver from contextlib import nullcontext as does_not_raise import time import pytest @@ -37,12 +37,12 @@ async def test_tablet_replication_factor_enough_nodes(manager: ManagerClient): res = await cql.run_async("SELECT data_center FROM system.local") this_dc = res[0].data_center - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 3}}") - with pytest.raises(ConfigurationException, match=f"Datacenter {this_dc} doesn't have enough token-owning nodes"): - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 3}}") as ks: + with pytest.raises(ConfigurationException, match=f"Datacenter {this_dc} doesn't have enough token-owning nodes"): + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - await cql.run_async(f"ALTER KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 2}}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"ALTER KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 2}}") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") @pytest.mark.asyncio @@ -53,27 +53,27 @@ async def test_tablet_cannot_decommision_below_replication_factor(manager: Manag logger.info("Creating table") cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + logger.info("Populating table") + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - logger.info("Decommission some node") - await manager.decommission_node(servers[0].server_id) + logger.info("Decommission some node") + await manager.decommission_node(servers[0].server_id) - with pytest.raises(HTTPError, match="Decommission failed"): - logger.info("Decommission another node") - await manager.decommission_node(servers[1].server_id) + with pytest.raises(HTTPError, match="Decommission failed"): + logger.info("Decommission another node") + await manager.decommission_node(servers[1].server_id) - # Three nodes should still provide CL=3 - logger.info("Checking table") - query = SimpleStatement("SELECT * FROM test.test;", consistency_level=ConsistencyLevel.THREE) - rows = await cql.run_async(query) - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + # Three nodes should still provide CL=3 + logger.info("Checking table") + query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.THREE) + rows = await cql.run_async(query) + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk async def test_reshape_with_tablets(manager: ManagerClient): logger.info("Bootstrapping cluster") @@ -83,31 +83,32 @@ async def test_reshape_with_tablets(manager: ManagerClient): logger.info("Creating table") cql = manager.get_cql() number_of_tablets = 2 - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} and tablets = {{'initial': {number_of_tablets} }}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} and tablets = {{'initial': {number_of_tablets} }}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Disabling autocompaction for the table") - await manager.api.disable_autocompaction(server.ip_addr, "test", "test") + logger.info("Disabling autocompaction for the table") + await manager.api.disable_autocompaction(server.ip_addr, ks, "test") - logger.info("Populating table") - loop_count = 32 - for _ in range(loop_count): - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in range(64)]) - await manager.api.keyspace_flush(server.ip_addr, "test", "test") - # After populating the table, expect loop_count number of sstables per tablet - sstable_info = await manager.api.get_sstable_info(server.ip_addr, "test", "test") - assert len(sstable_info[0]['sstables']) == number_of_tablets * loop_count + logger.info("Populating table") + loop_count = 32 + for _ in range(loop_count): + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in range(64)]) + await manager.api.keyspace_flush(server.ip_addr, ks, "test") + # After populating the table, expect loop_count number of sstables per tablet + sstable_info = await manager.api.get_sstable_info(server.ip_addr, ks, "test") + assert len(sstable_info[0]['sstables']) == number_of_tablets * loop_count - log = await manager.server_open_log(server.server_id) - mark = await log.mark() + log = await manager.server_open_log(server.server_id) + mark = await log.mark() - # Restart the server and verify that the sstables have been reshaped down to one sstable per tablet - logger.info("Restart the server") - await manager.server_restart(server.server_id) + # Restart the server and verify that the sstables have been reshaped down to one sstable per tablet + logger.info("Restart the server") + await manager.server_restart(server.server_id) + await reconnect_driver(manager) - await log.wait_for("Reshape test.test .* Reshaped 32 sstables to .*", mark, 30) - sstable_info = await manager.api.get_sstable_info(server.ip_addr, "test", "test") - assert len(sstable_info[0]['sstables']) == number_of_tablets + await log.wait_for(f"Reshape {ks}.test .* Reshaped 32 sstables to .*", mark, 30) + sstable_info = await manager.api.get_sstable_info(server.ip_addr, ks, "test") + assert len(sstable_info[0]['sstables']) == number_of_tablets @pytest.mark.parametrize("direction", ["up", "down", "none"]) @@ -132,47 +133,47 @@ async def test_tablet_rf_change(manager: ManagerClient, direction): rf_from = 2 rf_to = 2 - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': {rf_from}}}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE MATERIALIZED VIEW test.test_mv AS SELECT pk FROM test.test WHERE pk IS NOT NULL PRIMARY KEY (pk)") + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': {rf_from}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.test_mv AS SELECT pk FROM {ks}.test WHERE pk IS NOT NULL PRIMARY KEY (pk)") - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in range(128)]) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in range(128)]) - async def check_allocated_replica(expected: int): - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - replicas = replicas + await get_all_tablet_replicas(manager, servers[0], 'test', 'test_mv', is_view=True) - for r in replicas: - logger.info(f"{r.replicas}") - assert len(r.replicas) == expected + async def check_allocated_replica(expected: int): + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + replicas = replicas + await get_all_tablet_replicas(manager, servers[0], ks, 'test_mv', is_view=True) + for r in replicas: + logger.info(f"{r.replicas}") + assert len(r.replicas) == expected - logger.info(f"Checking {rf_from} allocated replicas") - await check_allocated_replica(rf_from) + logger.info(f"Checking {rf_from} allocated replicas") + await check_allocated_replica(rf_from) - logger.info(f"Altering RF {rf_from} -> {rf_to}") - await cql.run_async(f"ALTER KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': {rf_to}}}") + logger.info(f"Altering RF {rf_from} -> {rf_to}") + await cql.run_async(f"ALTER KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': {rf_to}}}") - logger.info(f"Checking {rf_to} re-allocated replicas") - await check_allocated_replica(rf_to) + logger.info(f"Checking {rf_to} re-allocated replicas") + await check_allocated_replica(rf_to) - if direction != 'up': - # Don't check fragments for up/none changes, scylla crashes when checking nodes - # that (validly) miss the replica, see scylladb/scylladb#18786 - return + if direction != 'up': + # Don't check fragments for up/none changes, scylla crashes when checking nodes + # that (validly) miss the replica, see scylladb/scylladb#18786 + return - fragments = { pk: set() for pk in random.sample(range(128), 17) } - for s in servers: - host_id = await manager.get_host_id(s.server_id) - host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) - await read_barrier(manager.api, s.ip_addr) # scylladb/scylladb#18199 + fragments = { pk: set() for pk in random.sample(range(128), 17) } + for s in servers: + host_id = await manager.get_host_id(s.server_id) + host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) + await read_barrier(manager.api, s.ip_addr) # scylladb/scylladb#18199 + for k in fragments: + res = await cql.run_async(f"SELECT partition_region FROM MUTATION_FRAGMENTS({ks}.test) WHERE pk={k}", host=host[0]) + for fragment in res: + if fragment.partition_region == 0: # partition start + fragments[k].add(host_id) + logger.info("Checking fragments") for k in fragments: - res = await cql.run_async(f"SELECT partition_region FROM MUTATION_FRAGMENTS(test.test) WHERE pk={k}", host=host[0]) - for fragment in res: - if fragment.partition_region == 0: # partition start - fragments[k].add(host_id) - logger.info("Checking fragments") - for k in fragments: - assert len(fragments[k]) == rf_to, f"Found mutations for {k} key on {fragments[k]} hosts, but expected only {rf_to} of them" + assert len(fragments[k]) == rf_to, f"Found mutations for {k} key on {fragments[k]} hosts, but expected only {rf_to} of them" @pytest.mark.asyncio @@ -185,17 +186,17 @@ async def test_tablet_mutation_fragments_unowned_partition(manager: ManagerClien cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 2}}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in range(4)]) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in range(4)]) - for s in servers: - host_id = await manager.get_host_id(s.server_id) - host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) - for k in range(4): - await cql.run_async(f"SELECT partition_region FROM MUTATION_FRAGMENTS(test.test) WHERE pk={k}", host=host[0]) + for s in servers: + host_id = await manager.get_host_id(s.server_id) + host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) + for k in range(4): + await cql.run_async(f"SELECT partition_region FROM MUTATION_FRAGMENTS({ks}.test) WHERE pk={k}", host=host[0]) # ALTER tablets KS cannot change RF of any DC by more than 1 at a time. @@ -213,37 +214,37 @@ async def test_multidc_alter_tablets_rf(request: pytest.FixtureRequest, manager: await manager.servers_add(2, config=config, property_file={'dc': f'dc2', 'rack': 'myrack'}) cql = manager.get_cql() - await cql.run_async("create keyspace if not exists ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1}") - # need to create a table to not change only the schema, but also tablets replicas - await cql.run_async("create table ks.t (pk int primary key)") - with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): - # changing RF of dc2 from 0 to 2 should fail - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc2': 2}") + async with new_test_keyspace(manager, "with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 1}") as ks: + # need to create a table to not change only the schema, but also tablets replicas + await cql.run_async(f"create table {ks}.t (pk int primary key)") + with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): + # changing RF of dc2 from 0 to 2 should fail + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc2': 2}}") - # changing RF of dc2 from 0 to 1 should succeed - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc2': 1}") - # ensure that RFs of both DCs are equal to 1 now, i.e. that omitting dc1 in above command didn't change it - res = await cql.run_async("SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'ks'") - assert res[0].replication['dc1'] == '1' - assert res[0].replication['dc2'] == '1' + # changing RF of dc2 from 0 to 1 should succeed + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc2': 1}}") + # ensure that RFs of both DCs are equal to 1 now, i.e. that omitting dc1 in above command didn't change it + res = await cql.run_async(f"SELECT * FROM system_schema.keyspaces WHERE keyspace_name = '{ks}'") + assert res[0].replication['dc1'] == '1' + assert res[0].replication['dc2'] == '1' - # incrementing RF of 2 DCs at once should NOT succeed, because it'd leave 2 pending tablets replicas - with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 2}") - # as above, but decrementing - with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 0, 'dc2': 0}") - # as above, but decrement 1 RF and increment the other - with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 0}") - # as above, but RFs are swapped - with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 0, 'dc2': 2}") + # incrementing RF of 2 DCs at once should NOT succeed, because it'd leave 2 pending tablets replicas + with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 2}}") + # as above, but decrementing + with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': 0, 'dc2': 0}}") + # as above, but decrement 1 RF and increment the other + with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 0}}") + # as above, but RFs are swapped + with pytest.raises(InvalidRequest, match="Only one DC's RF can be changed at a time and not by more than 1"): + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': 0, 'dc2': 2}}") - # check that we can remove all replicas from dc2 by changing RF from 1 to 0 - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc2': 0}") - # check that we can remove all replicas from the cluster, i.e. change RF of dc1 from 1 to 0 as well: - await cql.run_async("alter keyspace ks with replication = {'class': 'NetworkTopologyStrategy', 'dc1': 0}") + # check that we can remove all replicas from dc2 by changing RF from 1 to 0 + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc2': 0}}") + # check that we can remove all replicas from the cluster, i.e. change RF of dc1 from 1 to 0 as well: + await cql.run_async(f"alter keyspace {ks} with replication = {{'class': 'NetworkTopologyStrategy', 'dc1': 0}}") # Reproducer for https://github.com/scylladb/scylladb/issues/18110 @@ -260,55 +261,55 @@ async def test_saved_readers_tablet_migration(manager: ManagerClient, build_mode cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH" + async with new_test_keyspace(manager, "WITH" " replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}" - " and tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int, ck int, c int, PRIMARY KEY (pk, ck));") + " and tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int, ck int, c int, PRIMARY KEY (pk, ck));") - logger.info("Populating table") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, ck, c) VALUES (0, {k}, 0);") for k in range(128)]) + logger.info("Populating table") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, ck, c) VALUES (0, {k}, 0);") for k in range(128)]) - statement = SimpleStatement("SELECT * FROM test.test WHERE pk = 0", fetch_size=10) - cql.execute(statement) + statement = SimpleStatement(f"SELECT * FROM {ks}.test WHERE pk = 0", fetch_size=10) + cql.execute(statement) - def get_querier_cache_population(server): - metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text - pattern = re.compile("^scylla_database_querier_cache_population") - for metric in metrics.split('\n'): - if pattern.match(metric) is not None: - return int(float(metric.split()[1])) + def get_querier_cache_population(server): + metrics = requests.get(f"http://{server.ip_addr}:9180/metrics").text + pattern = re.compile("^scylla_database_querier_cache_population") + for metric in metrics.split('\n'): + if pattern.match(metric) is not None: + return int(float(metric.split()[1])) - assert any(map(lambda x: x > 0, [get_querier_cache_population(server) for server in servers])) + assert any(map(lambda x: x > 0, [get_querier_cache_population(server) for server in servers])) - table_id = await cql.run_async("SELECT id FROM system_schema.tables WHERE keyspace_name = 'test' AND table_name = 'test'") - table_id = table_id[0].id + table_id = await cql.run_async(f"SELECT id FROM system_schema.tables WHERE keyspace_name = '{ks}' AND table_name = 'test'") + table_id = table_id[0].id - tablet_infos = await cql.run_async(f"SELECT last_token, replicas FROM system.tablets WHERE table_id = {table_id}") - tablet_infos = list(tablet_infos) + tablet_infos = await cql.run_async(f"SELECT last_token, replicas FROM system.tablets WHERE table_id = {table_id}") + tablet_infos = list(tablet_infos) - assert len(tablet_infos) == 1 - tablet_info = tablet_infos[0] - assert len(tablet_info.replicas) == 1 + assert len(tablet_infos) == 1 + tablet_info = tablet_infos[0] + assert len(tablet_info.replicas) == 1 - hosts = {await manager.get_host_id(server.server_id) for server in servers} - print(f"HOSTS: {hosts}") - source_host, source_shard = tablet_info.replicas[0] + hosts = {await manager.get_host_id(server.server_id) for server in servers} + print(f"HOSTS: {hosts}") + source_host, source_shard = tablet_info.replicas[0] - hosts.remove(str(source_host)) - target_host, target_shard = list(hosts)[0], source_shard + hosts.remove(str(source_host)) + target_host, target_shard = list(hosts)[0], source_shard - await manager.api.move_tablet( - node_ip=servers[0].ip_addr, - ks="test", - table="test", - src_host=source_host, - src_shard=source_shard, - dst_host=target_host, - dst_shard=target_shard, - token=tablet_info.last_token) + await manager.api.move_tablet( + node_ip=servers[0].ip_addr, + ks=ks, + table="test", + src_host=source_host, + src_shard=source_shard, + dst_host=target_host, + dst_shard=target_shard, + token=tablet_info.last_token) - # The tablet move should have evicted the cached reader. - assert all(map(lambda x: x == 0, [get_querier_cache_population(server) for server in servers])) + # The tablet move should have evicted the cached reader. + assert all(map(lambda x: x == 0, [get_querier_cache_population(server) for server in servers])) # Reproducer for https://github.com/scylladb/scylladb/issues/19052 # 1) table A has N tablets and views @@ -334,53 +335,53 @@ async def test_read_of_pending_replica_during_migration(manager: ManagerClient, await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE MATERIALIZED VIEW test.mv1 AS \ - SELECT * FROM test.test WHERE pk IS NOT NULL AND c IS NOT NULL \ - PRIMARY KEY (c, pk);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv1 AS \ + SELECT * FROM {ks}.test WHERE pk IS NOT NULL AND c IS NOT NULL \ + PRIMARY KEY (c, pk);") - servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) + servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) - key = 7 # Whatever - tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 0)") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 + key = 7 # Whatever + tablet_token = 0 # Doesn't matter since there is one tablet + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 0)") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 - await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) - s1_log = await manager.server_open_log(servers[1].server_id) - s1_mark = await s1_log.mark() + await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) + s1_log = await manager.server_open_log(servers[1].server_id) + s1_mark = await s1_log.mark() - # Drop cache to remove dummy entry indicating that underlying mutation source is empty - await manager.api.drop_sstable_caches(servers[1].ip_addr) + # Drop cache to remove dummy entry indicating that underlying mutation source is empty + await manager.api.drop_sstable_caches(servers[1].ip_addr) - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) - await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) - s1_mark = await s1_log.mark() + await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) + s1_mark = await s1_log.mark() - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 1)") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 1)") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 - # Release abandoned streaming - await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") - await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) + # Release abandoned streaming + await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") + await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) - logger.info("Waiting for migration to finish") - await migration_task - logger.info("Migration done") + logger.info("Waiting for migration to finish") + await migration_task + logger.info("Migration done") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 # This test checks that --enable-tablets option and the TABLETS parameters of the CQL CREATE KEYSPACE @@ -400,12 +401,12 @@ async def test_keyspace_creation_cql_vs_config_sanity(manager: ManagerClient, wi # First, check if a kesypace is able to be created with default CQL statement that # doesn't contain tablets parameters. When possible, tablets should be activated - await cql.run_async(f"CREATE KEYSPACE test_d WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}};") - res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = 'test_d'").one() - if tablets_enabled_by_default: - assert res.initial_tablets == 0 - else: - assert res is None + async with new_test_keyspace(manager, f"WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}}") as ks: + res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = '{ks}'").one() + if tablets_enabled_by_default: + assert res.initial_tablets == 0 + else: + assert res is None # Next, check that explicit CQL request for enabling tablets can only be satisfied when # tablets are possible. Tablets must be activated in this case @@ -414,15 +415,16 @@ async def test_keyspace_creation_cql_vs_config_sanity(manager: ManagerClient, wi else: expectation = pytest.raises(ConfigurationException) with expectation: - await cql.run_async(f"CREATE KEYSPACE test_y WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}} AND TABLETS = {{'enabled': true}};") - res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = 'test_y'").one() + ks = await create_new_test_keyspace(cql, f"WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}} AND TABLETS = {{'enabled': true}}") + res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = '{ks}'").one() assert res.initial_tablets == 0 + await cql.run_async(f"drop keyspace {ks}") # Finally, check that explicitly disabling tablets in CQL results in vnode-based keyspace # whenever tablets are enabled or not in config - await cql.run_async(f"CREATE KEYSPACE test_n WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}} AND TABLETS = {{'enabled': false}};") - res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = 'test_n'").one() - assert res is None + async with new_test_keyspace(manager, f"WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 1}} AND TABLETS = {{'enabled': false}}") as ks: + res = cql.execute(f"SELECT initial_tablets FROM system_schema.scylla_keyspaces WHERE keyspace_name = '{ks}'").one() + assert res is None @pytest.mark.asyncio async def test_tablets_and_gossip_topology_changes_are_incompatible(manager: ManagerClient): @@ -435,11 +437,9 @@ async def test_tablets_disabled_with_gossip_topology_changes(manager: ManagerCli cfg = {"enable_tablets": False, "force_gossip_topology_changes": True} await manager.server_add(config=cfg) cql = manager.get_cql() - ks_name = unique_name() - await cql.run_async(f"CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}};") - res = cql.execute(f"SELECT * FROM system_schema.scylla_keyspaces WHERE keyspace_name = '{ks_name}'").one() - logger.info(res) - await cql.run_async(f"DROP KEYSPACE {ks_name}") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks_name: + res = cql.execute(f"SELECT * FROM system_schema.scylla_keyspaces WHERE keyspace_name = '{ks_name}'").one() + logger.info(res) for enabled in ["false", "true"]: expected = r"Error from server: code=2000 \[Syntax error in CQL query\] message=\"line 1:126 no viable alternative at input 'tablets'\"" @@ -469,37 +469,37 @@ async def test_tablet_streaming_with_unbuilt_view(manager: ManagerClient): logger.info("Create table, populate it and flush the table to disk") cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - num_of_rows = 64 - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k%3});") for k in range(num_of_rows)]) - await manager.api.keyspace_flush(servers[0].ip_addr, "test", "test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + num_of_rows = 64 + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k%3});") for k in range(num_of_rows)]) + await manager.api.keyspace_flush(servers[0].ip_addr, ks, "test") - logger.info("Starting Node 2") - servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Starting Node 2") + servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) + s1_host_id = await manager.get_host_id(servers[1].server_id) - logger.info("Inject error to make view generator pause before processing the sstable") - injection_name = "view_builder_pause_add_new_view" - await manager.api.enable_injection(servers[0].ip_addr, injection_name, one_shot=True) + logger.info("Inject error to make view generator pause before processing the sstable") + injection_name = "view_builder_pause_add_new_view" + await manager.api.enable_injection(servers[0].ip_addr, injection_name, one_shot=True) - logger.info("Create view") - await cql.run_async("CREATE MATERIALIZED VIEW test.mv1 AS \ - SELECT * FROM test.test WHERE pk IS NOT NULL AND c IS NOT NULL \ - PRIMARY KEY (c, pk);") + logger.info("Create view") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv1 AS \ + SELECT * FROM {ks}.test WHERE pk IS NOT NULL AND c IS NOT NULL \ + PRIMARY KEY (c, pk);") - logger.info("Migrate the tablet to node 2") - tablet_token = 0 # Doesn't matter since there is one tablet - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, 0, tablet_token) - logger.info("Migration done") + logger.info("Migrate the tablet to node 2") + tablet_token = 0 # Doesn't matter since there is one tablet + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, 0, tablet_token) + logger.info("Migration done") - # Verify the table has expected number of rows - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == num_of_rows - # Verify that the view has the expected number of rows - rows = await cql.run_async("SELECT c from test.mv1") - assert len(list(rows)) == num_of_rows + # Verify the table has expected number of rows + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == num_of_rows + # Verify that the view has the expected number of rows + rows = await cql.run_async(f"SELECT c from {ks}.mv1") + assert len(list(rows)) == num_of_rows @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -525,55 +525,55 @@ async def test_tablet_streaming_with_staged_sstables(manager: ManagerClient): logger.info("Create the test table, populate few rows and flush to disk") cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k%3});") for k in range(64)]) - await manager.api.keyspace_flush(servers[0].ip_addr, "test", "test") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k%3});") for k in range(64)]) + await manager.api.keyspace_flush(servers[0].ip_addr, ks, "test") - logger.info("Create view") - await cql.run_async("CREATE MATERIALIZED VIEW test.mv1 AS \ - SELECT * FROM test.test WHERE pk IS NOT NULL AND c IS NOT NULL \ - PRIMARY KEY (c, pk);") + logger.info("Create view") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv1 AS \ + SELECT * FROM {ks}.test WHERE pk IS NOT NULL AND c IS NOT NULL \ + PRIMARY KEY (c, pk);") - logger.info("Generate an sstable and move it to upload directory of test table") - # create an sstable using a dummy table - await cql.run_async("CREATE TABLE test.dummy (pk int PRIMARY KEY, c int);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.dummy (pk, c) VALUES ({k}, {k%3});") for k in range(64, 128)]) - await manager.api.keyspace_flush(servers[0].ip_addr, "test", "dummy") - node_workdir = await manager.server_get_workdir(servers[0].server_id) - dummy_table_dir = glob.glob(os.path.join(node_workdir, "data", "test", "dummy-*"))[0] - test_table_upload_dir = glob.glob(os.path.join(node_workdir, "data", "test", "test-*", "upload"))[0] - for src_path in glob.glob(os.path.join(dummy_table_dir, "me-*")): - dst_path = os.path.join(test_table_upload_dir, os.path.basename(src_path)) - os.rename(src_path, dst_path) - await cql.run_async("DROP TABLE test.dummy;") + logger.info("Generate an sstable and move it to upload directory of test table") + # create an sstable using a dummy table + await cql.run_async("CREATE TABLE {ks}.dummy (pk int PRIMARY KEY, c int);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.dummy (pk, c) VALUES ({k}, {k%3});") for k in range(64, 128)]) + await manager.api.keyspace_flush(servers[0].ip_addr, ks, "dummy") + node_workdir = await manager.server_get_workdir(servers[0].server_id) + dummy_table_dir = glob.glob(os.path.join(node_workdir, "data", ks, "dummy-*"))[0] + test_table_upload_dir = glob.glob(os.path.join(node_workdir, "data", ks, "test-*", "upload"))[0] + for src_path in glob.glob(os.path.join(dummy_table_dir, "me-*")): + dst_path = os.path.join(test_table_upload_dir, os.path.basename(src_path)) + os.rename(src_path, dst_path) + await cql.run_async(f"DROP TABLE {ks}.dummy;") - logger.info("Starting Node 2") - servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Starting Node 2") + servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) + s1_host_id = await manager.get_host_id(servers[1].server_id) - logger.info("Inject error to prevent view generator from processing staged sstables") - injection_name = "view_update_generator_consume_staging_sstable" - await manager.api.enable_injection(servers[0].ip_addr, injection_name, one_shot=True) + logger.info("Inject error to prevent view generator from processing staged sstables") + injection_name = "view_update_generator_consume_staging_sstable" + await manager.api.enable_injection(servers[0].ip_addr, injection_name, one_shot=True) - logger.info("Load the sstables from upload directory") - await manager.api.load_new_sstables(servers[0].ip_addr, "test", "test") + logger.info("Load the sstables from upload directory") + await manager.api.load_new_sstables(servers[0].ip_addr, ks, "test") - # The table now has both staged and unstaged sstables. - # Verify that tablet migration handles them both without causing any base-view inconsistencies. - logger.info("Migrate the tablet to node 2") - tablet_token = 0 # Doesn't matter since there is one tablet - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, 0, tablet_token) - logger.info("Migration done") + # The table now has both staged and unstaged sstables. + # Verify that tablet migration handles them both without causing any base-view inconsistencies. + logger.info("Migrate the tablet to node 2") + tablet_token = 0 # Doesn't matter since there is one tablet + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, 0, tablet_token) + logger.info("Migration done") - expected_num_of_rows = 128 - # Verify the table has expected number of rows - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == expected_num_of_rows - # Verify that the view has the expected number of rows - rows = await cql.run_async("SELECT c from test.mv1") - assert len(list(rows)) == expected_num_of_rows + expected_num_of_rows = 128 + # Verify the table has expected number of rows + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == expected_num_of_rows + # Verify that the view has the expected number of rows + rows = await cql.run_async(f"SELECT c from {ks}.mv1") + assert len(list(rows)) == expected_num_of_rows @pytest.mark.asyncio async def test_orphaned_sstables_on_startup(manager: ManagerClient): @@ -598,24 +598,24 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient): logger.info("Create the test table, populate few rows and flush to disk") cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k%3});") for k in range(256)]) - await manager.api.keyspace_flush(servers[0].ip_addr, "test", "test") + ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k%3});") for k in range(256)]) + await manager.api.keyspace_flush(servers[0].ip_addr, ks, "test") node0_workdir = await manager.server_get_workdir(servers[0].server_id) - node0_table_dir = glob.glob(os.path.join(node0_workdir, "data", "test", "test-*"))[0] + node0_table_dir = glob.glob(os.path.join(node0_workdir, "data", ks, "test-*"))[0] logger.info("Start Node 2") servers.append(await manager.server_add(cmdline=cmdline, config=cfg)) await manager.api.disable_tablet_balancing(servers[1].ip_addr) node1_workdir = await manager.server_get_workdir(servers[1].server_id) - node1_table_dir = glob.glob(os.path.join(node1_workdir, "data", "test", "test-*"))[0] + node1_table_dir = glob.glob(os.path.join(node1_workdir, "data", ks, "test-*"))[0] s1_host_id = await manager.get_host_id(servers[1].server_id) logger.info("Migrate the tablet from node1 to node2") tablet_token = 0 # Doesn't matter since there is one tablet - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, 0, tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, 0, tablet_token) logger.info("Migration done") logger.info("Stop node1 and copy the sstables from node2") @@ -626,7 +626,7 @@ async def test_orphaned_sstables_on_startup(manager: ManagerClient): # try starting the server again logger.info("Start node1 with the orphaned sstables and expect it to fail") - # Error thrown is of format : "Unable to load SSTable {sstable_name} : Storage wasn't found for tablet {tablet_id} of table test.test" + # Error thrown is of format : "Unable to load SSTable {sstable_name} : Storage wasn't found for tablet {tablet_id} of table {ks}.test" await manager.server_start(servers[0].server_id, expected_error="Storage wasn't found for tablet") @pytest.mark.asyncio @@ -649,25 +649,25 @@ async def test_remove_failure_with_no_normal_token_owners_in_dc(manager: Manager servers['dc3'] = [await manager.server_add(config={'join_ring': False}, property_file={'dc': 'dc3', 'rack': 'rack3'})] cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 }} AND tablets = {{ 'initial': 1 }}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = { 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 } AND tablets = { 'initial': 1 }") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - node_to_remove = servers['dc1'][0] - node_to_replace = servers['dc1'][1] - replaced_host_id = await manager.get_host_id(node_to_replace.server_id) - initiator_node = servers['dc2'][0] + node_to_remove = servers['dc1'][0] + node_to_replace = servers['dc1'][1] + replaced_host_id = await manager.get_host_id(node_to_replace.server_id) + initiator_node = servers['dc2'][0] - # Stop both token owners in dc1 to leave no token owners in the datacenter - await manager.server_stop_gracefully(node_to_remove.server_id) - await manager.server_stop_gracefully(node_to_replace.server_id) + # Stop both token owners in dc1 to leave no token owners in the datacenter + await manager.server_stop_gracefully(node_to_remove.server_id) + await manager.server_stop_gracefully(node_to_replace.server_id) - logger.info("Attempting removenode - expected to fail") - await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id], - expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain") + logger.info("Attempting removenode - expected to fail") + await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, ignore_dead=[replaced_host_id], + expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (There are nodes with tablets to drain") - logger.info(f"Replacing {node_to_replace} with a new node") - replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) - await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': f'rack1'}) + logger.info(f"Replacing {node_to_replace} with a new node") + replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) + await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': 'rack1'}) @pytest.mark.asyncio @pytest.mark.parametrize("with_zero_token_node", [False, True]) @@ -686,21 +686,21 @@ async def test_remove_failure_then_replace(manager: ManagerClient, with_zero_tok servers['dc3'] = [await manager.server_add(config={'join_ring': False}, property_file={'dc': 'dc3', 'rack': 'rack3'})] cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 }} AND tablets = {{ 'initial': 1 }}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = { 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 } AND tablets = { 'initial': 1 }") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - node_to_remove = servers['dc1'][0] - initiator_node = servers['dc2'][0] + node_to_remove = servers['dc1'][0] + initiator_node = servers['dc2'][0] - await manager.server_stop_gracefully(node_to_remove.server_id) + await manager.server_stop_gracefully(node_to_remove.server_id) - logger.info("Attempting removenode - expected to fail") - await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, - expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet") + logger.info("Attempting removenode - expected to fail") + await manager.remove_node(initiator_node.server_id, server_id=node_to_remove.server_id, + expected_error="Removenode failed. See earlier errors (Rolled back: Failed to drain tablets: std::runtime_error (Unable to find new replica for tablet") - logger.info(f"Replacing {node_to_remove} with a new node") - replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) - await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': f'rack1'}) + logger.info(f"Replacing {node_to_remove} with a new node") + replace_cfg = ReplaceConfig(replaced_id=node_to_remove.server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) + await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': 'rack1'}) @pytest.mark.asyncio @pytest.mark.parametrize("with_zero_token_node", [False, True]) @@ -722,38 +722,41 @@ async def test_replace_with_no_normal_token_owners_in_dc(manager: ManagerClient, servers['dc3'] = [await manager.server_add(config={'join_ring': False}, property_file={'dc': 'dc3', 'rack': 'rack3'})] cql = manager.get_cql() - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{ 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 }} AND tablets = {{ 'initial': 1 }}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = { 'class': 'NetworkTopologyStrategy', 'dc1': 2, 'dc2': 1 } AND tablets = { 'initial': 1 }") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - stmt = cql.prepare("INSERT INTO test.test (pk, c) VALUES (?, ?)") - stmt.consistency_level = ConsistencyLevel.ALL - keys = range(256) - await asyncio.gather(*[cql.run_async(stmt, [k, k]) for k in keys]) + stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, c) VALUES (?, ?)") + stmt.consistency_level = ConsistencyLevel.ALL + keys = range(256) + await asyncio.gather(*[cql.run_async(stmt, [k, k]) for k in keys]) - nodes_to_replace = servers['dc1'][0:2] - replaced_host_id = await manager.get_host_id(nodes_to_replace[1].server_id) + nodes_to_replace = servers['dc1'][0:2] + replaced_host_id = await manager.get_host_id(nodes_to_replace[1].server_id) - # Stop both token owners in dc1 to leave no token owners in the datacenter - for node in nodes_to_replace: - await manager.server_stop_gracefully(node.server_id) + # Stop both token owners in dc1 to leave no token owners in the datacenter + for node in nodes_to_replace: + await manager.server_stop_gracefully(node.server_id) - logger.info(f"Replacing {nodes_to_replace[0]} with a new node") - replace_cfg = ReplaceConfig(replaced_id=nodes_to_replace[0].server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True, - ignore_dead_nodes=[replaced_host_id]) - await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': f'rack1'}) + logger.info(f"Replacing {nodes_to_replace[0]} with a new node") + replace_cfg = ReplaceConfig(replaced_id=nodes_to_replace[0].server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True, + ignore_dead_nodes=[replaced_host_id]) + await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': 'rack1'}) - logger.info(f"Replacing {nodes_to_replace[1]} with a new node") - replace_cfg = ReplaceConfig(replaced_id=nodes_to_replace[1].server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) - await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': f'rack1'}) + logger.info(f"Replacing {nodes_to_replace[1]} with a new node") + replace_cfg = ReplaceConfig(replaced_id=nodes_to_replace[1].server_id, reuse_ip_addr = False, use_host_id=True, wait_replaced_dead=True) + await manager.server_add(replace_cfg=replace_cfg, property_file={'dc': 'dc1', 'rack': 'rack1'}) - logger.info("Verifying data") - for node in servers['dc2']: - await manager.server_stop_gracefully(node.server_id) - query = SimpleStatement("SELECT * FROM test.test;", consistency_level=ConsistencyLevel.ONE) - rows = await cql.run_async(query) - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + logger.info("Verifying data") + for node in servers['dc2']: + await manager.server_stop_gracefully(node.server_id) + query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.ONE) + rows = await cql.run_async(query) + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk + + # For dropping the keyspace + await asyncio.gather(*[manager.server_start(node.server_id) for node in servers['dc2']]) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -777,14 +780,14 @@ async def test_drop_keyspace_while_split(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) # create a table so that it has at least 2 tablets (and storage groups) per shard - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4};") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - await manager.api.disable_autocompaction(servers[0].ip_addr, 'test') + await manager.api.disable_autocompaction(servers[0].ip_addr, ks) keys = range(2048) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, 'test') + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) + await manager.api.flush_keyspace(servers[0].ip_addr, ks) await manager.api.enable_injection(servers[0].ip_addr, 'truncate_compaction_disabled_wait', one_shot=False) await manager.api.enable_injection(servers[0].ip_addr, 'split_storage_groups_wait', one_shot=False) @@ -796,7 +799,7 @@ async def test_drop_keyspace_while_split(manager: ManagerClient): await s0_log.wait_for('split_storage_groups_wait: wait') # start a DROP and wait for it to disable compaction - drop_ks_task = cql.run_async('DROP KEYSPACE test;') + drop_ks_task = cql.run_async(f'DROP KEYSPACE {ks};') await s0_log.wait_for('truncate_compaction_disabled_wait: wait') # release split From 0b88ea97985d72cd80ab337f02b4ca697b4022fb Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 19:23:37 +0200 Subject: [PATCH 41/56] topology_custom/test_tablets2: test_schema_change_during_cleanup: drop unused check function Signed-off-by: Benny Halevy --- test/topology_custom/test_tablets2.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/test/topology_custom/test_tablets2.py b/test/topology_custom/test_tablets2.py index 178f201d1c..2d7338b9c0 100644 --- a/test/topology_custom/test_tablets2.py +++ b/test/topology_custom/test_tablets2.py @@ -1508,14 +1508,6 @@ async def test_schema_change_during_cleanup(manager: ManagerClient): keys = range(256) await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - async def check(): - logger.info("Checking table") - rows = await cql.run_async("SELECT * FROM test.test;") - assert rows == expected_rows - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk - s1_log = await manager.server_open_log(servers[0].server_id) s1_mark = await s1_log.mark() From 6b37d04aa903e492d4353efc6cc96cd618a442ec Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 19:23:37 +0200 Subject: [PATCH 42/56] topology_custom/test_tablets2: use *new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_tablets2.py | 1494 ++++++++++++------------- 1 file changed, 741 insertions(+), 753 deletions(-) diff --git a/test/topology_custom/test_tablets2.py b/test/topology_custom/test_tablets2.py index 2d7338b9c0..6f3b54eade 100644 --- a/test/topology_custom/test_tablets2.py +++ b/test/topology_custom/test_tablets2.py @@ -11,7 +11,7 @@ from test.pylib.rest_client import inject_error_one_shot, HTTPError, read_barrie from test.pylib.util import wait_for_cql_and_get_hosts, unique_name from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas from test.topology.conftest import skip_mode -from test.topology.util import reconnect_driver +from test.topology.util import reconnect_driver, create_new_test_keyspace, new_test_keyspace import pytest import asyncio @@ -41,7 +41,7 @@ async def disable_injection_on(manager, error_name, servers): errs = [manager.api.disable_injection(s.ip_addr, error_name) for s in servers] await asyncio.gather(*errs) -async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: list[ServerInfo], ranges: str = ''): +async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: list[ServerInfo], keyspace, table = "test", ranges: str = ''): node = server.ip_addr await manager.servers_see_each_other(servers) live_nodes_wanted = [s.ip_addr for s in servers] @@ -50,7 +50,7 @@ async def repair_on_node(manager: ManagerClient, server: ServerInfo, servers: li live_nodes.sort() assert live_nodes == live_nodes_wanted logger.info(f"Repair table on node {node} live_nodes={live_nodes} live_nodes_wanted={live_nodes_wanted}") - await manager.api.repair(node, "test", "test", ranges) + await manager.api.repair(node, keyspace, table, ranges) async def load_repair_history(cql, hosts): all_rows = [] @@ -102,59 +102,54 @@ async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(m # s0 should miss schema and tablet changes cql = await safe_server_stop_gracefully(manager, s0, reconnect=True) - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 100};") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 3} AND tablets = {'initial': 100}") as ks: + # force s0 to catch up later from the snapshot and not the raft log + await inject_error_one_shot_on(manager, 'raft_server_force_snapshot', not_s0) + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - # force s0 to catch up later from the snapshot and not the raft log - await inject_error_one_shot_on(manager, 'raft_server_force_snapshot', not_s0) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + keys = range(10) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, 1);") for k in keys]) - keys = range(10) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, 1);") for k in keys]) - - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(list(rows)) == len(keys) - for r in rows: - assert r.c == 1 - - manager.driver_close() - await manager.server_start(s0, wait_others=2) - await manager.driver_connect(server=servers[0]) - cql = manager.get_cql() - await wait_for_cql_and_get_hosts(cql, [servers[0]], time.time() + 60) - - # Trigger a schema change to invoke schema agreement waiting to make sure that s0 has the latest schema - await cql.run_async("CREATE KEYSPACE test_dummy WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, 2);", execution_profile='whitelist') - for k in keys]) - - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == 2 - - conn_logger = logging.getLogger("conn_messages") - conn_logger.setLevel(logging.DEBUG) - try: - # Check that after rolling restart the tablet metadata is still there - await manager.rolling_restart(servers) - - cql = await reconnect_driver(manager) + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(list(rows)) == len(keys) + for r in rows: + assert r.c == 1 + manager.driver_close() + await manager.server_start(s0, wait_others=2) + await manager.driver_connect(server=servers[0]) + cql = manager.get_cql() await wait_for_cql_and_get_hosts(cql, [servers[0]], time.time() + 60) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, 3);", execution_profile='whitelist') - for k in keys]) + # Trigger a schema change to invoke schema agreement waiting to make sure that s0 has the latest schema + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as test_dummy: + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, 2);", execution_profile='whitelist') + for k in keys]) - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == 3 - finally: - conn_logger.setLevel(logging.INFO) + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == 2 - await cql.run_async("DROP KEYSPACE test;") - await cql.run_async("DROP KEYSPACE test_dummy;") + conn_logger = logging.getLogger("conn_messages") + conn_logger.setLevel(logging.DEBUG) + try: + # Check that after rolling restart the tablet metadata is still there + await manager.rolling_restart(servers) + + cql = await reconnect_driver(manager) + + await wait_for_cql_and_get_hosts(cql, [servers[0]], time.time() + 60) + + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, 3);", execution_profile='whitelist') + for k in keys]) + + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == 3 + finally: + conn_logger.setLevel(logging.INFO) @pytest.mark.asyncio @@ -163,21 +158,19 @@ async def test_scans(manager: ManagerClient): servers = await manager.servers_add(3) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 8};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 8}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - keys = range(100) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(100) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - rows = await cql.run_async("SELECT count(*) FROM test.test;") - assert rows[0].count == len(keys) + rows = await cql.run_async(f"SELECT count(*) FROM {ks}.test;") + assert rows[0].count == len(keys) - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk - - await cql.run_async("DROP KEYSPACE test;") + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk @pytest.mark.asyncio @@ -206,42 +199,40 @@ async def test_topology_changes(manager: ManagerClient): servers = await manager.servers_add(3) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 32};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 32}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - expected_rows = await cql.run_async("SELECT * FROM test.test;") + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + expected_rows = await cql.run_async(f"SELECT * FROM {ks}.test;") - async def check(): - logger.info("Checking table") - rows = await cql.run_async("SELECT * FROM test.test;") - assert rows == expected_rows - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + async def check(): + logger.info("Checking table") + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert rows == expected_rows + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk - await inject_error_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "tablet_allocator_shuffle", servers) - logger.info("Adding new server") - await manager.server_add() + logger.info("Adding new server") + await manager.server_add() - await check() + await check() - logger.info("Adding new server") - await manager.server_add() + logger.info("Adding new server") + await manager.server_add() - await check() - time.sleep(5) # Give load balancer some time to do work - await check() + await check() + time.sleep(5) # Give load balancer some time to do work + await check() - await manager.decommission_node(servers[0].server_id) + await manager.decommission_node(servers[0].server_id) - await check() - - await cql.run_async("DROP KEYSPACE test;") + await check() async def get_two_servers_to_move_tablet(manager: ManagerClient): """ @@ -254,18 +245,18 @@ async def get_two_servers_to_move_tablet(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") servers.append(await manager.server_add(cmdline=cmdline)) key = 7 # Whatever tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 0)") - rows = await cql.run_async("SELECT pk from test.test") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 0)") + rows = await cql.run_async(f"SELECT pk from {ks}.test") assert len(list(rows)) == 1 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) logger.info(f'{replica=}') s0_host_id = await manager.get_host_id(servers[0].server_id) @@ -278,12 +269,12 @@ async def get_two_servers_to_move_tablet(manager: ManagerClient): dst_shard = 0 - return (servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard) + return (servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard, ks) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_streaming_rx_error_no_failed_message_with_fail_stream_plan(manager: ManagerClient): - servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard = await get_two_servers_to_move_tablet(manager) + servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard, ks = await get_two_servers_to_move_tablet(manager) await manager.api.enable_injection(servers[0].ip_addr, "stream_session_ignore_failed_message", one_shot=True) await manager.api.enable_injection(servers[1].ip_addr, "stream_session_ignore_failed_message", one_shot=True) @@ -293,7 +284,7 @@ async def test_streaming_rx_error_no_failed_message_with_fail_stream_plan(manage s1_mark = await s1_log.mark() migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token, timeout=30)) + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token, timeout=30)) await s1_log.wait_for('stream_manager: Failed stream_session for stream_plan', from_mark=s1_mark) s1_mark = await s1_log.mark() @@ -305,26 +296,26 @@ async def test_streaming_rx_error_no_failed_message_with_fail_stream_plan(manage logger.info("Migration done") # Sanity test - rows = await cql.run_async("SELECT pk from test.test") + rows = await cql.run_async(f"SELECT pk from {ks}.test") assert len(list(rows)) == 1 - await cql.run_async("TRUNCATE test.test") - rows = await cql.run_async("SELECT pk from test.test") + await cql.run_async(f"TRUNCATE {ks}.test") + rows = await cql.run_async(f"SELECT pk from {ks}.test") assert len(list(rows)) == 0 # Verify that there is no data resurrection - rows = await cql.run_async("SELECT pk from test.test") + rows = await cql.run_async(f"SELECT pk from {ks}.test") assert len(list(rows)) == 0 # Verify that moving the tablet back works - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", s1_host_id, dst_shard, replica[0], replica[1], tablet_token) - rows = await cql.run_async("SELECT pk from test.test") + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", s1_host_id, dst_shard, replica[0], replica[1], tablet_token) + rows = await cql.run_async(f"SELECT pk from {ks}.test") assert len(list(rows)) == 0 @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_streaming_rx_error_no_failed_message_no_fail_stream_plan_hang(manager: ManagerClient): - servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard = await get_two_servers_to_move_tablet(manager) + servers, cql, s0_host_id, s1_host_id, replica, tablet_token, dst_shard, ks = await get_two_servers_to_move_tablet(manager) await manager.api.enable_injection(servers[0].ip_addr, "stream_session_ignore_failed_message", one_shot=True) await manager.api.enable_injection(servers[1].ip_addr, "stream_session_ignore_failed_message", one_shot=True) @@ -337,7 +328,7 @@ async def test_streaming_rx_error_no_failed_message_no_fail_stream_plan_hang(man s1_mark = await s1_log.mark() migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token, timeout=10)) + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token, timeout=10)) try: logger.info("Waiting for migration to finish") @@ -360,65 +351,65 @@ async def test_streaming_is_guarded_by_topology_guard(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - servers.append(await manager.server_add(cmdline=cmdline)) + servers.append(await manager.server_add(cmdline=cmdline)) - key = 7 # Whatever - tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 0)") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 + key = 7 # Whatever + tablet_token = 0 # Doesn't matter since there is one tablet + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 0)") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 - await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) - s1_log = await manager.server_open_log(servers[1].server_id) - s1_mark = await s1_log.mark() + await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) + s1_log = await manager.server_open_log(servers[1].server_id) + s1_mark = await s1_log.mark() - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) - # Wait for the replica-side writer of streaming to reach a place where it already - # received writes from the leaving replica but haven't applied them yet. - # Once the writer reaches this place, it will wait for the message_injection() call below before proceeding. - # The place we block the writer in should not hold to erm or topology_guard because that will block the migration - # below and prevent test from proceeding. - await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) - s1_mark = await s1_log.mark() + # Wait for the replica-side writer of streaming to reach a place where it already + # received writes from the leaving replica but haven't applied them yet. + # Once the writer reaches this place, it will wait for the message_injection() call below before proceeding. + # The place we block the writer in should not hold to erm or topology_guard because that will block the migration + # below and prevent test from proceeding. + await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) + s1_mark = await s1_log.mark() - # Should cause streaming to fail and be retried while leaving behind the replica-side writer. - await manager.api.inject_disconnect(servers[0].ip_addr, servers[1].ip_addr) + # Should cause streaming to fail and be retried while leaving behind the replica-side writer. + await manager.api.inject_disconnect(servers[0].ip_addr, servers[1].ip_addr) - logger.info("Waiting for migration to finish") - await migration_task - logger.info("Migration done") + logger.info("Waiting for migration to finish") + await migration_task + logger.info("Migration done") - # Sanity test - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 + # Sanity test + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 - await cql.run_async("TRUNCATE test.test") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 0 + await cql.run_async(f"TRUNCATE {ks}.test") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 0 - # Release abandoned streaming - await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") - await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) + # Release abandoned streaming + await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") + await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) - # Verify that there is no data resurrection - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 0 + # Verify that there is no data resurrection + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 0 - # Verify that moving the tablet back works - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", s1_host_id, dst_shard, replica[0], replica[1], tablet_token) - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 0 + # Verify that moving the tablet back works + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", s1_host_id, dst_shard, replica[0], replica[1], tablet_token) + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 0 @pytest.mark.asyncio @@ -435,66 +426,66 @@ async def test_table_dropped_during_streaming(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE TABLE test.test2 (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE TABLE {ks}.test2 (pk int PRIMARY KEY, c int);") - servers.append(await manager.server_add()) + servers.append(await manager.server_add()) - logger.info("Populating tables") - key = 7 # Whatever - value = 3 # Whatever - tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, {value})") - await cql.run_async(f"INSERT INTO test.test2 (pk, c) VALUES ({key}, {value})") - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == 1 - rows = await cql.run_async("SELECT pk from test.test2") - assert len(list(rows)) == 1 + logger.info("Populating tables") + key = 7 # Whatever + value = 3 # Whatever + tablet_token = 0 # Doesn't matter since there is one tablet + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, {value})") + await cql.run_async(f"INSERT INTO {ks}.test2 (pk, c) VALUES ({key}, {value})") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == 1 + rows = await cql.run_async(f"SELECT pk from {ks}.test2") + assert len(list(rows)) == 1 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) - s1_log = await manager.server_open_log(servers[1].server_id) - s1_mark = await s1_log.mark() + await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) + s1_log = await manager.server_open_log(servers[1].server_id) + s1_mark = await s1_log.mark() - logger.info("Starting tablet migration") - s1_host_id = await manager.get_host_id(servers[1].server_id) - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, 0, tablet_token)) + logger.info("Starting tablet migration") + s1_host_id = await manager.get_host_id(servers[1].server_id) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, 0, tablet_token)) - # Wait for the replica-side writer of streaming to reach a place where it already - # received writes from the leaving replica but haven't applied them yet. - # Once the writer reaches this place, it will wait for the message_injection() call below before proceeding. - # We want to drop the table while streaming is deep in the process, where it will attempt to apply writes - # to the dropped table. - await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) + # Wait for the replica-side writer of streaming to reach a place where it already + # received writes from the leaving replica but haven't applied them yet. + # Once the writer reaches this place, it will wait for the message_injection() call below before proceeding. + # We want to drop the table while streaming is deep in the process, where it will attempt to apply writes + # to the dropped table. + await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) - # Streaming blocks table drop, so we can't wait here. - drop_task = cql.run_async("DROP TABLE test.test") + # Streaming blocks table drop, so we can't wait here. + drop_task = cql.run_async(f"DROP TABLE {ks}.test") - # Release streaming as late as possible to increase probability of drop causing problems. - await s1_log.wait_for('Dropping', from_mark=s1_mark) + # Release streaming as late as possible to increase probability of drop causing problems. + await s1_log.wait_for('Dropping', from_mark=s1_mark) - # Unblock streaming - await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") - await drop_task + # Unblock streaming + await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") + await drop_task - logger.info("Waiting for migration to finish") - try: - await migration_task - except HTTPError as e: - assert 'Tablet map not found' in e.message + logger.info("Waiting for migration to finish") + try: + await migration_task + except HTTPError as e: + assert 'Tablet map not found' in e.message - logger.info("Verifying that moving the other tablet works") - replica = await get_tablet_replica(manager, servers[0], 'test', 'test2', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - assert replica[0] == s0_host_id - await manager.api.move_tablet(servers[0].ip_addr, "test", "test2", replica[0], replica[1], s1_host_id, 0, tablet_token) + logger.info("Verifying that moving the other tablet works") + replica = await get_tablet_replica(manager, servers[0], ks, 'test2', tablet_token) + s0_host_id = await manager.get_host_id(servers[0].server_id) + assert replica[0] == s0_host_id + await manager.api.move_tablet(servers[0].ip_addr, ks, "test2", replica[0], replica[1], s1_host_id, 0, tablet_token) - logger.info("Verifying tablet replica") - replica = await get_tablet_replica(manager, servers[0], 'test', 'test2', tablet_token) - assert replica == (s1_host_id, 0) + logger.info("Verifying tablet replica") + replica = await get_tablet_replica(manager, servers[0], ks, 'test2', tablet_token) + assert replica == (s1_host_id, 0) @pytest.mark.repair @pytest.mark.asyncio @@ -509,49 +500,49 @@ async def test_tablet_repair(manager: ManagerClient): await inject_error_on(manager, "tablet_allocator_shuffle", servers) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', " - "'replication_factor': 2} AND tablets = {'initial': 32};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', " + "'replication_factor': 2} AND tablets = {'initial': 32}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) + keys = range(256) - stmt = cql.prepare("INSERT INTO test.test (pk, c) VALUES (?, ?)") - stmt.consistency_level = ConsistencyLevel.ONE + stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, c) VALUES (?, ?)") + stmt.consistency_level = ConsistencyLevel.ONE - # Repair runs concurrently with tablet shuffling which exercises issues with serialization - # of repair and tablet migration. - # - # We do it 30 times because it's been experimentally shown to be enough to trigger the issue with high probability. - # Lack of proper synchronization would manifest as repair failure with the following cause: - # - # failed_because=std::runtime_error (multishard_writer: No shards for token 7505809055260144771 of test.test) - # - # ...which indicates that repair tried to stream data to a node which is no longer a tablet replica. - repair_cycles = 30 - for i in range(repair_cycles): - # Write concurrently with repair to increase the chance of repair having some discrepancy to resolve and send writes. - inserts_future = asyncio.gather(*[cql.run_async(stmt, [k, i]) for k in keys]) + # Repair runs concurrently with tablet shuffling which exercises issues with serialization + # of repair and tablet migration. + # + # We do it 30 times because it's been experimentally shown to be enough to trigger the issue with high probability. + # Lack of proper synchronization would manifest as repair failure with the following cause: + # + # failed_because=std::runtime_error (multishard_writer: No shards for token 7505809055260144771 of test.test) + # + # ...which indicates that repair tried to stream data to a node which is no longer a tablet replica. + repair_cycles = 30 + for i in range(repair_cycles): + # Write concurrently with repair to increase the chance of repair having some discrepancy to resolve and send writes. + inserts_future = asyncio.gather(*[cql.run_async(stmt, [k, i]) for k in keys]) - # Disable in the background so that repair is started with migrations in progress. - # We need to disable balancing so that repair which blocks on migrations eventually gets unblocked. - # Otherwise, shuffling would keep the topology busy forever. - disable_balancing_future = asyncio.create_task(manager.api.disable_tablet_balancing(servers[0].ip_addr)) + # Disable in the background so that repair is started with migrations in progress. + # We need to disable balancing so that repair which blocks on migrations eventually gets unblocked. + # Otherwise, shuffling would keep the topology busy forever. + disable_balancing_future = asyncio.create_task(manager.api.disable_tablet_balancing(servers[0].ip_addr)) - await repair_on_node(manager, servers[0], servers) + await repair_on_node(manager, servers[0], servers, ks) - await inserts_future - await disable_balancing_future - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + await inserts_future + await disable_balancing_future + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - key_count = len(keys) - stmt = cql.prepare("SELECT * FROM test.test;") - stmt.consistency_level = ConsistencyLevel.ALL - rows = await cql.run_async(stmt) - assert len(rows) == key_count - for r in rows: - assert r.c == repair_cycles - 1 + key_count = len(keys) + stmt = cql.prepare(f"SELECT * FROM {ks}.test;") + stmt.consistency_level = ConsistencyLevel.ALL + rows = await cql.run_async(stmt) + assert len(rows) == key_count + for r in rows: + assert r.c == repair_cycles - 1 # Reproducer for race between split and repair: https://github.com/scylladb/scylladb/issues/19378 # Verifies repair will not complete with sstables that still require split, causing split @@ -572,57 +563,57 @@ async def test_concurrent_tablet_repair_and_split(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', " - "'replication_factor': 2} AND tablets = {'initial': 32};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', " + "'replication_factor': 2} AND tablets = {'initial': 32}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") + logger.info("Populating table") - keys = range(5000) # Enough keys to trigger repair digest mismatch with a high chance. - stmt = cql.prepare("INSERT INTO test.test (pk, c) VALUES (?, ?)") - stmt.consistency_level = ConsistencyLevel.ONE + keys = range(5000) # Enough keys to trigger repair digest mismatch with a high chance. + stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, c) VALUES (?, ?)") + stmt.consistency_level = ConsistencyLevel.ONE - await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) + await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) - s0_log = await manager.server_open_log(servers[0].server_id) - s0_mark = await s0_log.mark() + s0_log = await manager.server_open_log(servers[0].server_id) + s0_mark = await s0_log.mark() - await asyncio.gather(*[cql.run_async(stmt, [k, -1]) for k in keys]) + await asyncio.gather(*[cql.run_async(stmt, [k, -1]) for k in keys]) - # split decision is sstable size based, so data must be flushed first - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") + # split decision is sstable size based, so data must be flushed first + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) - await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", False) - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", False) + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - logger.info("Waiting for split prepare...") - await s0_log.wait_for('Setting split ready sequence number to', from_mark=s0_mark) - s0_mark = await s0_log.mark() - logger.info("Waited for split prepare") + logger.info("Waiting for split prepare...") + await s0_log.wait_for('Setting split ready sequence number to', from_mark=s0_mark) + s0_mark = await s0_log.mark() + logger.info("Waited for split prepare") - # Balancer is re-enabled later for split execution - await asyncio.create_task(manager.api.disable_tablet_balancing(servers[0].ip_addr)) + # Balancer is re-enabled later for split execution + await asyncio.create_task(manager.api.disable_tablet_balancing(servers[0].ip_addr)) - # Write concurrently with repair to increase the chance of repair having some discrepancy to resolve and send writes. - inserts_future = asyncio.gather(*[cql.run_async(stmt, [k, 1]) for k in keys]) + # Write concurrently with repair to increase the chance of repair having some discrepancy to resolve and send writes. + inserts_future = asyncio.gather(*[cql.run_async(stmt, [k, 1]) for k in keys]) - await repair_on_node(manager, servers[0], servers) + await repair_on_node(manager, servers[0], servers, ks) - await inserts_future + await inserts_future - logger.info("Waiting for split execute...") - await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") - await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await s0_log.wait_for('Detected tablet split for table', from_mark=s0_mark) - await inject_error_one_shot_on(manager, "tablet_split_finalization_postpone", servers) - logger.info("Waited for split execute...") + logger.info("Waiting for split execute...") + await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") + await manager.api.enable_tablet_balancing(servers[0].ip_addr) + await s0_log.wait_for('Detected tablet split for table', from_mark=s0_mark) + await inject_error_one_shot_on(manager, "tablet_split_finalization_postpone", servers) + logger.info("Waited for split execute...") - key_count = len(keys) - stmt = cql.prepare("SELECT * FROM test.test;") - stmt.consistency_level = ConsistencyLevel.ALL - rows = await cql.run_async(stmt) - assert len(rows) == key_count + key_count = len(keys) + stmt = cql.prepare(f"SELECT * FROM {ks}.test;") + stmt.consistency_level = ConsistencyLevel.ALL + rows = await cql.run_async(stmt) + assert len(rows) == key_count @pytest.mark.repair @pytest.mark.asyncio @@ -636,33 +627,33 @@ async def test_tablet_missing_data_repair(manager: ManagerClient): await manager.server_add(cmdline=cmdline)] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', " - "'replication_factor': 3} AND tablets = {'initial': 32};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', " + "'replication_factor': 3} AND tablets = {'initial': 32}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - keys_list = [range(0, 100), range(100, 200), range(200, 300)] - keys_for_server = dict([(s.server_id, keys_list[idx]) for idx, s in enumerate(servers)]) - keys = range(0, 300) + keys_list = [range(0, 100), range(100, 200), range(200, 300)] + keys_for_server = dict([(s.server_id, keys_list[idx]) for idx, s in enumerate(servers)]) + keys = range(0, 300) - async def insert_with_down(down_server): - logger.info(f"Stopped server {down_server.server_id}") - logger.info(f"Insert into server {down_server.server_id}") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") - for k in keys_for_server[down_server.server_id]]) + async def insert_with_down(down_server): + logger.info(f"Stopped server {down_server.server_id}") + logger.info(f"Insert into server {down_server.server_id}") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") + for k in keys_for_server[down_server.server_id]]) - cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down) + cql = await safe_rolling_restart(manager, servers, with_down=insert_with_down) - await repair_on_node(manager, servers[0], servers) + await repair_on_node(manager, servers[0], servers, ks) - async def check_with_down(down_node): - logger.info("Checking table") - query = SimpleStatement("SELECT * FROM test.test;", consistency_level=ConsistencyLevel.ONE) - rows = await cql.run_async(query) - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + async def check_with_down(down_node): + logger.info("Checking table") + query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.ONE) + rows = await cql.run_async(query) + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk - cql = await safe_rolling_restart(manager, servers, with_down=check_with_down) + cql = await safe_rolling_restart(manager, servers, with_down=check_with_down) @pytest.mark.repair @@ -675,24 +666,21 @@ async def test_tablet_repair_history(manager: ManagerClient): tablets = 8 cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', " - "'replication_factor': {}}} AND tablets = {{'initial': {}}};".format(rf, tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND tablets = {{'initial': {tablets}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - logging.info(f'Got hosts={hosts}'); + hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + logging.info(f'Got hosts={hosts}'); - await repair_on_node(manager, servers[0], servers) + await repair_on_node(manager, servers[0], servers, ks) - all_rows = await load_repair_history(cql, hosts) - assert len(all_rows) == rf * tablets - - await cql.run_async("DROP KEYSPACE test;") + all_rows = await load_repair_history(cql, hosts) + assert len(all_rows) == rf * tablets @pytest.mark.repair @pytest.mark.asyncio @@ -705,34 +693,31 @@ async def test_tablet_repair_ranges_selection(manager: ManagerClient): nr_ranges = 0; cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', " - "'replication_factor': {}}} AND tablets = {{'initial': {}}};".format(rf, tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND tablets = {{'initial': {tablets}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - logging.info(f'Got hosts={hosts}'); + hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + logging.info(f'Got hosts={hosts}'); - await repair_on_node(manager, servers[0], servers, ranges='-4611686018427387905:-1,4611686018427387903:9223372036854775807') - nr_ranges = nr_ranges + 2 - await repair_on_node(manager, servers[0], servers, ranges='-2000:-1000,1000:2000') - nr_ranges = nr_ranges + 2 - await repair_on_node(manager, servers[0], servers, ranges='3000:-3000') - # The wrap around range (3000, -3000] will produce the following intersection range - # range=(minimum token,-4611686018427387905] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(minimum token,-4611686018427387905] - # range=(-4611686018427387905,-1] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(-4611686018427387905,-3000] - # range=(-1,4611686018427387903] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(3000,4611686018427387903] - # range=(4611686018427387903,9223372036854775807] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(4611686018427387903,9223372036854775807] - nr_ranges = nr_ranges + 4 + await repair_on_node(manager, servers[0], servers, ks, ranges='-4611686018427387905:-1,4611686018427387903:9223372036854775807') + nr_ranges = nr_ranges + 2 + await repair_on_node(manager, servers[0], servers, ks, ranges='-2000:-1000,1000:2000') + nr_ranges = nr_ranges + 2 + await repair_on_node(manager, servers[0], servers, ks, ranges='3000:-3000') + # The wrap around range (3000, -3000] will produce the following intersection range + # range=(minimum token,-4611686018427387905] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(minimum token,-4611686018427387905] + # range=(-4611686018427387905,-1] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(-4611686018427387905,-3000] + # range=(-1,4611686018427387903] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(3000,4611686018427387903] + # range=(4611686018427387903,9223372036854775807] ranges_specified={(3000,+inf), (-inf, -3000]} intersection_ranges=(4611686018427387903,9223372036854775807] + nr_ranges = nr_ranges + 4 - all_rows = await load_repair_history(cql, hosts) - assert len(all_rows) == rf * nr_ranges; - - await cql.run_async("DROP KEYSPACE test;") + all_rows = await load_repair_history(cql, hosts) + assert len(all_rows) == rf * nr_ranges; @pytest.mark.asyncio async def test_tablet_cleanup(manager: ManagerClient): @@ -748,62 +733,62 @@ async def test_tablet_cleanup(manager: ManagerClient): n_partitions = 1000 await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) await manager.servers_see_each_other(servers) - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {}}};".format(n_tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk) VALUES ({k});") for k in range(1000)]) + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {n_tablets}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk) VALUES ({k});") for k in range(1000)]) - logger.info("Start second node") - servers.append(await manager.server_add()) + logger.info("Start second node") + servers.append(await manager.server_add()) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) - logger.info("Read system.tablets") - tablet_replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - assert len(tablet_replicas) == n_tablets + logger.info("Read system.tablets") + tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + assert len(tablet_replicas) == n_tablets - # Randomly select half of all tablets. - sample = random.sample(tablet_replicas, n_tablets // 2) - moved_tokens = [x.last_token for x in sample] - moved_src = [x.replicas[0] for x in sample] - moved_dst = [(s1_host_id, random.choice([0, 1])) for _ in sample] + # Randomly select half of all tablets. + sample = random.sample(tablet_replicas, n_tablets // 2) + moved_tokens = [x.last_token for x in sample] + moved_src = [x.replicas[0] for x in sample] + moved_dst = [(s1_host_id, random.choice([0, 1])) for _ in sample] - # Migrate the selected tablets to second node. - logger.info("Migrate half of all tablets to second node") - for t, s, d in zip(moved_tokens, moved_src, moved_dst): - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", *s, *d, t) + # Migrate the selected tablets to second node. + logger.info("Migrate half of all tablets to second node") + for t, s, d in zip(moved_tokens, moved_src, moved_dst): + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", *s, *d, t) - # Sanity check. All data we inserted should be still there. - assert n_partitions == (await cql.run_async("SELECT COUNT(*) FROM test.test"))[0].count + # Sanity check. All data we inserted should be still there. + assert n_partitions == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.test"))[0].count - # Wipe data on second node. - logger.info("Wipe data on second node") - await manager.server_stop_gracefully(servers[1].server_id, timeout=120) - await manager.server_wipe_sstables(servers[1].server_id, "test", "test") - await manager.server_start(servers[1].server_id) - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - await manager.servers_see_each_other(servers) - partitions_after_loss = (await cql.run_async("SELECT COUNT(*) FROM test.test"))[0].count - assert partitions_after_loss < n_partitions + # Wipe data on second node. + logger.info("Wipe data on second node") + await manager.server_stop_gracefully(servers[1].server_id, timeout=120) + await manager.server_wipe_sstables(servers[1].server_id, ks, "test") + await manager.server_start(servers[1].server_id) + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + await manager.servers_see_each_other(servers) + partitions_after_loss = (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.test"))[0].count + assert partitions_after_loss < n_partitions - # Migrate all tablets back to their original position. - # Check that this doesn't resurrect cleaned data. - logger.info("Migrate the migrated tablets back") - for t, s, d in zip(moved_tokens, moved_dst, moved_src): - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", *s, *d, t) - assert partitions_after_loss == (await cql.run_async("SELECT COUNT(*) FROM test.test"))[0].count + # Migrate all tablets back to their original position. + # Check that this doesn't resurrect cleaned data. + logger.info("Migrate the migrated tablets back") + for t, s, d in zip(moved_tokens, moved_dst, moved_src): + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", *s, *d, t) + assert partitions_after_loss == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.test"))[0].count - # Kill and restart first node. - # Check that this doesn't resurrect cleaned data. - logger.info("Brutally restart first node") - await manager.server_stop(servers[0].server_id) - await manager.server_start(servers[0].server_id) - hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - await manager.servers_see_each_other(servers) - assert partitions_after_loss == (await cql.run_async("SELECT COUNT(*) FROM test.test"))[0].count + # Kill and restart first node. + # Check that this doesn't resurrect cleaned data. + logger.info("Brutally restart first node") + await manager.server_stop(servers[0].server_id) + await manager.server_start(servers[0].server_id) + hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + await manager.servers_see_each_other(servers) + assert partitions_after_loss == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.test"))[0].count - # Bonus: check that commitlog_cleanups doesn't have any garbage after restart. - assert 0 == (await cql.run_async("SELECT COUNT(*) FROM system.commitlog_cleanups", host=hosts[0]))[0].count + # Bonus: check that commitlog_cleanups doesn't have any garbage after restart. + assert 0 == (await cql.run_async("SELECT COUNT(*) FROM system.commitlog_cleanups", host=hosts[0]))[0].count @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -817,44 +802,44 @@ async def test_tablet_cleanup_failure(manager: ManagerClient): n_partitions = 1000 await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) await manager.servers_see_each_other(servers) - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {}}};".format(n_tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk) VALUES ({k});") for k in range(n_partitions)]) + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {n_tablets}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk) VALUES ({k});") for k in range(n_partitions)]) - await inject_error_one_shot_on(manager, "tablet_cleanup_failure", servers) + await inject_error_one_shot_on(manager, "tablet_cleanup_failure", servers) - s0_log = await manager.server_open_log(servers[0].server_id) - s0_mark = await s0_log.mark() + s0_log = await manager.server_open_log(servers[0].server_id) + s0_mark = await s0_log.mark() - servers.append(await manager.server_add()) + servers.append(await manager.server_add()) - tablet_token = 0 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) + tablet_token = 0 + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) - logger.info("Waiting for injected cleanup failure...") - await s0_log.wait_for('Cleanup failed for tablet', from_mark=s0_mark) + logger.info("Waiting for injected cleanup failure...") + await s0_log.wait_for('Cleanup failed for tablet', from_mark=s0_mark) - logger.info("Waiting for cleanup success on retry...") - await s0_log.wait_for('Cleaned up tablet .* of table test.test successfully.', from_mark=s0_mark) + logger.info("Waiting for cleanup success on retry...") + await s0_log.wait_for(f'Cleaned up tablet .* of table {ks}.test successfully.', from_mark=s0_mark) - logger.info("Waiting for cleanup success on retry...") - await s0_log.wait_for('updating topology state: Finished tablet migration', from_mark=s0_mark) + logger.info("Waiting for cleanup success on retry...") + await s0_log.wait_for('updating topology state: Finished tablet migration', from_mark=s0_mark) - logger.info("Waiting for migration task...") - await migration_task + logger.info("Waiting for migration task...") + await migration_task - assert n_partitions == (await cql.run_async("SELECT COUNT(*) FROM test.test"))[0].count + assert n_partitions == (await cql.run_async(f"SELECT COUNT(*) FROM {ks}.test"))[0].count - node_workdir = await manager.server_get_workdir(servers[0].server_id) - table_dir = glob.glob(os.path.join(node_workdir, "data", "test", "test-*"))[0] - logger.info(f"Table dir: {table_dir}") - ssts = glob.glob(os.path.join(table_dir, "*-Data.db")) - logger.info("Guarantee source node of migration left no sstables undeleted") - assert len(ssts) == 0 + node_workdir = await manager.server_get_workdir(servers[0].server_id) + table_dir = glob.glob(os.path.join(node_workdir, "data", ks, "test-*"))[0] + logger.info(f"Table dir: {table_dir}") + ssts = glob.glob(os.path.join(table_dir, "*-Data.db")) + logger.info("Guarantee source node of migration left no sstables undeleted") + assert len(ssts) == 0 @pytest.mark.asyncio async def test_tablet_resharding(manager: ManagerClient): @@ -867,9 +852,9 @@ async def test_tablet_resharding(manager: ManagerClient): cql = manager.get_cql() n_tablets = 32 n_partitions = 1000 - await cql.run_async(f"CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {n_tablets}}};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY);") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk) VALUES ({k});") for k in range(n_partitions)]) + ks = await create_new_test_keyspace(cql, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {n_tablets}}}") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY);") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk) VALUES ({k});") for k in range(n_partitions)]) await manager.server_stop_gracefully(server.server_id, timeout=120) await manager.server_update_cmdline(server.server_id, ['--smp=2']) @@ -907,58 +892,58 @@ async def test_tablet_split(manager: ManagerClient, injection_error: str): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - # enough to trigger multiple splits with max size of 1024 bytes. - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + # enough to trigger multiple splits with max size of 1024 bytes. + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - async def check(): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk - await check() + await check() - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count == 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count == 1 - logger.info("Adding new server") - servers.append(await manager.server_add(cmdline=cmdline)) + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) - # Increases the chance of tablet migration concurrent with split - await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers) - await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) + # Increases the chance of tablet migration concurrent with split + await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) - compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, "test")) - await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) + await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) + compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, ks)) + await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) - # Now there's a split and migration need, so they'll potentially run concurrently. - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + # Now there's a split and migration need, so they'll potentially run concurrently. + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await check() - time.sleep(5) # Give load balancer some time to do work + await check() + time.sleep(5) # Give load balancer some time to do work - await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) - await check() + await check() - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count > 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count > 1 - await manager.api.message_injection(servers[0].ip_addr, injection_error) - await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) - await compaction_task + await manager.api.message_injection(servers[0].ip_addr, injection_error) + await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) + await compaction_task @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -980,59 +965,59 @@ async def test_correctness_of_tablet_split_finalization_after_restart(manager: M }, cmdline=cmdline)) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH compaction = {'class': 'NullCompactionStrategy'};") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH compaction = {{'class': 'NullCompactionStrategy'}};") - # enough to trigger multiple splits with max size of 1024 bytes. - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + # enough to trigger multiple splits with max size of 1024 bytes. + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - async def check(): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk - await check() + await check() - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count == 2 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count == 2 - await manager.api.enable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing", one_shot=False) + await manager.api.enable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing", one_shot=False) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", one_shot=False) - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", one_shot=False) + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await s1_log.wait_for('Finalizing resize decision for table', from_mark=s1_mark) + await s1_log.wait_for('Finalizing resize decision for table', from_mark=s1_mark) - # Delays refresh of tablet stats, so balancer works with whichever it got last. - await manager.api.disable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing") - await manager.api.disable_injection(servers[0].ip_addr, "short_tablet_stats_refresh_interval") - time.sleep(1) - await manager.api.disable_tablet_balancing(servers[0].ip_addr) + # Delays refresh of tablet stats, so balancer works with whichever it got last. + await manager.api.disable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing") + await manager.api.disable_injection(servers[0].ip_addr, "short_tablet_stats_refresh_interval") + time.sleep(1) + await manager.api.disable_tablet_balancing(servers[0].ip_addr) - await manager.server_stop_gracefully(servers[1].server_id, timeout=120) - await manager.server_start(servers[1].server_id) - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - await manager.servers_see_each_other(servers) + await manager.server_stop_gracefully(servers[1].server_id, timeout=120) + await manager.server_start(servers[1].server_id) + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + await manager.servers_see_each_other(servers) - await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count > 2 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count > 2 - await check() + await check() @pytest.mark.parametrize("injection_error", ["foreach_compaction_group_wait", "major_compaction_wait"]) @pytest.mark.asyncio @@ -1045,51 +1030,51 @@ async def test_concurrent_tablet_migration_and_major(manager: ManagerClient, inj await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - async def check(): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == len(keys) - for r in rows: - assert r.c == r.pk + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == len(keys) + for r in rows: + assert r.c == r.pk - await check() + await check() - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - logger.info("Adding new server") - servers.append(await manager.server_add(cmdline=cmdline)) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) + s1_host_id = await manager.get_host_id(servers[1].server_id) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) - logger.info("Started major compaction") - compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, "test")) - await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) + await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) + logger.info("Started major compaction") + compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, ks)) + await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) - tablet_replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') + tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') - t = tablet_replicas[0] - logger.info("Migrating tablet") - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", *t.replicas[0], *(s1_host_id, 0), t.last_token) + t = tablet_replicas[0] + logger.info("Migrating tablet") + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", *t.replicas[0], *(s1_host_id, 0), t.last_token) - await manager.api.message_injection(servers[0].ip_addr, injection_error) - await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) - await compaction_task + await manager.api.message_injection(servers[0].ip_addr, injection_error) + await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) + await compaction_task - if injection_error == "major_compaction_wait": - logger.info("Check that major was successfully aborted on migration") - await s1_log.wait_for("Compaction for test/test was stopped due to: tablet cleanup", from_mark=s1_mark) + if injection_error == "major_compaction_wait": + logger.info("Check that major was successfully aborted on migration") + await s1_log.wait_for(f"Compaction for {ks}/test was stopped due to: tablet cleanup", from_mark=s1_mark) - await check() + await check() @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -1102,32 +1087,32 @@ async def test_concurrent_table_drop_and_major(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) - logger.info("Started major compaction") - compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, "test")) - await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) + await manager.api.enable_injection(servers[0].ip_addr, injection_error, one_shot=True) + logger.info("Started major compaction") + compaction_task = asyncio.create_task(manager.api.keyspace_compaction(servers[0].ip_addr, ks)) + await s1_log.wait_for(f"{injection_error}: waiting", from_mark=s1_mark) - logger.info("Dropping table") - await cql.run_async("DROP TABLE test.test") + logger.info("Dropping table") + await cql.run_async(f"DROP TABLE {ks}.test") - await manager.api.message_injection(servers[0].ip_addr, injection_error) - await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) - await compaction_task + await manager.api.message_injection(servers[0].ip_addr, injection_error) + await s1_log.wait_for(f"{injection_error}: released", from_mark=s1_mark) + await compaction_task - if injection_error == "major_compaction_wait": - logger.info("Check that major was successfully aborted on migration") - await s1_log.wait_for("ongoing compactions for table test.test .* due to table removal", from_mark=s1_mark) + if injection_error == "major_compaction_wait": + logger.info("Check that major was successfully aborted on migration") + await s1_log.wait_for(f"ongoing compactions for table {ks}.test .* due to table removal", from_mark=s1_mark) async def assert_tablet_count_metric_value_for_shards(manager: ManagerClient, server: ServerInfo, expected_count_per_shard: list[int]): tablet_count_metric_name = "scylla_tablets_count" @@ -1190,80 +1175,80 @@ async def test_tablet_count_metric_per_shard(manager: ManagerClient): # When two tables are created cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE testing WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1};") - await cql.run_async("CREATE TABLE testing.mytable1 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") - await cql.run_async("CREATE TABLE testing.mytable2 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.mytable1 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") + await cql.run_async(f"CREATE TABLE {ks}.mytable2 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") - # Then tablet count metric for each shard depicts the actual state - tables = { "testing": ["mytable1", "mytable2"] } - expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) + # Then tablet count metric for each shard depicts the actual state + tables = { ks: ["mytable1", "mytable2"] } + expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) - expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) + expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) - # When third table is created - await cql.run_async("CREATE TABLE testing.mytable3 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") + # When third table is created + await cql.run_async(f"CREATE TABLE {ks}.mytable3 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") - # Then tablet count metric for each shard depicts the actual state - tables = { "testing": ["mytable1", "mytable2", "mytable3"] } - expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) + # Then tablet count metric for each shard depicts the actual state + tables = { ks: ["mytable1", "mytable2", "mytable3"] } + expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) - expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) + expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) - # When one of tables is dropped - await cql.run_async("DROP TABLE testing.mytable2;") + # When one of tables is dropped + await cql.run_async(f"DROP TABLE {ks}.mytable2;") - # Then tablet count metric for each shard depicts the actual state - tables = { "testing": ["mytable1", "mytable3"] } - expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) + # Then tablet count metric for each shard depicts the actual state + tables = { ks: ["mytable1", "mytable3"] } + expected_count_per_shard_for_host_0 = await get_tablet_count_per_shard_for_host(manager, servers[0], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[0], expected_count_per_shard_for_host_0) - expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) - await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) + expected_count_per_shard_for_host_1 = await get_tablet_count_per_shard_for_host(manager, servers[1], tables, shards_count) + await assert_tablet_count_metric_value_for_shards(manager, servers[1], expected_count_per_shard_for_host_1) - # And when moving tablets from one shard of src_host to (dest_host, shard_3) - shard_id_to_move = get_shard_that_has_tablets(expected_count_per_shard_for_host_0) - if shard_id_to_move != -1: - src_server = servers[0] - dest_server = servers[1] - src_expected_count_per_shard = expected_count_per_shard_for_host_0 - dest_expected_count_per_shard = expected_count_per_shard_for_host_1 - else: - shard_id_to_move = get_shard_that_has_tablets(expected_count_per_shard_for_host_1) - src_server = servers[1] - dest_server = servers[0] - src_expected_count_per_shard = expected_count_per_shard_for_host_1 - dest_expected_count_per_shard = expected_count_per_shard_for_host_0 + # And when moving tablets from one shard of src_host to (dest_host, shard_3) + shard_id_to_move = get_shard_that_has_tablets(expected_count_per_shard_for_host_0) + if shard_id_to_move != -1: + src_server = servers[0] + dest_server = servers[1] + src_expected_count_per_shard = expected_count_per_shard_for_host_0 + dest_expected_count_per_shard = expected_count_per_shard_for_host_1 + else: + shard_id_to_move = get_shard_that_has_tablets(expected_count_per_shard_for_host_1) + src_server = servers[1] + dest_server = servers[0] + src_expected_count_per_shard = expected_count_per_shard_for_host_1 + dest_expected_count_per_shard = expected_count_per_shard_for_host_0 - tokens_on_shard_to_move = { - "mytable1" : await get_tablet_tokens_from_host_on_shard(manager, src_server, "testing", "mytable1", shard_id_to_move), - "mytable3" : await get_tablet_tokens_from_host_on_shard(manager, src_server, "testing", "mytable3", shard_id_to_move) - } + tokens_on_shard_to_move = { + "mytable1" : await get_tablet_tokens_from_host_on_shard(manager, src_server, ks, "mytable1", shard_id_to_move), + "mytable3" : await get_tablet_tokens_from_host_on_shard(manager, src_server, ks, "mytable3", shard_id_to_move) + } - count_of_tokens_on_src_shard_to_move = len(tokens_on_shard_to_move["mytable1"]) + len(tokens_on_shard_to_move["mytable3"]) - assert count_of_tokens_on_src_shard_to_move > 0 + count_of_tokens_on_src_shard_to_move = len(tokens_on_shard_to_move["mytable1"]) + len(tokens_on_shard_to_move["mytable3"]) + assert count_of_tokens_on_src_shard_to_move > 0 - src_host_id = await manager.get_host_id(src_server.server_id) - dest_host_id = await manager.get_host_id(dest_server.server_id) - for table_name, tokens in tokens_on_shard_to_move.items(): - for token in tokens: - await manager.api.move_tablet(node_ip=src_server.ip_addr, ks="testing", table=table_name, src_host=src_host_id, src_shard=shard_id_to_move, dst_host=dest_host_id, dst_shard=3, token=token) + src_host_id = await manager.get_host_id(src_server.server_id) + dest_host_id = await manager.get_host_id(dest_server.server_id) + for table_name, tokens in tokens_on_shard_to_move.items(): + for token in tokens: + await manager.api.move_tablet(node_ip=src_server.ip_addr, ks=ks, table=table_name, src_host=src_host_id, src_shard=shard_id_to_move, dst_host=dest_host_id, dst_shard=3, token=token) - # And when ensuring that local tablet metadata on the queried node reflects the finalized tablet movement - await read_barrier(manager.api, servers[0].ip_addr) - await read_barrier(manager.api, servers[1].ip_addr) + # And when ensuring that local tablet metadata on the queried node reflects the finalized tablet movement + await read_barrier(manager.api, servers[0].ip_addr) + await read_barrier(manager.api, servers[1].ip_addr) - # Then tablet count metric is adjusted to depict that situation on src_host - all tablets from selected shard have been moved - src_expected_count_per_shard[shard_id_to_move] = 0 - await assert_tablet_count_metric_value_for_shards(manager, src_server, src_expected_count_per_shard) + # Then tablet count metric is adjusted to depict that situation on src_host - all tablets from selected shard have been moved + src_expected_count_per_shard[shard_id_to_move] = 0 + await assert_tablet_count_metric_value_for_shards(manager, src_server, src_expected_count_per_shard) - # And then tablet count metric is increased on dest_host - tablets have been moved to shard_3 - dest_expected_count_per_shard[3] += count_of_tokens_on_src_shard_to_move - await assert_tablet_count_metric_value_for_shards(manager, dest_server, dest_expected_count_per_shard) + # And then tablet count metric is increased on dest_host - tablets have been moved to shard_3 + dest_expected_count_per_shard[3] += count_of_tokens_on_src_shard_to_move + await assert_tablet_count_metric_value_for_shards(manager, dest_server, dest_expected_count_per_shard) @pytest.mark.parametrize("primary_replica_only", [False, True]) async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_only): @@ -1279,17 +1264,18 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on cql = manager.get_cql() - async def create_table(ks_name : str, tablet_count : int): + async def create_table(tablet_count : int) -> str: # Creates multiple tablets in the same shard - await cql.run_async(f"CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}" \ + ks_name = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}" \ f" AND tablets = {{ 'initial': {tablet_count} }};") await cql.run_async(f"CREATE TABLE {ks_name}.test (pk int PRIMARY KEY, c int);") + return ks_name - await create_table("test", 5) # 5 is rounded up to next power-of-two + ks = await create_table(5) # 5 is rounded up to next power-of-two # Populate tablets keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) async def check(ks_name: str): logger.info("Checking table") @@ -1299,19 +1285,19 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on for r in rows: assert r.c == r.pk - await manager.api.flush_keyspace(servers[0].ip_addr, "test") - await check("test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) + await check(ks) node_workdir = await manager.server_get_workdir(servers[0].server_id) - await create_table("test2", 16) + ks2 = await create_table(16) cql = await safe_server_stop_gracefully(manager, servers[0].server_id) - table_dir = glob.glob(os.path.join(node_workdir, "data", "test", "test-*"))[0] + table_dir = glob.glob(os.path.join(node_workdir, "data", ks, "test-*"))[0] logger.info(f"Table dir: {table_dir}") - dst_table_dir = glob.glob(os.path.join(node_workdir, "data", "test2", "test-*"))[0] + dst_table_dir = glob.glob(os.path.join(node_workdir, "data", ks2, "test-*"))[0] logger.info(f"Dst table dir: {dst_table_dir}") def move_sstables_to_upload(table_dir: str, dst_table_dir: str): @@ -1329,7 +1315,7 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on cql = manager.get_cql() await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - rows = await cql.run_async("SELECT * FROM test.test BYPASS CACHE;") + rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;") assert len(rows) == 0 await manager.api.disable_tablet_balancing(servers[0].ip_addr) @@ -1341,11 +1327,13 @@ async def test_tablet_load_and_stream(manager: ManagerClient, primary_replica_on await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await manager.api.load_new_sstables(servers[0].ip_addr, "test2", "test", primary_replica_only) + await manager.api.load_new_sstables(servers[0].ip_addr, ks2, "test", primary_replica_only) time.sleep(1) - await check("test2") + await check(ks2) + + await asyncio.gather(*[cql.run_async(f"drop keyspace {i}") for i in [ks, ks2]]) @pytest.mark.asyncio async def test_storage_service_api_uneven_ownership_keyspace_and_table_params_used(manager: ManagerClient): @@ -1356,31 +1344,31 @@ async def test_storage_service_api_uneven_ownership_keyspace_and_table_params_us # When table is created with initial tablets set to 1 cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE testing WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE testing.mytable1 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.mytable1 (col1 timestamp, col2 text, col3 blob, PRIMARY KEY (col1));") - # And when ownership for this table is queried - actual_ownerships = await manager.api.get_ownership(servers[0].ip_addr, "testing", "mytable1") + # And when ownership for this table is queried + actual_ownerships = await manager.api.get_ownership(servers[0].ip_addr, ks, "mytable1") - # Then ensure that returned ownerships is 0.0 and 1.0 (which node gets 0.0 and 1.0 is unspecified) - expected_ips = {servers[0].ip_addr, servers[1].ip_addr} - expected_ownerships = [0.0, 1.0] - delta = 0.0001 - already_verified = set() + # Then ensure that returned ownerships is 0.0 and 1.0 (which node gets 0.0 and 1.0 is unspecified) + expected_ips = {servers[0].ip_addr, servers[1].ip_addr} + expected_ownerships = [0.0, 1.0] + delta = 0.0001 + already_verified = set() - sorted_actual_ownerships = sorted(actual_ownerships, key=lambda e: e["value"]) - assert len(sorted_actual_ownerships) == len(expected_ownerships) + sorted_actual_ownerships = sorted(actual_ownerships, key=lambda e: e["value"]) + assert len(sorted_actual_ownerships) == len(expected_ownerships) - for i in range(0, len(sorted_actual_ownerships)): - entry = sorted_actual_ownerships[i] - actual_ip = entry["key"] - actual_ownership = float(entry["value"]) + for i in range(0, len(sorted_actual_ownerships)): + entry = sorted_actual_ownerships[i] + actual_ip = entry["key"] + actual_ownership = float(entry["value"]) - assert actual_ip in expected_ips - assert actual_ip not in already_verified - assert actual_ownership == pytest.approx(expected_ownerships[i], abs=delta) + assert actual_ip in expected_ips + assert actual_ip not in already_verified + assert actual_ownership == pytest.approx(expected_ownerships[i], abs=delta) - already_verified.add(actual_ip) + already_verified.add(actual_ip) @pytest.mark.asyncio async def test_tablet_storage_freeing(manager: ManagerClient): @@ -1393,35 +1381,35 @@ async def test_tablet_storage_freeing(manager: ManagerClient): logger.info("Create a table with two tablets and populate it with a moderate amount of data.") n_tablets = 2 n_partitions = 1000 - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {}}};".format(n_tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, v text) WITH compression = {'sstable_compression': ''};") - insert_stmt = cql.prepare("INSERT INTO test.test (pk, v) VALUES (?, ?);") - payload = "a"*10000 + async with new_test_keyspace(manager, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': {n_tablets}}}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, v text) WITH compression = {{'sstable_compression': ''}};") + insert_stmt = cql.prepare(f"INSERT INTO {ks}.test (pk, v) VALUES (?, ?);") + payload = "a"*10000 - max_concurrency = 100 - for batch in itertools.batched(range(n_partitions), max_concurrency): - await asyncio.gather(*[cql.run_async(insert_stmt, [k, payload]) for k in batch]) - await manager.api.keyspace_flush(servers[0].ip_addr, "test") + max_concurrency = 100 + for batch in itertools.batched(range(n_partitions), max_concurrency): + await asyncio.gather(*[cql.run_async(insert_stmt, [k, payload]) for k in batch]) + await manager.api.keyspace_flush(servers[0].ip_addr, ks) - logger.info("Start second node.") - servers.append(await manager.server_add()) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Start second node.") + servers.append(await manager.server_add()) + s1_host_id = await manager.get_host_id(servers[1].server_id) - logger.info("Check the table's disk usage on first node.") - size_before = await manager.server_get_sstables_disk_usage(servers[0].server_id, "test", "test") - assert size_before > n_partitions * len(payload) + logger.info("Check the table's disk usage on first node.") + size_before = await manager.server_get_sstables_disk_usage(servers[0].server_id, ks, "test") + assert size_before > n_partitions * len(payload) - logger.info("Read system.tablets.") - tablet_replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - assert len(tablet_replicas) == n_tablets + logger.info("Read system.tablets.") + tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + assert len(tablet_replicas) == n_tablets - logger.info("Migrate one of the two tablets from the first node to the second node.") - t = tablet_replicas[0] - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", *t.replicas[0], *(s1_host_id, 0), t.last_token) + logger.info("Migrate one of the two tablets from the first node to the second node.") + t = tablet_replicas[0] + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", *t.replicas[0], *(s1_host_id, 0), t.last_token) - logger.info("Verify that the table's disk usage on first node shrunk by about half.") - size_after = await manager.server_get_sstables_disk_usage(servers[0].server_id, "test", "test") - assert size_before * 0.33 < size_after < size_before * 0.66 + logger.info("Verify that the table's disk usage on first node shrunk by about half.") + size_after = await manager.server_get_sstables_disk_usage(servers[0].server_id, ks, "test") + assert size_before * 0.33 < size_after < size_before * 0.66 @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -1432,63 +1420,63 @@ async def test_tombstone_gc_disabled_on_pending_replica(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds = 0;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds = 0;") - servers.append(await manager.server_add()) + servers.append(await manager.server_add()) - key = 7 # Whatever - tablet_token = 0 # Doesn't matter since there is one tablet - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({key}, 1) USING timestamp 9") - rows = await cql.run_async("SELECT pk from test.test") - assert len(rows) == 1 + key = 7 # Whatever + tablet_token = 0 # Doesn't matter since there is one tablet + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({key}, 1) USING timestamp 9") + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(rows) == 1 - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 - await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) - s1_log = await manager.server_open_log(servers[1].server_id) - s1_mark = await s1_log.mark() + await manager.api.enable_injection(servers[1].ip_addr, "stream_mutation_fragments", one_shot=True) + s1_log = await manager.server_open_log(servers[1].server_id) + s1_mark = await s1_log.mark() - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) - await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) - s1_mark = await s1_log.mark() + await s1_log.wait_for('stream_mutation_fragments: waiting', from_mark=s1_mark) + s1_mark = await s1_log.mark() - # write a tombstone with timestamp X to DB - await cql.run_async(f'DELETE FROM test.test USING timestamp 10 WHERE pk = {key}') + # write a tombstone with timestamp X to DB + await cql.run_async(f'DELETE FROM {ks}.test USING timestamp 10 WHERE pk = {key}') - # flush both servers - for s in servers: - await manager.api.flush_keyspace(s.ip_addr, "test") + # flush both servers + for s in servers: + await manager.api.flush_keyspace(s.ip_addr, ks) - await asyncio.sleep(1) + await asyncio.sleep(1) - # major compact both servers - for s in servers: - await manager.api.keyspace_compaction(s.ip_addr, "test") + # major compact both servers + for s in servers: + await manager.api.keyspace_compaction(s.ip_addr, ks) - # write backdated data to test.test with timestamp X-1 with the same key as the tombstone - await cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({key}, 0) USING timestamp 9') + # write backdated data to test.test with timestamp X-1 with the same key as the tombstone + await cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({key}, 0) USING timestamp 9') - # release streaming - await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") - await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) + # release streaming + await manager.api.message_injection(servers[1].ip_addr, "stream_mutation_fragments") + await s1_log.wait_for('stream_mutation_fragments: done', from_mark=s1_mark) - logger.info("Waiting for migration to finish") - await migration_task - logger.info("Migration done") + logger.info("Waiting for migration to finish") + await migration_task + logger.info("Migration done") - for s in servers: - await manager.api.flush_keyspace(s.ip_addr, "test") + for s in servers: + await manager.api.flush_keyspace(s.ip_addr, ks) - # verify result - rows = await cql.run_async(f'SELECT pk, c FROM test.test WHERE pk = {key};') - assert len(rows) == 0 + # verify result + rows = await cql.run_async(f'SELECT pk, c FROM {ks}.test WHERE pk = {key};') + assert len(rows) == 0 @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -1500,37 +1488,37 @@ async def test_schema_change_during_cleanup(manager: ManagerClient): await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - logger.info("Start second node.") - servers.append(await manager.server_add()) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Start second node.") + servers.append(await manager.server_add()) + s1_host_id = await manager.get_host_id(servers[1].server_id) - await inject_error_on(manager, "delay_tablet_compaction_groups_cleanup", servers) + await inject_error_on(manager, "delay_tablet_compaction_groups_cleanup", servers) - logger.info("Read system.tablets.") - tablet_replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - assert len(tablet_replicas) == 1 + logger.info("Read system.tablets.") + tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + assert len(tablet_replicas) == 1 - logger.info("Migrating one tablet to another node.") - t = tablet_replicas[0] - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", *t.replicas[0], *(s1_host_id, 0), t.last_token)) + logger.info("Migrating one tablet to another node.") + t = tablet_replicas[0] + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", *t.replicas[0], *(s1_host_id, 0), t.last_token)) - logger.info("Waiting for log") - await s1_log.wait_for('Initiating tablet cleanup of', from_mark=s1_mark, timeout=120) - time.sleep(1) - await cql.run_async("ALTER TABLE test.test WITH gc_grace_seconds = 0;") - await migration_task + logger.info("Waiting for log") + await s1_log.wait_for('Initiating tablet cleanup of', from_mark=s1_mark, timeout=120) + time.sleep(1) + await cql.run_async(f"ALTER TABLE {ks}.test WITH gc_grace_seconds = 0;") + await migration_task @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -1548,73 +1536,73 @@ async def test_tombstone_gc_correctness_during_tablet_split(manager: ManagerClie await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds=0;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH gc_grace_seconds=0;") - await manager.api.disable_autocompaction(servers[0].ip_addr, "test") + await manager.api.disable_autocompaction(servers[0].ip_addr, ks) - keys = range(100) + keys = range(100) - logger.info("Generating sstable with shadowed data") - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + logger.info("Generating sstable with shadowed data") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - logger.info("Generating another sstable with tombstones") - await asyncio.gather(*[cql.run_async(f"DELETE FROM test.test WHERE pk={k};") for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + logger.info("Generating another sstable with tombstones") + await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in keys]) + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - async def assert_empty_table(): - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test BYPASS CACHE;") - assert len(rows) == 0 + async def assert_empty_table(): + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;") + assert len(rows) == 0 - await assert_empty_table() + await assert_empty_table() - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count == 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count == 1 - await manager.api.enable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing", one_shot=False) - await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", one_shot=False) + await manager.api.enable_injection(servers[0].ip_addr, "tablet_load_stats_refresh_before_rebalancing", one_shot=False) + await manager.api.enable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone", one_shot=False) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - # Waits for tombstones to be expired. - time.sleep(1) + # Waits for tombstones to be expired. + time.sleep(1) - await manager.api.enable_injection(servers[0].ip_addr, "split_sstable_rewrite", one_shot=False) + await manager.api.enable_injection(servers[0].ip_addr, "split_sstable_rewrite", one_shot=False) - logger.info("Enable balancing so split will be emitted") - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + logger.info("Enable balancing so split will be emitted") + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - logger.info("Waits for split of sstable containing expired tombstones") - await s1_log.wait_for(f"split_sstable_rewrite: waiting", from_mark=s1_mark) - s1_mark = await s1_log.mark() - await manager.api.message_injection(servers[0].ip_addr, "split_sstable_rewrite") - await s1_log.wait_for(f"split_sstable_rewrite: released", from_mark=s1_mark) + logger.info("Waits for split of sstable containing expired tombstones") + await s1_log.wait_for(f"split_sstable_rewrite: waiting", from_mark=s1_mark) + s1_mark = await s1_log.mark() + await manager.api.message_injection(servers[0].ip_addr, "split_sstable_rewrite") + await s1_log.wait_for(f"split_sstable_rewrite: released", from_mark=s1_mark) - logger.info("Pause split of sstable containing deleted data") - await s1_log.wait_for(f"split_sstable_rewrite: waiting", from_mark=s1_mark) - s1_mark = await s1_log.mark() + logger.info("Pause split of sstable containing deleted data") + await s1_log.wait_for(f"split_sstable_rewrite: waiting", from_mark=s1_mark) + s1_mark = await s1_log.mark() - logger.info("Force compaction of split sstable containing expired tombstone") - await manager.api.stop_compaction(servers[0].ip_addr, "SPLIT") - await manager.api.keyspace_compaction(servers[0].ip_addr, "test") + logger.info("Force compaction of split sstable containing expired tombstone") + await manager.api.stop_compaction(servers[0].ip_addr, "SPLIT") + await manager.api.keyspace_compaction(servers[0].ip_addr, ks) - await s1_log.wait_for(f"split_sstable_rewrite: released", from_mark=s1_mark) + await s1_log.wait_for(f"split_sstable_rewrite: released", from_mark=s1_mark) - await manager.api.disable_injection(servers[0].ip_addr, "split_sstable_rewrite") + await manager.api.disable_injection(servers[0].ip_addr, "split_sstable_rewrite") - await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") - await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + await manager.api.disable_injection(servers[0].ip_addr, "tablet_split_finalization_postpone") + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count > 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count > 1 - logger.info("Verify data is not resurrected") - await assert_empty_table() + logger.info("Verify data is not resurrected") + await assert_empty_table() async def create_cluster(manager: ManagerClient, num_dcs: int, num_racks: int, nodes_per_rack: int) -> dict[ServerNum, ServerInfo]: logger.debug(f"Creating cluster: num_dcs={num_dcs} num_racks={num_racks} nodes_per_rack={nodes_per_rack}") @@ -1639,7 +1627,7 @@ class TestContext: @asynccontextmanager async def create_and_populate_table(manager: ManagerClient, rf: int = 3, initial_tablets: int = 64, num_keys: int = 0): - ks = unique_name() + ks = "" table = unique_name() if not num_keys: num_keys = initial_tablets * 4 @@ -1648,7 +1636,7 @@ async def create_and_populate_table(manager: ManagerClient, rf: int = 3, initial cql = manager.get_cql() try: - await cql.run_async(f"CREATE KEYSPACE {ks} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND tablets = {{'initial': {initial_tablets}}}") + ks = await create_new_test_keyspace(cql, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}} AND tablets = {{'initial': {initial_tablets}}}") await cql.run_async(f"CREATE TABLE {ks}.{table} (pk int PRIMARY KEY, c int)") await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.{table} (pk, c) VALUES ({k}, 1);") for k in range(num_keys)]) yield TestContext(ks, table, rf, initial_tablets, num_keys) From e59aca66bfd9c14cad6a345b080d72b67b30f6cc Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 43/56] topology_custom/test_tablets_cql: use new_test_keyspace And create_new_test_keyspace when we need drop to be explicit. Signed-off-by: Benny Halevy --- test/topology_custom/test_tablets_cql.py | 90 ++++++++++++------------ 1 file changed, 46 insertions(+), 44 deletions(-) diff --git a/test/topology_custom/test_tablets_cql.py b/test/topology_custom/test_tablets_cql.py index 5979e316ee..8d24868c73 100644 --- a/test/topology_custom/test_tablets_cql.py +++ b/test/topology_custom/test_tablets_cql.py @@ -12,7 +12,7 @@ from cassandra.protocol import InvalidRequest from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error_one_shot from test.topology.conftest import skip_mode -from test.topology.util import disable_schema_agreement_wait +from test.topology.util import disable_schema_agreement_wait, create_new_test_keyspace, new_test_keyspace logger = logging.getLogger(__name__) @@ -30,10 +30,10 @@ async def test_alter_dropped_tablets_keyspace(manager: ManagerClient) -> None: logger.info("starting a second node (the follower)") servers += [await manager.server_add(config=config)] - await manager.get_cql().run_async("create keyspace ks with " + ks = await create_new_test_keyspace(manager.get_cql(), "with " "replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} and " "tablets = {'enabled': true}") - await manager.get_cql().run_async("create table ks.t (pk int primary key)") + await manager.get_cql().run_async(f"create table {ks}.t (pk int primary key)") logger.info(f"injecting wait-after-topology-coordinator-gets-event into the leader node {servers[0]}") injection_handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, @@ -43,7 +43,7 @@ async def test_alter_dropped_tablets_keyspace(manager: ManagerClient) -> None: res = await manager.get_cql().run_async("select data_center from system.local") # ALTER tablets KS only accepts a specific DC, it rejects the generic 'replication_factor' tag this_dc = res[0].data_center - await manager.get_cql().run_async("alter keyspace ks " + await manager.get_cql().run_async(f"alter keyspace {ks} " f"with replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 1}}") # by creating a task this way we ensure it's immediately executed, but we won't wait until it's completed @@ -56,16 +56,16 @@ async def test_alter_dropped_tablets_keyspace(manager: ManagerClient) -> None: logger.info(f"dropping KS from the follower node {servers[1]} so that the leader, which hangs on injected sleep, " f"wakes up with the drop applied") host = manager.get_cql().cluster.metadata.get_host(servers[1].ip_addr) - await manager.get_cql().run_async("drop keyspace ks", host=host) + await manager.get_cql().run_async(f"drop keyspace {ks}", host=host) logger.info("Waking up the leader to continue processing ALTER with KS that doesn't exist (has been just dropped)") await injection_handler.message() matches = await leader_log_file.grep("topology change coordinator fiber got error " - "data_dictionary::no_such_keyspace \(Can't find a keyspace ks\)") + f"data_dictionary::no_such_keyspace \(Can't find a keyspace {ks}\)") assert not matches - with pytest.raises(InvalidRequest, match="Can't ALTER keyspace ks, keyspace doesn't exist") as e: + with pytest.raises(InvalidRequest, match=f"Can't ALTER keyspace {ks}, keyspace doesn't exist") as e: await task @pytest.mark.asyncio @@ -81,51 +81,53 @@ async def test_alter_tablets_keyspace_concurrent_modification(manager: ManagerCl logger.info("starting a second node (the follower)") servers += [await manager.server_add(config=config)] - await manager.get_cql().run_async("create keyspace ks with " + async with new_test_keyspace(manager, "with " "replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} and " - "tablets = {'initial': 2}") - await manager.get_cql().run_async("create table ks.t (pk int primary key)") + "tablets = {'initial': 2}") as ks: + await manager.get_cql().run_async(f"create table {ks}.t (pk int primary key)") - logger.info(f"injecting wait-before-committing-rf-change-event into the leader node {servers[0]}") - injection_handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, - 'wait-before-committing-rf-change-event') + logger.info(f"injecting wait-before-committing-rf-change-event into the leader node {servers[0]}") + injection_handler = await inject_error_one_shot(manager.api, servers[0].ip_addr, + 'wait-before-committing-rf-change-event') - # ALTER tablets KS only accepts a specific DC, it rejects the generic 'replication_factor' tag - res = await manager.get_cql().run_async("select data_center from system.local") - this_dc = res[0].data_center + # ALTER tablets KS only accepts a specific DC, it rejects the generic 'replication_factor' tag + res = await manager.get_cql().run_async("select data_center from system.local") + this_dc = res[0].data_center - async def alter_tablets_ks_without_waiting_to_complete(): - logger.info("scheduling ALTER KS to change the RF from 1 to 2") - await manager.get_cql().run_async("alter keyspace ks " - f"with replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 2}}") + async def alter_tablets_ks_without_waiting_to_complete(): + logger.info("scheduling ALTER KS to change the RF from 1 to 2") + await manager.get_cql().run_async(f"alter keyspace {ks} " + f"with replication = {{'class': 'NetworkTopologyStrategy', '{this_dc}': 2}}") - # by creating a task this way we ensure it's immediately executed, - # but we don't want to wait until the task is completed here, - # because we want to do something else in the meantime - task = asyncio.create_task(alter_tablets_ks_without_waiting_to_complete()) + # by creating a task this way we ensure it's immediately executed, + # but we don't want to wait until the task is completed here, + # because we want to do something else in the meantime + task = asyncio.create_task(alter_tablets_ks_without_waiting_to_complete()) - logger.info(f"waiting for the leader node {servers[0]} to start handling the keyspace-rf-change request") - leader_log_file = await manager.server_open_log(servers[0].server_id) - await leader_log_file.wait_for("wait-before-committing-rf-change-event: waiting", timeout=10) + logger.info(f"waiting for the leader node {servers[0]} to start handling the keyspace-rf-change request") + leader_log_file = await manager.server_open_log(servers[0].server_id) + await leader_log_file.wait_for("wait-before-committing-rf-change-event: waiting", timeout=10) - logger.info(f"creating another keyspace from the follower node {servers[1]} so that the leader, which hangs on injected sleep, " - f"wakes up with a changed schema") - host = manager.get_cql().cluster.metadata.get_host(servers[1].ip_addr) - with disable_schema_agreement_wait(manager.get_cql()): - await manager.get_cql().run_async("create keyspace ks2 with " - "replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} " - "and tablets = {'enabled': true}", host=host) + logger.info(f"creating another keyspace from the follower node {servers[1]} so that the leader, which hangs on injected sleep, " + f"wakes up with a changed schema") + host = manager.get_cql().cluster.metadata.get_host(servers[1].ip_addr) + with disable_schema_agreement_wait(manager.get_cql()): + ks2 = await create_new_test_keyspace(manager.get_cql(), "with " + "replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} " + "and tablets = {'enabled': true}", host=host) - logger.info("waking up the leader to continue processing ALTER on a changed schema, which should cause a retry") - await injection_handler.message() + logger.info("waking up the leader to continue processing ALTER on a changed schema, which should cause a retry") + await injection_handler.message() - logger.info("waiting for ALTER to complete") - await task + logger.info("waiting for ALTER to complete") + await task - # ensure that the concurrent modification error really did take place - matches = await leader_log_file.grep("topology change coordinator fiber got group0_concurrent_modification") - assert matches + # ensure that the concurrent modification error really did take place + matches = await leader_log_file.grep("topology change coordinator fiber got group0_concurrent_modification") + assert matches - # ensure that the ALTER has eventually succeeded and we changed RF from 1 to 2 - res = manager.get_cql().execute(f"SELECT * FROM system_schema.keyspaces WHERE keyspace_name = 'ks'") - assert res[0].replication[this_dc] == '2' + # ensure that the ALTER has eventually succeeded and we changed RF from 1 to 2 + res = manager.get_cql().execute(f"SELECT * FROM system_schema.keyspaces WHERE keyspace_name = '{ks}'") + assert res[0].replication[this_dc] == '2' + + await manager.get_cql().run_async(f"drop keyspace {ks2}") From 5ff31539122f258bcc7376b4db35773e18bb79da Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 44/56] topology_custom/test_tablets_intranode: use new_test_keyspace Signed-off-by: Benny Halevy --- .../topology_custom/test_tablets_intranode.py | 138 +++++++++--------- 1 file changed, 68 insertions(+), 70 deletions(-) diff --git a/test/topology_custom/test_tablets_intranode.py b/test/topology_custom/test_tablets_intranode.py index cefd63607f..ba3b77950e 100644 --- a/test/topology_custom/test_tablets_intranode.py +++ b/test/topology_custom/test_tablets_intranode.py @@ -12,7 +12,7 @@ from test.pylib.rest_client import inject_error from test.pylib.util import wait_for_cql_and_get_hosts, start_writes from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas from test.topology.conftest import skip_mode -from test.topology.util import reconnect_driver +from test.topology.util import new_test_keyspace import pytest import asyncio @@ -43,26 +43,26 @@ async def test_intranode_migration(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - finish_writes = await start_writes(cql, "test", "test") + finish_writes = await start_writes(cql, ks, "test") - tablet_token = 0 # Doesn't matter since there is one tablet - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + tablet_token = 0 # Doesn't matter since there is one tablet + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - src_shard = replica[1] - dst_shard = src_shard ^ 1 + s0_host_id = await manager.get_host_id(servers[0].server_id) + src_shard = replica[1] + dst_shard = src_shard ^ 1 - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], src_shard, replica[0], dst_shard, tablet_token) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], src_shard, replica[0], dst_shard, tablet_token) - key_count = await finish_writes() + key_count = await finish_writes() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == key_count - for r in rows: - assert r.c == r.pk + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == key_count + for r in rows: + assert r.c == r.pk @pytest.mark.asyncio @@ -79,43 +79,42 @@ async def test_crash_during_intranode_migration(manager: ManagerClient): cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}" - " AND tablets = {'initial': 4};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 4}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - finish_writes = await start_writes(cql, "test", "test", ignore_errors=True) + finish_writes = await start_writes(cql, ks, "test", ignore_errors=True) - tablet_token = 0 # Choose one tablet, any of them - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) + tablet_token = 0 # Choose one tablet, any of them + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) - src_shard = replica[1] - dst_shard = src_shard ^ 1 + src_shard = replica[1] + dst_shard = src_shard ^ 1 - await manager.api.enable_injection(servers[0].ip_addr, 'crash-in-tablet-write-both-read-new', one_shot=True) + await manager.api.enable_injection(servers[0].ip_addr, 'crash-in-tablet-write-both-read-new', one_shot=True) - migration_task = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, "test", "test", - replica[0], src_shard, replica[0], dst_shard, tablet_token)) + migration_task = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", + replica[0], src_shard, replica[0], dst_shard, tablet_token)) - s0_logs = await manager.server_open_log(servers[0].server_id) - await s0_logs.wait_for('crash-in-tablet-write-both-read-new hit') - await manager.server_stop(servers[0].server_id) - await manager.server_start(servers[0].server_id) - await wait_for_cql_and_get_hosts(manager.cql, servers, time.time() + 60) + s0_logs = await manager.server_open_log(servers[0].server_id) + await s0_logs.wait_for('crash-in-tablet-write-both-read-new hit') + await manager.server_stop(servers[0].server_id) + await manager.server_start(servers[0].server_id) + await wait_for_cql_and_get_hosts(manager.cql, servers, time.time() + 60) - # Wait for the tablet migration to finish - await manager.api.quiesce_topology(servers[0].ip_addr) + # Wait for the tablet migration to finish + await manager.api.quiesce_topology(servers[0].ip_addr) - try: - await migration_task - except: - pass + try: + await migration_task + except: + pass - key_count = await finish_writes() + key_count = await finish_writes() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == key_count - for r in rows: - assert r.c == r.pk + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == key_count + for r in rows: + assert r.c == r.pk @pytest.mark.asyncio @@ -147,39 +146,38 @@ async def test_cross_shard_migration(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}" - " AND tablets = {'initial': 2};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - finish_writes = await start_writes(cql, "test", "test") + finish_writes = await start_writes(cql, ks, "test") - tablet0_token = -1 - tablet1_token = 1 - replica0 = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet0_token) - replica1 = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet1_token) + tablet0_token = -1 + tablet1_token = 1 + replica0 = await get_tablet_replica(manager, servers[0], ks, 'test', tablet0_token) + replica1 = await get_tablet_replica(manager, servers[0], ks, 'test', tablet1_token) - s0_host_id = await manager.get_host_id(servers[0].server_id) - s1_host_id = await manager.get_host_id(servers[1].server_id) + s0_host_id = await manager.get_host_id(servers[0].server_id) + s1_host_id = await manager.get_host_id(servers[1].server_id) - # Place tablets on non-zero shards so that defaulted shard (0) is never the right shard. - # This is to catch the problem when sharder (incorrectly) thinks that tablet does not have - # any replica on the current host and assigns shard 0 to it in shard_for_read(). - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica0[0], replica0[1], s0_host_id, 1, tablet0_token) - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica1[0], replica1[1], s1_host_id, 1, tablet1_token) + # Place tablets on non-zero shards so that defaulted shard (0) is never the right shard. + # This is to catch the problem when sharder (incorrectly) thinks that tablet does not have + # any replica on the current host and assigns shard 0 to it in shard_for_read(). + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica0[0], replica0[1], s0_host_id, 1, tablet0_token) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica1[0], replica1[1], s1_host_id, 1, tablet1_token) - # Put whole token ring into migration so that all requests hit the migration path. Half of them - # will be coordinated by the owning host, half will be coordinated by the non-owning host. - migration0 = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, "test", "test", - s0_host_id, 1, s1_host_id, 1, tablet0_token)) - migration1 = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, "test", "test", - s1_host_id, 1, s0_host_id, 1, tablet1_token)) + # Put whole token ring into migration so that all requests hit the migration path. Half of them + # will be coordinated by the owning host, half will be coordinated by the non-owning host. + migration0 = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", + s0_host_id, 1, s1_host_id, 1, tablet0_token)) + migration1 = asyncio.create_task(manager.api.move_tablet(servers[0].ip_addr, ks, "test", + s1_host_id, 1, s0_host_id, 1, tablet1_token)) - await migration0 - await migration1 + await migration0 + await migration1 - key_count = await finish_writes() + key_count = await finish_writes() - rows = await cql.run_async("SELECT * FROM test.test;") - assert len(rows) == key_count - for r in rows: - assert r.c == r.pk + rows = await cql.run_async(f"SELECT * FROM {ks}.test;") + assert len(rows) == key_count + for r in rows: + assert r.c == r.pk From 20f7eda16e2e32d348d89d49f7cb8be5611e8b30 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 45/56] topology_custom/test_tablets_merge: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_tablets_merge.py | 385 +++++++++++---------- 1 file changed, 193 insertions(+), 192 deletions(-) diff --git a/test/topology_custom/test_tablets_merge.py b/test/topology_custom/test_tablets_merge.py index 2cdf0e0e55..9298f1d253 100644 --- a/test/topology_custom/test_tablets_merge.py +++ b/test/topology_custom/test_tablets_merge.py @@ -10,6 +10,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import inject_error_one_shot, HTTPError, read_barrier from test.pylib.tablets import get_all_tablet_replicas from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace import pytest import asyncio @@ -62,131 +63,131 @@ async def test_tablet_merge_simple(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - # Initial average table size of 400k (1 tablet), so triggers some splits. - total_keys = 200 - keys = range(total_keys) - def populate(keys): - insert = cql.prepare(f"INSERT INTO test.test(pk, c) VALUES(?, ?)") - for pk in keys: - value = random.randbytes(2000) - cql.execute(insert, [pk, value]) - populate(keys) + # Initial average table size of 400k (1 tablet), so triggers some splits. + total_keys = 200 + keys = range(total_keys) + def populate(keys): + insert = cql.prepare(f"INSERT INTO {ks}.test(pk, c) VALUES(?, ?)") + for pk in keys: + value = random.randbytes(2000) + cql.execute(insert, [pk, value]) + populate(keys) - async def check(): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test BYPASS CACHE;") - assert len(rows) == len(keys) + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;") + assert len(rows) == len(keys) - await check() + await check() - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count == 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count == 1 - logger.info("Adding new server") - servers.append(await manager.server_add(cmdline=cmdline)) - s1_host_id = await manager.get_host_id(servers[1].server_id) + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) + s1_host_id = await manager.get_host_id(servers[1].server_id) - # Increases the chance of tablet migration concurrent with split - await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers) - await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) + # Increases the chance of tablet migration concurrent with split + await inject_error_one_shot_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - # Now there's a split and migration need, so they'll potentially run concurrently. - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + # Now there's a split and migration need, so they'll potentially run concurrently. + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await check() - time.sleep(2) # Give load balancer some time to do work + await check() + time.sleep(2) # Give load balancer some time to do work - await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) - await check() + await check() - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count > 1 + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count > 1 - # Allow shuffling of tablet replicas to make co-location work harder - async def shuffle(): - await inject_error_on(manager, "tablet_allocator_shuffle", servers) - time.sleep(2) - await disable_injection_on(manager, "tablet_allocator_shuffle", servers) + # Allow shuffling of tablet replicas to make co-location work harder + async def shuffle(): + await inject_error_on(manager, "tablet_allocator_shuffle", servers) + time.sleep(2) + await disable_injection_on(manager, "tablet_allocator_shuffle", servers) - await shuffle() + await shuffle() - # This will allow us to simulate some balancing after co-location with shuffling, to make sure that - # balancer won't break co-location. - await inject_error_on(manager, "tablet_merge_completion_bypass", servers) + # This will allow us to simulate some balancing after co-location with shuffling, to make sure that + # balancer won't break co-location. + await inject_error_on(manager, "tablet_merge_completion_bypass", servers) - # Shrinks table significantly, forcing merge. - delete_keys = range(total_keys - 1) - await asyncio.gather(*[cql.run_async(f"DELETE FROM test.test WHERE pk={k};") for k in delete_keys]) - keys = range(total_keys - 1, total_keys) + # Shrinks table significantly, forcing merge. + delete_keys = range(total_keys - 1) + await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in delete_keys]) + keys = range(total_keys - 1, total_keys) - # To avoid race of major with migration - await manager.api.disable_tablet_balancing(servers[0].ip_addr) + # To avoid race of major with migration + await manager.api.disable_tablet_balancing(servers[0].ip_addr) - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") - await manager.api.keyspace_compaction(server.ip_addr, "test") - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) + await manager.api.keyspace_compaction(server.ip_addr, ks) + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark) - # Waits for balancer to co-locate sibling tablets - await s1_log.wait_for("All sibling tablets are co-located") - # Do some shuffling to make sure balancer works with co-located tablets - await shuffle() + await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark) + # Waits for balancer to co-locate sibling tablets + await s1_log.wait_for("All sibling tablets are co-located") + # Do some shuffling to make sure balancer works with co-located tablets + await shuffle() - old_tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - s1_mark = await s1_log.mark() + old_tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + s1_mark = await s1_log.mark() - await inject_error_on(manager, "replica_merge_completion_wait", servers) - await disable_injection_on(manager, "tablet_merge_completion_bypass", servers) + await inject_error_on(manager, "replica_merge_completion_wait", servers) + await disable_injection_on(manager, "tablet_merge_completion_bypass", servers) - await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark) + await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count < old_tablet_count - await check() + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count < old_tablet_count + await check() - # Reproduces https://github.com/scylladb/scylladb/issues/21867 that could cause compaction group - # to be destroyed without being stopped first. - # That's done by: - # 1) Migrating a tablet to another node, and putting an artificial delay in cleanup stage when stopping groups - # 2) Force tablet split, causing new groups to be added in a tablet being cleaned up - # Without the fix, new groups are added to tablet being migrated away and never closed, potentially - # resulting in an use-after-free. - keys = range(total_keys) - populate(keys) - # Migrates a tablet to another node and put artificial delay on cleanup stage - await manager.api.enable_injection(servers[0].ip_addr, "delay_tablet_compaction_groups_cleanup", one_shot=True) - tablet_replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - assert len(tablet_replicas) > 0 - t = tablet_replicas[0] - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", *t.replicas[0], *(s1_host_id, 0), t.last_token)) - # Trigger split - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") - try: - await migration_task - except: - # move_tablet() fails if tablet is already in transit. - # forgive if balancer decided to migrate the target tablet post split. - pass + # Reproduces https://github.com/scylladb/scylladb/issues/21867 that could cause compaction group + # to be destroyed without being stopped first. + # That's done by: + # 1) Migrating a tablet to another node, and putting an artificial delay in cleanup stage when stopping groups + # 2) Force tablet split, causing new groups to be added in a tablet being cleaned up + # Without the fix, new groups are added to tablet being migrated away and never closed, potentially + # resulting in an use-after-free. + keys = range(total_keys) + populate(keys) + # Migrates a tablet to another node and put artificial delay on cleanup stage + await manager.api.enable_injection(servers[0].ip_addr, "delay_tablet_compaction_groups_cleanup", one_shot=True) + tablet_replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + assert len(tablet_replicas) > 0 + t = tablet_replicas[0] + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", *t.replicas[0], *(s1_host_id, 0), t.last_token)) + # Trigger split + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) + try: + await migration_task + except: + # move_tablet() fails if tablet is already in transit. + # forgive if balancer decided to migrate the target tablet post split. + pass - await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark) + await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark) - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") - await manager.api.keyspace_compaction(server.ip_addr, "test") - await check() + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) + await manager.api.keyspace_compaction(server.ip_addr, ks) + await check() # Multiple cycles of split and merge, with topology changes in parallel and RF > 1. @pytest.mark.asyncio @@ -209,127 +210,127 @@ async def test_tablet_split_and_merge_with_concurrent_topology_changes(manager: await manager.server_add(config=config, cmdline=cmdline)] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - async def perform_topology_ops(): - logger.info("Topology ops in background") - server_id_to_decommission = servers[-1].server_id - logger.info("Decommissioning old server with id {}".format(server_id_to_decommission)) - await manager.decommission_node(server_id_to_decommission) - servers.pop() - logger.info("Adding new server") - servers.append(await manager.server_add(cmdline=cmdline)) - logger.info("Completed topology ops") + async def perform_topology_ops(): + logger.info("Topology ops in background") + server_id_to_decommission = servers[-1].server_id + logger.info("Decommissioning old server with id {}".format(server_id_to_decommission)) + await manager.decommission_node(server_id_to_decommission) + servers.pop() + logger.info("Adding new server") + servers.append(await manager.server_add(cmdline=cmdline)) + logger.info("Completed topology ops") - for cycle in range(2): - logger.info("Running split-merge cycle #{}".format(cycle)) + for cycle in range(2): + logger.info("Running split-merge cycle #{}".format(cycle)) - await manager.api.disable_tablet_balancing(servers[0].ip_addr) + await manager.api.disable_tablet_balancing(servers[0].ip_addr) - logger.info("Inserting data") - # Initial average table size of (400k + metadata_overhead). Enough to trigger a few splits. - total_keys = 200 - keys = range(total_keys) - insert = cql.prepare(f"INSERT INTO test.test(pk, c) VALUES(?, ?)") - for pk in keys: - value = random.randbytes(2000) - cql.execute(insert, [pk, value]) + logger.info("Inserting data") + # Initial average table size of (400k + metadata_overhead). Enough to trigger a few splits. + total_keys = 200 + keys = range(total_keys) + insert = cql.prepare(f"INSERT INTO {ks}.test(pk, c) VALUES(?, ?)") + for pk in keys: + value = random.randbytes(2000) + cql.execute(insert, [pk, value]) - async def check(): - logger.info("Checking table") - cql = manager.get_cql() - rows = await cql.run_async("SELECT * FROM test.test BYPASS CACHE;") - assert len(rows) == len(keys) + async def check(): + logger.info("Checking table") + cql = manager.get_cql() + rows = await cql.run_async(f"SELECT * FROM {ks}.test BYPASS CACHE;") + assert len(rows) == len(keys) - await check() + await check() - logger.info("Flushing keyspace") - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") + logger.info("Flushing keyspace") + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') - # Increases the chance of tablet migration concurrent with split - await inject_error_on(manager, "tablet_allocator_shuffle", servers) - await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) + # Increases the chance of tablet migration concurrent with split + await inject_error_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "tablet_load_stats_refresh_before_rebalancing", servers) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - logger.info("Enabling balancing") - # Now there's a split and migration need, so they'll potentially run concurrently. - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + logger.info("Enabling balancing") + # Now there's a split and migration need, so they'll potentially run concurrently. + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - topology_ops_task = asyncio.create_task(perform_topology_ops()) + topology_ops_task = asyncio.create_task(perform_topology_ops()) - await check() + await check() - logger.info("Waiting for split") - await disable_injection_on(manager, "tablet_allocator_shuffle", servers) - await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) + logger.info("Waiting for split") + await disable_injection_on(manager, "tablet_allocator_shuffle", servers) + await s1_log.wait_for('Detected tablet split for table', from_mark=s1_mark) - logger.info("Waiting for topology ops") - await topology_ops_task + logger.info("Waiting for topology ops") + await topology_ops_task - await check() + await check() - old_tablet_count = tablet_count - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count > old_tablet_count - logger.info("Split increased number of tablets from {} to {}".format(old_tablet_count, tablet_count)) + old_tablet_count = tablet_count + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count > old_tablet_count + logger.info("Split increased number of tablets from {} to {}".format(old_tablet_count, tablet_count)) - # Allow shuffling of tablet replicas to make co-location work harder - await inject_error_on(manager, "tablet_allocator_shuffle", servers) - # This will allow us to simulate some balancing after co-location with shuffling, to make sure that - # balancer won't break co-location. - await inject_error_on(manager, "tablet_merge_completion_bypass", servers) + # Allow shuffling of tablet replicas to make co-location work harder + await inject_error_on(manager, "tablet_allocator_shuffle", servers) + # This will allow us to simulate some balancing after co-location with shuffling, to make sure that + # balancer won't break co-location. + await inject_error_on(manager, "tablet_merge_completion_bypass", servers) - logger.info("Deleting data") - # Delete almost all keys, enough to trigger a few merges. - delete_keys = range(total_keys - 1) - await asyncio.gather(*[cql.run_async(f"DELETE FROM test.test WHERE pk={k};") for k in delete_keys]) - keys = range(total_keys - 1, total_keys) + logger.info("Deleting data") + # Delete almost all keys, enough to trigger a few merges. + delete_keys = range(total_keys - 1) + await asyncio.gather(*[cql.run_async(f"DELETE FROM {ks}.test WHERE pk={k};") for k in delete_keys]) + keys = range(total_keys - 1, total_keys) - await disable_injection_on(manager, "tablet_allocator_shuffle", servers) + await disable_injection_on(manager, "tablet_allocator_shuffle", servers) - # To avoid race of major with migration - await manager.api.disable_tablet_balancing(servers[0].ip_addr) + # To avoid race of major with migration + await manager.api.disable_tablet_balancing(servers[0].ip_addr) - logger.info("Flushing keyspace and performing major") - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") - await manager.api.keyspace_compaction(server.ip_addr, "test") - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + logger.info("Flushing keyspace and performing major") + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) + await manager.api.keyspace_compaction(server.ip_addr, ks) + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - logger.info("Waiting for merge decision") - await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark) - # Waits for balancer to co-locate sibling tablets - await s1_log.wait_for("All sibling tablets are co-located") - # Do some shuffling to make sure balancer works with co-located tablets - await inject_error_on(manager, "tablet_allocator_shuffle", servers) + logger.info("Waiting for merge decision") + await s1_log.wait_for("Emitting resize decision of type merge", from_mark=s1_mark) + # Waits for balancer to co-locate sibling tablets + await s1_log.wait_for("All sibling tablets are co-located") + # Do some shuffling to make sure balancer works with co-located tablets + await inject_error_on(manager, "tablet_allocator_shuffle", servers) - old_tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') + old_tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') - topology_ops_task = asyncio.create_task(perform_topology_ops()) + topology_ops_task = asyncio.create_task(perform_topology_ops()) - await inject_error_on(manager, "replica_merge_completion_wait", servers) - await disable_injection_on(manager, "tablet_merge_completion_bypass", servers) - await disable_injection_on(manager, "tablet_allocator_shuffle", servers) + await inject_error_on(manager, "replica_merge_completion_wait", servers) + await disable_injection_on(manager, "tablet_merge_completion_bypass", servers) + await disable_injection_on(manager, "tablet_allocator_shuffle", servers) - await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark) - await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark) + await s1_log.wait_for('Detected tablet merge for table', from_mark=s1_mark) + await s1_log.wait_for('Merge completion fiber finished', from_mark=s1_mark) - logger.info("Waiting for topology ops") - await topology_ops_task + logger.info("Waiting for topology ops") + await topology_ops_task - tablet_count = await get_tablet_count(manager, servers[0], 'test', 'test') - assert tablet_count < old_tablet_count - logger.info("Merge decreased number of tablets from {} to {}".format(old_tablet_count, tablet_count)) - await check() + tablet_count = await get_tablet_count(manager, servers[0], ks, 'test') + assert tablet_count < old_tablet_count + logger.info("Merge decreased number of tablets from {} to {}".format(old_tablet_count, tablet_count)) + await check() - logger.info("Flushing keyspace and performing major") - for server in servers: - await manager.api.flush_keyspace(server.ip_addr, "test") - await manager.api.keyspace_compaction(server.ip_addr, "test") - await check() + logger.info("Flushing keyspace and performing major") + for server in servers: + await manager.api.flush_keyspace(server.ip_addr, ks) + await manager.api.keyspace_compaction(server.ip_addr, ks) + await check() From f30e4c6917fb64571786fd03d648f51fb1195955 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 46/56] topology_custom/test_tablets_migration: use new_test_keyspace Signed-off-by: Benny Halevy --- .../topology_custom/test_tablets_migration.py | 502 +++++++++--------- 1 file changed, 253 insertions(+), 249 deletions(-) diff --git a/test/topology_custom/test_tablets_migration.py b/test/topology_custom/test_tablets_migration.py index f203679b7e..075f228300 100644 --- a/test/topology_custom/test_tablets_migration.py +++ b/test/topology_custom/test_tablets_migration.py @@ -8,7 +8,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.rest_client import HTTPError, read_barrier from test.pylib.tablets import get_tablet_replica, get_all_tablet_replicas from test.topology.conftest import skip_mode -from test.topology.util import wait_for_cql_and_get_hosts +from test.topology.util import wait_for_cql_and_get_hosts, new_test_keyspace, reconnect_driver import time import pytest import logging @@ -39,59 +39,59 @@ async def test_tablet_transition_sanity(manager: ManagerClient, action): cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - logger.info(f"Tablet is on [{replicas}]") - assert len(replicas) == 1 and len(replicas[0].replicas) == 2 - old_replica = replicas[0].replicas[0] - replicas = [ r[0] for r in replicas[0].replicas ] - for h in host_ids: - if h not in replicas: - new_replica = (h, 0) - break - else: - assert False, "Cannot find node without replica" - - if action == 'move': - logger.info(f"Move tablet {old_replica[0]} -> {new_replica[0]}") - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) - if action == 'add_replica': - logger.info(f"Adding replica to tablet, host {new_replica[0]}") - await manager.api.add_tablet_replica(servers[0].ip_addr, "test", "test", new_replica[0], new_replica[1], 0) - if action == 'del_replica': - logger.info(f"Deleting replica from tablet, host {old_replica[0]}") - await manager.api.del_tablet_replica(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], 0) - - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - logger.info(f"Tablet is now on [{replicas}]") - assert len(replicas) == 1 - replicas = [ r[0] for r in replicas[0].replicas ] - if action == 'move': - assert len(replicas) == 2 - assert new_replica[0] in replicas - assert old_replica[0] not in replicas - if action == 'add_replica': - assert len(replicas) == 3 - assert old_replica[0] in replicas - assert new_replica[0] in replicas - if action == 'del_replica': - assert len(replicas) == 1 - assert old_replica[0] not in replicas - - for h, s in zip(host_ids, servers): - host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) - if h != host_ids[0]: - await read_barrier(manager.api, host[0].address) # host-0 did the barrier in get_all_tablet_replicas above - res = await cql.run_async("SELECT COUNT(*) FROM MUTATION_FRAGMENTS(test.test)", host=host[0]) - logger.info(f"Host {h} reports {res} as mutation fragments count") - if h in replicas: - assert res[0].count != 0 + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + logger.info(f"Tablet is on [{replicas}]") + assert len(replicas) == 1 and len(replicas[0].replicas) == 2 + old_replica = replicas[0].replicas[0] + replicas = [ r[0] for r in replicas[0].replicas ] + for h in host_ids: + if h not in replicas: + new_replica = (h, 0) + break else: - assert res[0].count == 0 + assert False, "Cannot find node without replica" + + if action == 'move': + logger.info(f"Move tablet {old_replica[0]} -> {new_replica[0]}") + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) + if action == 'add_replica': + logger.info(f"Adding replica to tablet, host {new_replica[0]}") + await manager.api.add_tablet_replica(servers[0].ip_addr, ks, "test", new_replica[0], new_replica[1], 0) + if action == 'del_replica': + logger.info(f"Deleting replica from tablet, host {old_replica[0]}") + await manager.api.del_tablet_replica(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], 0) + + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + logger.info(f"Tablet is now on [{replicas}]") + assert len(replicas) == 1 + replicas = [ r[0] for r in replicas[0].replicas ] + if action == 'move': + assert len(replicas) == 2 + assert new_replica[0] in replicas + assert old_replica[0] not in replicas + if action == 'add_replica': + assert len(replicas) == 3 + assert old_replica[0] in replicas + assert new_replica[0] in replicas + if action == 'del_replica': + assert len(replicas) == 1 + assert old_replica[0] not in replicas + + for h, s in zip(host_ids, servers): + host = await wait_for_cql_and_get_hosts(cql, [s], time.time() + 30) + if h != host_ids[0]: + await read_barrier(manager.api, host[0].address) # host-0 did the barrier in get_all_tablet_replicas above + res = await cql.run_async(f"SELECT COUNT(*) FROM MUTATION_FRAGMENTS({ks}.test)", host=host[0]) + logger.info(f"Host {h} reports {res} as mutation fragments count") + if h in replicas: + assert res[0].count != 0 + else: + assert res[0].count == 0 @pytest.mark.parametrize("fail_replica", ["source", "destination"]) @@ -118,131 +118,135 @@ async def test_node_failure_during_tablet_migration(manager: ManagerClient, fail await make_server() cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") - await make_server() - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2} AND tablets = {'initial': 1}") as ks: + await make_server() + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await make_server() - - if fail_stage in ["cleanup_target", "revert_migration"]: - # we'll stop 2 servers, group0 quorum should be there - # - # it seems that we need five nodes to have three remaining, but - # when removing the 1st node it will be marked as non-voter so to - # remove the 2nd node just two remaining will be enough - # - # also this extra node will be used to call removenode on - # removing the 1st node will wait for the operation to go through - # raft log, and it will not finish before tablet migration. An - # attempt to remove the 2nd node, to make cleanup_target stage - # go ahead, will step on the legacy API lock on storage_service, - # so we need to ask some other node to do it + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) await make_server() - logger.info(f"Cluster is [{host_ids}]") + if fail_stage in ["cleanup_target", "revert_migration"]: + # we'll stop 2 servers, group0 quorum should be there + # + # it seems that we need five nodes to have three remaining, but + # when removing the 1st node it will be marked as non-voter so to + # remove the 2nd node just two remaining will be enough + # + # also this extra node will be used to call removenode on + # removing the 1st node will wait for the operation to go through + # raft log, and it will not finish before tablet migration. An + # attempt to remove the 2nd node, to make cleanup_target stage + # go ahead, will step on the legacy API lock on storage_service, + # so we need to ask some other node to do it + await make_server() - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - logger.info(f"Tablet is on [{replicas}]") - assert len(replicas) == 1 and len(replicas[0].replicas) == 2 + logger.info(f"Cluster is [{host_ids}]") - last_token = replicas[0].last_token - old_replica = None - for r in replicas[0].replicas: - assert r[0] != host_ids[2], "Tablet got migrated to node2" - if r[0] == host_ids[1]: - old_replica = r - assert old_replica is not None - new_replica = (host_ids[2], 0) - logger.info(f"Moving tablet {old_replica} -> {new_replica}") + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + logger.info(f"Tablet is on [{replicas}]") + assert len(replicas) == 1 and len(replicas[0].replicas) == 2 - class node_failer: - def __init__(self, stage, replica): - self.stage = stage - self.replica = replica - self.fail_idx = 1 if self.replica == "source" else 2 + last_token = replicas[0].last_token + old_replica = None + for r in replicas[0].replicas: + assert r[0] != host_ids[2], "Tablet got migrated to node2" + if r[0] == host_ids[1]: + old_replica = r + assert old_replica is not None + new_replica = (host_ids[2], 0) + logger.info(f"Moving tablet {old_replica} -> {new_replica}") - async def setup(self): - logger.info(f"Will fail {self.stage}") - if self.stage == "streaming": - await manager.api.enable_injection(servers[2].ip_addr, "stream_mutation_fragments", one_shot=True) - self.log = await manager.server_open_log(servers[2].server_id) - self.mark = await self.log.mark() - elif self.stage in [ "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "end_migration", "do_revert_migration" ]: - await manager.api.enable_injection(servers[self.fail_idx].ip_addr, "raft_topology_barrier_and_drain_fail", one_shot=False, - parameters={'keyspace': 'test', 'table': 'test', 'last_token': last_token, 'stage': self.stage.removeprefix('do_')}) - self.log = await manager.server_open_log(servers[self.fail_idx].server_id) - self.mark = await self.log.mark() - elif self.stage == "cleanup": - await manager.api.enable_injection(servers[self.fail_idx].ip_addr, "cleanup_tablet_crash", one_shot=True) - self.log = await manager.server_open_log(servers[self.fail_idx].server_id) - self.mark = await self.log.mark() - elif self.stage == "cleanup_target": - assert self.fail_idx == 2 - self.stream_fail = node_failer('streaming', 'source') - await self.stream_fail.setup() - self.cleanup_fail = node_failer('cleanup', 'destination') - await self.cleanup_fail.setup() - elif self.stage == "revert_migration": - self.wbro_fail = node_failer('write_both_read_old', 'source' if self.replica == 'destination' else 'destination') - await self.wbro_fail.setup() - self.revert_fail = node_failer('do_revert_migration', self.replica) - await self.revert_fail.setup() - else: - assert False, f"Unknown stage {self.stage}" + class node_failer: + def __init__(self, stage, replica, ks): + self.stage = stage + self.replica = replica + self.fail_idx = 1 if self.replica == "source" else 2 + self.ks = ks - async def wait(self): - logger.info(f"Wait for {self.stage} to happen") - if self.stage == "streaming": - await self.log.wait_for('stream_mutation_fragments: waiting', from_mark=self.mark) - elif self.stage in [ "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "end_migration", "do_revert_migration" ]: - await self.log.wait_for('raft_topology_cmd: barrier handler waits', from_mark=self.mark); - elif self.stage == "cleanup": - await self.log.wait_for('Crashing tablet cleanup', from_mark=self.mark) - elif self.stage == "cleanup_target": - await self.stream_fail.wait() - self.stream_stop_task = asyncio.create_task(self.stream_fail.stop()) - await self.cleanup_fail.wait() - elif self.stage == "revert_migration": - await self.wbro_fail.wait() - self.wbro_fail_task = asyncio.create_task(self.wbro_fail.stop()) - await self.revert_fail.wait() - else: - assert False + async def setup(self): + logger.info(f"Will fail {self.stage}") + if self.stage == "streaming": + await manager.api.enable_injection(servers[2].ip_addr, "stream_mutation_fragments", one_shot=True) + self.log = await manager.server_open_log(servers[2].server_id) + self.mark = await self.log.mark() + elif self.stage in [ "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "end_migration", "do_revert_migration" ]: + await manager.api.enable_injection(servers[self.fail_idx].ip_addr, "raft_topology_barrier_and_drain_fail", one_shot=False, + parameters={'keyspace': self.ks, 'table': 'test', 'last_token': last_token, 'stage': self.stage.removeprefix('do_')}) + self.log = await manager.server_open_log(servers[self.fail_idx].server_id) + self.mark = await self.log.mark() + elif self.stage == "cleanup": + await manager.api.enable_injection(servers[self.fail_idx].ip_addr, "cleanup_tablet_crash", one_shot=True) + self.log = await manager.server_open_log(servers[self.fail_idx].server_id) + self.mark = await self.log.mark() + elif self.stage == "cleanup_target": + assert self.fail_idx == 2 + self.stream_fail = node_failer('streaming', 'source', ks) + await self.stream_fail.setup() + self.cleanup_fail = node_failer('cleanup', 'destination', ks) + await self.cleanup_fail.setup() + elif self.stage == "revert_migration": + self.wbro_fail = node_failer('write_both_read_old', 'source' if self.replica == 'destination' else 'destination', ks) + await self.wbro_fail.setup() + self.revert_fail = node_failer('do_revert_migration', self.replica, ks) + await self.revert_fail.setup() + else: + assert False, f"Unknown stage {self.stage}" - async def stop(self, via=0): - if self.stage == "cleanup_target": - await self.cleanup_fail.stop(via=3) # removenode of source is happening via node0 already - await self.stream_stop_task - return - if self.stage == "revert_migration": - await self.revert_fail.stop(via=3) - await self.wbro_fail_task - return + async def wait(self): + logger.info(f"Wait for {self.stage} to happen") + if self.stage == "streaming": + await self.log.wait_for('stream_mutation_fragments: waiting', from_mark=self.mark) + elif self.stage in [ "allow_write_both_read_old", "write_both_read_old", "write_both_read_new", "use_new", "end_migration", "do_revert_migration" ]: + await self.log.wait_for('raft_topology_cmd: barrier handler waits', from_mark=self.mark); + elif self.stage == "cleanup": + await self.log.wait_for('Crashing tablet cleanup', from_mark=self.mark) + elif self.stage == "cleanup_target": + await self.stream_fail.wait() + self.stream_stop_task = asyncio.create_task(self.stream_fail.stop()) + await self.cleanup_fail.wait() + elif self.stage == "revert_migration": + await self.wbro_fail.wait() + self.wbro_fail_task = asyncio.create_task(self.wbro_fail.stop()) + await self.revert_fail.wait() + else: + assert False - logger.info(f"Stop {self.replica} {host_ids[self.fail_idx]}") - await manager.server_stop(servers[self.fail_idx].server_id) - logger.info(f"Remove {self.replica} {host_ids[self.fail_idx]} via {host_ids[via]}") - await manager.remove_node(servers[via].server_id, servers[self.fail_idx].server_id) - logger.info(f"Done with {self.replica} {host_ids[self.fail_idx]}") + async def stop(self, via=0): + if self.stage == "cleanup_target": + await self.cleanup_fail.stop(via=3) # removenode of source is happening via node0 already + await self.stream_stop_task + return + if self.stage == "revert_migration": + await self.revert_fail.stop(via=3) + await self.wbro_fail_task + return + + logger.info(f"Stop {self.replica} {host_ids[self.fail_idx]}") + await manager.server_stop(servers[self.fail_idx].server_id) + logger.info(f"Remove {self.replica} {host_ids[self.fail_idx]} via {host_ids[via]}") + await manager.remove_node(servers[via].server_id, servers[self.fail_idx].server_id) + logger.info(f"Done with {self.replica} {host_ids[self.fail_idx]}") - failer = node_failer(fail_stage, fail_replica) - await failer.setup() - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0)) - await failer.wait() - await failer.stop() + failer = node_failer(fail_stage, fail_replica, ks) + await failer.setup() + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0)) + await failer.wait() + await failer.stop() - logger.info("Done, waiting for migration to finish") - await migration_task + logger.info("Done, waiting for migration to finish") + await migration_task - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - logger.info(f"Tablet is now on [{replicas}]") - assert len(replicas) == 1 - for r in replicas[0].replicas: - assert r[0] != host_ids[failer.fail_idx] + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + logger.info(f"Tablet is now on [{replicas}]") + assert len(replicas) == 1 + for r in replicas[0].replicas: + assert r[0] != host_ids[failer.fail_idx] + + # For dropping the keyspace after the node failure + await reconnect_driver(manager) @pytest.mark.asyncio async def test_tablet_back_and_forth_migration(manager: ManagerClient): @@ -258,39 +262,39 @@ async def test_tablet_back_and_forth_migration(manager: ManagerClient): await manager.api.disable_tablet_balancing(s.ip_addr) async def assert_rows(num): - res = await cql.run_async(f"SELECT * FROM test.test") + res = await cql.run_async(f"SELECT * FROM {ks}.test") assert len(res) == num await make_server() cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await make_server() + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await make_server() - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({1}, {1});") - await assert_rows(1) + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({1}, {1});") + await assert_rows(1) - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') - logger.info(f"Tablet is on [{replicas}]") - assert len(replicas) == 1 and len(replicas[0].replicas) == 1 + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') + logger.info(f"Tablet is on [{replicas}]") + assert len(replicas) == 1 and len(replicas[0].replicas) == 1 - old_replica = replicas[0].replicas[0] - assert old_replica[0] != host_ids[1] - new_replica = (host_ids[1], 0) + old_replica = replicas[0].replicas[0] + assert old_replica[0] != host_ids[1] + new_replica = (host_ids[1], 0) - logger.info(f"Moving tablet {old_replica} -> {new_replica}") - manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) + logger.info(f"Moving tablet {old_replica} -> {new_replica}") + manager.api.move_tablet(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) - await assert_rows(1) - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({2}, {2});") - await assert_rows(2) + await assert_rows(1) + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({2}, {2});") + await assert_rows(2) - logger.info(f"Moving tablet {new_replica} -> {old_replica}") - manager.api.move_tablet(servers[0].ip_addr, "test", "test", new_replica[0], new_replica[1], old_replica[0], old_replica[1], 0) + logger.info(f"Moving tablet {new_replica} -> {old_replica}") + manager.api.move_tablet(servers[0].ip_addr, ks, "test", new_replica[0], new_replica[1], old_replica[0], old_replica[1], 0) - await assert_rows(2) - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({3}, {3});") - await assert_rows(3) + await assert_rows(2) + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({3}, {3});") + await assert_rows(3) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -304,87 +308,87 @@ async def test_staging_backlog_is_preserved_with_file_based_streaming(manager: M await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE MATERIALIZED VIEW test.mv1 AS \ - SELECT * FROM test.test WHERE pk IS NOT NULL AND c IS NOT NULL \ - PRIMARY KEY (c, pk);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1};") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.mv1 AS \ + SELECT * FROM {ks}.test WHERE pk IS NOT NULL AND c IS NOT NULL \ + PRIMARY KEY (c, pk);") - logger.info("Populating single tablet") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + logger.info("Populating single tablet") + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, "test") + await manager.api.flush_keyspace(servers[0].ip_addr, ks) - # check - async def check(expected): - rows = await cql.run_async("SELECT pk from test.test") - assert len(list(rows)) == len(expected) - await check(keys) + # check + async def check(expected): + rows = await cql.run_async(f"SELECT pk from {ks}.test") + assert len(list(rows)) == len(expected) + await check(keys) - logger.info("Adding new server") - servers.append(await manager.server_add(config=cfg)) + logger.info("Adding new server") + servers.append(await manager.server_add(config=cfg)) - async def get_table_dir(manager, server_id): - node_workdir = await manager.server_get_workdir(server_id) - return glob.glob(os.path.join(node_workdir, "data", "test", "test-*"))[0] + async def get_table_dir(manager, server_id): + node_workdir = await manager.server_get_workdir(server_id) + return glob.glob(os.path.join(node_workdir, "data", ks, "test-*"))[0] - s0_table_dir = await get_table_dir(manager, servers[0].server_id) - logger.info(f"Table dir in server 0: {s0_table_dir}") + s0_table_dir = await get_table_dir(manager, servers[0].server_id) + logger.info(f"Table dir in server 0: {s0_table_dir}") - s1_table_dir = await get_table_dir(manager, servers[1].server_id) - logger.info(f"Table dir in server 1: {s1_table_dir}") + s1_table_dir = await get_table_dir(manager, servers[1].server_id) + logger.info(f"Table dir in server 1: {s1_table_dir}") - # Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown. - # It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed. - manager.driver_close() - cql = None - await manager.server_stop_gracefully(servers[0].server_id) + # Explicitly close the driver to avoid reconnections if scylla fails to update gossiper state on shutdown. + # It's a problem until https://github.com/scylladb/scylladb/issues/15356 is fixed. + manager.driver_close() + cql = None + await manager.server_stop_gracefully(servers[0].server_id) - def move_sstables_to_staging(table_dir: str): - table_staging_dir = os.path.join(table_dir, "staging") - logger.info(f"Moving sstables to staging dir: {table_staging_dir}") - for sst in glob.glob(os.path.join(table_dir, "*-Data.db")): - for src_path in glob.glob(os.path.join(table_dir, sst.removesuffix("-Data.db") + "*")): - dst_path = os.path.join(table_staging_dir, os.path.basename(src_path)) - logger.info(f"Moving sstable file {src_path} to {dst_path}") - os.rename(src_path, dst_path) + def move_sstables_to_staging(table_dir: str): + table_staging_dir = os.path.join(table_dir, "staging") + logger.info(f"Moving sstables to staging dir: {table_staging_dir}") + for sst in glob.glob(os.path.join(table_dir, "*-Data.db")): + for src_path in glob.glob(os.path.join(table_dir, sst.removesuffix("-Data.db") + "*")): + dst_path = os.path.join(table_staging_dir, os.path.basename(src_path)) + logger.info(f"Moving sstable file {src_path} to {dst_path}") + os.rename(src_path, dst_path) - def sstable_count_in_staging(table_dir: str): - table_staging_dir = os.path.join(table_dir, "staging") - return len(glob.glob(os.path.join(table_staging_dir, "*-Data.db"))) + def sstable_count_in_staging(table_dir: str): + table_staging_dir = os.path.join(table_dir, "staging") + return len(glob.glob(os.path.join(table_staging_dir, "*-Data.db"))) - move_sstables_to_staging(s0_table_dir) - s0_sstables_in_staging = sstable_count_in_staging(s0_table_dir) + move_sstables_to_staging(s0_table_dir) + s0_sstables_in_staging = sstable_count_in_staging(s0_table_dir) - await manager.server_start(servers[0].server_id) - cql = manager.get_cql() - await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + await manager.server_start(servers[0].server_id) + cql = manager.get_cql() + await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - tablet_token = 0 # Doesn't matter since there is one tablet - replica = await get_tablet_replica(manager, servers[0], 'test', 'test', tablet_token) - s1_host_id = await manager.get_host_id(servers[1].server_id) - dst_shard = 0 + tablet_token = 0 # Doesn't matter since there is one tablet + replica = await get_tablet_replica(manager, servers[0], ks, 'test', tablet_token) + s1_host_id = await manager.get_host_id(servers[1].server_id) + dst_shard = 0 - migration_task = asyncio.create_task( - manager.api.move_tablet(servers[0].ip_addr, "test", "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) + migration_task = asyncio.create_task( + manager.api.move_tablet(servers[0].ip_addr, ks, "test", replica[0], replica[1], s1_host_id, dst_shard, tablet_token)) - logger.info("Waiting for migration to finish") - await migration_task - logger.info("Migration done") + logger.info("Waiting for migration to finish") + await migration_task + logger.info("Migration done") - # FIXME: After https://github.com/scylladb/scylladb/issues/19149 is fixed, we can check that view updates complete - # after migration and then check for base-view consistency. By the time being, we only check that backlog is - # transferred by looking at staging directory. + # FIXME: After https://github.com/scylladb/scylladb/issues/19149 is fixed, we can check that view updates complete + # after migration and then check for base-view consistency. By the time being, we only check that backlog is + # transferred by looking at staging directory. - s1_sstables_in_staging = sstable_count_in_staging(s1_table_dir) - logger.info(f"SSTable count in staging dir of server 1: {s1_sstables_in_staging}") + s1_sstables_in_staging = sstable_count_in_staging(s1_table_dir) + logger.info(f"SSTable count in staging dir of server 1: {s1_sstables_in_staging}") - logger.info("Allowing view update generator to progress again") - for server in servers: - manager.api.disable_injection(server.ip_addr, 'view_update_generator_consume_staging_sstable') + logger.info("Allowing view update generator to progress again") + for server in servers: + manager.api.disable_injection(server.ip_addr, 'view_update_generator_consume_staging_sstable') - assert s0_sstables_in_staging > 0 - assert s0_sstables_in_staging == s1_sstables_in_staging + assert s0_sstables_in_staging > 0 + assert s0_sstables_in_staging == s1_sstables_in_staging - await check(keys) + await check(keys) From 96d327fb83b5a8d6d54eaec4b4aba4cc298a08d5 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 47/56] topology_custom/test_tablets_removenode: use create_new_test_keyspace Signed-off-by: Benny Halevy --- .../test_tablets_removenode.py | 65 ++++++++++--------- 1 file changed, 33 insertions(+), 32 deletions(-) diff --git a/test/topology_custom/test_tablets_removenode.py b/test/topology_custom/test_tablets_removenode.py index f342645198..b4fdc13cca 100644 --- a/test/topology_custom/test_tablets_removenode.py +++ b/test/topology_custom/test_tablets_removenode.py @@ -14,12 +14,13 @@ import logging from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.util import start_writes +from test.topology.util import create_new_test_keyspace logger = logging.getLogger(__name__) -async def create_keyspace(cql, name, initial_tablets, rf): - await cql.run_async(f"CREATE KEYSPACE {name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}}" +async def create_keyspace(cql, initial_tablets, rf): + return await create_new_test_keyspace(cql, f"WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': {rf}}}" f" AND tablets = {{'initial': {initial_tablets}}};") @@ -40,25 +41,25 @@ async def test_replace(manager: ManagerClient): cql = manager.get_cql() - await create_keyspace(cql, "test", 32, rf=1) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks1 = await create_keyspace(cql, 32, rf=1) + await cql.run_async(f"CREATE TABLE {ks1}.test (pk int PRIMARY KEY, c int);") # We want RF=2 table to validate that quorum reads work after replacing node finishes # bootstrap which indicates that bootstrap waits for rebuilt. # Otherwise, some reads would fail to find a quorum. - await create_keyspace(cql, "test2", 32, rf=2) - await cql.run_async("CREATE TABLE test2.test (pk int PRIMARY KEY, c int);") + ks2 = await create_keyspace(cql, 32, rf=2) + await cql.run_async(f"CREATE TABLE {ks2}.test (pk int PRIMARY KEY, c int);") - await create_keyspace(cql, "test3", 32, rf=3) - await cql.run_async("CREATE TABLE test3.test (pk int PRIMARY KEY, c int);") - await cql.run_async("CREATE TABLE test3.test2 (pk int PRIMARY KEY, c int);") + ks3 = await create_keyspace(cql, 32, rf=3) + await cql.run_async(f"CREATE TABLE {ks3}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"CREATE TABLE {ks3}.test2 (pk int PRIMARY KEY, c int);") logger.info("Populating table") keys = range(256) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test2.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test3.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks1}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks2}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks3}.test (pk, c) VALUES ({k}, {k});") for k in keys]) async def check_ks(ks): logger.info(f"Checking {ks}") @@ -71,8 +72,8 @@ async def test_replace(manager: ManagerClient): async def check(): # RF=1 keyspace will experience data loss so don't check it. # We include it in the test only to check that the system doesn't crash. - await check_ks("test2") - await check_ks("test3") + await check_ks(ks2) + await check_ks(ks3) await check() @@ -80,7 +81,7 @@ async def test_replace(manager: ManagerClient): # See https://github.com/scylladb/scylladb/issues/16527 await manager.api.disable_tablet_balancing(servers[0].ip_addr) - finish_writes = await start_writes(cql, "test3", "test2") + finish_writes = await start_writes(cql, ks3, "test2") logger.info('Replacing a node') await manager.server_stop(servers[0].server_id) @@ -89,7 +90,7 @@ async def test_replace(manager: ManagerClient): servers = servers[1:] key_count = await finish_writes() - stmt = SimpleStatement("SELECT * FROM test3.test2;", consistency_level=ConsistencyLevel.QUORUM) + stmt = SimpleStatement(f"SELECT * FROM {ks3}.test2;", consistency_level=ConsistencyLevel.QUORUM) rows = await cql.run_async(stmt, all_pages=True) assert len(rows) == key_count for r in rows: @@ -105,7 +106,7 @@ async def test_replace(manager: ManagerClient): await manager.server_not_sees_other_server(servers[1].ip_addr, servers[0].ip_addr) await manager.server_not_sees_other_server(servers[2].ip_addr, servers[0].ip_addr) - await check_ks("test3") + await check_ks(ks3) @pytest.mark.asyncio @@ -119,37 +120,37 @@ async def test_removenode(manager: ManagerClient): cql = manager.get_cql() # RF=1 - await create_keyspace(cql, "test", 32, rf=1) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks1 = await create_keyspace(cql, 32, rf=1) + await cql.run_async(f"CREATE TABLE {ks1}.test (pk int PRIMARY KEY, c int);") # RF=2 - await create_keyspace(cql, "test2", 32, rf=2) - await cql.run_async("CREATE TABLE test2.test (pk int PRIMARY KEY, c int);") + ks2 = await create_keyspace(cql, 32, rf=2) + await cql.run_async(f"CREATE TABLE {ks2}.test (pk int PRIMARY KEY, c int);") # RF=3 - await create_keyspace(cql, "test3", 32, rf=3) - await cql.run_async("CREATE TABLE test3.test (pk int PRIMARY KEY, c int);") + ks3 = await create_keyspace(cql, 32, rf=3) + await cql.run_async(f"CREATE TABLE {ks3}.test (pk int PRIMARY KEY, c int);") logger.info("Populating table") keys = range(256) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test2.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test3.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks1}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks2}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks3}.test (pk, c) VALUES ({k}, {k});") for k in keys]) async def check(): # RF=1 table "test" will experience data loss so don't check it. # We include it to check that the system doesn't crash. logger.info("Checking table test2") - query = SimpleStatement("SELECT * FROM test2.test;", consistency_level=ConsistencyLevel.ONE) + query = SimpleStatement(f"SELECT * FROM {ks2}.test;", consistency_level=ConsistencyLevel.ONE) rows = await cql.run_async(query) assert len(rows) == len(keys) for r in rows: assert r.c == r.pk logger.info("Checking table test3") - query = SimpleStatement("SELECT * FROM test3.test;", consistency_level=ConsistencyLevel.ONE) + query = SimpleStatement(f"SELECT * FROM {ks3}.test;", consistency_level=ConsistencyLevel.ONE) rows = await cql.run_async(query) assert len(rows) == len(keys) for r in rows: @@ -182,17 +183,17 @@ async def test_removenode_with_ignored_node(manager: ManagerClient): cql = manager.get_cql() - await create_keyspace(cql, "test", 32, rf=3) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks = await create_keyspace(cql, 32, rf=3) + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") logger.info("Populating table") keys = range(512) - await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[run_async_cl_all(cql, f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) async def check(): logger.info("Checking") - query = SimpleStatement("SELECT * FROM test.test;", consistency_level=ConsistencyLevel.ONE) + query = SimpleStatement(f"SELECT * FROM {ks}.test;", consistency_level=ConsistencyLevel.ONE) rows = await cql.run_async(query) assert len(rows) == len(keys) for r in rows: From 16ef78075c090124b114b6ffd0887040c170e4a0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 48/56] topology_custom/test_topology_failure_recovery: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_topology_failure_recovery.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/test/topology_custom/test_topology_failure_recovery.py b/test/topology_custom/test_topology_failure_recovery.py index d69e917d17..334c9df2cd 100644 --- a/test/topology_custom/test_topology_failure_recovery.py +++ b/test/topology_custom/test_topology_failure_recovery.py @@ -6,6 +6,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.scylla_cluster import ReplaceConfig from test.topology.conftest import skip_mode +from test.topology.util import new_test_keyspace import pytest import logging import asyncio @@ -26,21 +27,18 @@ async def test_tablet_drain_failure_during_decommission(manager: ManagerClient): marks = [await log.mark() for log in logs] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 32};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 32}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") - logger.info("Populating table") + logger.info("Populating table") - keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + keys = range(256) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) - await inject_error_on(manager, "stream_tablet_fail_on_drain", servers) + await inject_error_on(manager, "stream_tablet_fail_on_drain", servers) - await manager.decommission_node(servers[2].server_id, expected_error="Decommission failed. See earlier errors") - - matches = [await log.grep("raft_topology - rollback.*after decommissioning failure, moving transition state to rollback to normal", - from_mark=mark) for log, mark in zip(logs, marks)] - assert sum(len(x) for x in matches) == 1 - - await cql.run_async("DROP KEYSPACE test;") + await manager.decommission_node(servers[2].server_id, expected_error="Decommission failed. See earlier errors") + matches = [await log.grep("raft_topology - rollback.*after decommissioning failure, moving transition state to rollback to normal", + from_mark=mark) for log, mark in zip(logs, marks)] + assert sum(len(x) for x in matches) == 1 From 2d4af012813900f3dffa6716c187fdc4a736b9ab Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 49/56] topology_custom/test_truncate_with_tablets: use new_test_keyspace Signed-off-by: Benny Halevy --- .../test_truncate_with_tablets.py | 238 +++++++++--------- 1 file changed, 119 insertions(+), 119 deletions(-) diff --git a/test/topology_custom/test_truncate_with_tablets.py b/test/topology_custom/test_truncate_with_tablets.py index dd8c70ed04..19f8152047 100644 --- a/test/topology_custom/test_truncate_with_tablets.py +++ b/test/topology_custom/test_truncate_with_tablets.py @@ -9,7 +9,7 @@ from cassandra.cluster import TruncateError from cassandra.policies import FallthroughRetryPolicy from test.pylib.manager_client import ManagerClient from test.topology.conftest import skip_mode -from test.topology.util import get_topology_coordinator +from test.topology.util import get_topology_coordinator, new_test_keyspace from test.pylib.tablets import get_all_tablet_replicas from test.pylib.util import wait_for_cql_and_get_hosts import time @@ -34,31 +34,31 @@ async def test_truncate_while_migration(manager: ManagerClient): cql = manager.get_cql() # Create a keyspace with tablets and initial_tablets == 2, then insert data - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") - await cql.run_async('CREATE TABLE test.test (pk int PRIMARY KEY, c int);') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f'CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);') - keys = range(1024) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + keys = range(1024) + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) - # Add a node to the cluster. This will cause the tablet load balancer to migrate one tablet to the new node - servers.append(await manager.server_add(config=cfg)) + # Add a node to the cluster. This will cause the tablet load balancer to migrate one tablet to the new node + servers.append(await manager.server_add(config=cfg)) - # Wait for tablet streaming to start - pending_node = servers[1] - pending_log = await manager.server_open_log(pending_node.server_id) + # Wait for tablet streaming to start + pending_node = servers[1] + pending_log = await manager.server_open_log(pending_node.server_id) - await pending_log.wait_for('migration_streaming_wait: start') - await manager.api.message_injection(pending_node.ip_addr, 'migration_streaming_wait') + await pending_log.wait_for('migration_streaming_wait: start') + await manager.api.message_injection(pending_node.ip_addr, 'migration_streaming_wait') - # Do a TRUNCATE TABLE while the tablet is being streamed - await cql.run_async('TRUNCATE TABLE test.test') + # Do a TRUNCATE TABLE while the tablet is being streamed + await cql.run_async(f'TRUNCATE TABLE {ks}.test') - # Wait for streaming to complete - await pending_log.wait_for('raft_topology - Streaming for tablet migration of.*successful') + # Wait for streaming to complete + await pending_log.wait_for('raft_topology - Streaming for tablet migration of.*successful') - # Check if we have any data - row = await cql.run_async(SimpleStatement('SELECT COUNT(*) FROM test.test', consistency_level=ConsistencyLevel.ALL)) - assert row[0].count == 0 + # Check if we have any data + row = await cql.run_async(SimpleStatement(f'SELECT COUNT(*) FROM {ks}.test', consistency_level=ConsistencyLevel.ALL)) + assert row[0].count == 0 async def get_raft_leader_and_log(manager: ManagerClient, servers): @@ -89,37 +89,37 @@ async def test_truncate_with_concurrent_drop(manager: ManagerClient): hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) # Create a keyspace with tablets and initial_tablets == 2, then insert data - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") - await cql.run_async('CREATE TABLE test.test (pk int PRIMARY KEY, c int);') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f'CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);') - keys = range(1024) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + keys = range(1024) + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) - (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) + (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) - if raft_leader == servers[0]: - trunc_host = hosts[1] - drop_host = hosts[2] - elif raft_leader == servers[1]: - trunc_host = hosts[0] - drop_host = hosts[2] - elif raft_leader == servers[2]: - trunc_host = hosts[0] - drop_host = hosts[1] - else: - assert False, 'Unable to determine raft leader' + if raft_leader == servers[0]: + trunc_host = hosts[1] + drop_host = hosts[2] + elif raft_leader == servers[1]: + trunc_host = hosts[0] + drop_host = hosts[2] + elif raft_leader == servers[2]: + trunc_host = hosts[0] + drop_host = hosts[1] + else: + assert False, 'Unable to determine raft leader' - # Start a TRUNCATE in the background - trunc_future = cql.run_async('TRUNCATE TABLE test.test', host=trunc_host) - # Wait for the topology coordinator to reach a point wher it is about to start sending the truncate RPCs - await raft_leader_log.wait_for('truncate_table_wait: start') - # Execute DROP TABLE - await cql.run_async('DROP TABLE test.test', host=drop_host) - # Release TRUNCATE table in topology coordinator - await manager.api.message_injection(raft_leader.ip_addr, 'truncate_table_wait') - # Check we received an error - with pytest.raises(InvalidRequest, match='unconfigured table test'): - await trunc_future + # Start a TRUNCATE in the background + trunc_future = cql.run_async(f'TRUNCATE TABLE {ks}.test', host=trunc_host) + # Wait for the topology coordinator to reach a point wher it is about to start sending the truncate RPCs + await raft_leader_log.wait_for('truncate_table_wait: start') + # Execute DROP TABLE + await cql.run_async(f'DROP TABLE {ks}.test', host=drop_host) + # Release TRUNCATE table in topology coordinator + await manager.api.message_injection(raft_leader.ip_addr, 'truncate_table_wait') + # Check we received an error + with pytest.raises(InvalidRequest, match='unconfigured table test'): + await trunc_future @pytest.mark.asyncio @@ -138,36 +138,36 @@ async def test_truncate_while_node_restart(manager: ManagerClient): hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) # Create a keyspace with tablets and initial_tablets == 2, then insert data - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") - await cql.run_async('CREATE TABLE test.test (pk int PRIMARY KEY, c int);') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f'CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);') - keys = range(1024) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + keys = range(1024) + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) - (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) + (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) - # Decide which node to restart; select a node with a replica but not the raft leader - tablet_replicas = await get_all_tablet_replicas(manager, raft_leader, 'test', 'test') - replica_hosts = [tr.replicas[0][0] for tr in tablet_replicas] - for s in servers: - if s != raft_leader: - host_id = await manager.get_host_id(s.server_id) - if host_id in replica_hosts: - restart_node = s - break + # Decide which node to restart; select a node with a replica but not the raft leader + tablet_replicas = await get_all_tablet_replicas(manager, raft_leader, ks, 'test') + replica_hosts = [tr.replicas[0][0] for tr in tablet_replicas] + for s in servers: + if s != raft_leader: + host_id = await manager.get_host_id(s.server_id) + if host_id in replica_hosts: + restart_node = s + break - # Shutdown the node containing a replica - await manager.server_stop_gracefully(restart_node.server_id) - # Start truncating in the background - trunc_future = cql.run_async('TRUNCATE TABLE test.test', host=hosts[0]) - # Restart the node - await manager.server_start(restart_node.server_id) - # Wait for truncate to complete - await trunc_future + # Shutdown the node containing a replica + await manager.server_stop_gracefully(restart_node.server_id) + # Start truncating in the background + trunc_future = cql.run_async(f'TRUNCATE TABLE {ks}.test', host=hosts[0]) + # Restart the node + await manager.server_start(restart_node.server_id) + # Wait for truncate to complete + await trunc_future - # Check if truncate was successful - row = await cql.run_async(SimpleStatement('SELECT COUNT(*) FROM test.test', consistency_level=ConsistencyLevel.ALL)) - assert row[0].count == 0 + # Check if truncate was successful + row = await cql.run_async(SimpleStatement(f'SELECT COUNT(*) FROM {ks}.test', consistency_level=ConsistencyLevel.ALL)) + assert row[0].count == 0 @pytest.mark.asyncio @@ -185,35 +185,35 @@ async def test_truncate_with_coordinator_crash(manager: ManagerClient): hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) # Create a keyspace with tablets and initial_tablets == 2, then insert data - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") - await cql.run_async('CREATE TABLE test.test (pk int PRIMARY KEY, c int);') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f'CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);') - keys = range(1024) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + keys = range(1024) + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) - (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) + (raft_leader, raft_leader_log) = await get_raft_leader_and_log(manager, servers) - if raft_leader == servers[0]: - trunc_host = hosts[1] - else: - trunc_host = hosts[0] + if raft_leader == servers[0]: + trunc_host = hosts[1] + else: + trunc_host = hosts[0] - # Enable injection to crash the raft leader after truncate cleared the session ID - await manager.api.enable_injection(raft_leader.ip_addr, 'truncate_crash_after_session_clear', one_shot=False) + # Enable injection to crash the raft leader after truncate cleared the session ID + await manager.api.enable_injection(raft_leader.ip_addr, 'truncate_crash_after_session_clear', one_shot=False) - # Start a TRUNCATE in the background - trunc_future = cql.run_async('TRUNCATE TABLE test.test', host=trunc_host) - # Wait for the topology coordinator to crash - await raft_leader_log.wait_for('truncate_crash_after_session_clear hit, killing the node') - await manager.server_stop(raft_leader.server_id) - # Restart the crashed node - await manager.server_start(raft_leader.server_id) - # Wait for truncate to complete - await trunc_future + # Start a TRUNCATE in the background + trunc_future = cql.run_async(f'TRUNCATE TABLE {ks}.test', host=trunc_host) + # Wait for the topology coordinator to crash + await raft_leader_log.wait_for('truncate_crash_after_session_clear hit, killing the node') + await manager.server_stop(raft_leader.server_id) + # Restart the crashed node + await manager.server_start(raft_leader.server_id) + # Wait for truncate to complete + await trunc_future - # Check if we have any data - row = await cql.run_async(SimpleStatement('SELECT COUNT(*) FROM test.test', consistency_level=ConsistencyLevel.ALL)) - assert row[0].count == 0 + # Check if we have any data + row = await cql.run_async(SimpleStatement(f'SELECT COUNT(*) FROM {ks}.test', consistency_level=ConsistencyLevel.ALL)) + assert row[0].count == 0 @pytest.mark.asyncio @@ -231,38 +231,38 @@ async def test_truncate_while_truncate_already_waiting(manager: ManagerClient): cql = manager.get_cql() # Create a keyspace with tablets and initial_tablets == 2, then insert data - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") - await cql.run_async('CREATE TABLE test.test (pk int PRIMARY KEY, c int);') + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 2}") as ks: + await cql.run_async(f'CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);') - keys = range(1024) - await asyncio.gather(*[cql.run_async(f'INSERT INTO test.test (pk, c) VALUES ({k}, {k});') for k in keys]) + keys = range(1024) + await asyncio.gather(*[cql.run_async(f'INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});') for k in keys]) - # Add a node to the cluster. This will cause the load balancer to migrate one tablet to the new node - servers.append(await manager.server_add(config=cfg)) + # Add a node to the cluster. This will cause the load balancer to migrate one tablet to the new node + servers.append(await manager.server_add(config=cfg)) - hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - s1_log = await manager.server_open_log(servers[1].server_id) + hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + s1_log = await manager.server_open_log(servers[1].server_id) - # Wait for tablet streaming to start - await s1_log.wait_for('migration_streaming_wait: start') + # Wait for tablet streaming to start + await s1_log.wait_for('migration_streaming_wait: start') - # Run a truncate which will quickly time out, but the truncate fiber remains alive - # Do not attempt to retry automatically (hense the FallthroughRetryPolicy) - with pytest.raises((TruncateError), match='Timeout during TRUNCATE TABLE of test.test'): - await cql.run_async(SimpleStatement('TRUNCATE TABLE test.test USING TIMEOUT 100ms', retry_policy=FallthroughRetryPolicy())) + # Run a truncate which will quickly time out, but the truncate fiber remains alive + # Do not attempt to retry automatically (hense the FallthroughRetryPolicy) + with pytest.raises((TruncateError), match=f'Timeout during TRUNCATE TABLE of {ks}.test'): + await cql.run_async(SimpleStatement(f'TRUNCATE TABLE {ks}.test USING TIMEOUT 100ms', retry_policy=FallthroughRetryPolicy())) - # Run another truncate on the same table while the timedout one is still waiting - truncate_future = cql.run_async('TRUNCATE TABLE test.test', host=hosts[1]) + # Run another truncate on the same table while the timedout one is still waiting + truncate_future = cql.run_async(f'TRUNCATE TABLE {ks}.test', host=hosts[1]) - # Make sure the second truncate re-used the existing global topology request - await s1_log.wait_for('Ongoing TRUNCATE for table test.test') + # Make sure the second truncate re-used the existing global topology request + await s1_log.wait_for(f'Ongoing TRUNCATE for table {ks}.test') - # Release streaming - await manager.api.message_injection(servers[1].ip_addr, 'migration_streaming_wait') + # Release streaming + await manager.api.message_injection(servers[1].ip_addr, 'migration_streaming_wait') - # Wait for the joined truncate to complete - await truncate_future + # Wait for the joined truncate to complete + await truncate_future - # Check if we have any data - row = await cql.run_async(SimpleStatement('SELECT COUNT(*) FROM test.test', consistency_level=ConsistencyLevel.ALL)) - assert row[0].count == 0 + # Check if we have any data + row = await cql.run_async(SimpleStatement(f'SELECT COUNT(*) FROM {ks}.test', consistency_level=ConsistencyLevel.ALL)) + assert row[0].count == 0 From b810791fbb85334115d1a21d00267b588e8d6651 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 50/56] topology_custom/test_view_build_status: use new_test_keyspace Signed-off-by: Benny Halevy --- .../topology_custom/test_view_build_status.py | 94 +++++++++---------- 1 file changed, 46 insertions(+), 48 deletions(-) diff --git a/test/topology_custom/test_view_build_status.py b/test/topology_custom/test_view_build_status.py index 5865dcb179..8c11b565c4 100644 --- a/test/topology_custom/test_view_build_status.py +++ b/test/topology_custom/test_view_build_status.py @@ -12,7 +12,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.internal_types import ServerInfo from test.topology.util import trigger_snapshot, wait_until_topology_upgrade_finishes, enter_recovery_state, reconnect_driver, \ - delete_raft_topology_state, delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes, wait_for + delete_raft_topology_state, delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes, wait_for, create_new_test_keyspace from test.topology.conftest import skip_mode from cassandra import ConsistencyLevel from cassandra.query import SimpleStatement @@ -22,15 +22,13 @@ from cassandra.protocol import InvalidRequest logger = logging.getLogger(__name__) async def create_keyspace(cql): - ks_name = 'ks' - await cql.run_async(f"CREATE KEYSPACE {ks_name} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}}") - return ks_name + return await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1}") -async def create_table(cql): - await cql.run_async(f"CREATE TABLE ks.t (p int, c int, PRIMARY KEY (p, c))") +async def create_table(cql, ks): + await cql.run_async(f"CREATE TABLE {ks}.t (p int, c int, PRIMARY KEY (p, c))") -async def create_mv(cql, view_name): - await cql.run_async(f"CREATE MATERIALIZED VIEW ks.{view_name} AS SELECT * FROM ks.t WHERE c IS NOT NULL and p IS NOT NULL PRIMARY KEY (c, p)") +async def create_mv(cql, ks, view_name): + await cql.run_async(f"CREATE MATERIALIZED VIEW {ks}.{view_name} AS SELECT * FROM {ks}.t WHERE c IS NOT NULL and p IS NOT NULL PRIMARY KEY (c, p)") async def get_view_builder_version(cql, **kwargs): result = await cql.run_async("SELECT value FROM system.scylla_local WHERE key='view_builder_version'", **kwargs) @@ -72,11 +70,11 @@ async def test_view_build_status_v2_table(manager: ManagerClient): v = await get_view_builder_version(cql, host=h) assert v == 2 - await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt") + ks = await create_keyspace(cql) + await create_table(cql, ks) + await create_mv(cql, ks, "vt") - await asyncio.gather(*(wait_for_view_v2(cql, 'ks', 'vt', node_count, host=h) for h in hosts)) + await asyncio.gather(*(wait_for_view_v2(cql, ks, 'vt', node_count, host=h) for h in hosts)) # The table system_distributed.view_build_status is set to be a virtual table reading # from system.view_build_status_v2, so verify that reading from each of them provides @@ -107,16 +105,16 @@ async def test_view_build_status_virtual_table(manager: ManagerClient): await wait_for(view_is_built, deadline) ks_name = await create_keyspace(cql) - await create_table(cql) + await create_table(cql, ks_name) await assert_v1_eq_v2() - await create_mv(cql, 'vt1') + await create_mv(cql, ks_name, 'vt1') await asyncio.gather(*(wait_for_view_on_host(cql, 'vt1', node_count, h) for h in hosts)) await assert_v1_eq_v2() assert len(await select_v2()) == node_count - await create_mv(cql, 'vt2') + await create_mv(cql, ks_name, 'vt2') await asyncio.gather(*(wait_for_view_on_host(cql, 'vt2', node_count, h) for h in hosts)) await assert_v1_eq_v2() assert len(await select_v2()) == node_count * 2 @@ -163,11 +161,11 @@ async def test_view_build_status_snapshot(manager: ManagerClient): servers = await manager.servers_add(3) cql, _ = await manager.get_ready_cql(servers) - await create_keyspace(cql) - await create_table(cql) + ks = await create_keyspace(cql) + await create_table(cql, ks) - await create_mv(cql, "vt1") - await create_mv(cql, "vt2") + await create_mv(cql, ks, "vt1") + await create_mv(cql, ks, "vt2") for s in servers: await manager.driver_connect(server=s) @@ -215,9 +213,9 @@ async def test_view_build_status_migration_to_v2(request, manager: ManagerClient status = await manager.api.raft_topology_upgrade_status(host.address) assert status == "not_upgraded" - await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt1") + ks = await create_keyspace(cql) + await create_table(cql, ks) + await create_mv(cql, ks, "vt1") # Verify we're using v1 now v = await get_view_builder_version(cql) @@ -239,8 +237,8 @@ async def test_view_build_status_migration_to_v2(request, manager: ManagerClient await asyncio.gather(*(wait_for(lambda: view_builder_is_v2(cql, host=h), time.time() + 60) for h in hosts)) # Check that new writes are written to the v2 table - await create_mv(cql, "vt2") - await asyncio.gather(*(wait_for_view_v2(cql, "ks", "vt2", 3, host=h) for h in hosts)) + await create_mv(cql, ks, "vt2") + await asyncio.gather(*(wait_for_view_v2(cql, ks, "vt2", 3, host=h) for h in hosts)) await wait_for_row_count(cql, "system.view_build_status_v2", 6, hosts[0]) @@ -267,13 +265,13 @@ async def test_view_build_status_migration_to_v2_with_write_during_migration(req status = await manager.api.raft_topology_upgrade_status(host.address) assert status == "not_upgraded" - await create_keyspace(cql) - await create_table(cql) + ks = await create_keyspace(cql) + await create_table(cql, ks) inj_insert = "view_builder_pause_add_new_view" await manager.api.enable_injection(servers[1].ip_addr, inj_insert, one_shot=True) - await create_mv(cql, "vt1") + await create_mv(cql, ks, "vt1") # pause the migration between reading the old table and writing to the new table, so we have # a time window where new writes may be lost. @@ -300,7 +298,7 @@ async def test_view_build_status_migration_to_v2_with_write_during_migration(req await asyncio.gather(*(wait_for(lambda: view_builder_is_v2(cql, host=h), time.time() + 60) for h in hosts)) - await asyncio.gather(*(wait_for_view_v2(cql, 'ks', 'vt1', 3, host=h) for h in hosts)) + await asyncio.gather(*(wait_for_view_v2(cql, ks, 'vt1', 3, host=h) for h in hosts)) # Migrate the view_build_status table to v2 while there is an 'old' write operation in progress. # The migration should wait for the old operations to complete before continuing, otherwise @@ -325,13 +323,13 @@ async def test_view_build_status_migration_to_v2_barrier(request, manager: Manag status = await manager.api.raft_topology_upgrade_status(host.address) assert status == "not_upgraded" - await create_keyspace(cql) - await create_table(cql) + ks = await create_keyspace(cql) + await create_table(cql, ks) # Create MV and delay the write operation to the old table inj_insert = "view_builder_pause_add_new_view" await manager.api.enable_injection(servers[1].ip_addr, inj_insert, one_shot=True) - await create_mv(cql, "vt1") + await create_mv(cql, ks, "vt1") # The upgrade should perform a barrier and wait for the delayed operation to complete before continuing. logging.info("Triggering upgrade to raft topology") @@ -349,7 +347,7 @@ async def test_view_build_status_migration_to_v2_barrier(request, manager: Manag await asyncio.gather(*(wait_for(lambda: view_builder_is_v2(cql, host=h), time.time() + 60) for h in hosts)) - await asyncio.gather(*(wait_for_view_v2(cql, 'ks', 'vt1', 3, host=h) for h in hosts)) + await asyncio.gather(*(wait_for_view_v2(cql, ks, 'vt1', 3, host=h) for h in hosts)) # Test that when removing a node from the cluster, we clean its rows from # the view build status table. @@ -359,10 +357,10 @@ async def test_view_build_status_cleanup_on_remove_node(manager: ManagerClient): servers = await manager.servers_add(node_count) cql, hosts = await manager.get_ready_cql(servers) - await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt1") - await create_mv(cql, "vt2") + ks = await create_keyspace(cql) + await create_table(cql, ks) + await create_mv(cql, ks, "vt1") + await create_mv(cql, ks, "vt2") await wait_for_row_count(cql, "system.view_build_status_v2", node_count*2, hosts[0]) @@ -383,10 +381,10 @@ async def test_view_build_status_with_replace_node(manager: ManagerClient): servers = await manager.servers_add(node_count) cql, hosts = await manager.get_ready_cql(servers) - await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt1") - await create_mv(cql, "vt2") + ks = await create_keyspace(cql) + await create_table(cql, ks) + await create_mv(cql, ks, "vt1") + await create_mv(cql, ks, "vt2") await wait_for_row_count(cql, "system.view_build_status_v2", node_count*2, hosts[1]) @@ -446,8 +444,8 @@ async def test_view_build_status_migration_to_v2_with_cleanup(request, manager: # Create a view. This will insert 4 entries to the view build status table, one for each node. ks_name = await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt1") + await create_table(cql, ks_name) + await create_mv(cql, ks_name, "vt1") await wait_for_view_v1(cql, "vt1", 4) @@ -457,7 +455,7 @@ async def test_view_build_status_migration_to_v2_with_cleanup(request, manager: # This row should get cleaned during migration. s0_host_id = await manager.get_host_id(servers[0].server_id) await cql.run_async(f"INSERT INTO system_distributed.view_build_status(keyspace_name, view_name, host_id, status) \ - VALUES ('ks', 'view_doesnt_exist', {s0_host_id}, 'SUCCESS')") + VALUES ('{ks_name}', 'view_doesnt_exist', {s0_host_id}, 'SUCCESS')") # Remove the last node. the entry for this node in the view build status remains and it # corresponds now to an unknown node. The migration should remove it. @@ -509,9 +507,9 @@ async def test_migration_on_existing_raft_topology(request, manager: ManagerClie logging.info("Waiting until driver connects to every server") cql, hosts = await manager.get_ready_cql(servers) - await create_keyspace(cql) - await create_table(cql) - await create_mv(cql, "vt1") + ks = await create_keyspace(cql) + await create_table(cql, ks) + await create_mv(cql, ks, "vt1") # Verify we're using v1 now v = await get_view_builder_version(cql) @@ -533,8 +531,8 @@ async def test_migration_on_existing_raft_topology(request, manager: ManagerClie await asyncio.gather(*(wait_for(lambda: view_builder_is_v2(cql, host=h), time.time() + 60) for h in hosts)) # Check that new writes are written to the v2 table - await create_mv(cql, "vt2") - await asyncio.gather(*(wait_for_view_v2(cql, "ks", "vt2", 3, host=h) for h in hosts)) + await create_mv(cql, ks, "vt2") + await asyncio.gather(*(wait_for_view_v2(cql, ks, "vt2", 3, host=h) for h in hosts)) await wait_for_row_count(cql, "system.view_build_status_v2", 6, hosts[0]) From 46b1850f0c3443d23cdde80d2d3c630e0ad2ffeb Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 51/56] topology_custom/test_zero_token_nodes_multidc: use create_new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_custom/test_zero_token_nodes_multidc.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/topology_custom/test_zero_token_nodes_multidc.py b/test/topology_custom/test_zero_token_nodes_multidc.py index b5d833d1cf..3f5309438b 100644 --- a/test/topology_custom/test_zero_token_nodes_multidc.py +++ b/test/topology_custom/test_zero_token_nodes_multidc.py @@ -13,6 +13,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.util import unique_name from test.topology.conftest import cluster_con +from test.topology.util import create_new_test_keyspace @pytest.mark.asyncio @@ -47,11 +48,11 @@ async def test_zero_token_nodes_multidc_basic(manager: ManagerClient, zero_token ks_names = list[str]() logging.info('Trying to create tables for different replication factors') for rf in range(3): - ks_names.append(unique_name()) failed = False - await dc2_cql.run_async(f"""CREATE KEYSPACE {ks_names[rf]} WITH replication = + ks_name = await create_new_test_keyspace(dc2_cql, f"""WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 2, 'dc2': {rf}}} AND tablets = {{ 'enabled': true }}""") + ks_names.append(ks_name) try: await dc2_cql.run_async(f'CREATE TABLE {ks_names[rf]}.tbl (pk int PRIMARY KEY, v int)') except Exception: From 0564e95c512e85e3663ceea2ce4ae2c4118178c4 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 52/56] topology_custom/test_zero_token_nodes_no_replication: use create_new_test_keyspace Signed-off-by: Benny Halevy --- .../topology_custom/test_zero_token_nodes_no_replication.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/topology_custom/test_zero_token_nodes_no_replication.py b/test/topology_custom/test_zero_token_nodes_no_replication.py index 381cbef6f1..536a492911 100644 --- a/test/topology_custom/test_zero_token_nodes_no_replication.py +++ b/test/topology_custom/test_zero_token_nodes_no_replication.py @@ -13,6 +13,7 @@ from cassandra.query import SimpleStatement from test.pylib.manager_client import ManagerClient from test.pylib.util import unique_name from test.topology.conftest import cluster_con +from test.topology.util import create_new_test_keyspace @pytest.mark.asyncio @@ -40,11 +41,10 @@ async def test_zero_token_nodes_no_replication(manager: ManagerClient): if tablets_enabled and replication_strategy != 'NetworkTopologyStrategy': continue - ks_name = unique_name() - ks_names.append(ks_name) - await cql_b.run_async(f"""CREATE KEYSPACE {ks_name} WITH replication = + ks_name = await create_new_test_keyspace(cql_b, f"""WITH replication = {{'class': '{replication_strategy}', 'replication_factor': 2}} AND tablets = {{ 'enabled': {str(tablets_enabled).lower()} }}""") + ks_names.append(ks_name) await cql_b.run_async(f'CREATE TABLE {ks_name}.tbl (pk int PRIMARY KEY, v int)') for i in range(100): insert_query = f'INSERT INTO {ks_name}.tbl (pk, v) VALUES ({i}, {i})' From 12f85ce57cf5f1e080740afbda714d92d0f5f3e3 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 53/56] topology_tasks/test_node_ops_tasks: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_tasks/test_node_ops_tasks.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/test/topology_tasks/test_node_ops_tasks.py b/test/topology_tasks/test_node_ops_tasks.py index 204513a2f1..0bebdfe7ca 100644 --- a/test/topology_tasks/test_node_ops_tasks.py +++ b/test/topology_tasks/test_node_ops_tasks.py @@ -12,6 +12,7 @@ from test.pylib.rest_client import InjectionHandler, inject_error_one_shot from test.pylib.scylla_cluster import ReplaceConfig from test.pylib.util import wait_for from test.topology_tasks.task_manager_client import TaskManagerClient +from test.topology.util import new_test_keyspace import asyncio import logging @@ -204,17 +205,17 @@ async def test_node_ops_tasks_tree(manager: ManagerClient): assert module_name in await tm.list_modules(servers[0].ip_addr), "node_ops module wasn't registered" cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({1}, {1});") - await cql.run_async(f"TRUNCATE test.test;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as ks: + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({1}, {1});") + await cql.run_async(f"TRUNCATE {ks}.test;") - servers, vt_ids = await check_bootstrap_tasks_tree(tm, module_name, servers) - servers, vt_ids = await check_replace_tasks_tree(manager, tm, module_name, servers, vt_ids) - servers, vt_ids = await check_rebuild_tasks_tree(manager, tm, module_name, servers, vt_ids) - servers, vt_ids = await check_remove_node_tasks_tree(manager, tm, module_name, servers, vt_ids) - servers, vt_ids = await check_decommission_tasks_tree(manager, tm, module_name, servers, vt_ids) + servers, vt_ids = await check_bootstrap_tasks_tree(tm, module_name, servers) + servers, vt_ids = await check_replace_tasks_tree(manager, tm, module_name, servers, vt_ids) + servers, vt_ids = await check_rebuild_tasks_tree(manager, tm, module_name, servers, vt_ids) + servers, vt_ids = await check_remove_node_tasks_tree(manager, tm, module_name, servers, vt_ids) + servers, vt_ids = await check_decommission_tasks_tree(manager, tm, module_name, servers, vt_ids) @pytest.mark.asyncio async def test_node_ops_tasks_ttl(manager: ManagerClient): From 9829b1594ff71c82ff4d5bc623d531db2cbc5779 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 19 Jan 2025 08:52:50 +0200 Subject: [PATCH 54/56] topology_tasks/test_tablet_tasks: use new_test_keyspace Signed-off-by: Benny Halevy --- test/topology_tasks/test_tablet_tasks.py | 214 +++++++++++------------ 1 file changed, 107 insertions(+), 107 deletions(-) diff --git a/test/topology_tasks/test_tablet_tasks.py b/test/topology_tasks/test_tablet_tasks.py index 02455f6e57..2e9eeed157 100644 --- a/test/topology_tasks/test_tablet_tasks.py +++ b/test/topology_tasks/test_tablet_tasks.py @@ -14,6 +14,7 @@ from test.pylib.manager_client import ManagerClient from test.pylib.repair import create_table_insert_data_for_repair, get_tablet_task_id from test.pylib.tablets import get_all_tablet_replicas from test.topology.conftest import skip_mode +from test.topology.util import create_new_test_keyspace, new_test_keyspace from test.topology_custom.test_tablets2 import inject_error_on from test.topology_tasks.task_manager_client import TaskManagerClient from test.topology_tasks.task_manager_types import TaskStatus, TaskStats @@ -30,9 +31,9 @@ async def message_injection(manager: ManagerClient, servers: list[ServerInfo], i for server in servers: await manager.api.message_injection(server.ip_addr, injection) -async def wait_tasks_created(tm: TaskManagerClient, server: ServerInfo, module_name: str, expected_number: int, type: str, table: Optional[str] = None): +async def wait_tasks_created(tm: TaskManagerClient, server: ServerInfo, module_name: str, expected_number: int, type: str, keyspace: str, table: Optional[str] = None): async def get_tasks(): - tasks = [task for task in await tm.list_tasks(server.ip_addr, module_name) if task.kind == "cluster" and task.type == type and task.keyspace == "test"] + tasks = [task for task in await tm.list_tasks(server.ip_addr, module_name) if task.kind == "cluster" and task.type == type and task.keyspace == keyspace] return [task for task in tasks if not table or table == task.table] tasks = await get_tasks() @@ -40,7 +41,7 @@ async def wait_tasks_created(tm: TaskManagerClient, server: ServerInfo, module_n tasks = await get_tasks() return tasks -def check_task_status(status: TaskStatus, states: list[str], type: str, scope: str, abortable: bool, keyspace: str = "test", table: str = "test", possible_child_num: list[int] = [0]): +def check_task_status(status: TaskStatus, states: list[str], type: str, scope: str, abortable: bool, keyspace: str, table: str = "test", possible_child_num: list[int] = [0]): assert status.scope == scope assert status.kind == "cluster" assert status.type == type @@ -50,9 +51,9 @@ def check_task_status(status: TaskStatus, states: list[str], type: str, scope: s assert len(status.children_ids) in possible_child_num assert status.state in states -async def check_and_abort_repair_task(manager: ManagerClient, tm: TaskManagerClient, servers: list[ServerInfo], module_name: str): +async def check_and_abort_repair_task(manager: ManagerClient, tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str = "test"): # Wait until user repair task is created. - repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair") + repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=keyspace) task = repair_tasks[0] assert task.scope == "table" @@ -62,14 +63,14 @@ async def check_and_abort_repair_task(manager: ManagerClient, tm: TaskManagerCli status = await tm.get_task_status(servers[0].ip_addr, task.task_id) - check_task_status(status, ["created", "running"], "user_repair", "table", True) + check_task_status(status, ["created", "running"], "user_repair", "table", True, keyspace) log = await manager.server_open_log(servers[0].server_id) mark = await log.mark() async def wait_for_task(): status_wait = await tm.wait_for_task(servers[0].ip_addr, task.task_id) - check_task_status(status_wait, ["done"], "user_repair", "table", True) + check_task_status(status_wait, ["done"], "user_repair", "table", True, keyspace) async def abort_task(): await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark) @@ -83,6 +84,7 @@ async def test_tablet_repair_task(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) + # FIXME: use unique_name for keyspace servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" @@ -94,16 +96,16 @@ async def test_tablet_repair_task(manager: ManagerClient): await asyncio.gather(repair_task(), check_and_abort_repair_task(manager, tm, servers, module_name)) -async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str): +async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str = "test"): def get_task_with_id(repair_tasks, task_id): tasks_with_id1 = [task for task in repair_tasks if task.task_id == task_id] assert len(tasks_with_id1) == 1 return tasks_with_id1[0] # Wait until user repair tasks are created. - repair_tasks0 = await wait_tasks_created(tm, servers[0], module_name, len(servers), "user_repair") - repair_tasks1 = await wait_tasks_created(tm, servers[1], module_name, len(servers), "user_repair") - repair_tasks2 = await wait_tasks_created(tm, servers[2], module_name, len(servers), "user_repair") + repair_tasks0 = await wait_tasks_created(tm, servers[0], module_name, len(servers), "user_repair", keyspace=keyspace) + repair_tasks1 = await wait_tasks_created(tm, servers[1], module_name, len(servers), "user_repair", keyspace=keyspace) + repair_tasks2 = await wait_tasks_created(tm, servers[2], module_name, len(servers), "user_repair", keyspace=keyspace) assert len(repair_tasks0) == len(repair_tasks1), f"Different number of repair virtual tasks on nodes {servers[0].server_id} and {servers[1].server_id}" assert len(repair_tasks0) == len(repair_tasks2), f"Different number of repair virtual tasks on nodes {servers[0].server_id} and {servers[2].server_id}" @@ -176,7 +178,7 @@ async def test_tablet_repair_task_children(manager: ManagerClient): async def check_children(): # Wait until user repair task is created. - repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair") + repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", "test") status = await tm.wait_for_task(servers[0].ip_addr, repair_tasks[0].task_id) assert len(status.children_ids) == 1 @@ -198,38 +200,38 @@ async def prepare_migration_test(manager: ManagerClient): await make_server() cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);") + ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int);") await make_server() - await cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({1}, {1});") + await cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({1}, {1});") - return (servers, host_ids) + return (ks, servers, host_ids) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_migration_task(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) - servers, host_ids = await prepare_migration_test(manager) + ks, servers, host_ids = await prepare_migration_test(manager) injection = "handle_tablet_migration_end_migration" async def move_tablet(old_replica, new_replica): await manager.api.enable_injection(servers[0].ip_addr, injection, False) - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) async def check(type): # Wait until migration task is created. - migration_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, type) + migration_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, type, keyspace=ks) assert len(migration_tasks) == 1 status = await tm.get_task_status(servers[0].ip_addr, migration_tasks[0].task_id) - check_task_status(status, ["created", "running"], type, "tablet", False) + check_task_status(status, ["created", "running"], type, "tablet", False, keyspace=ks) await manager.api.disable_injection(servers[0].ip_addr, injection) - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') assert len(replicas) == 1 and len(replicas[0].replicas) == 1 intranode_migration_src = replicas[0].replicas[0] @@ -246,16 +248,16 @@ async def test_tablet_migration_task(manager: ManagerClient): async def test_tablet_migration_task_list(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) - servers, host_ids = await prepare_migration_test(manager) + ks, servers, host_ids = await prepare_migration_test(manager) injection = "handle_tablet_migration_end_migration" async def move_tablet(server, old_replica, new_replica): - await manager.api.move_tablet(server.ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) + await manager.api.move_tablet(server.ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) async def check_migration_task_list(type: str): # Wait until migration tasks are created. - migration_tasks0 = await wait_tasks_created(tm, servers[0], module_name, 1, type) - migration_tasks1 = await wait_tasks_created(tm, servers[1], module_name, 1, type) + migration_tasks0 = await wait_tasks_created(tm, servers[0], module_name, 1, type, keyspace=ks) + migration_tasks1 = await wait_tasks_created(tm, servers[1], module_name, 1, type, keyspace=ks) assert len(migration_tasks0) == len(migration_tasks1), f"Different number of migration virtual tasks on nodes {servers[0].server_id} and {servers[1].server_id}" assert len(migration_tasks0) == 1, f"Wrong number of migration virtual tasks" @@ -270,11 +272,11 @@ async def test_tablet_migration_task_list(manager: ManagerClient): assert task.kind == "cluster" assert task.scope == "tablet" assert task.table == "test" - assert task.keyspace == "test" + assert task.keyspace == ks await disable_injection(manager, servers, injection) - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') assert len(replicas) == 1 and len(replicas[0].replicas) == 1 intranode_migration_src = replicas[0].replicas[0] @@ -293,17 +295,17 @@ async def test_tablet_migration_task_list(manager: ManagerClient): async def test_tablet_migration_task_failed(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) - servers, host_ids = await prepare_migration_test(manager) + ks, servers, host_ids = await prepare_migration_test(manager) wait_injection = "stream_tablet_wait" throw_injection = "stream_tablet_move_to_cleanup" async def move_tablet(old_replica, new_replica): - await manager.api.move_tablet(servers[0].ip_addr, "test", "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) + await manager.api.move_tablet(servers[0].ip_addr, ks, "test", old_replica[0], old_replica[1], new_replica[0], new_replica[1], 0) async def wait_for_task(task_id, type): status = await tm.wait_for_task(servers[0].ip_addr, task_id) - check_task_status(status, ["failed"], type, "tablet", False) + check_task_status(status, ["failed"], type, "tablet", False, keyspace=ks) async def resume_migration(log, mark): await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark) @@ -311,7 +313,7 @@ async def test_tablet_migration_task_failed(manager: ManagerClient): async def check(type, log, mark): # Wait until migration task is created. - migration_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, type) + migration_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, type, keyspace=ks) assert len(migration_tasks) == 1 await asyncio.gather(wait_for_task(migration_tasks[0].task_id, type), resume_migration(log, mark)) @@ -322,7 +324,7 @@ async def test_tablet_migration_task_failed(manager: ManagerClient): log = await manager.server_open_log(servers[0].server_id) mark = await log.mark() - replicas = await get_all_tablet_replicas(manager, servers[0], 'test', 'test') + replicas = await get_all_tablet_replicas(manager, servers[0], ks, 'test') assert len(replicas) == 1 and len(replicas[0].replicas) == 1 src = replicas[0].replicas[0] @@ -336,6 +338,7 @@ async def test_repair_task_info_is_none_when_no_running_repair(manager: ManagerC tm = TaskManagerClient(manager.api) token = -1 + # FIXME: use unique_name for keyspace servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" @@ -350,7 +353,7 @@ async def test_repair_task_info_is_none_when_no_running_repair(manager: ManagerC await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) async def wait_and_check_none(): - task = (await wait_tasks_created(tm, servers[0], module_name, 1,"user_repair"))[0] + task = (await wait_tasks_created(tm, servers[0], module_name, 1,"user_repair", keyspace="test"))[0] await disable_injection(manager, servers, "repair_tablet_fail_on_rpc_call") status = await tm.wait_for_task(servers[0].ip_addr, task.task_id) await check_none() @@ -399,34 +402,33 @@ async def test_tablet_resize_task(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - keyspace = "test" table1 = "test1" table2 = "test2" - await cql.run_async(f"CREATE KEYSPACE {keyspace} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 1}};") - await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - await cql.run_async(f"CREATE TABLE {keyspace}.{table2} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as keyspace: + await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + await cql.run_async(f"CREATE TABLE {keyspace}.{table2} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - total_keys = 60 - keys = range(total_keys) - await prepare_split(manager, servers[0], keyspace, table1, keys) - await enable_tablet_balancing_and_wait(manager, servers[0], "Detected tablet split for table") - await wait_tasks_created(tm, servers[0], module_name, 0, "split", table1) + total_keys = 60 + keys = range(total_keys) + await prepare_split(manager, servers[0], keyspace, table1, keys) + await enable_tablet_balancing_and_wait(manager, servers[0], "Detected tablet split for table") + await wait_tasks_created(tm, servers[0], module_name, 0, "split", keyspace, table1) - await prepare_split(manager, servers[0], keyspace, table2, keys) - await prepare_merge(manager, servers[0], keyspace, table1, keys[:-1]) - await manager.api.keyspace_compaction(servers[0].ip_addr, "test") + await prepare_split(manager, servers[0], keyspace, table2, keys) + await prepare_merge(manager, servers[0], keyspace, table1, keys[:-1]) + await manager.api.keyspace_compaction(servers[0].ip_addr, keyspace) - injection = "tablet_split_finalization_postpone" - await enable_injection(manager, servers, injection) - await manager.api.enable_tablet_balancing(servers[0].ip_addr) + injection = "tablet_split_finalization_postpone" + await enable_injection(manager, servers, injection) + await manager.api.enable_tablet_balancing(servers[0].ip_addr) - async def wait_and_check_status(server, type, keyspace, table): - task = (await wait_tasks_created(tm, server, module_name, 1, type, table))[0] - status = await tm.get_task_status(server.ip_addr, task.task_id) - check_task_status(status, ["running"], type, "table", False, keyspace, table, [0, 1, 2]) + async def wait_and_check_status(server, type, keyspace, table): + task = (await wait_tasks_created(tm, server, module_name, 1, type, keyspace, table))[0] + status = await tm.get_task_status(server.ip_addr, task.task_id) + check_task_status(status, ["running"], type, "table", False, keyspace, table, [0, 1, 2]) - await wait_and_check_status(servers[0], "split", keyspace, table2) - await wait_and_check_status(servers[0], "merge", keyspace, table1) + await wait_and_check_status(servers[0], "split", keyspace, table2) + await wait_and_check_status(servers[0], "merge", keyspace, table1) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -443,50 +445,49 @@ async def test_tablet_resize_list(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - keyspace = "test" table1 = "test1" - await cql.run_async(f"CREATE KEYSPACE {keyspace} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 1}};") - await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as keyspace: + await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - total_keys = 60 - keys = range(total_keys) - await prepare_split(manager, servers[0], keyspace, table1, keys) + total_keys = 60 + keys = range(total_keys) + await prepare_split(manager, servers[0], keyspace, table1, keys) - servers.append(await manager.server_add(cmdline=cmdline, config={ - 'error_injections_at_startup': ['short_tablet_stats_refresh_interval'] - })) + servers.append(await manager.server_add(cmdline=cmdline, config={ + 'error_injections_at_startup': ['short_tablet_stats_refresh_interval'] + })) - s1_log = await manager.server_open_log(servers[0].server_id) - s1_mark = await s1_log.mark() + s1_log = await manager.server_open_log(servers[0].server_id) + s1_mark = await s1_log.mark() - injection = "tablet_split_finalization_postpone" - compaction_injection = "split_sstable_rewrite" - await enable_injection(manager, servers, injection) - await manager.api.enable_injection(servers[0].ip_addr, compaction_injection, one_shot=True) + injection = "tablet_split_finalization_postpone" + compaction_injection = "split_sstable_rewrite" + await enable_injection(manager, servers, injection) + await manager.api.enable_injection(servers[0].ip_addr, compaction_injection, one_shot=True) - await manager.api.enable_tablet_balancing(servers[0].ip_addr) - task0 = (await wait_tasks_created(tm, servers[0], module_name, 1, "split", table1))[0] - task1 = (await wait_tasks_created(tm, servers[1], module_name, 1, "split", table1))[0] + await manager.api.enable_tablet_balancing(servers[0].ip_addr) + task0 = (await wait_tasks_created(tm, servers[0], module_name, 1, "split", keyspace, table1))[0] + task1 = (await wait_tasks_created(tm, servers[1], module_name, 1, "split", keyspace, table1))[0] - assert task0.task_id == task1.task_id + assert task0.task_id == task1.task_id - for task in [task0, task1]: - assert task.state == "running" - assert task.type == "split" - assert task.kind == "cluster" - assert task.scope == "table" - assert task.table == table1 - assert task.keyspace == keyspace + for task in [task0, task1]: + assert task.state == "running" + assert task.type == "split" + assert task.kind == "cluster" + assert task.scope == "table" + assert task.table == table1 + assert task.keyspace == keyspace - await s1_log.wait_for("split_sstable_rewrite: waiting", from_mark=s1_mark) - await manager.api.message_injection(servers[0].ip_addr, "split_sstable_rewrite") + await s1_log.wait_for("split_sstable_rewrite: waiting", from_mark=s1_mark) + await manager.api.message_injection(servers[0].ip_addr, "split_sstable_rewrite") - status1 = await tm.get_task_status(servers[1].ip_addr, task0.task_id) - status0 = await tm.get_task_status(servers[0].ip_addr, task0.task_id) - assert len(status0.children_ids) == 2 - assert status0.children_ids == status1.children_ids + status1 = await tm.get_task_status(servers[1].ip_addr, task0.task_id) + status0 = await tm.get_task_status(servers[0].ip_addr, task0.task_id) + assert len(status0.children_ids) == 2 + assert status0.children_ids == status1.children_ids - await disable_injection(manager, servers, injection) + await disable_injection(manager, servers, injection) @pytest.mark.asyncio @@ -505,35 +506,34 @@ async def test_tablet_resize_revoked(manager: ManagerClient): await manager.api.disable_tablet_balancing(servers[0].ip_addr) cql = manager.get_cql() - keyspace = "test" table1 = "test1" - await cql.run_async(f"CREATE KEYSPACE {keyspace} WITH replication = {{'class': 'NetworkTopologyStrategy', 'replication_factor': 1}} AND tablets = {{'initial': 1}};") - await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") + async with new_test_keyspace(manager, "WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 1} AND tablets = {'initial': 1}") as keyspace: + await cql.run_async(f"CREATE TABLE {keyspace}.{table1} (pk int PRIMARY KEY, c blob) WITH gc_grace_seconds=0 AND bloom_filter_fp_chance=1;") - total_keys = 60 - keys = range(total_keys) - await prepare_split(manager, servers[0], keyspace, table1, keys) + total_keys = 60 + keys = range(total_keys) + await prepare_split(manager, servers[0], keyspace, table1, keys) - injection = "tablet_split_finalization_postpone" - await enable_injection(manager, servers, injection) + injection = "tablet_split_finalization_postpone" + await enable_injection(manager, servers, injection) - await manager.api.enable_tablet_balancing(servers[0].ip_addr) - task0 = (await wait_tasks_created(tm, servers[0], module_name, 1, "split", table1))[0] + await manager.api.enable_tablet_balancing(servers[0].ip_addr) + task0 = (await wait_tasks_created(tm, servers[0], module_name, 1, "split", keyspace, table1))[0] - log = await manager.server_open_log(servers[0].server_id) - mark = await log.mark() + log = await manager.server_open_log(servers[0].server_id) + mark = await log.mark() - async def revoke_resize(log, mark): - await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark) - await asyncio.gather(*[cql.run_async(f"DELETE FROM {keyspace}.{table1} WHERE pk={k};") for k in keys]) + async def revoke_resize(log, mark): + await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark) + await asyncio.gather(*[cql.run_async(f"DELETE FROM {keyspace}.{table1} WHERE pk={k};") for k in keys]) - await manager.api.flush_keyspace(servers[0].ip_addr, keyspace) + await manager.api.flush_keyspace(servers[0].ip_addr, keyspace) - async def wait_for_task(task_id): - status = await tm.wait_for_task(servers[0].ip_addr, task_id) - check_task_status(status, ["suspended"], "split", "table", False, keyspace, table1, [0, 1, 2]) + async def wait_for_task(task_id): + status = await tm.wait_for_task(servers[0].ip_addr, task_id) + check_task_status(status, ["suspended"], "split", "table", False, keyspace, table1, [0, 1, 2]) - await asyncio.gather(revoke_resize(log, mark), wait_for_task(task0.task_id)) + await asyncio.gather(revoke_resize(log, mark), wait_for_task(task0.task_id)) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') From cbe79b20f72ab0d27a01d1931b0a861aa7517ff2 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 5 Feb 2025 09:07:10 +0200 Subject: [PATCH 55/56] test/repair: create_table_insert_data_for_repair: create keyspace with unique name and return it to the caller Signed-off-by: Benny Halevy --- test/pylib/repair.py | 18 +++++--- .../test_tablet_repair_scheduler.py | 38 +++++++-------- test/topology_tasks/test_tablet_tasks.py | 46 +++++++++---------- 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/test/pylib/repair.py b/test/pylib/repair.py index 1c5c2cc109..f346692f3a 100644 --- a/test/pylib/repair.py +++ b/test/pylib/repair.py @@ -4,7 +4,11 @@ # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 # -from test.pylib.util import wait_for_cql_and_get_hosts +from test.pylib.internal_types import ServerInfo +from test.pylib.util import wait_for_cql_and_get_hosts, Host +from test.topology.util import create_new_test_keyspace + +from cassandra.cluster import Session as CassandraSession import asyncio import time @@ -40,7 +44,7 @@ async def load_tablet_repair_task_infos(cql, host, table_id): return repair_task_infos -async def create_table_insert_data_for_repair(manager, rf = 3 , tablets = 8, fast_stats_refresh = True, nr_keys = 256, disable_flush_cache_time = False): +async def create_table_insert_data_for_repair(manager, rf = 3 , tablets = 8, fast_stats_refresh = True, nr_keys = 256, disable_flush_cache_time = False) -> (list[ServerInfo], CassandraSession, list[Host], str, str): if fast_stats_refresh: config = {'error_injections_at_startup': ['short_tablet_stats_refresh_interval']} else: @@ -49,15 +53,15 @@ async def create_table_insert_data_for_repair(manager, rf = 3 , tablets = 8, fas config.update({'repair_hints_batchlog_flush_cache_time_in_ms': 0}) servers = [await manager.server_add(config=config), await manager.server_add(config=config), await manager.server_add(config=config)] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {{'class': 'NetworkTopologyStrategy', " + ks = await create_new_test_keyspace(cql, "WITH replication = {{'class': 'NetworkTopologyStrategy', " "'replication_factor': {}}} AND tablets = {{'initial': {}}};".format(rf, tablets)) - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") keys = range(nr_keys) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) logging.info(f'Got hosts={hosts}'); - table_id = await manager.get_table_id("test", "test") - return (servers, cql, hosts, table_id) + table_id = await manager.get_table_id(ks, "test") + return (servers, cql, hosts, ks, table_id) async def get_tablet_task_id(cql, host, table_id, token): rows = await cql.run_async(f"SELECT last_token, repair_task_info from system.tablets where table_id = {table_id}", host=host) diff --git a/test/topology_custom/test_tablet_repair_scheduler.py b/test/topology_custom/test_tablet_repair_scheduler.py index 56ced30e8d..c28fd8b916 100644 --- a/test/topology_custom/test_tablet_repair_scheduler.py +++ b/test/topology_custom/test_tablet_repair_scheduler.py @@ -40,11 +40,11 @@ async def guarantee_repair_time_next_second(): @pytest.mark.asyncio async def test_tablet_manual_repair(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) token = -1 start = time.time() - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) duration = time.time() - start map1 = await load_tablet_repair_time(cql, hosts[0:1], table_id) logging.info(f'map1={map1} duration={duration}') @@ -52,7 +52,7 @@ async def test_tablet_manual_repair(manager: ManagerClient): await guarantee_repair_time_next_second() start = time.time() - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) duration = time.time() - start map2 = await load_tablet_repair_time(cql, hosts[0:1], table_id) logging.info(f'map2={map2} duration={duration}') @@ -65,7 +65,7 @@ async def test_tablet_manual_repair(manager: ManagerClient): @pytest.mark.asyncio async def test_tombstone_gc_insert_flush(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) token = "all" logs = [] for s in servers: @@ -73,7 +73,7 @@ async def test_tombstone_gc_insert_flush(manager: ManagerClient): await manager.api.set_logger_level(s.ip_addr, "tablets", "debug") logs.append(await manager.server_open_log(s.server_id)) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) timeout = 600 deadline = time.time() + timeout @@ -96,14 +96,14 @@ async def test_tombstone_gc_insert_flush(manager: ManagerClient): @pytest.mark.asyncio async def test_tablet_manual_repair_all_tokens(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False, disable_flush_cache_time=True) token = "all" now = datetime.datetime.utcnow() map1 = await load_tablet_repair_time(cql, hosts[0:1], table_id) await guarantee_repair_time_next_second() - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) map2 = await load_tablet_repair_time(cql, hosts[0:1], table_id) logging.info(f'{map1=} {map2=}') assert len(map1) == len(map2) @@ -115,10 +115,10 @@ async def test_tablet_manual_repair_all_tokens(manager: ManagerClient): @pytest.mark.asyncio async def test_tablet_manual_repair_async(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False) token = "-1" log = await manager.server_open_log(servers[0].server_id) - res = await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, await_completion=False) + res = await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, await_completion=False) tablet_task_id = res['tablet_task_id'] logging.info(f"{tablet_task_id=}") res = await log.grep(rf'.*Issued tablet repair by API request table_id={table_id}.*tablet_task_id={tablet_task_id}.*') @@ -128,7 +128,7 @@ async def test_tablet_manual_repair_async(manager: ManagerClient): @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_manual_repair_reject_parallel_requests(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager, fast_stats_refresh=False) token = -1 await inject_error_on(manager, "tablet_repair_add_delay_in_ms", servers, params={'value':'3000'}) @@ -141,7 +141,7 @@ async def test_tablet_manual_repair_reject_parallel_requests(manager: ManagerCli async def run_repair(state): try: - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) state.ok = state.ok + 1 except Exception as e: logging.info(f"Got exception as expected: {e}") @@ -157,24 +157,24 @@ async def test_tablet_manual_repair_reject_parallel_requests(manager: ManagerCli @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_error_and_retry(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) # Repair should finish with one time error injection token = -1 await inject_error_one_shot_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) await inject_error_off(manager, "repair_tablet_fail_on_rpc_call", servers) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_error_not_finish(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) token = -1 # Repair should not finish with error await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) try: - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, timeout=10) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, timeout=10) assert False # Check the tablet repair is not supposed to finish except TimeoutError: logger.info("Repair timeout as expected") @@ -183,13 +183,13 @@ async def test_tablet_repair_error_not_finish(manager: ManagerClient): @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_error_delete(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) token = -1 async def repair_task(): await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) # Check failed repair request can be deleted - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, timeout=900) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, timeout=900) async def del_repair_task(): tablet_task_id = None @@ -219,7 +219,7 @@ def get_repair_row_from_disk(server): @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_hosts_filter(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) hosts_filter = f"{hosts[0].host_id},{hosts[1].host_id}" row_num_before = [get_repair_row_from_disk(server) for server in servers] @@ -227,7 +227,7 @@ async def test_tablet_repair_hosts_filter(manager: ManagerClient): token = -1 async def repair_task(): await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, hosts_filter=hosts_filter) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, hosts_filter=hosts_filter) async def check_filter(): tablet_task_id = None diff --git a/test/topology_tasks/test_tablet_tasks.py b/test/topology_tasks/test_tablet_tasks.py index 2e9eeed157..874dce4852 100644 --- a/test/topology_tasks/test_tablet_tasks.py +++ b/test/topology_tasks/test_tablet_tasks.py @@ -51,13 +51,13 @@ def check_task_status(status: TaskStatus, states: list[str], type: str, scope: s assert len(status.children_ids) in possible_child_num assert status.state in states -async def check_and_abort_repair_task(manager: ManagerClient, tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str = "test"): +async def check_and_abort_repair_task(manager: ManagerClient, tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str): # Wait until user repair task is created. repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", keyspace=keyspace) task = repair_tasks[0] assert task.scope == "table" - assert task.keyspace == "test" + assert task.keyspace == keyspace assert task.table == "test" assert task.state in ["created", "running"] @@ -84,19 +84,18 @@ async def test_tablet_repair_task(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) - # FIXME: use unique_name for keyspace - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" async def repair_task(): token = -1 # Keep retring tablet repair. await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) - await asyncio.gather(repair_task(), check_and_abort_repair_task(manager, tm, servers, module_name)) + await asyncio.gather(repair_task(), check_and_abort_repair_task(manager, tm, servers, module_name, ks)) -async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str = "test"): +async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo], module_name: str, keyspace: str): def get_task_with_id(repair_tasks, task_id): tasks_with_id1 = [task for task in repair_tasks if task.task_id == task_id] assert len(tasks_with_id1) == 1 @@ -123,7 +122,7 @@ async def check_repair_task_list(tm: TaskManagerClient, servers: list[ServerInfo assert task.type == "user_repair" assert task.kind == "cluster" assert task.scope == "table" - assert task.keyspace == "test" + assert task.keyspace == keyspace await tm.abort_task(servers[0].ip_addr, task0.task_id) @@ -133,23 +132,23 @@ async def test_tablet_repair_task_list(manager: ManagerClient): module_name = "tablets" tm = TaskManagerClient(manager.api) - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" # Create other tables. - await cql.run_async("CREATE TABLE test.test2 (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") - await cql.run_async("CREATE TABLE test.test3 (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") + await cql.run_async(f"CREATE TABLE {ks}.test2 (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") + await cql.run_async(f"CREATE TABLE {ks}.test3 (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test2 (pk, c) VALUES ({k}, {k});") for k in keys]) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test3 (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test2 (pk, c) VALUES ({k}, {k});") for k in keys]) + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test3 (pk, c) VALUES ({k}, {k});") for k in keys]) async def run_repair(server_id, table_name): token = -1 - await manager.api.tablet_repair(servers[server_id].ip_addr, "test", table_name, token) + await manager.api.tablet_repair(servers[server_id].ip_addr, ks, table_name, token) await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name)) + await asyncio.gather(run_repair(0, "test"), run_repair(1, "test2"), run_repair(2, "test3"), check_repair_task_list(tm, servers, module_name, ks)) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') @@ -158,7 +157,7 @@ async def test_tablet_repair_task_children(manager: ManagerClient): tm = TaskManagerClient(manager.api) injection = "repair_tablet_repair_task_impl_run" - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) for server in servers: tm.set_task_ttl(server.ip_addr, 3600) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" @@ -170,7 +169,7 @@ async def test_tablet_repair_task_children(manager: ManagerClient): token = -1 # Keep retring tablet repair. await inject_error_on(manager, injection, servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) async def resume_repair(): await log.wait_for('tablet_virtual_task: wait until tablet operation is finished', from_mark=mark) @@ -178,7 +177,7 @@ async def test_tablet_repair_task_children(manager: ManagerClient): async def check_children(): # Wait until user repair task is created. - repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", "test") + repair_tasks = await wait_tasks_created(tm, servers[0], module_name, 1, "user_repair", ks) status = await tm.wait_for_task(servers[0].ip_addr, repair_tasks[0].task_id) assert len(status.children_ids) == 1 @@ -338,8 +337,7 @@ async def test_repair_task_info_is_none_when_no_running_repair(manager: ManagerC tm = TaskManagerClient(manager.api) token = -1 - # FIXME: use unique_name for keyspace - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) assert module_name in await tm.list_modules(servers[0].ip_addr), "tablets module wasn't registered" async def check_none(): @@ -350,10 +348,10 @@ async def test_repair_task_info_is_none_when_no_running_repair(manager: ManagerC async def repair_task(): await enable_injection(manager, servers, "repair_tablet_fail_on_rpc_call") - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) async def wait_and_check_none(): - task = (await wait_tasks_created(tm, servers[0], module_name, 1,"user_repair", keyspace="test"))[0] + task = (await wait_tasks_created(tm, servers[0], module_name, 1,"user_repair", keyspace=ks))[0] await disable_injection(manager, servers, "repair_tablet_fail_on_rpc_call") status = await tm.wait_for_task(servers[0].ip_addr, task.task_id) await check_none() @@ -538,13 +536,13 @@ async def test_tablet_resize_revoked(manager: ManagerClient): @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_task_sees_latest_state(manager: ManagerClient): - servers, cql, hosts, table_id = await create_table_insert_data_for_repair(manager) + servers, cql, hosts, ks, table_id = await create_table_insert_data_for_repair(manager) token = -1 async def repair_task(): await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) # Check failed repair request can be deleted - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token) async def del_repair_task(): tablet_task_id = None From cc281ff88dabe45d64af62d0033762aabaae2b46 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 5 Feb 2025 09:07:10 +0200 Subject: [PATCH 56/56] test_tablet_repair_scheduler: prepare_multi_dc_repair: use create_new_test_keyspace and return the keyspace unique name to the caller. Signed-off-by: Benny Halevy --- .../test_tablet_repair_scheduler.py | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/test/topology_custom/test_tablet_repair_scheduler.py b/test/topology_custom/test_tablet_repair_scheduler.py index c28fd8b916..0ea8bd86d5 100644 --- a/test/topology_custom/test_tablet_repair_scheduler.py +++ b/test/topology_custom/test_tablet_repair_scheduler.py @@ -4,11 +4,15 @@ # SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 # +from test.pylib.internal_types import ServerInfo from test.pylib.manager_client import ManagerClient -from test.pylib.util import wait_for_cql_and_get_hosts +from test.pylib.util import wait_for_cql_and_get_hosts, Host from test.topology.conftest import skip_mode from test.pylib.repair import load_tablet_repair_time, create_table_insert_data_for_repair, get_tablet_task_id, load_tablet_repair_task_infos from test.pylib.rest_client import inject_error_one_shot, read_barrier +from test.topology.util import create_new_test_keyspace + +from cassandra.cluster import Session as CassandraSession import pytest import asyncio @@ -247,24 +251,24 @@ async def test_tablet_repair_hosts_filter(manager: ManagerClient): assert row_num_before[1] < row_num_after[1] assert row_num_before[2] == row_num_after[2] -async def prepare_multi_dc_repair(manager): +async def prepare_multi_dc_repair(manager) -> tuple[list[ServerInfo], CassandraSession, list[Host], str, str]: servers = [await manager.server_add(property_file = {'dc': 'DC1', 'rack' : 'R1'}), await manager.server_add(property_file = {'dc': 'DC1', 'rack' : 'R1'}), await manager.server_add(property_file = {'dc': 'DC2', 'rack' : 'R2'})] cql = manager.get_cql() - await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', " + ks = await create_new_test_keyspace(cql, "WITH replication = {'class': 'NetworkTopologyStrategy', " "'DC1': 2, 'DC2': 1} AND tablets = {'initial': 8};") - await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {'mode':'repair'};") + await cql.run_async(f"CREATE TABLE {ks}.test (pk int PRIMARY KEY, c int) WITH tombstone_gc = {{'mode':'repair'}};") keys = range(256) - await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys]) - table_id = await manager.get_table_id("test", "test") + await asyncio.gather(*[cql.run_async(f"INSERT INTO {ks}.test (pk, c) VALUES ({k}, {k});") for k in keys]) + table_id = await manager.get_table_id(ks, "test") hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) - return (servers, cql, hosts, table_id) + return (servers, cql, hosts, ks, table_id) @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_dcs_filter(manager: ManagerClient): - servers, cql, hosts, table_id = await prepare_multi_dc_repair(manager) + servers, cql, hosts, ks, table_id = await prepare_multi_dc_repair(manager) dcs_filter = "DC1" row_num_before = [get_repair_row_from_disk(server) for server in servers] @@ -272,7 +276,7 @@ async def test_tablet_repair_dcs_filter(manager: ManagerClient): token = -1 async def repair_task(): await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, dcs_filter=dcs_filter) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, dcs_filter=dcs_filter) async def check_filter(): tablet_task_id = None @@ -295,7 +299,7 @@ async def test_tablet_repair_dcs_filter(manager: ManagerClient): @pytest.mark.asyncio @skip_mode('release', 'error injections are not supported in release mode') async def test_tablet_repair_hosts_and_dcs_filter(manager: ManagerClient): - servers, cql, hosts, table_id = await prepare_multi_dc_repair(manager) + servers, cql, hosts, ks, table_id = await prepare_multi_dc_repair(manager) dcs_filter = "DC1,DC2" hosts_filter = f"{hosts[0].host_id},{hosts[2].host_id}" @@ -304,7 +308,7 @@ async def test_tablet_repair_hosts_and_dcs_filter(manager: ManagerClient): token = -1 async def repair_task(): await inject_error_on(manager, "repair_tablet_fail_on_rpc_call", servers) - await manager.api.tablet_repair(servers[0].ip_addr, "test", "test", token, hosts_filter=hosts_filter, dcs_filter=dcs_filter) + await manager.api.tablet_repair(servers[0].ip_addr, ks, "test", token, hosts_filter=hosts_filter, dcs_filter=dcs_filter) async def check_filter(): tablet_task_id = None