Merge 'repair: handle no_such_keyspace in repair preparation phase' from Aleksandra Martyniuk

Currently, data sync repair handles most no_such_keyspace exceptions,
but it omits the preparation phase, where the exception could be thrown
during make_global_effective_replication_map.

Skip the keyspace repair if no_such_keyspace is thrown during preparations.

Fixes: #22073.

Requires backport to 6.1 and 6.2 as they contain the bug

Closes scylladb/scylladb#22473

* github.com:scylladb/scylladb:
  test: add test to check if repair handles no_such_keyspace
  repair: handle keyspace dropped
This commit is contained in:
Avi Kivity
2025-01-28 13:42:38 +02:00
3 changed files with 28 additions and 1 deletions

View File

@@ -657,6 +657,8 @@ future<> global_vnode_effective_replication_map::get_keyspace_erms(sharded<repli
// all under the lock.
auto lk = co_await db.get_shared_token_metadata().get_lock();
auto erm = db.find_keyspace(keyspace_name).get_vnode_effective_replication_map();
utils::get_local_injector().inject("get_keyspace_erms_throw_no_such_keyspace",
[&keyspace_name] { throw data_dictionary::no_such_keyspace{keyspace_name}; });
auto ring_version = erm->get_token_metadata().get_ring_version();
_erms[0] = make_foreign(std::move(erm));
co_await coroutine::parallel_for_each(std::views::iota(1u, smp::count), [this, &sharded_db, keyspace_name, ring_version] (unsigned shard) -> future<> {

View File

@@ -1499,7 +1499,16 @@ future<> repair::data_sync_repair_task_impl::run() {
auto& keyspace = _status.keyspace;
auto& sharded_db = rs.get_db();
auto& db = sharded_db.local();
auto germs = make_lw_shared(co_await locator::make_global_effective_replication_map(sharded_db, keyspace));
auto germs_fut = co_await coroutine::as_future(locator::make_global_effective_replication_map(sharded_db, keyspace));
if (germs_fut.failed()) {
auto ex = germs_fut.get_exception();
if (try_catch<data_dictionary::no_such_keyspace>(ex)) {
rlogger.warn("sync data: keyspace {} does not exist, skipping", keyspace);
co_return;
}
co_await coroutine::return_exception_ptr(std::move(ex));
}
auto germs = make_lw_shared(germs_fut.get());
auto id = get_repair_uniq_id();

View File

@@ -259,3 +259,19 @@ async def test_repair_abort(manager):
await manager.api.client.get_json(f"/task_manager/wait_task/{id}", host=servers[0].ip_addr)
statuses = await manager.api.client.get_json(f"/task_manager/task_status_recursive/{id}", host=servers[0].ip_addr)
assert all([status["state"] == "failed" for status in statuses])
@pytest.mark.asyncio
@skip_mode('release', 'error injections are not supported in release mode')
async def test_keyspace_drop_during_data_sync_repair(manager):
cfg = {
'enable_tablets': False,
'error_injections_at_startup': ['get_keyspace_erms_throw_no_such_keyspace']
}
await manager.server_add(config=cfg)
cql = manager.get_cql()
cql.execute("CREATE KEYSPACE ks WITH replication = {'class': 'NetworkTopologyStrategy', 'replication_factor': 2}")
cql.execute("CREATE TABLE ks.tbl (pk int, ck int, PRIMARY KEY (pk, ck)) WITH tombstone_gc = {'mode': 'repair'}")
await manager.server_add(config=cfg)