From ae422fdf69180fddc2d5a2d32eea23afbe81658c Mon Sep 17 00:00:00 2001 From: xuchang Date: Tue, 23 Jan 2024 09:20:31 +0800 Subject: [PATCH 1/2] repair: accelerate repair load_history time Using `parallel_for_each_table` instance of `for_each_table_gently` on `repair_service::load_history`, and parallel num 16 for each shard, to reduced bootstrap time. --- repair/row_level.cc | 4 +++- repair/row_level.hh | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/repair/row_level.cc b/repair/row_level.cc index 2af6cedc68..c443b4a4ca 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -3229,11 +3229,13 @@ future<> repair_service::cleanup_history(tasks::task_id repair_id) { } future<> repair_service::load_history() { - co_await get_db().local().get_tables_metadata().for_each_table_gently(coroutine::lambda([&] (table_id table_uuid, lw_shared_ptr table) -> future<> { + co_await get_db().local().get_tables_metadata().parallel_for_each_table(coroutine::lambda([&] (table_id table_uuid, lw_shared_ptr table) -> future<> { auto shard = unsigned(table_uuid.uuid().get_most_significant_bits()) % smp::count; if (shard != this_shard_id()) { co_return; } + auto permit = co_await seastar::get_units(_load_parallelism_semaphore, 1); + rlogger.info("Loading repair history for keyspace={}, table={}, table_uuid={}", table->schema()->ks_name(), table->schema()->cf_name(), table_uuid); co_await _sys_ks.local().get_repair_history(table_uuid, [this] (const auto& entry) -> future<> { diff --git a/repair/row_level.hh b/repair/row_level.hh index 45e85c92db..4e0b916e64 100644 --- a/repair/row_level.hh +++ b/repair/row_level.hh @@ -108,6 +108,7 @@ class repair_service : public seastar::peering_sharded_service { size_t _max_repair_memory; seastar::semaphore _memory_sem; + seastar::named_semaphore _load_parallelism_semaphore = {16, named_semaphore_exception_factory{"Load repair history parallelism"}}; future<> init_ms_handlers(); future<> uninit_ms_handlers(); From 9b675d1fe4b117a6911722072ce3faadeddc6bce Mon Sep 17 00:00:00 2001 From: xuchang Date: Mon, 29 Jan 2024 09:44:57 +0800 Subject: [PATCH 2/2] repair: resolve load_history shard load skew Using uuid_xor_to_uint32 instance of table_uuid's most_significant_bits, optimize the hash conflict to shard. --- repair/row_level.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/repair/row_level.cc b/repair/row_level.cc index c443b4a4ca..cb9408e6eb 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -3230,7 +3230,7 @@ future<> repair_service::cleanup_history(tasks::task_id repair_id) { future<> repair_service::load_history() { co_await get_db().local().get_tables_metadata().parallel_for_each_table(coroutine::lambda([&] (table_id table_uuid, lw_shared_ptr table) -> future<> { - auto shard = unsigned(table_uuid.uuid().get_most_significant_bits()) % smp::count; + auto shard = utils::uuid_xor_to_uint32(table_uuid.uuid()) % smp::count; if (shard != this_shard_id()) { co_return; }