repair: Prevent CPU stall during cross-shard row copy and destruction

When handling `repair_stream_cmd::end_of_current_rows`, passing the
foreign list directly to `put_row_diff_handler` triggered a massive
synchronous deep copy on the destination shard. Additionally, destroying
the list triggered a synchronous deallocation on the source shard. This
blocked the reactor and triggered the CPU stall detector.

This commit fixes the issue by introducing `clone_gently()` to copy the
list elements one by one, and leveraging the existing
`utils::clear_gently()` to destroy them. Both utilize
`seastar::coroutine::maybe_yield()` to allow the reactor to breathe
during large cross-shard transfers and cleanups.

Fixes SCYLLADB-403

Closes scylladb/scylladb#28979
This commit is contained in:
Asias He
2026-03-10 14:42:50 +08:00
committed by Botond Dénes
parent 035aa90d4b
commit 6cb263bab0

View File

@@ -2362,6 +2362,15 @@ static future<> repair_get_row_diff_with_rpc_stream_process_op_slow_path(
}
}
static future<repair_rows_on_wire> clone_gently(const repair_rows_on_wire& rows) {
repair_rows_on_wire cloned;
for (const auto& row : rows) {
cloned.push_back(row);
co_await seastar::coroutine::maybe_yield();
}
co_return cloned;
}
static future<> repair_put_row_diff_with_rpc_stream_process_op(
sharded<repair_service>& repair,
locator::host_id from,
@@ -2388,7 +2397,9 @@ static future<> repair_put_row_diff_with_rpc_stream_process_op(
co_await rm->put_row_diff_handler(std::move(*fp));
rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
} else {
co_await rm->put_row_diff_handler(*fp);
// Gently clone to avoid copy stall on destination shard
repair_rows_on_wire local_rows = co_await clone_gently(*fp);
co_await seastar::when_all_succeed(rm->put_row_diff_handler(std::move(local_rows)), utils::clear_gently(fp));
rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
}
});