repair: Prevent CPU stall during cross-shard row copy and destruction

When handling `repair_stream_cmd::end_of_current_rows`, passing the foreign list directly to `put_row_diff_handler` triggered a massive synchronous deep copy on the destination shard. Additionally, destroying the list triggered a synchronous deallocation on the source shard. This blocked the reactor and triggered the CPU stall detector. This commit fixes the issue by introducing `clone_gently()` to copy the list elements one by one, and leveraging the existing `utils::clear_gently()` to destroy them. Both utilize `seastar::coroutine::maybe_yield()` to allow the reactor to breathe during large cross-shard transfers and cleanups. Fixes SCYLLADB-403 Closes scylladb/scylladb#28979
2026-03-10 14:42:50 +08:00
parent 035aa90d4b
commit 6cb263bab0
1 changed files with 12 additions and 1 deletions
--- a/repair/row_level.cc
+++ b/repair/row_level.cc
@@ -2362,6 +2362,15 @@ static future<> repair_get_row_diff_with_rpc_stream_process_op_slow_path(
    }
 }

+static future<repair_rows_on_wire> clone_gently(const repair_rows_on_wire& rows) {
+    repair_rows_on_wire cloned;
+    for (const auto& row : rows) {
+        cloned.push_back(row);
+        co_await seastar::coroutine::maybe_yield();
+    }
+    co_return cloned;
+}
+
 static future<> repair_put_row_diff_with_rpc_stream_process_op(
        sharded<repair_service>& repair,
        locator::host_id from,
@@ -2388,7 +2397,9 @@ static future<> repair_put_row_diff_with_rpc_stream_process_op(
                co_await rm->put_row_diff_handler(std::move(*fp));
                rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
            } else {
-                co_await rm->put_row_diff_handler(*fp);
+                // Gently clone to avoid copy stall on destination shard
+                repair_rows_on_wire local_rows = co_await clone_gently(*fp);
+                co_await seastar::when_all_succeed(rm->put_row_diff_handler(std::move(local_rows)), utils::clear_gently(fp));
                rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_finished);
            }
        });