From 7056b7ee9aa9055b9d53d08b779f79d16a3d4b77 Mon Sep 17 00:00:00 2001 From: Asias He Date: Thu, 18 May 2023 09:22:08 +0800 Subject: [PATCH] repair: Log nodes down during repair in case of failed repair This helps users to figure if the repair has failed due to a peer node was down during repair. For example: ``` WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: Repair 1026 out of 1026 ranges, keyspace=ks2a, table={test_table, tb}, range=(9203128250168517738,+inf), peers={127.0.0.2}, live_peers={}, status=skipped_no_live_peers INFO [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: stats: repair_reason=repair, keyspace=ks2a, tables={test_table, tb}, ranges_nr=513, round_nr=0, round_nr_fast_path_already_synced=0, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=0, tx_hashes_nr=0, rx_hashes_nr=0, duration=0 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={}, row_from_disk_nr={}, row_from_disk_bytes_per_sec={} MiB/s, row_from_disk_rows_per_sec={} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={} WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out of 1026 ranges failed, keyspace=ks2a, tables={test_table, tb}, repair_reason=repair, nodes_down_during_repair={127.0.0.2} WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: repair_tracker run failed: std::runtime_error ({shard 0: std::runtime_error (repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out of 1026 ranges failed, keyspace=ks2a, tables={test_table, tb}, repair_reason=repair, nodes_down_during_repair={127.0.0.2})}) ``` In addition, change the `status=skipped` to `status=skipped_no_live_peers` to make it more clear. Closes #13928 --- repair/repair.cc | 15 ++++++++++++--- repair/task_manager_module.hh | 1 + 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/repair/repair.cc b/repair/repair.cc index 56380c192f..9ec40f7ffd 100644 --- a/repair/repair.cc +++ b/repair/repair.cc @@ -591,8 +591,10 @@ void repair::shard_repair_task_impl::check_failed_ranges() { rlogger.info("repair[{}]: stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}", global_repair_id.uuid(), _reason, _status.keyspace, table_names(), ranges.size(), _stats.get_stats()); if (nr_failed_ranges) { - rlogger.warn("repair[{}]: failed - {} out of {} ranges failed", global_repair_id.uuid(), nr_failed_ranges, ranges_size()); - throw std::runtime_error(format("repair[{}] on failed to repair {} out of {} ranges", global_repair_id.uuid(), nr_failed_ranges, ranges_size())); + auto msg = format("repair[{}]: {} out of {} ranges failed, keyspace={}, tables={}, repair_reason={}, nodes_down_during_repair={}", + global_repair_id.uuid(), nr_failed_ranges, ranges_size(), _status.keyspace, table_names(), _reason, nodes_down); + rlogger.warn("{}", msg); + throw std::runtime_error(msg); } else { if (dropped_tables.size()) { rlogger.warn("repair[{}]: completed successfully, keyspace={}, ignoring dropped tables={}", global_repair_id.uuid(), _status.keyspace, dropped_tables); @@ -630,6 +632,7 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra auto it = std::find(live_neighbors.begin(), live_neighbors.end(), node); if (it == live_neighbors.end()) { nr_failed_ranges++; + nodes_down.insert(node); auto status = format("failed: mandatory neighbor={} is not alive", node); rlogger.error("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}", global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status); @@ -641,7 +644,13 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra } if (live_neighbors.size() != neighbors.size()) { nr_failed_ranges++; - auto status = live_neighbors.empty() ? "skipped" : "partial"; + std::unordered_set live_neighbors_set(live_neighbors.begin(), live_neighbors.end()); + for (auto& node : neighbors) { + if (!live_neighbors_set.contains(node)) { + nodes_down.insert(node); + } + } + auto status = live_neighbors.empty() ? "skipped_no_live_peers" : "partial"; rlogger.warn("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}", global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status); if (live_neighbors.empty()) { diff --git a/repair/task_manager_module.hh b/repair/task_manager_module.hh index 234ccafab5..f843de2333 100644 --- a/repair/task_manager_module.hh +++ b/repair/task_manager_module.hh @@ -120,6 +120,7 @@ public: repair_stats _stats; std::unordered_set dropped_tables; bool _hints_batchlog_flushed = false; + std::unordered_set nodes_down; public: shard_repair_task_impl(tasks::task_manager::module_ptr module, tasks::task_id id,