repair: Log nodes down during repair in case of failed repair

This helps users to figure if the repair has failed due to a peer node
was down during repair.

For example:

```
WARN  [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: Repair
1026 out of 1026 ranges, keyspace=ks2a, table={test_table, tb},
range=(9203128250168517738,+inf), peers={127.0.0.2}, live_peers={},
status=skipped_no_live_peers

INFO  [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: stats:
repair_reason=repair, keyspace=ks2a, tables={test_table, tb}, ranges_nr=513,
round_nr=0, round_nr_fast_path_already_synced=0,
round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=0,
tx_hashes_nr=0, rx_hashes_nr=0, duration=0 seconds, tx_row_nr=0, rx_row_nr=0,
tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={}, row_from_disk_nr={},
row_from_disk_bytes_per_sec={} MiB/s, row_from_disk_rows_per_sec={} Rows/s,
tx_row_nr_peer={}, rx_row_nr_peer={}

WARN  [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out
of 1026 ranges failed, keyspace=ks2a, tables={test_table, tb},
repair_reason=repair, nodes_down_during_repair={127.0.0.2}

WARN  [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]:
repair_tracker run failed: std::runtime_error ({shard 0: std::runtime_error
(repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out of 1026 ranges failed,
keyspace=ks2a, tables={test_table, tb}, repair_reason=repair,
nodes_down_during_repair={127.0.0.2})})
```

In addition, change the `status=skipped` to `status=skipped_no_live_peers`
to make it more clear.

Closes #13928
This commit is contained in:
Asias He
2023-05-18 09:22:08 +08:00
committed by Botond Dénes
parent f45976730c
commit 7056b7ee9a
2 changed files with 13 additions and 3 deletions

View File

@@ -591,8 +591,10 @@ void repair::shard_repair_task_impl::check_failed_ranges() {
rlogger.info("repair[{}]: stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
global_repair_id.uuid(), _reason, _status.keyspace, table_names(), ranges.size(), _stats.get_stats());
if (nr_failed_ranges) {
rlogger.warn("repair[{}]: failed - {} out of {} ranges failed", global_repair_id.uuid(), nr_failed_ranges, ranges_size());
throw std::runtime_error(format("repair[{}] on failed to repair {} out of {} ranges", global_repair_id.uuid(), nr_failed_ranges, ranges_size()));
auto msg = format("repair[{}]: {} out of {} ranges failed, keyspace={}, tables={}, repair_reason={}, nodes_down_during_repair={}",
global_repair_id.uuid(), nr_failed_ranges, ranges_size(), _status.keyspace, table_names(), _reason, nodes_down);
rlogger.warn("{}", msg);
throw std::runtime_error(msg);
} else {
if (dropped_tables.size()) {
rlogger.warn("repair[{}]: completed successfully, keyspace={}, ignoring dropped tables={}", global_repair_id.uuid(), _status.keyspace, dropped_tables);
@@ -630,6 +632,7 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra
auto it = std::find(live_neighbors.begin(), live_neighbors.end(), node);
if (it == live_neighbors.end()) {
nr_failed_ranges++;
nodes_down.insert(node);
auto status = format("failed: mandatory neighbor={} is not alive", node);
rlogger.error("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
@@ -641,7 +644,13 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra
}
if (live_neighbors.size() != neighbors.size()) {
nr_failed_ranges++;
auto status = live_neighbors.empty() ? "skipped" : "partial";
std::unordered_set<gms::inet_address> live_neighbors_set(live_neighbors.begin(), live_neighbors.end());
for (auto& node : neighbors) {
if (!live_neighbors_set.contains(node)) {
nodes_down.insert(node);
}
}
auto status = live_neighbors.empty() ? "skipped_no_live_peers" : "partial";
rlogger.warn("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
if (live_neighbors.empty()) {

View File

@@ -120,6 +120,7 @@ public:
repair_stats _stats;
std::unordered_set<sstring> dropped_tables;
bool _hints_batchlog_flushed = false;
std::unordered_set<gms::inet_address> nodes_down;
public:
shard_repair_task_impl(tasks::task_manager::module_ptr module,
tasks::task_id id,