repair: Log nodes down during repair in case of failed repair
This helps users to figure if the repair has failed due to a peer node
was down during repair.
For example:
```
WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: Repair
1026 out of 1026 ranges, keyspace=ks2a, table={test_table, tb},
range=(9203128250168517738,+inf), peers={127.0.0.2}, live_peers={},
status=skipped_no_live_peers
INFO [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: stats:
repair_reason=repair, keyspace=ks2a, tables={test_table, tb}, ranges_nr=513,
round_nr=0, round_nr_fast_path_already_synced=0,
round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=0,
tx_hashes_nr=0, rx_hashes_nr=0, duration=0 seconds, tx_row_nr=0, rx_row_nr=0,
tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={}, row_from_disk_nr={},
row_from_disk_bytes_per_sec={} MiB/s, row_from_disk_rows_per_sec={} Rows/s,
tx_row_nr_peer={}, rx_row_nr_peer={}
WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out
of 1026 ranges failed, keyspace=ks2a, tables={test_table, tb},
repair_reason=repair, nodes_down_during_repair={127.0.0.2}
WARN [shard 0] repair - repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]:
repair_tracker run failed: std::runtime_error ({shard 0: std::runtime_error
(repair[ec2e9646-918e-4345-99ab-fa07aa1f17de]: 1026 out of 1026 ranges failed,
keyspace=ks2a, tables={test_table, tb}, repair_reason=repair,
nodes_down_during_repair={127.0.0.2})})
```
In addition, change the `status=skipped` to `status=skipped_no_live_peers`
to make it more clear.
Closes #13928
This commit is contained in:
@@ -591,8 +591,10 @@ void repair::shard_repair_task_impl::check_failed_ranges() {
|
||||
rlogger.info("repair[{}]: stats: repair_reason={}, keyspace={}, tables={}, ranges_nr={}, {}",
|
||||
global_repair_id.uuid(), _reason, _status.keyspace, table_names(), ranges.size(), _stats.get_stats());
|
||||
if (nr_failed_ranges) {
|
||||
rlogger.warn("repair[{}]: failed - {} out of {} ranges failed", global_repair_id.uuid(), nr_failed_ranges, ranges_size());
|
||||
throw std::runtime_error(format("repair[{}] on failed to repair {} out of {} ranges", global_repair_id.uuid(), nr_failed_ranges, ranges_size()));
|
||||
auto msg = format("repair[{}]: {} out of {} ranges failed, keyspace={}, tables={}, repair_reason={}, nodes_down_during_repair={}",
|
||||
global_repair_id.uuid(), nr_failed_ranges, ranges_size(), _status.keyspace, table_names(), _reason, nodes_down);
|
||||
rlogger.warn("{}", msg);
|
||||
throw std::runtime_error(msg);
|
||||
} else {
|
||||
if (dropped_tables.size()) {
|
||||
rlogger.warn("repair[{}]: completed successfully, keyspace={}, ignoring dropped tables={}", global_repair_id.uuid(), _status.keyspace, dropped_tables);
|
||||
@@ -630,6 +632,7 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra
|
||||
auto it = std::find(live_neighbors.begin(), live_neighbors.end(), node);
|
||||
if (it == live_neighbors.end()) {
|
||||
nr_failed_ranges++;
|
||||
nodes_down.insert(node);
|
||||
auto status = format("failed: mandatory neighbor={} is not alive", node);
|
||||
rlogger.error("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
|
||||
global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
@@ -641,7 +644,13 @@ future<> repair::shard_repair_task_impl::repair_range(const dht::token_range& ra
|
||||
}
|
||||
if (live_neighbors.size() != neighbors.size()) {
|
||||
nr_failed_ranges++;
|
||||
auto status = live_neighbors.empty() ? "skipped" : "partial";
|
||||
std::unordered_set<gms::inet_address> live_neighbors_set(live_neighbors.begin(), live_neighbors.end());
|
||||
for (auto& node : neighbors) {
|
||||
if (!live_neighbors_set.contains(node)) {
|
||||
nodes_down.insert(node);
|
||||
}
|
||||
}
|
||||
auto status = live_neighbors.empty() ? "skipped_no_live_peers" : "partial";
|
||||
rlogger.warn("repair[{}]: Repair {} out of {} ranges, keyspace={}, table={}, range={}, peers={}, live_peers={}, status={}",
|
||||
global_repair_id.uuid(), ranges_index, ranges_size(), _status.keyspace, table_names(), range, neighbors, live_neighbors, status);
|
||||
if (live_neighbors.empty()) {
|
||||
|
||||
@@ -120,6 +120,7 @@ public:
|
||||
repair_stats _stats;
|
||||
std::unordered_set<sstring> dropped_tables;
|
||||
bool _hints_batchlog_flushed = false;
|
||||
std::unordered_set<gms::inet_address> nodes_down;
|
||||
public:
|
||||
shard_repair_task_impl(tasks::task_manager::module_ptr module,
|
||||
tasks::task_id id,
|
||||
|
||||
Reference in New Issue
Block a user