Merge 'Basic tablet repair support' from Asias He

This patch adds basic tablet repair support.

Below is an example showing how tablet repairs works. The `nodetool
repair -pr` cmd was performed on all the nodes, which makes sure no duplication
repair work will be performed and each tablet will be repaired exactly once.

Three nodes in the cluster. RF = 2. 16 initial tablets.

Tablets:
```
cqlsh> SELECT  * FROM system.tablets;

 keyspace_name | table_id                             | last_token           | table_name | tablet_count | new_replicas | replicas                                                                               | session | stage
---------------+--------------------------------------+----------------------+------------+--------------+--------------+----------------------------------------------------------------------------------------+---------+-------
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -8070450532247928833 |  standard1 |           16 |         null | [(951cb5bc-5749-481a-9645-4dd0f624f24a, 6), (2dd3808d-6601-4483-b081-adf41ef094e5, 5)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -6917529027641081857 |  standard1 |           16 |         null | [(2dd3808d-6601-4483-b081-adf41ef094e5, 0), (19caaeb3-d754-4704-a998-840df53eb54c, 5)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -5764607523034234881 |  standard1 |           16 |         null | [(19caaeb3-d754-4704-a998-840df53eb54c, 2), (2dd3808d-6601-4483-b081-adf41ef094e5, 3)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -4611686018427387905 |  standard1 |           16 |         null | [(951cb5bc-5749-481a-9645-4dd0f624f24a, 5), (2dd3808d-6601-4483-b081-adf41ef094e5, 4)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -3458764513820540929 |  standard1 |           16 |         null | [(19caaeb3-d754-4704-a998-840df53eb54c, 1), (951cb5bc-5749-481a-9645-4dd0f624f24a, 0)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -2305843009213693953 |  standard1 |           16 |         null | [(951cb5bc-5749-481a-9645-4dd0f624f24a, 7), (2dd3808d-6601-4483-b081-adf41ef094e5, 1)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 | -1152921504606846977 |  standard1 |           16 |         null | [(19caaeb3-d754-4704-a998-840df53eb54c, 7), (951cb5bc-5749-481a-9645-4dd0f624f24a, 1)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |                   -1 |  standard1 |           16 |         null | [(951cb5bc-5749-481a-9645-4dd0f624f24a, 2), (2dd3808d-6601-4483-b081-adf41ef094e5, 7)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  1152921504606846975 |  standard1 |           16 |         null | [(951cb5bc-5749-481a-9645-4dd0f624f24a, 6), (19caaeb3-d754-4704-a998-840df53eb54c, 2)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  2305843009213693951 |  standard1 |           16 |         null | [(2dd3808d-6601-4483-b081-adf41ef094e5, 5), (951cb5bc-5749-481a-9645-4dd0f624f24a, 7)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  3458764513820540927 |  standard1 |           16 |         null | [(2dd3808d-6601-4483-b081-adf41ef094e5, 1), (19caaeb3-d754-4704-a998-840df53eb54c, 3)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  4611686018427387903 |  standard1 |           16 |         null | [(2dd3808d-6601-4483-b081-adf41ef094e5, 7), (951cb5bc-5749-481a-9645-4dd0f624f24a, 1)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  5764607523034234879 |  standard1 |           16 |         null | [(19caaeb3-d754-4704-a998-840df53eb54c, 6), (2dd3808d-6601-4483-b081-adf41ef094e5, 2)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  6917529027641081855 |  standard1 |           16 |         null | [(19caaeb3-d754-4704-a998-840df53eb54c, 5), (951cb5bc-5749-481a-9645-4dd0f624f24a, 3)] |    null |  null
           ks1 | 3ffadad0-a552-11ee-bc15-66412bbb6978 |  8070450532247928831 |  standard1 |           16 |         null | [(2dd3808d-6601-4483-b081-adf41ef094e5, 0), (19caaeb3-d754-4704-a998-840df53eb54c, 7)] |    null |  null
```

node1:
```
$nodetool repair -p 7199 -pr ks1 standard1
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: starting user-requested repair for keyspace ks1, repair id 6, options {{trace -> false}, {primaryRange -> true}, {columnFamilies -> standard1}, {jobThreads -> 1}, {incremental -> false}, {parallelism -> parallel}}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c] Repair 1 out of 5 tablets: table=ks1.standard1 tablet_id=2 range=(-6917529027641081857,-5764607523034234881] replicas={19caaeb3-d754-4704-a998-840df53eb54c:2, 2dd3808d-6601-4483-b081-adf41ef094e5:3} primary_replica_only=true
[shard 2:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07399633 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7174440}, {127.0.0.2, 7174440}}, row_from_disk_nr={{127.0.0.1, 15330}, {127.0.0.2, 15330}}, row_from_disk_bytes_per_sec={{127.0.0.1, 92.4651}, {127.0.0.2, 92.4651}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 207172}, {127.0.0.2, 207172}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c] Repair 2 out of 5 tablets: table=ks1.standard1 tablet_id=4 range=(-4611686018427387905,-3458764513820540929] replicas={19caaeb3-d754-4704-a998-840df53eb54c:1, 951cb5bc-5749-481a-9645-4dd0f624f24a:0} primary_replica_only=true
[shard 1:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07302664 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7195032}, {127.0.0.3, 7195032}}, row_from_disk_nr={{127.0.0.1, 15374}, {127.0.0.3, 15374}}, row_from_disk_bytes_per_sec={{127.0.0.1, 93.9618}, {127.0.0.3, 93.9618}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 210526}, {127.0.0.3, 210526}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c] Repair 3 out of 5 tablets: table=ks1.standard1 tablet_id=6 range=(-2305843009213693953,-1152921504606846977] replicas={19caaeb3-d754-4704-a998-840df53eb54c:7, 951cb5bc-5749-481a-9645-4dd0f624f24a:1} primary_replica_only=true
[shard 7:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.06781354 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7095816}, {127.0.0.3, 7095816}}, row_from_disk_nr={{127.0.0.1, 15162}, {127.0.0.3, 15162}}, row_from_disk_bytes_per_sec={{127.0.0.1, 99.7898}, {127.0.0.3, 99.7898}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 223584}, {127.0.0.3, 223584}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c] Repair 4 out of 5 tablets: table=ks1.standard1 tablet_id=12 range=(4611686018427387903,5764607523034234879] replicas={19caaeb3-d754-4704-a998-840df53eb54c:6, 2dd3808d-6601-4483-b081-adf41ef094e5:2} primary_replica_only=true
[shard 6:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.06793772 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7150572}, {127.0.0.2, 7150572}}, row_from_disk_nr={{127.0.0.1, 15279}, {127.0.0.2, 15279}}, row_from_disk_bytes_per_sec={{127.0.0.1, 100.376}, {127.0.0.2, 100.376}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 224897}, {127.0.0.2, 224897}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c] Repair 5 out of 5 tablets: table=ks1.standard1 tablet_id=13 range=(5764607523034234879,6917529027641081855] replicas={19caaeb3-d754-4704-a998-840df53eb54c:5, 951cb5bc-5749-481a-9645-4dd0f624f24a:3} primary_replica_only=true
[shard 5:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.068579935 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7129512}, {127.0.0.3, 7129512}}, row_from_disk_nr={{127.0.0.1, 15234}, {127.0.0.3, 15234}}, row_from_disk_bytes_per_sec={{127.0.0.1, 99.1432}, {127.0.0.3, 99.1432}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 222135}, {127.0.0.3, 222135}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[f7ac8fb6-8e49-4b31-8c7d-0d493064977c]: Finished user-requested repair for tablet keyspace=ks1 tables={standard1} repair_id=6 duration=0.352379s
```

node2:
```
$nodetool repair -p 7200 -pr ks1 standard1
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: starting user-requested repair for keyspace ks1, repair id 1, options {{trace -> false}, {primaryRange -> true}, {columnFamilies -> standard1}, {jobThreads -> 1}, {incremental -> false}, {parallelism -> parallel}}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 1 out of 6 tablets: table=ks1.standard1 tablet_id=1 range=(-8070450532247928833,-6917529027641081857] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:0, 19caaeb3-d754-4704-a998-840df53eb54c:5} primary_replica_only=true
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07016466 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7212816}, {127.0.0.2, 7212816}}, row_from_disk_nr={{127.0.0.1, 15412}, {127.0.0.2, 15412}}, row_from_disk_bytes_per_sec={{127.0.0.1, 98.0362}, {127.0.0.2, 98.0362}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 219655}, {127.0.0.2, 219655}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 2 out of 6 tablets: table=ks1.standard1 tablet_id=9 range=(1152921504606846975,2305843009213693951] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:5, 951cb5bc-5749-481a-9645-4dd0f624f24a:7} primary_replica_only=true
[shard 5:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07180758 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7236216}, {127.0.0.3, 7236216}}, row_from_disk_nr={{127.0.0.2, 15462}, {127.0.0.3, 15462}}, row_from_disk_bytes_per_sec={{127.0.0.2, 96.104}, {127.0.0.3, 96.104}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 215325}, {127.0.0.3, 215325}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 3 out of 6 tablets: table=ks1.standard1 tablet_id=10 range=(2305843009213693951,3458764513820540927] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:1, 19caaeb3-d754-4704-a998-840df53eb54c:3} primary_replica_only=true
[shard 1:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.06772773 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7039188}, {127.0.0.2, 7039188}}, row_from_disk_nr={{127.0.0.1, 15041}, {127.0.0.2, 15041}}, row_from_disk_bytes_per_sec={{127.0.0.1, 99.1188}, {127.0.0.2, 99.1188}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 222080}, {127.0.0.2, 222080}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 4 out of 6 tablets: table=ks1.standard1 tablet_id=11 range=(3458764513820540927,4611686018427387903] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:7, 951cb5bc-5749-481a-9645-4dd0f624f24a:1} primary_replica_only=true
[shard 7:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07025768 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7229664}, {127.0.0.3, 7229664}}, row_from_disk_nr={{127.0.0.2, 15448}, {127.0.0.3, 15448}}, row_from_disk_bytes_per_sec={{127.0.0.2, 98.1351}, {127.0.0.3, 98.1351}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 219876}, {127.0.0.3, 219876}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 5 out of 6 tablets: table=ks1.standard1 tablet_id=14 range=(6917529027641081855,8070450532247928831] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:0, 19caaeb3-d754-4704-a998-840df53eb54c:7} primary_replica_only=true
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.0719635 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7225452}, {127.0.0.2, 7225452}}, row_from_disk_nr={{127.0.0.1, 15439}, {127.0.0.2, 15439}}, row_from_disk_bytes_per_sec={{127.0.0.1, 95.7531}, {127.0.0.2, 95.7531}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 214539}, {127.0.0.2, 214539}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e] Repair 6 out of 6 tablets: table=ks1.standard1 tablet_id=15 range=(8070450532247928831,9223372036854775807] replicas={2dd3808d-6601-4483-b081-adf41ef094e5:4, 19caaeb3-d754-4704-a998-840df53eb54c:3} primary_replica_only=true
[shard 4:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.0691715 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7122960}, {127.0.0.2, 7122960}}, row_from_disk_nr={{127.0.0.1, 15220}, {127.0.0.2, 15220}}, row_from_disk_bytes_per_sec={{127.0.0.1, 98.2049}, {127.0.0.2, 98.2049}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 220033}, {127.0.0.2, 220033}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[5c805f0c-4ff2-4c5c-88df-bb318d559e0e]: Finished user-requested repair for tablet keyspace=ks1 tables={standard1} repair_id=1 duration=0.42178s
```
node3:
```
$nodetool repair -p 7300 -pr ks1 standard1
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: starting user-requested repair for keyspace ks1, repair id 1, options {{trace -> false}, {primaryRange -> true}, {columnFamilies -> standard1}, {jobThreads -> 1}, {incremental -> false}, {parallelism -> parallel}}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6] Repair 1 out of 5 tablets: table=ks1.standard1 tablet_id=0 range=(minimum token,-8070450532247928833] replicas={951cb5bc-5749-481a-9645-4dd0f624f24a:6, 2dd3808d-6601-4483-b081-adf41ef094e5:5} primary_replica_only=true
[shard 6:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.07126866 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7133256}, {127.0.0.3, 7133256}}, row_from_disk_nr={{127.0.0.2, 15242}, {127.0.0.3, 15242}}, row_from_disk_bytes_per_sec={{127.0.0.2, 95.4529}, {127.0.0.3, 95.4529}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 213867}, {127.0.0.3, 213867}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6] Repair 2 out of 5 tablets: table=ks1.standard1 tablet_id=3 range=(-5764607523034234881,-4611686018427387905] replicas={951cb5bc-5749-481a-9645-4dd0f624f24a:5, 2dd3808d-6601-4483-b081-adf41ef094e5:4} primary_replica_only=true
[shard 5:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.0701025 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7138404}, {127.0.0.3, 7138404}}, row_from_disk_nr={{127.0.0.2, 15253}, {127.0.0.3, 15253}}, row_from_disk_bytes_per_sec={{127.0.0.2, 97.1108}, {127.0.0.3, 97.1108}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 217581}, {127.0.0.3, 217581}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6] Repair 3 out of 5 tablets: table=ks1.standard1 tablet_id=5 range=(-3458764513820540929,-2305843009213693953] replicas={951cb5bc-5749-481a-9645-4dd0f624f24a:7, 2dd3808d-6601-4483-b081-adf41ef094e5:1} primary_replica_only=true
[shard 7:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.06859512 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7171632}, {127.0.0.3, 7171632}}, row_from_disk_nr={{127.0.0.2, 15324}, {127.0.0.3, 15324}}, row_from_disk_bytes_per_sec={{127.0.0.2, 99.7068}, {127.0.0.3, 99.7068}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 223398}, {127.0.0.3, 223398}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6] Repair 4 out of 5 tablets: table=ks1.standard1 tablet_id=7 range=(-1152921504606846977,-1] replicas={951cb5bc-5749-481a-9645-4dd0f624f24a:2, 2dd3808d-6601-4483-b081-adf41ef094e5:7} primary_replica_only=true
[shard 2:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.06975318 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.2, 7105176}, {127.0.0.3, 7105176}}, row_from_disk_nr={{127.0.0.2, 15182}, {127.0.0.3, 15182}}, row_from_disk_bytes_per_sec={{127.0.0.2, 97.1429}, {127.0.0.3, 97.1429}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.2, 217653}, {127.0.0.3, 217653}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6] Repair 5 out of 5 tablets: table=ks1.standard1 tablet_id=8 range=(-1,1152921504606846975] replicas={951cb5bc-5749-481a-9645-4dd0f624f24a:6, 19caaeb3-d754-4704-a998-840df53eb54c:2} primary_replica_only=true
[shard 6:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: stats: repair_reason=repair, keyspace=ks1, tables={standard1}, ranges_nr=1, round_nr=2, round_nr_fast_path_already_synced=2, round_nr_fast_path_same_combined_hashes=0, round_nr_slow_path=0, rpc_call_nr=6, tx_hashes_nr=0, rx_hashes_nr=0, duration=0.070810474 seconds, tx_row_nr=0, rx_row_nr=0, tx_row_bytes=0, rx_row_bytes=0, row_from_disk_bytes={{127.0.0.1, 7023276}, {127.0.0.3, 7023276}}, row_from_disk_nr={{127.0.0.1, 15007}, {127.0.0.3, 15007}}, row_from_disk_bytes_per_sec={{127.0.0.1, 94.5894}, {127.0.0.3, 94.5894}} MiB/s, row_from_disk_rows_per_sec={{127.0.0.1, 211932}, {127.0.0.3, 211932}} Rows/s, tx_row_nr_peer={}, rx_row_nr_peer={}
[shard 0:strm] repair - repair[350b97f3-f06e-470f-9164-43997a4f82a6]: Finished user-requested repair for tablet keyspace=ks1 tables={standard1} repair_id=1 duration=0.351395s
```

Fixes #16599

Closes scylladb/scylladb#16600

* github.com:scylladb/scylladb:
  test: Add test_tablet_missing_data_repair
  test: Add test_tablet_repair
  test: Allow timeout in server_stop_gracefully
  test: Increase STOP_TIMEOUT_SECONDS
  repair: Wire tablet repair with the user repair request
  repair: Pass raft_address_map to repair service
  repair: Add host2ip_t type
  repair: Add finished user-requested log for vnode table too
  repair: Log error in the rpc_stream_handler
  repair: Make row_level repair work with tablet
  repair: Add get_dst_shard_id
  repair: Add shard to repair_node_state
  repair: Add shard map to repair_neighbors
This commit is contained in:
Botond Dénes
2024-01-18 09:12:59 +02:00
13 changed files with 650 additions and 218 deletions

View File

@@ -1571,7 +1571,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
// both)
supervisor::notify("starting repair service");
auto max_memory_repair = memory::stats().total_memory() * 0.1;
repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(bm), std::ref(sys_dist_ks), std::ref(sys_ks), std::ref(view_update_generator), std::ref(task_manager), std::ref(mm), max_memory_repair).get();
repair.start(std::ref(gossiper), std::ref(messaging), std::ref(db), std::ref(proxy), std::ref(raft_address_map), std::ref(bm), std::ref(sys_dist_ks), std::ref(sys_ks), std::ref(view_update_generator), std::ref(task_manager), std::ref(mm), max_memory_repair).get();
auto stop_repair_service = defer_verbose_shutdown("repair service", [&repair] {
repair.stop().get();
});

View File

@@ -1013,11 +1013,11 @@ future<> messaging_service::unregister_stream_mutation_fragments() {
template<class SinkType, class SourceType>
future<std::tuple<rpc::sink<SinkType>, rpc::source<SourceType>>>
do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shard_id dst_shard_id, shared_ptr<messaging_service::rpc_protocol_client_wrapper> rpc_client, std::unique_ptr<messaging_service::rpc_protocol_wrapper>& rpc) {
using value_type = std::tuple<rpc::sink<SinkType>, rpc::source<SourceType>>;
auto sink = co_await rpc_client->make_stream_sink<netw::serializer, SinkType>();
auto rpc_handler = rpc->make_client<rpc::source<SourceType> (uint32_t, rpc::sink<SinkType>)>(verb);
auto source_fut = co_await coroutine::as_future(rpc_handler(*rpc_client, repair_meta_id, sink));
auto rpc_handler = rpc->make_client<rpc::source<SourceType> (uint32_t, shard_id, rpc::sink<SinkType>)>(verb);
auto source_fut = co_await coroutine::as_future(rpc_handler(*rpc_client, repair_meta_id, dst_shard_id, sink));
if (source_fut.failed()) {
auto ex = source_fut.get_exception();
try {
@@ -1032,20 +1032,20 @@ do_make_sink_source(messaging_verb verb, uint32_t repair_meta_id, shared_ptr<mes
// Wrapper for REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>
messaging_service::make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id) {
messaging_service::make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id) {
auto verb = messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, id);
return do_make_sink_source<repair_hash_with_cmd, repair_row_on_wire_with_cmd>(verb, repair_meta_id, std::move(rpc_client), rpc());
return do_make_sink_source<repair_hash_with_cmd, repair_row_on_wire_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
}
rpc::sink<repair_row_on_wire_with_cmd> messaging_service::make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source) {
return source.make_sink<netw::serializer, repair_row_on_wire_with_cmd>();
}
void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func) {
void messaging_service::register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_hash_with_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
@@ -1054,20 +1054,20 @@ future<> messaging_service::unregister_repair_get_row_diff_with_rpc_stream() {
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>
messaging_service::make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id) {
messaging_service::make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id) {
auto verb = messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, id);
return do_make_sink_source<repair_row_on_wire_with_cmd, repair_stream_cmd>(verb, repair_meta_id, std::move(rpc_client), rpc());
return do_make_sink_source<repair_row_on_wire_with_cmd, repair_stream_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
}
rpc::sink<repair_stream_cmd> messaging_service::make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source) {
return source.make_sink<netw::serializer, repair_stream_cmd>();
}
void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
void messaging_service::register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_row_on_wire_with_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
@@ -1076,20 +1076,20 @@ future<> messaging_service::unregister_repair_put_row_diff_with_rpc_stream() {
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>
messaging_service::make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id) {
messaging_service::make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id) {
auto verb = messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM;
if (is_shutting_down()) {
return make_exception_future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>>(rpc::closed_error());
}
auto rpc_client = get_rpc_client(verb, id);
return do_make_sink_source<repair_stream_cmd, repair_hash_with_cmd>(verb, repair_meta_id, std::move(rpc_client), rpc());
return do_make_sink_source<repair_stream_cmd, repair_hash_with_cmd>(verb, repair_meta_id, dst_cpu_id, std::move(rpc_client), rpc());
}
rpc::sink<repair_hash_with_cmd> messaging_service::make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source) {
return source.make_sink<netw::serializer, repair_hash_with_cmd>();
}
void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func) {
void messaging_service::register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_stream_cmd> source)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM, std::move(func));
}
future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_stream() {
@@ -1276,101 +1276,101 @@ future<table_schema_version> messaging_service::send_schema_check(msg_addr dst,
}
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
void messaging_service::register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(func));
}
future<> messaging_service::unregister_repair_get_full_row_hashes() {
return unregister_handler(messaging_verb::REPAIR_GET_FULL_ROW_HASHES);
}
future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id) {
return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id);
future<repair_hash_set> messaging_service::send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id, shard_id dst_shard_id) {
return send_message<future<repair_hash_set>>(this, messaging_verb::REPAIR_GET_FULL_ROW_HASHES, std::move(id), repair_meta_id, dst_shard_id);
}
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
void messaging_service::register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func) {
void messaging_service::register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_COMBINED_ROW_HASH, std::move(func));
}
future<> messaging_service::unregister_repair_get_combined_row_hash() {
return unregister_handler(messaging_verb::REPAIR_GET_COMBINED_ROW_HASH);
}
future<get_combined_row_hash_response> messaging_service::send_repair_get_combined_row_hash(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary) {
return send_message<future<get_combined_row_hash_response>>(this, messaging_verb::REPAIR_GET_COMBINED_ROW_HASH, std::move(id), repair_meta_id, std::move(common_sync_boundary));
future<get_combined_row_hash_response> messaging_service::send_repair_get_combined_row_hash(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary, shard_id dst_shard_id) {
return send_message<future<get_combined_row_hash_response>>(this, messaging_verb::REPAIR_GET_COMBINED_ROW_HASH, std::move(id), repair_meta_id, std::move(common_sync_boundary), dst_shard_id);
}
void messaging_service::register_repair_get_sync_boundary(std::function<future<get_sync_boundary_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary)>&& func) {
void messaging_service::register_repair_get_sync_boundary(std::function<future<get_sync_boundary_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_SYNC_BOUNDARY, std::move(func));
}
future<> messaging_service::unregister_repair_get_sync_boundary() {
return unregister_handler(messaging_verb::REPAIR_GET_SYNC_BOUNDARY);
}
future<get_sync_boundary_response> messaging_service::send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary) {
return send_message<future<get_sync_boundary_response>>(this, messaging_verb::REPAIR_GET_SYNC_BOUNDARY, std::move(id), repair_meta_id, std::move(skipped_sync_boundary));
future<get_sync_boundary_response> messaging_service::send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary, shard_id dst_shard_id) {
return send_message<future<get_sync_boundary_response>>(this, messaging_verb::REPAIR_GET_SYNC_BOUNDARY, std::move(id), repair_meta_id, std::move(skipped_sync_boundary), dst_shard_id);
}
// Wrapper for REPAIR_GET_ROW_DIFF
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func) {
void messaging_service::register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(func));
}
future<> messaging_service::unregister_repair_get_row_diff() {
return unregister_handler(messaging_verb::REPAIR_GET_ROW_DIFF);
}
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows) {
return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows);
future<repair_rows_on_wire> messaging_service::send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows, shard_id dst_shard_id) {
return send_message<future<repair_rows_on_wire>>(this, messaging_verb::REPAIR_GET_ROW_DIFF, std::move(id), repair_meta_id, std::move(set_diff), needs_all_rows, dst_shard_id);
}
// Wrapper for REPAIR_PUT_ROW_DIFF
void messaging_service::register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func) {
void messaging_service::register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_PUT_ROW_DIFF, std::move(func));
}
future<> messaging_service::unregister_repair_put_row_diff() {
return unregister_handler(messaging_verb::REPAIR_PUT_ROW_DIFF);
}
future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff) {
return send_message<void>(this, messaging_verb::REPAIR_PUT_ROW_DIFF, std::move(id), repair_meta_id, std::move(row_diff));
future<> messaging_service::send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff, shard_id dst_shard_id) {
return send_message<void>(this, messaging_verb::REPAIR_PUT_ROW_DIFF, std::move(id), repair_meta_id, std::move(row_diff), dst_shard_id);
}
// Wrapper for REPAIR_ROW_LEVEL_START
void messaging_service::register_repair_row_level_start(std::function<future<repair_row_level_start_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time)>&& func) {
void messaging_service::register_repair_row_level_start(std::function<future<repair_row_level_start_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(func));
}
future<> messaging_service::unregister_repair_row_level_start() {
return unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_START);
}
future<rpc::optional<repair_row_level_start_response>> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time) {
return send_message<rpc::optional<repair_row_level_start_response>>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version), reason, compaction_time);
future<rpc::optional<repair_row_level_start_response>> messaging_service::send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time, shard_id dst_shard_id) {
return send_message<rpc::optional<repair_row_level_start_response>>(this, messaging_verb::REPAIR_ROW_LEVEL_START, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, std::move(remote_partitioner_name), std::move(schema_version), reason, compaction_time, dst_shard_id);
}
// Wrapper for REPAIR_ROW_LEVEL_STOP
void messaging_service::register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func) {
void messaging_service::register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_ROW_LEVEL_STOP, std::move(func));
}
future<> messaging_service::unregister_repair_row_level_stop() {
return unregister_handler(messaging_verb::REPAIR_ROW_LEVEL_STOP);
}
future<> messaging_service::send_repair_row_level_stop(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range) {
return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_STOP, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range));
future<> messaging_service::send_repair_row_level_stop(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, shard_id dst_shard_id) {
return send_message<void>(this, messaging_verb::REPAIR_ROW_LEVEL_STOP, std::move(id), repair_meta_id, std::move(keyspace_name), std::move(cf_name), std::move(range), dst_shard_id);
}
// Wrapper for REPAIR_GET_ESTIMATED_PARTITIONS
void messaging_service::register_repair_get_estimated_partitions(std::function<future<uint64_t> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func) {
void messaging_service::register_repair_get_estimated_partitions(std::function<future<uint64_t> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_GET_ESTIMATED_PARTITIONS, std::move(func));
}
future<> messaging_service::unregister_repair_get_estimated_partitions() {
return unregister_handler(messaging_verb::REPAIR_GET_ESTIMATED_PARTITIONS);
}
future<uint64_t> messaging_service::send_repair_get_estimated_partitions(msg_addr id, uint32_t repair_meta_id) {
return send_message<future<uint64_t>>(this, messaging_verb::REPAIR_GET_ESTIMATED_PARTITIONS, std::move(id), repair_meta_id);
future<uint64_t> messaging_service::send_repair_get_estimated_partitions(msg_addr id, uint32_t repair_meta_id, shard_id dst_shard_id) {
return send_message<future<uint64_t>>(this, messaging_verb::REPAIR_GET_ESTIMATED_PARTITIONS, std::move(id), repair_meta_id, dst_shard_id);
}
// Wrapper for REPAIR_SET_ESTIMATED_PARTITIONS
void messaging_service::register_repair_set_estimated_partitions(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, uint64_t estimated_partitions)>&& func) {
void messaging_service::register_repair_set_estimated_partitions(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, uint64_t estimated_partitions, rpc::optional<shard_id> dst_shard_id)>&& func) {
register_handler(this, messaging_verb::REPAIR_SET_ESTIMATED_PARTITIONS, std::move(func));
}
future<> messaging_service::unregister_repair_set_estimated_partitions() {
return unregister_handler(messaging_verb::REPAIR_SET_ESTIMATED_PARTITIONS);
}
future<> messaging_service::send_repair_set_estimated_partitions(msg_addr id, uint32_t repair_meta_id, uint64_t estimated_partitions) {
return send_message<void>(this, messaging_verb::REPAIR_SET_ESTIMATED_PARTITIONS, std::move(id), repair_meta_id, estimated_partitions);
future<> messaging_service::send_repair_set_estimated_partitions(msg_addr id, uint32_t repair_meta_id, uint64_t estimated_partitions, shard_id dst_shard_id) {
return send_message<void>(this, messaging_verb::REPAIR_SET_ESTIMATED_PARTITIONS, std::move(id), repair_meta_id, estimated_partitions, dst_shard_id);
}
// Wrapper for REPAIR_GET_DIFF_ALGORITHMS

View File

@@ -383,21 +383,21 @@ public:
future<std::tuple<rpc::sink<frozen_mutation_fragment, streaming::stream_mutation_fragments_cmd>, rpc::source<int32_t>>> make_sink_and_source_for_stream_mutation_fragments(table_schema_version schema_id, streaming::plan_id plan_id, table_id cf_id, uint64_t estimated_partitions, streaming::stream_reason reason, service::session_id session, msg_addr id);
// Wrapper for REPAIR_GET_ROW_DIFF_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
future<std::tuple<rpc::sink<repair_hash_with_cmd>, rpc::source<repair_row_on_wire_with_cmd>>> make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id);
rpc::sink<repair_row_on_wire_with_cmd> make_sink_for_repair_get_row_diff_with_rpc_stream(rpc::source<repair_hash_with_cmd>& source);
void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_hash_with_cmd> source)>&& func);
void register_repair_get_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_row_on_wire_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_hash_with_cmd> source)>&& func);
future<> unregister_repair_get_row_diff_with_rpc_stream();
// Wrapper for REPAIR_PUT_ROW_DIFF_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
future<std::tuple<rpc::sink<repair_row_on_wire_with_cmd>, rpc::source<repair_stream_cmd>>> make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id);
rpc::sink<repair_stream_cmd> make_sink_for_repair_put_row_diff_with_rpc_stream(rpc::source<repair_row_on_wire_with_cmd>& source);
void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
void register_repair_put_row_diff_with_rpc_stream(std::function<future<rpc::sink<repair_stream_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_row_on_wire_with_cmd> source)>&& func);
future<> unregister_repair_put_row_diff_with_rpc_stream();
// Wrapper for REPAIR_GET_FULL_ROW_HASHES_WITH_RPC_STREAM
future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, msg_addr id);
future<std::tuple<rpc::sink<repair_stream_cmd>, rpc::source<repair_hash_with_cmd>>> make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(uint32_t repair_meta_id, shard_id dst_cpu_id, msg_addr id);
rpc::sink<repair_hash_with_cmd> make_sink_for_repair_get_full_row_hashes_with_rpc_stream(rpc::source<repair_stream_cmd>& source);
void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::source<repair_stream_cmd> source)>&& func);
void register_repair_get_full_row_hashes_with_rpc_stream(std::function<future<rpc::sink<repair_hash_with_cmd>> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_stream_cmd> source)>&& func);
future<> unregister_repair_get_full_row_hashes_with_rpc_stream();
void register_stream_mutation_done(std::function<future<> (const rpc::client_info& cinfo, streaming::plan_id plan_id, dht::token_range_vector ranges, table_id cf_id, unsigned dst_cpu_id)>&& func);
@@ -409,49 +409,49 @@ public:
future<> unregister_complete_message();
// Wrapper for REPAIR_GET_FULL_ROW_HASHES
void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
void register_repair_get_full_row_hashes(std::function<future<repair_hash_set> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_get_full_row_hashes();
future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id);
future<repair_hash_set> send_repair_get_full_row_hashes(msg_addr id, uint32_t repair_meta_id, shard_id dst_cpu_id);
// Wrapper for REPAIR_GET_COMBINED_ROW_HASH
void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary)>&& func);
void register_repair_get_combined_row_hash(std::function<future<get_combined_row_hash_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_get_combined_row_hash();
future<get_combined_row_hash_response> send_repair_get_combined_row_hash(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary);
future<get_combined_row_hash_response> send_repair_get_combined_row_hash(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> common_sync_boundary, shard_id dst_cpu_id);
// Wrapper for REPAIR_GET_SYNC_BOUNDARY
void register_repair_get_sync_boundary(std::function<future<get_sync_boundary_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary)>&& func);
void register_repair_get_sync_boundary(std::function<future<get_sync_boundary_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_get_sync_boundary();
future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary);
future<get_sync_boundary_response> send_repair_get_sync_boundary(msg_addr id, uint32_t repair_meta_id, std::optional<repair_sync_boundary> skipped_sync_boundary, shard_id dst_cpu_id);
// Wrapper for REPAIR_GET_ROW_DIFF
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows)>&& func);
void register_repair_get_row_diff(std::function<future<repair_rows_on_wire> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_get_row_diff();
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows);
future<repair_rows_on_wire> send_repair_get_row_diff(msg_addr id, uint32_t repair_meta_id, repair_hash_set set_diff, bool needs_all_rows, shard_id dst_cpu_id);
// Wrapper for REPAIR_PUT_ROW_DIFF
void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff)>&& func);
void register_repair_put_row_diff(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, repair_rows_on_wire row_diff, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_put_row_diff();
future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff);
future<> send_repair_put_row_diff(msg_addr id, uint32_t repair_meta_id, repair_rows_on_wire row_diff, shard_id dst_cpu_id);
// Wrapper for REPAIR_ROW_LEVEL_START
void register_repair_row_level_start(std::function<future<repair_row_level_start_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time)>&& func);
void register_repair_row_level_start(std::function<future<repair_row_level_start_response> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_row_level_start();
future<rpc::optional<repair_row_level_start_response>> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time);
future<rpc::optional<repair_row_level_start_response>> send_repair_row_level_start(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed, unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time, shard_id dst_cpu_id);
// Wrapper for REPAIR_ROW_LEVEL_STOP
void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range)>&& func);
void register_repair_row_level_stop(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_row_level_stop();
future<> send_repair_row_level_stop(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range);
future<> send_repair_row_level_stop(msg_addr id, uint32_t repair_meta_id, sstring keyspace_name, sstring cf_name, dht::token_range range, shard_id dst_cpu_id);
// Wrapper for REPAIR_GET_ESTIMATED_PARTITIONS
void register_repair_get_estimated_partitions(std::function<future<uint64_t> (const rpc::client_info& cinfo, uint32_t repair_meta_id)>&& func);
void register_repair_get_estimated_partitions(std::function<future<uint64_t> (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_get_estimated_partitions();
future<uint64_t> send_repair_get_estimated_partitions(msg_addr id, uint32_t repair_meta_id);
future<uint64_t> send_repair_get_estimated_partitions(msg_addr id, uint32_t repair_meta_id, shard_id dst_cpu_id);
// Wrapper for REPAIR_SET_ESTIMATED_PARTITIONS
void register_repair_set_estimated_partitions(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, uint64_t estimated_partitions)>&& func);
void register_repair_set_estimated_partitions(std::function<future<> (const rpc::client_info& cinfo, uint32_t repair_meta_id, uint64_t estimated_partitions, rpc::optional<shard_id> dst_cpu_id)>&& func);
future<> unregister_repair_set_estimated_partitions();
future<> send_repair_set_estimated_partitions(msg_addr id, uint32_t repair_meta_id, uint64_t estimated_partitions);
future<> send_repair_set_estimated_partitions(msg_addr id, uint32_t repair_meta_id, uint64_t estimated_partitions, shard_id dst_cpu_id);
// Wrapper for REPAIR_GET_DIFF_ALGORITHMS
void register_repair_get_diff_algorithms(std::function<future<std::vector<row_level_diff_detect_algorithm>> (const rpc::client_info& cinfo)>&& func);

View File

@@ -22,6 +22,7 @@
#include "utils/hashers.hh"
#include "locator/network_topology_strategy.hh"
#include "service/migration_manager.hh"
#include "service/storage_service.hh"
#include "partition_range_compat.hh"
#include "gms/feature_service.hh"
@@ -42,6 +43,7 @@
#include <cfloat>
#include <algorithm>
#include <atomic>
#include "idl/position_in_partition.dist.hh"
#include "idl/partition_checksum.dist.hh"
@@ -211,6 +213,17 @@ void remove_item(Collection& c, T& item) {
}
}
repair_neighbors::repair_neighbors(std::vector<gms::inet_address> nodes, std::vector<shard_id> shards)
: all(std::move(nodes))
{
if (all.size() != shards.size()) {
throw std::runtime_error("The number of shards and nodes do not match");
}
for (int i = 0; i < all.size(); i++) {
shard_map[all[i]] = shards[i];
}
}
// Return all of the neighbors with whom we share the provided range.
static std::vector<gms::inet_address> get_neighbors(
const locator::effective_replication_map& erm,
@@ -538,33 +551,6 @@ future<uint64_t> estimate_partitions(seastar::sharded<replica::database>& db, co
);
}
static
const dht::sharder&
get_sharder_for_tables(seastar::sharded<replica::database>& db, const sstring& keyspace, const std::vector<table_id>& table_ids) {
schema_ptr last_s;
for (size_t idx = 0 ; idx < table_ids.size(); idx++) {
schema_ptr s;
if (const auto* cf = find_column_family_if_exists(db.local(), table_ids[idx])) {
s = cf->schema();
} else {
continue;
}
if (last_s && last_s->get_sharder() != s->get_sharder()) {
throw std::runtime_error(
format("All tables repaired together have to have the same sharding logic. "
"Different sharding logic found: {} (for table {}) and {} (for table {})",
last_s->get_sharder(), last_s->cf_name(),
s->get_sharder(), s->cf_name()));
}
last_s = std::move(s);
}
if (!last_s) {
throw std::runtime_error(format("Failed to find sharder for keyspace={}, tables={}, no table in this keyspace",
keyspace, table_ids));
}
return last_s->get_sharder();
}
repair::shard_repair_task_impl::shard_repair_task_impl(tasks::task_manager::module_ptr module,
tasks::task_id id,
const sstring& keyspace,
@@ -586,7 +572,6 @@ repair::shard_repair_task_impl::shard_repair_task_impl(tasks::task_manager::modu
, messaging(repair.get_messaging().container())
, mm(repair.get_migration_manager())
, gossiper(repair.get_gossiper())
, sharder(get_sharder_for_tables(db, keyspace, table_ids_))
, erm(std::move(erm_))
, ranges(ranges_)
, cfs(get_table_names(db.local(), table_ids_))
@@ -1083,12 +1068,6 @@ future<int> repair_service::do_repair_start(sstring keyspace, std::unordered_map
get_repair_module().check_in_shutdown();
auto& sharded_db = get_db();
auto& db = sharded_db.local();
auto germs = make_lw_shared(co_await locator::make_global_effective_replication_map(sharded_db, keyspace));
auto& erm = germs->get();
auto& topology = erm.get_token_metadata().get_topology();
auto my_address = topology.my_address();
repair_options options(options_map);
// Note: Cassandra can, in some cases, decide immediately that there is
// nothing to repair, and return 0. "nodetool repair" prints in this case
@@ -1098,6 +1077,80 @@ future<int> repair_service::do_repair_start(sstring keyspace, std::unordered_map
auto id = _repair_module->new_repair_uniq_id();
rlogger.info("repair[{}]: starting user-requested repair for keyspace {}, repair id {}, options {}", id.uuid(), keyspace, id.id, options_map);
repair_options options(options_map);
std::vector<sstring> cfs =
options.column_families.size() ? options.column_families : list_column_families(db, keyspace);
if (cfs.empty()) {
rlogger.info("repair[{}]: completed successfully: no tables to repair", id.uuid());
co_return id.id;
}
{
// Repair tables in table name order. Later we could repair in other
// orders, e.g., smaller tables first.
std::sort(cfs.begin(), cfs.end());
size_t nr_tablet_table = 0;
size_t nr_vnode_table = 0;
bool is_tablet = false;
for (auto& table_name : cfs) {
auto& t = db.find_column_family(keyspace, table_name);
if (t.uses_tablets()) {
nr_tablet_table++;
} else {
nr_vnode_table++;
}
}
if (nr_tablet_table != 0) {
if (nr_vnode_table != 0) {
throw std::runtime_error("Mixed vnode table and tablet table");
}
is_tablet = true;
}
if (is_tablet) {
// Reject unsupported options for tablet repair
if (!options.ranges.empty()) {
throw std::runtime_error("The ranges option is not supported for tablet repair");
}
if (!options.hosts.empty()) {
throw std::runtime_error("The hosts option is not supported for tablet repair");
}
if (!options.ignore_nodes.empty()) {
throw std::runtime_error("The ignore_nodes option is not supported for tablet repair");
}
if (!options.data_centers.empty()) {
throw std::runtime_error("The dataCenters option is not supported for tablet repair");
}
if (!options.start_token.empty()) {
throw std::runtime_error("The startToken option is not supported for tablet repair");
}
if (!options.end_token.empty()) {
throw std::runtime_error("The endToken option is not supported for tablet repair");
}
if (options.small_table_optimization) {
throw std::runtime_error("The small_table_optimization option is not supported for tablet repair");
}
auto host2ip = [&addr_map = _addr_map] (locator::host_id host) -> future<gms::inet_address> {
auto ip = addr_map.local().find(raft::server_id(host.uuid()));
if (!ip) {
throw std::runtime_error(format("Could not get ip address for host {} from raft_address_map", host));
}
co_return *ip;
};
bool primary_replica_only = options.primary_range;
co_await repair_tablets(id, keyspace, cfs, host2ip, primary_replica_only);
co_return id.id;
}
}
auto germs = make_lw_shared(co_await locator::make_global_effective_replication_map(sharded_db, keyspace));
auto& erm = germs->get();
auto& topology = erm.get_token_metadata().get_topology();
auto my_address = erm.get_topology().my_address();
if (erm.get_replication_strategy().get_type() == locator::replication_strategy_type::local) {
rlogger.info("repair[{}]: completed successfully: nothing to repair for keyspace {} with local replication strategy", id.uuid(), keyspace);
co_return id.id;
@@ -1185,13 +1238,6 @@ future<int> repair_service::do_repair_start(sstring keyspace, std::unordered_map
ranges = std::move(intersections);
}
std::vector<sstring> cfs =
options.column_families.size() ? options.column_families : list_column_families(db, keyspace);
if (cfs.empty()) {
rlogger.info("repair[{}]: completed successfully: no tables to repair", id.uuid());
co_return id.id;
}
auto small_table_optimization = options.small_table_optimization;
if (small_table_optimization) {
auto range = dht::token_range(dht::token_range::bound(dht::minimum_token(), false), dht::token_range::bound(dht::maximum_token(), false));
@@ -1214,6 +1260,7 @@ future<> repair::user_requested_repair_task_impl::run() {
return module->run(id, [this, &rs, &db, id, keyspace = _status.keyspace, germs = std::move(_germs),
&cfs = _cfs, &ranges = _ranges, hosts = std::move(_hosts), data_centers = std::move(_data_centers), ignore_nodes = std::move(_ignore_nodes)] () mutable {
auto uuid = node_ops_id{id.uuid().uuid()};
auto start_time = std::chrono::steady_clock::now();
bool needs_flush_before_repair = false;
if (db.features().tombstone_gc_options) {
@@ -1340,6 +1387,8 @@ future<> repair::user_requested_repair_task_impl::run() {
}
return make_ready_future<>();
}).get();
auto duration = std::chrono::duration<float>(std::chrono::steady_clock::now() - start_time);
rlogger.info("repair[{}]: Finished user-requested repair for vnode keyspace={} tables={} repair_id={} duration={}", id.uuid(), keyspace, cfs, id.id, duration);
}).handle_exception([id, &rs] (std::exception_ptr ep) {
rlogger.warn("repair[{}]: user-requested repair failed: {}", id.uuid(), ep);
// If abort was requested, throw abort_requested_exception instead of wrapped exceptions from all shards,
@@ -1994,6 +2043,159 @@ future<> repair_service::replace_with_repair(locator::token_metadata_ptr tmptr,
co_return co_await do_rebuild_replace_with_repair(std::move(cloned_tmptr), std::move(op), myloc.dc, reason, std::move(ignore_nodes));
}
// Repair all tablets belong to this node for the given table
future<> repair_service::repair_tablets(repair_uniq_id rid, sstring keyspace_name, std::vector<sstring> table_names, host2ip_t host2ip, bool primary_replica_only) {
std::vector<tablet_repair_task_meta> task_metas;
for (auto& table_name : table_names) {
lw_shared_ptr<replica::table> t;
try {
t = _db.local().find_column_family(keyspace_name, table_name).shared_from_this();
} catch (replica::no_such_column_family& e) {
rlogger.debug("repair[{}] Table {}.{} does not exist anymore", rid.uuid(), keyspace_name, table_name);
continue;
}
if (!t->uses_tablets()) {
throw std::runtime_error(format("repair[{}] Table {}.{} is not a tablet table", rid.uuid(), keyspace_name, table_name));
}
table_id tid = t->schema()->id();
// FIXME: we need to wait for current tablet movement and disable future tablet movment
auto erm = t->get_effective_replication_map();
auto& tmap = erm->get_token_metadata_ptr()->tablets().get_tablet_map(tid);
struct repair_tablet_meta {
locator::tablet_id id;
dht::token_range range;
locator::host_id master_host_id;
shard_id master_shard_id;
locator::tablet_replica_set replicas;
};
std::vector<repair_tablet_meta> metas;
auto myhostid = erm->get_token_metadata_ptr()->get_my_id();
auto myip = erm->get_topology().my_address();
co_await tmap.for_each_tablet([&] (locator::tablet_id id, const locator::tablet_info& info) {
auto range = tmap.get_token_range(id);
auto& replicas = info.replicas;
bool found = false;
shard_id master_shard_id;
// Repair all tablets belong to this node
for (auto& r : replicas) {
if (r.host == myhostid) {
master_shard_id = r.shard;
found = true;
break;
}
if (primary_replica_only) {
break;
}
}
if (found) {
metas.push_back(repair_tablet_meta{id, range, myhostid, master_shard_id, replicas});
}
});
size_t nr = 0;
for (auto& m : metas) {
nr++;
rlogger.debug("repair[{}] Collect {} out of {} tablets: table={}.{} tablet_id={} range={} replicas={} primary_replica_only={}",
rid.uuid(), nr, metas.size(), keyspace_name, table_name, m.id, m.range, m.replicas, primary_replica_only);
std::vector<gms::inet_address> nodes;
auto master_shard_id = m.master_shard_id;
auto range = m.range;
std::vector<shard_id> shards;
for (auto& r : m.replicas) {
auto shard = r.shard;
auto ip = co_await host2ip(r.host);
if (r.host != myhostid) {
rlogger.debug("repair[{}] Repair get neighbors table={}.{} hostid={} shard={} ip={} myip={} myhostid={}",
rid.uuid(), keyspace_name, table_name, r.host, shard, ip, myip, myhostid);
shards.push_back(shard);
nodes.push_back(ip);
}
}
task_metas.push_back(tablet_repair_task_meta{keyspace_name, table_name, tid, master_shard_id, range, repair_neighbors(nodes, shards), m.replicas});
}
}
auto task = co_await _repair_module->make_and_start_task<repair::tablet_repair_task_impl>({}, rid, keyspace_name, table_names, streaming::stream_reason::repair, std::move(task_metas));
}
future<> repair::tablet_repair_task_impl::run() {
auto m = dynamic_pointer_cast<repair::task_manager_module>(_module);
auto& rs = m->get_repair_service();
auto& sharded_db = rs.get_db();
auto& db = sharded_db.local();
auto id = get_repair_uniq_id();
auto keyspace = _keyspace;
rlogger.debug("repair[{}]: Repair tablet for keyspace={} tables={} status=started", id.uuid(), _keyspace, _tables);
co_await m->run(id, [this, &rs, id] () mutable {
// This runs inside a seastar thread
auto start_time = std::chrono::steady_clock::now();
auto parent_data = get_repair_uniq_id().task_info;
std::atomic<int> idx{1};
rs.container().invoke_on_all([&idx, id, metas = _metas, parent_data, reason = _reason] (repair_service& rs) -> future<> {
for (auto& m : metas) {
if (m.master_shard_id != this_shard_id()) {
continue;
}
auto nr = idx.fetch_add(1);
rlogger.info("repair[{}] Repair {} out of {} tablets: table={}.{} range={} replicas={}",
id.uuid(), nr, metas.size(), m.keyspace_name, m.table_name, m.range, m.replicas);
lw_shared_ptr<replica::table> t;
try {
t = rs._db.local().find_column_family(m.tid).shared_from_this();
} catch (replica::no_such_column_family& e) {
rlogger.debug("repair[{}] Table {}.{} does not exist anymore", id.uuid(), m.keyspace_name, m.table_name);
continue;
}
auto erm = t->get_effective_replication_map();
if (rs.get_repair_module().is_aborted(id.uuid())) {
throw abort_requested_exception();
}
std::unordered_map<dht::token_range, repair_neighbors> neighbors;
neighbors[m.range] = m.neighbors;
dht::token_range_vector ranges = {m.range};
std::vector<table_id> table_ids = {m.tid};
auto data_centers = std::vector<sstring>();
auto hosts = std::vector<sstring>();
auto ignore_nodes = std::unordered_set<gms::inet_address>();
bool hints_batchlog_flushed = false;
bool small_table_optimization = false;
auto ranges_parallelism = std::nullopt;
auto task_impl_ptr = seastar::make_shared<repair::shard_repair_task_impl>(rs._repair_module, tasks::task_id::create_random_id(),
m.keyspace_name, rs, erm, std::move(ranges), std::move(table_ids), id, std::move(data_centers), std::move(hosts),
std::move(ignore_nodes), reason, hints_batchlog_flushed, small_table_optimization, ranges_parallelism);
task_impl_ptr->neighbors = std::move(neighbors);
auto task = co_await rs._repair_module->make_task(std::move(task_impl_ptr), parent_data);
task->start();
co_await task->done();
}
}).get();
auto duration = std::chrono::duration<float>(std::chrono::steady_clock::now() - start_time);
rlogger.info("repair[{}]: Finished user-requested repair for tablet keyspace={} tables={} repair_id={} tablets_repaired={} duration={}",
id.uuid(), _keyspace, _tables, id.id, _metas.size(), duration);
}).then([id, keyspace] {
rlogger.debug("repair[{}]: Repair tablet for keyspace={} status=succeeded", id.uuid(), keyspace);
}).handle_exception([&db, id, keyspace, &rs] (std::exception_ptr ep) {
if (!db.has_keyspace(keyspace)) {
rlogger.warn("repair[{}]: Repair tablet for keyspace={}, status=failed: keyspace does not exist any more, ignoring it, {}", id.uuid(), keyspace, ep);
return make_ready_future<>();
}
rlogger.warn("repair[{}]: Repair tablet for keyspace={} status=failed: {}", id.uuid(), keyspace, ep);
rs.get_repair_module().check_in_shutdown();
return make_exception_future<>(ep);
});
}
future<std::optional<double>> repair::tablet_repair_task_impl::expected_total_workload() const {
auto sz = _metas.size();
co_return sz ? std::make_optional<double>(sz) : std::nullopt;
}
std::optional<double> repair::tablet_repair_task_impl::expected_children_number() const {
return _metas.size();
}
node_ops_cmd_category categorize_node_ops_cmd(node_ops_cmd cmd) noexcept {
switch (cmd) {
case node_ops_cmd::removenode_prepare:

View File

@@ -80,6 +80,11 @@ struct repair_uniq_id {
};
std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);
// If the repair master sets the dst_cpu_id to repair_unspecified_shard. It
// means the repair master does not choose shard id for the repair follower.
// The repair follower should choose the shard id itself.
constexpr shard_id repair_unspecified_shard = shard_id(-1);
// NOTE: repair_start() can be run on any node, but starts a node-global
// operation.
// repair_start() starts the requested repair on this node. It returns an
@@ -134,6 +139,7 @@ class repair_neighbors {
public:
std::vector<gms::inet_address> all;
std::vector<gms::inet_address> mandatory;
std::unordered_map<gms::inet_address, shard_id> shard_map;
repair_neighbors() = default;
explicit repair_neighbors(std::vector<gms::inet_address> a)
: all(std::move(a)) {
@@ -142,6 +148,7 @@ public:
: all(std::move(a))
, mandatory(std::move(m)) {
}
repair_neighbors(std::vector<gms::inet_address> nodes, std::vector<shard_id> shards);
};
future<uint64_t> estimate_partitions(seastar::sharded<replica::database>& db, const sstring& keyspace,
@@ -253,6 +260,16 @@ struct repair_flush_hints_batchlog_request {
struct repair_flush_hints_batchlog_response {
};
struct tablet_repair_task_meta {
sstring keyspace_name;
sstring table_name;
table_id tid;
shard_id master_shard_id;
dht::token_range range;
repair_neighbors neighbors;
locator::tablet_replica_set replicas;
};
namespace std {
template<>

View File

@@ -47,6 +47,7 @@
#include <seastar/coroutine/all.hh>
#include "db/system_keyspace.hh"
#include "service/storage_proxy.hh"
#include "service/raft/raft_address_map.hh"
#include "db/batchlog_manager.hh"
#include "idl/position_in_partition.dist.hh"
#include "idl/partition_checksum.dist.hh"
@@ -67,6 +68,16 @@ extern logging::logger rlogger;
static bool inject_rpc_stream_error = false;
static shard_id get_dst_shard_id(uint32_t src_cpu_id, const rpc::optional<shard_id>& dst_cpu_id_opt) {
uint32_t dst_cpu_id = 0;
if (dst_cpu_id_opt && *dst_cpu_id_opt != repair_unspecified_shard) {
dst_cpu_id = *dst_cpu_id_opt;
} else {
dst_cpu_id = src_cpu_id % smp::count;
}
return dst_cpu_id;
}
enum class repair_state : uint16_t {
unknown,
row_level_start_started,
@@ -100,7 +111,10 @@ enum class repair_state : uint16_t {
struct repair_node_state {
gms::inet_address node;
repair_state state = repair_state::unknown;
// The shard that repair instance runs on
shard_id shard;
explicit repair_node_state(gms::inet_address n) : node(n) { }
explicit repair_node_state(gms::inet_address n, shard_id s) : node(n), shard(s) { }
};
// Wraps sink and source objects for repair master or repair follower nodes.
@@ -109,7 +123,7 @@ struct repair_node_state {
template<class SinkType, class SourceType>
class sink_source_for_repair {
uint32_t _repair_meta_id;
using get_sink_source_fn_type = std::function<future<std::tuple<rpc::sink<SinkType>, rpc::source<SourceType>>> (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr)>;
using get_sink_source_fn_type = std::function<future<std::tuple<rpc::sink<SinkType>, rpc::source<SourceType>>> (uint32_t repair_meta_id, std::optional<shard_id> dst_cpu_id, netw::messaging_service::msg_addr addr)>;
using sink_type = std::reference_wrapper<rpc::sink<SinkType>>;
using source_type = std::reference_wrapper<rpc::source<SourceType>>;
// The vectors below store sink and source object for peer nodes.
@@ -128,7 +142,7 @@ public:
void mark_source_closed(unsigned node_idx) {
_sources_closed[node_idx] = true;
}
future<std::tuple<sink_type, source_type>> get_sink_source(gms::inet_address remote_node, unsigned node_idx) {
future<std::tuple<sink_type, source_type>> get_sink_source(gms::inet_address remote_node, unsigned node_idx, std::optional<shard_id> dst_cpu_id) {
using value_type = std::tuple<sink_type, source_type>;
if (_sinks[node_idx] && _sources[node_idx]) {
return make_ready_future<value_type>(value_type(_sinks[node_idx].value(), _sources[node_idx].value()));
@@ -136,7 +150,7 @@ public:
if (_sinks[node_idx] || _sources[node_idx]) {
return make_exception_future<value_type>(std::runtime_error(format("sink or source is missing for node {}", remote_node)));
}
return _fn(_repair_meta_id, netw::messaging_service::msg_addr(remote_node)).then_unpack([this, node_idx] (rpc::sink<SinkType> sink, rpc::source<SourceType> source) mutable {
return _fn(_repair_meta_id, dst_cpu_id, netw::messaging_service::msg_addr(remote_node)).then_unpack([this, node_idx] (rpc::sink<SinkType> sink, rpc::source<SourceType> source) mutable {
_sinks[node_idx].emplace(std::move(sink));
_sources[node_idx].emplace(std::move(source));
return make_ready_future<value_type>(value_type(_sinks[node_idx].value(), _sources[node_idx].value()));
@@ -497,12 +511,15 @@ void repair_writer_impl::create_writer(lw_shared_ptr<repair_writer> w) {
replica::table& t = _db.local().find_column_family(_schema->id());
rlogger.debug("repair_writer: keyspace={}, table={}, estimated_partitions={}", w->schema()->ks_name(), w->schema()->cf_name(), w->get_estimated_partitions());
service::frozen_topology_guard topo_guard = service::null_topology_guard; // FIXME: propagate
_writer_done = mutation_writer::distribute_reader_and_consume_on_shards(_schema, _schema->get_sharder(), std::move(_queue_reader),
// The sharder is valid only when the erm is valid. Keep a reference of the erm to keep the sharder valid.
auto erm = t.get_effective_replication_map();
auto& sharder = erm->get_sharder(*(w->schema()));
_writer_done = mutation_writer::distribute_reader_and_consume_on_shards(_schema, sharder, std::move(_queue_reader),
streaming::make_streaming_consumer(sstables::repair_origin, _db, _sys_dist_ks, _view_update_generator, w->get_estimated_partitions(), _reason, is_offstrategy_supported(_reason), topo_guard),
t.stream_in_progress()).then([w] (uint64_t partitions) {
t.stream_in_progress()).then([w, erm] (uint64_t partitions) {
rlogger.debug("repair_writer: keyspace={}, table={}, managed to write partitions={} to sstable",
w->schema()->ks_name(), w->schema()->cf_name(), partitions);
}).handle_exception([w] (std::exception_ptr ep) {
}).handle_exception([w, erm] (std::exception_ptr ep) {
rlogger.warn("repair_writer: keyspace={}, table={}, multishard_writer failed: {}",
w->schema()->ks_name(), w->schema()->cf_name(), ep);
w->queue().abort(ep);
@@ -775,6 +792,7 @@ private:
std::optional<shared_future<>> _stopped;
repair_hasher _repair_hasher;
gc_clock::time_point _compaction_time;
bool _is_tablet;
public:
std::vector<repair_node_state>& all_nodes() {
return _all_node_states;
@@ -829,6 +847,7 @@ public:
shard_config master_node_shard_config,
inet_address_vector_replica_set all_live_peer_nodes,
size_t nr_peer_nodes,
std::vector<std::optional<shard_id>> all_live_peer_shards,
row_level_repair* row_level_repair_ptr,
gc_clock::time_point compaction_time)
: _rs(rs)
@@ -854,29 +873,37 @@ public:
, _nr_peer_nodes(nr_peer_nodes)
, _repair_writer(make_repair_writer(_schema, _permit, _reason, _db, _sys_dist_ks, _view_update_generator))
, _sink_source_for_get_full_row_hashes(_repair_meta_id, _nr_peer_nodes,
[&rs] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
return rs.get_messaging().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, addr);
[&rs] (uint32_t repair_meta_id, std::optional<shard_id> dst_cpu_id_opt, netw::messaging_service::msg_addr addr) {
auto dst_cpu_id = dst_cpu_id_opt.value_or(repair_unspecified_shard);
rlogger.debug("get_full_row_hashes: repair_meta_id={} dst_cpu_id={}", repair_meta_id, dst_cpu_id);
return rs.get_messaging().make_sink_and_source_for_repair_get_full_row_hashes_with_rpc_stream(repair_meta_id, dst_cpu_id, addr);
})
, _sink_source_for_get_row_diff(_repair_meta_id, _nr_peer_nodes,
[&rs] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
return rs.get_messaging().make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(repair_meta_id, addr);
[&rs] (uint32_t repair_meta_id, std::optional<shard_id> dst_cpu_id_opt, netw::messaging_service::msg_addr addr) {
auto dst_cpu_id = dst_cpu_id_opt.value_or(repair_unspecified_shard);
rlogger.debug("get_row_diff: repair_meta_id={} dst_cpu_id={}", repair_meta_id, dst_cpu_id);
return rs.get_messaging().make_sink_and_source_for_repair_get_row_diff_with_rpc_stream(repair_meta_id, dst_cpu_id, addr);
})
, _sink_source_for_put_row_diff(_repair_meta_id, _nr_peer_nodes,
[&rs] (uint32_t repair_meta_id, netw::messaging_service::msg_addr addr) {
return rs.get_messaging().make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(repair_meta_id, addr);
[&rs] (uint32_t repair_meta_id, std::optional<shard_id> dst_cpu_id_opt, netw::messaging_service::msg_addr addr) {
auto dst_cpu_id = dst_cpu_id_opt.value_or(repair_unspecified_shard);
rlogger.debug("put_row_diff: repair_meta_id={} dst_cpu_id={}", repair_meta_id, dst_cpu_id);
return rs.get_messaging().make_sink_and_source_for_repair_put_row_diff_with_rpc_stream(repair_meta_id, dst_cpu_id, addr);
})
, _row_level_repair_ptr(row_level_repair_ptr)
, _repair_hasher(_seed, _schema)
, _compaction_time(compaction_time)
, _is_tablet(_cf.uses_tablets())
{
if (master) {
add_to_repair_meta_for_masters(*this);
} else {
add_to_repair_meta_for_followers(*this);
}
_all_node_states.push_back(repair_node_state(_myip));
for (auto& node : all_live_peer_nodes) {
_all_node_states.push_back(repair_node_state(node));
assert(all_live_peer_shards.size() == all_live_peer_nodes.size());
_all_node_states.push_back(repair_node_state(_myip, this_shard_id()));
for (int i = 0; i < all_live_peer_nodes.size(); i++) {
_all_node_states.push_back(repair_node_state(all_live_peer_nodes[i], all_live_peer_shards[i].value_or(repair_unspecified_shard)));
}
}
@@ -897,10 +924,16 @@ public:
inet_address_vector_replica_set all_live_peer_nodes,
gc_clock::time_point compaction_time)
: repair_meta(rs, cf, std::move(s), std::move(permit), std::move(range), algo, max_row_buf_size, seed, master, repair_meta_id, reason,
std::move(master_node_shard_config), std::move(all_live_peer_nodes), 1, nullptr, compaction_time)
std::move(master_node_shard_config), std::move(all_live_peer_nodes), 1, {std::nullopt}, nullptr, compaction_time)
{
}
public:
std::optional<shard_id> get_peer_node_dst_cpu_id(uint32_t peer_node_idx) {
assert(peer_node_idx + 1 < all_nodes().size());
return all_nodes()[peer_node_idx + 1].shard;
}
public:
future<> clear_gently() noexcept {
co_await utils::clear_gently(_peer_row_hash_sets);
@@ -1004,7 +1037,7 @@ private:
future<uint64_t> get_estimated_partitions() {
return with_gate(_gate, [this] {
if (_repair_master || _same_sharding_config) {
if (_repair_master || _same_sharding_config || _is_tablet) {
return do_estimate_partitions_on_local_shard();
} else {
return do_with(dht::selective_token_range_sharder(_remote_sharder, _range, _master_node_shard_config.shard), uint64_t(0), uint64_t(0), [this] (auto& sharder, auto& partitions_sum, auto& subranges) mutable {
@@ -1045,8 +1078,9 @@ private:
bool is_same_sharding_config() {
rlogger.debug("is_same_sharding_config: remote_shard={}, remote_shard_count={}, remote_ignore_msb={}",
_master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb);
return _schema->get_sharder().shard_count() == _master_node_shard_config.shard_count
&& _schema->get_sharder().sharding_ignore_msb() == _master_node_shard_config.ignore_msb
auto& sharder = _cf.get_effective_replication_map()->get_sharder(*_schema);
return sharder.shard_count() == _master_node_shard_config.shard_count
&& sharder.sharding_ignore_msb() == _master_node_shard_config.ignore_msb
&& this_shard_id() == _master_node_shard_config.shard;
}
@@ -1117,7 +1151,7 @@ private:
_master_node_shard_config.shard,
_seed,
std::invoke([this]() {
if (_repair_master || _same_sharding_config) {
if (_repair_master || _same_sharding_config || _is_tablet) {
rlogger.debug("repair_reader: meta_id={}, _repair_master={}, _same_sharding_config={},"
"read_strategy {} is chosen",
_repair_meta_id, _repair_master, _same_sharding_config,
@@ -1437,12 +1471,12 @@ public:
// RPC API
// Return the hashes of the rows in _working_row_buf
future<repair_hash_set>
get_full_row_hashes(gms::inet_address remote_node) {
get_full_row_hashes(gms::inet_address remote_node, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return get_full_row_hashes_handler();
}
return _messaging.send_repair_get_full_row_hashes(msg_addr(remote_node),
_repair_meta_id).then([this, remote_node] (repair_hash_set hashes) {
_repair_meta_id, dst_cpu_id).then([this, remote_node] (repair_hash_set hashes) {
rlogger.debug("Got full hashes from peer={}, nr_hashes={}", remote_node, hashes.size());
_metrics.rx_hashes_nr += hashes.size();
stats().rx_hashes_nr += hashes.size();
@@ -1492,12 +1526,12 @@ private:
public:
future<repair_hash_set>
get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx) {
get_full_row_hashes_with_rpc_stream(gms::inet_address remote_node, unsigned node_idx, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return get_full_row_hashes_handler();
}
auto current_hashes = make_lw_shared<repair_hash_set>();
return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx).then_unpack(
return _sink_source_for_get_full_row_hashes.get_sink_source(remote_node, node_idx, dst_cpu_id).then_unpack(
[this, current_hashes, remote_node, node_idx]
(rpc::sink<repair_stream_cmd>& sink, rpc::source<repair_hash_with_cmd>& source) mutable {
auto source_op = get_full_row_hashes_source_op(current_hashes, remote_node, node_idx, source);
@@ -1521,12 +1555,12 @@ public:
// RPC API
// Return the combined hashes of the current working row buf
future<get_combined_row_hash_response>
get_combined_row_hash(std::optional<repair_sync_boundary> common_sync_boundary, gms::inet_address remote_node) {
get_combined_row_hash(std::optional<repair_sync_boundary> common_sync_boundary, gms::inet_address remote_node, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return get_combined_row_hash_handler(common_sync_boundary);
}
return _messaging.send_repair_get_combined_row_hash(msg_addr(remote_node),
_repair_meta_id, common_sync_boundary).then([this] (get_combined_row_hash_response resp) {
_repair_meta_id, common_sync_boundary, dst_cpu_id).then([this] (get_combined_row_hash_response resp) {
stats().rpc_call_nr++;
stats().rx_hashes_nr++;
_metrics.rx_hashes_nr++;
@@ -1548,7 +1582,7 @@ public:
// RPC API
future<>
repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time) {
repair_row_level_start(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, table_schema_version schema_version, streaming::stream_reason reason, gc_clock::time_point compaction_time, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return make_ready_future<>();
}
@@ -1562,7 +1596,7 @@ public:
return _messaging.send_repair_row_level_start(msg_addr(remote_node),
_repair_meta_id, ks_name, cf_name, std::move(range), _algo, _max_row_buf_size, _seed,
_master_node_shard_config.shard, _master_node_shard_config.shard_count, _master_node_shard_config.ignore_msb,
remote_partitioner_name, std::move(schema_version), reason, compaction_time).then([ks_name, cf_name] (rpc::optional<repair_row_level_start_response> resp) {
remote_partitioner_name, std::move(schema_version), reason, compaction_time, dst_cpu_id).then([ks_name, cf_name] (rpc::optional<repair_row_level_start_response> resp) {
if (resp && resp->status == repair_row_level_start_status::no_such_column_family) {
return make_exception_future<>(replica::no_such_column_family(ks_name, cf_name));
} else {
@@ -1587,13 +1621,13 @@ public:
}
// RPC API
future<> repair_row_level_stop(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range) {
future<> repair_row_level_stop(gms::inet_address remote_node, sstring ks_name, sstring cf_name, dht::token_range range, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return stop();
}
stats().rpc_call_nr++;
return _messaging.send_repair_row_level_stop(msg_addr(remote_node),
_repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range));
_repair_meta_id, std::move(ks_name), std::move(cf_name), std::move(range), dst_cpu_id);
}
// RPC handler
@@ -1609,12 +1643,12 @@ public:
}
// RPC API
future<uint64_t> repair_get_estimated_partitions(gms::inet_address remote_node) {
future<uint64_t> repair_get_estimated_partitions(gms::inet_address remote_node, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return get_estimated_partitions();
}
stats().rpc_call_nr++;
return _messaging.send_repair_get_estimated_partitions(msg_addr(remote_node), _repair_meta_id);
return _messaging.send_repair_get_estimated_partitions(msg_addr(remote_node), _repair_meta_id, dst_cpu_id);
}
@@ -1629,12 +1663,12 @@ public:
}
// RPC API
future<> repair_set_estimated_partitions(gms::inet_address remote_node, uint64_t estimated_partitions) {
future<> repair_set_estimated_partitions(gms::inet_address remote_node, uint64_t estimated_partitions, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return set_estimated_partitions(estimated_partitions);
}
stats().rpc_call_nr++;
return _messaging.send_repair_set_estimated_partitions(msg_addr(remote_node), _repair_meta_id, estimated_partitions);
return _messaging.send_repair_set_estimated_partitions(msg_addr(remote_node), _repair_meta_id, estimated_partitions, dst_cpu_id);
}
@@ -1650,12 +1684,12 @@ public:
// RPC API
// Return the largest sync point contained in the _row_buf , current _row_buf checksum, and the _row_buf size
future<get_sync_boundary_response>
get_sync_boundary(gms::inet_address remote_node, std::optional<repair_sync_boundary> skipped_sync_boundary) {
get_sync_boundary(gms::inet_address remote_node, std::optional<repair_sync_boundary> skipped_sync_boundary, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return get_sync_boundary_handler(skipped_sync_boundary);
}
stats().rpc_call_nr++;
return _messaging.send_repair_get_sync_boundary(msg_addr(remote_node), _repair_meta_id, skipped_sync_boundary);
return _messaging.send_repair_get_sync_boundary(msg_addr(remote_node), _repair_meta_id, skipped_sync_boundary, dst_cpu_id);
}
// RPC handler
@@ -1670,7 +1704,7 @@ public:
// RPC API
// Return rows in the _working_row_buf with hash within the given sef_diff
// Must run inside a seastar thread
void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx) {
void get_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, unsigned node_idx, shard_id dst_cpu_id) {
if (needs_all_rows || !set_diff.empty()) {
if (remote_node == _myip) {
return;
@@ -1683,7 +1717,7 @@ public:
}
stats().rpc_call_nr++;
repair_rows_on_wire rows = _messaging.send_repair_get_row_diff(msg_addr(remote_node),
_repair_meta_id, std::move(set_diff), bool(needs_all_rows)).get0();
_repair_meta_id, std::move(set_diff), bool(needs_all_rows), dst_cpu_id).get0();
if (!rows.empty()) {
apply_rows_on_master_in_thread(std::move(rows), remote_node, update_working_row_buf::yes, update_peer_row_hash_sets::no, node_idx);
}
@@ -1691,13 +1725,13 @@ public:
}
// Must run inside a seastar thread
void get_row_diff_and_update_peer_row_hash_sets(gms::inet_address remote_node, unsigned node_idx) {
void get_row_diff_and_update_peer_row_hash_sets(gms::inet_address remote_node, unsigned node_idx, shard_id dst_cpu_id) {
if (remote_node == _myip) {
return;
}
stats().rpc_call_nr++;
repair_rows_on_wire rows = _messaging.send_repair_get_row_diff(msg_addr(remote_node),
_repair_meta_id, {}, bool(needs_all_rows_t::yes)).get0();
_repair_meta_id, {}, bool(needs_all_rows_t::yes), dst_cpu_id).get0();
if (!rows.empty()) {
apply_rows_on_master_in_thread(std::move(rows), remote_node, update_working_row_buf::yes, update_peer_row_hash_sets::yes, node_idx);
}
@@ -1774,7 +1808,8 @@ public:
needs_all_rows_t needs_all_rows,
update_peer_row_hash_sets update_hash_set,
gms::inet_address remote_node,
unsigned node_idx) {
unsigned node_idx,
shard_id dst_cpu_id) {
if (needs_all_rows || !set_diff.empty()) {
if (remote_node == _myip) {
return;
@@ -1786,7 +1821,7 @@ public:
_metrics.tx_hashes_nr += set_diff.size();
}
stats().rpc_call_nr++;
auto f = _sink_source_for_get_row_diff.get_sink_source(remote_node, node_idx).get0();
auto f = _sink_source_for_get_row_diff.get_sink_source(remote_node, node_idx, dst_cpu_id).get0();
rpc::sink<repair_hash_with_cmd>& sink = std::get<0>(f);
rpc::source<repair_row_on_wire_with_cmd>& source = std::get<1>(f);
auto sink_op = get_row_diff_sink_op(std::move(set_diff), needs_all_rows, sink, remote_node);
@@ -1806,25 +1841,25 @@ public:
// RPC API
// Send rows in the _working_row_buf with hash within the given sef_diff
future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node) {
future<> put_row_diff(repair_hash_set set_diff, needs_all_rows_t needs_all_rows, gms::inet_address remote_node, shard_id dst_cpu_id) {
if (!set_diff.empty()) {
if (remote_node == _myip) {
return make_ready_future<>();
}
size_t sz = set_diff.size();
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, sz] (std::list<repair_row> row_diff) {
return get_row_diff(std::move(set_diff), needs_all_rows).then([this, remote_node, dst_cpu_id, sz] (std::list<repair_row> row_diff) {
if (row_diff.size() != sz) {
rlogger.warn("Hash conflict detected, keyspace={}, table={}, range={}, row_diff.size={}, set_diff.size={}. It is recommended to compact the table and rerun repair for the range.",
_schema->ks_name(), _schema->cf_name(), _range, row_diff.size(), sz);
}
return do_with(std::move(row_diff), [this, remote_node] (std::list<repair_row>& row_diff) {
return get_repair_rows_size(row_diff).then([this, remote_node, &row_diff] (size_t row_bytes) mutable {
return do_with(std::move(row_diff), [this, remote_node, dst_cpu_id] (std::list<repair_row>& row_diff) {
return get_repair_rows_size(row_diff).then([this, remote_node, dst_cpu_id, &row_diff] (size_t row_bytes) mutable {
stats().tx_row_nr += row_diff.size();
stats().tx_row_nr_peer[remote_node] += row_diff.size();
stats().tx_row_bytes += row_bytes;
stats().rpc_call_nr++;
return to_repair_rows_on_wire(std::move(row_diff)).then([this, remote_node] (repair_rows_on_wire rows) {
return _messaging.send_repair_put_row_diff(msg_addr(remote_node), _repair_meta_id, std::move(rows));
return to_repair_rows_on_wire(std::move(row_diff)).then([this, remote_node, dst_cpu_id] (repair_rows_on_wire rows) {
return _messaging.send_repair_put_row_diff(msg_addr(remote_node), _repair_meta_id, std::move(rows), dst_cpu_id);
});
});
});
@@ -1885,7 +1920,8 @@ public:
repair_hash_set set_diff,
needs_all_rows_t needs_all_rows,
gms::inet_address remote_node, unsigned node_idx,
const locator::effective_replication_map& erm, bool small_table_optimization) {
const locator::effective_replication_map& erm, bool small_table_optimization,
shard_id dst_cpu_id) {
if (set_diff.empty()) {
co_return;
}
@@ -1925,7 +1961,7 @@ public:
repair_rows_on_wire rows = co_await to_repair_rows_on_wire(std::move(row_diff));
auto [sink, source] = co_await _sink_source_for_put_row_diff.get_sink_source(remote_node, node_idx);
auto [sink, source] = co_await _sink_source_for_put_row_diff.get_sink_source(remote_node, node_idx, dst_cpu_id);
auto source_op = [&] () mutable -> future<> {
co_await put_row_diff_source_op(remote_node, node_idx, source);
};
@@ -1958,6 +1994,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_row_on_wire_with_cmd> sink,
rpc::source<repair_hash_with_cmd> source,
@@ -1976,7 +2013,7 @@ static future<stop_iteration> repair_get_row_diff_with_rpc_stream_process_op(
bool needs_all_rows = hash_cmd.cmd == repair_stream_cmd::needs_all_rows;
_metrics.rx_hashes_nr += current_set_diff.size();
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(current_set_diff)));
return repair.invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] (repair_service& local_repair) {
return repair.invoke_on(dst_cpu_id, [from, repair_meta_id, needs_all_rows, fp = std::move(fp)] (repair_service& local_repair) {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::get_row_diff_with_rpc_stream_started);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -2015,6 +2052,7 @@ static future<stop_iteration> repair_put_row_diff_with_rpc_stream_process_op(
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_stream_cmd> sink,
rpc::source<repair_row_on_wire_with_cmd> source,
@@ -2029,7 +2067,7 @@ static future<stop_iteration> repair_put_row_diff_with_rpc_stream_process_op(
} else if (row.cmd == repair_stream_cmd::end_of_current_rows) {
rlogger.trace("Got repair_rows_on_wire from peer={}, got end_of_current_rows", from);
auto fp = make_foreign(std::make_unique<repair_rows_on_wire>(std::move(current_rows)));
return repair.invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp)] (repair_service& local_repair) mutable {
return repair.invoke_on(dst_cpu_id, [from, repair_meta_id, fp = std::move(fp)] (repair_service& local_repair) mutable {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::put_row_diff_with_rpc_stream_started);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -2057,6 +2095,7 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_hash_with_cmd> sink,
rpc::source<repair_stream_cmd> source,
@@ -2065,7 +2104,7 @@ static future<stop_iteration> repair_get_full_row_hashes_with_rpc_stream_process
repair_stream_cmd status = std::get<0>(status_opt.value());
rlogger.trace("Got register_repair_get_full_row_hashes_with_rpc_stream from peer={}, status={}", from, int(status));
if (status == repair_stream_cmd::get_full_row_hashes) {
return repair.invoke_on(src_cpu_id % smp::count, [from, repair_meta_id] (repair_service& local_repair) {
return repair.invoke_on(dst_cpu_id, [from, repair_meta_id] (repair_service& local_repair) {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::get_full_row_hashes_started);
return rm->get_full_row_hashes_handler().then([rm] (repair_hash_set hashes) {
@@ -2095,24 +2134,27 @@ static future<> repair_get_row_diff_with_rpc_stream_handler(
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_row_on_wire_with_cmd> sink,
rpc::source<repair_hash_with_cmd> source) {
return do_with(false, repair_hash_set(), [&repair, from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
return repeat([&repair, from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
return source().then([&repair, from, src_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
return do_with(false, repair_hash_set(), [&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source] (bool& error, repair_hash_set& current_set_diff) mutable {
return repeat([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] () mutable {
return source().then([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source, &error, &current_set_diff] (std::optional<std::tuple<repair_hash_with_cmd>> hash_cmd_opt) mutable {
if (hash_cmd_opt) {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::no);
}
return repair_get_row_diff_with_rpc_stream_process_op(repair, from,
src_cpu_id,
dst_cpu_id,
repair_meta_id,
sink,
source,
error,
current_set_diff,
std::move(hash_cmd_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
std::move(hash_cmd_opt)).handle_exception([from, repair_meta_id, sink, &error] (std::exception_ptr ep) mutable {
rlogger.warn("repair_get_row_diff_with_rpc_stream_handler: from={} repair_meta_id={} error={}", from, repair_meta_id, ep);
error = true;
return sink(repair_row_on_wire_with_cmd{repair_stream_cmd::error, repair_row_on_wire()}).then([] {
return make_ready_future<stop_iteration>(stop_iteration::no);
@@ -2132,24 +2174,27 @@ static future<> repair_put_row_diff_with_rpc_stream_handler(
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_stream_cmd> sink,
rpc::source<repair_row_on_wire_with_cmd> source) {
return do_with(false, repair_rows_on_wire(), [&repair, from, src_cpu_id, repair_meta_id, sink, source] (bool& error, repair_rows_on_wire& current_rows) mutable {
return repeat([&repair, from, src_cpu_id, repair_meta_id, sink, source, &current_rows, &error] () mutable {
return source().then([&repair, from, src_cpu_id, repair_meta_id, sink, source, &current_rows, &error] (std::optional<std::tuple<repair_row_on_wire_with_cmd>> row_opt) mutable {
return do_with(false, repair_rows_on_wire(), [&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source] (bool& error, repair_rows_on_wire& current_rows) mutable {
return repeat([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source, &current_rows, &error] () mutable {
return source().then([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source, &current_rows, &error] (std::optional<std::tuple<repair_row_on_wire_with_cmd>> row_opt) mutable {
if (row_opt) {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::no);
}
return repair_put_row_diff_with_rpc_stream_process_op(repair, from,
src_cpu_id,
dst_cpu_id,
repair_meta_id,
sink,
source,
error,
current_rows,
std::move(row_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
std::move(row_opt)).handle_exception([from, repair_meta_id, sink, &error] (std::exception_ptr ep) mutable {
rlogger.warn("repair_put_row_diff_with_rpc_stream_handler: from={} repair_meta_id={} error={}", from, repair_meta_id, ep);
error = true;
return sink(repair_stream_cmd::error).then([] {
return make_ready_future<stop_iteration>(stop_iteration::no);
@@ -2169,23 +2214,26 @@ static future<> repair_get_full_row_hashes_with_rpc_stream_handler(
sharded<repair_service>& repair,
gms::inet_address from,
uint32_t src_cpu_id,
uint32_t dst_cpu_id,
uint32_t repair_meta_id,
rpc::sink<repair_hash_with_cmd> sink,
rpc::source<repair_stream_cmd> source) {
return repeat([&repair, from, src_cpu_id, repair_meta_id, sink, source] () mutable {
return do_with(false, [&repair, from, src_cpu_id, repair_meta_id, sink, source] (bool& error) mutable {
return source().then([&repair, from, src_cpu_id, repair_meta_id, sink, source, &error] (std::optional<std::tuple<repair_stream_cmd>> status_opt) mutable {
return repeat([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source] () mutable {
return do_with(false, [&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source] (bool& error) mutable {
return source().then([&repair, from, src_cpu_id, dst_cpu_id, repair_meta_id, sink, source, &error] (std::optional<std::tuple<repair_stream_cmd>> status_opt) mutable {
if (status_opt) {
if (error) {
return make_ready_future<stop_iteration>(stop_iteration::no);
}
return repair_get_full_row_hashes_with_rpc_stream_process_op(repair, from,
src_cpu_id,
dst_cpu_id,
repair_meta_id,
sink,
source,
error,
std::move(status_opt)).handle_exception([sink, &error] (std::exception_ptr ep) mutable {
std::move(status_opt)).handle_exception([from, repair_meta_id, sink, &error] (std::exception_ptr ep) mutable {
rlogger.warn("repair_get_full_row_hashes_with_rpc_stream_handler: from={} repair_meta_id={} error={}", from, repair_meta_id, ep);
error = true;
return sink(repair_hash_with_cmd{repair_stream_cmd::error, repair_hash()}).then([] () {
return make_ready_future<stop_iteration>(stop_iteration::no);
@@ -2268,43 +2316,47 @@ future<repair_flush_hints_batchlog_response> repair_service::repair_flush_hints_
future<> repair_service::init_ms_handlers() {
auto& ms = this->_messaging;
ms.register_repair_get_row_diff_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::source<repair_hash_with_cmd> source) {
ms.register_repair_get_row_diff_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_hash_with_cmd> source) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
auto sink = ms.make_sink_for_repair_get_row_diff_with_rpc_stream(source);
// Start a new fiber.
(void)repair_get_row_diff_with_rpc_stream_handler(container(), from, src_cpu_id, repair_meta_id, sink, source).handle_exception(
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
(void)repair_get_row_diff_with_rpc_stream_handler(container(), from, src_cpu_id, shard, repair_meta_id, sink, source).handle_exception(
[from, repair_meta_id, sink, source] (std::exception_ptr ep) {
rlogger.warn("Failed to process get_row_diff_with_rpc_stream_handler from={}, repair_meta_id={}: {}", from, repair_meta_id, ep);
});
return make_ready_future<rpc::sink<repair_row_on_wire_with_cmd>>(sink);
});
ms.register_repair_put_row_diff_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::source<repair_row_on_wire_with_cmd> source) {
ms.register_repair_put_row_diff_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_row_on_wire_with_cmd> source) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
auto sink = ms.make_sink_for_repair_put_row_diff_with_rpc_stream(source);
// Start a new fiber.
(void)repair_put_row_diff_with_rpc_stream_handler(container(), from, src_cpu_id, repair_meta_id, sink, source).handle_exception(
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
(void)repair_put_row_diff_with_rpc_stream_handler(container(), from, src_cpu_id, shard, repair_meta_id, sink, source).handle_exception(
[from, repair_meta_id, sink, source] (std::exception_ptr ep) {
rlogger.warn("Failed to process put_row_diff_with_rpc_stream_handler from={}, repair_meta_id={}: {}", from, repair_meta_id, ep);
});
return make_ready_future<rpc::sink<repair_stream_cmd>>(sink);
});
ms.register_repair_get_full_row_hashes_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::source<repair_stream_cmd> source) {
ms.register_repair_get_full_row_hashes_with_rpc_stream([this, &ms] (const rpc::client_info& cinfo, uint64_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt, rpc::source<repair_stream_cmd> source) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
auto sink = ms.make_sink_for_repair_get_full_row_hashes_with_rpc_stream(source);
// Start a new fiber.
(void)repair_get_full_row_hashes_with_rpc_stream_handler(container(), from, src_cpu_id, repair_meta_id, sink, source).handle_exception(
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
(void)repair_get_full_row_hashes_with_rpc_stream_handler(container(), from, src_cpu_id, shard, repair_meta_id, sink, source).handle_exception(
[from, repair_meta_id, sink, source] (std::exception_ptr ep) {
rlogger.warn("Failed to process get_full_row_hashes_with_rpc_stream_handler from={}, repair_meta_id={}: {}", from, repair_meta_id, ep);
});
return make_ready_future<rpc::sink<repair_hash_with_cmd>>(sink);
});
ms.register_repair_get_full_row_hashes([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id) {
ms.register_repair_get_full_row_hashes([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id] (repair_service& local_repair) {
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
return container().invoke_on(shard, [from, repair_meta_id] (repair_service& local_repair) {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::get_full_row_hashes_started);
return rm->get_full_row_hashes_handler().then([rm] (repair_hash_set hashes) {
@@ -2315,10 +2367,11 @@ future<> repair_service::init_ms_handlers() {
}) ;
});
ms.register_repair_get_combined_row_hash([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
std::optional<repair_sync_boundary> common_sync_boundary) {
std::optional<repair_sync_boundary> common_sync_boundary, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id,
return container().invoke_on(shard, [from, repair_meta_id,
common_sync_boundary = std::move(common_sync_boundary)] (repair_service& local_repair) mutable {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
_metrics.tx_hashes_nr++;
@@ -2330,10 +2383,11 @@ future<> repair_service::init_ms_handlers() {
});
});
ms.register_repair_get_sync_boundary([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
std::optional<repair_sync_boundary> skipped_sync_boundary) {
std::optional<repair_sync_boundary> skipped_sync_boundary, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id,
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
return container().invoke_on(shard, [from, repair_meta_id,
skipped_sync_boundary = std::move(skipped_sync_boundary)] (repair_service& local_repair) mutable {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::get_sync_boundary_started);
@@ -2344,12 +2398,13 @@ future<> repair_service::init_ms_handlers() {
});
});
ms.register_repair_get_row_diff([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
repair_hash_set set_diff, bool needs_all_rows) {
repair_hash_set set_diff, bool needs_all_rows, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
_metrics.rx_hashes_nr += set_diff.size();
auto fp = make_foreign(std::make_unique<repair_hash_set>(std::move(set_diff)));
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] (repair_service& local_repair) mutable {
return container().invoke_on(shard, [from, repair_meta_id, fp = std::move(fp), needs_all_rows] (repair_service& local_repair) mutable {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::get_row_diff_started);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -2366,11 +2421,12 @@ future<> repair_service::init_ms_handlers() {
});
});
ms.register_repair_put_row_diff([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
repair_rows_on_wire row_diff) {
repair_rows_on_wire row_diff, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
auto fp = make_foreign(std::make_unique<repair_rows_on_wire>(std::move(row_diff)));
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, fp = std::move(fp)] (repair_service& local_repair) mutable {
return container().invoke_on(shard, [from, repair_meta_id, fp = std::move(fp)] (repair_service& local_repair) mutable {
auto rm = local_repair.get_repair_meta(from, repair_meta_id);
rm->set_repair_state_for_local_node(repair_state::put_row_diff_started);
if (fp.get_owner_shard() == this_shard_id()) {
@@ -2387,10 +2443,11 @@ future<> repair_service::init_ms_handlers() {
ms.register_repair_row_level_start([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id, sstring ks_name,
sstring cf_name, dht::token_range range, row_level_diff_detect_algorithm algo, uint64_t max_row_buf_size, uint64_t seed,
unsigned remote_shard, unsigned remote_shard_count, unsigned remote_ignore_msb, sstring remote_partitioner_name, table_schema_version schema_version,
rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time) {
rpc::optional<streaming::stream_reason> reason, rpc::optional<gc_clock::time_point> compaction_time, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
return container().invoke_on(shard, [from, src_cpu_id, repair_meta_id, ks_name, cf_name,
range, algo, max_row_buf_size, seed, remote_shard, remote_shard_count, remote_ignore_msb, schema_version, reason, compaction_time, this] (repair_service& local_repair) mutable {
if (!local_repair._sys_dist_ks.local_is_initialized() || !local_repair._view_update_generator.local_is_initialized()) {
return make_exception_future<repair_row_level_start_response>(std::runtime_error(format("Node {} is not fully initialized for repair, try again later",
@@ -2405,26 +2462,29 @@ future<> repair_service::init_ms_handlers() {
});
});
ms.register_repair_row_level_stop([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
sstring ks_name, sstring cf_name, dht::token_range range) {
sstring ks_name, sstring cf_name, dht::token_range range, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, ks_name, cf_name, range] (repair_service& local_repair) mutable {
return container().invoke_on(shard, [from, repair_meta_id, ks_name, cf_name, range] (repair_service& local_repair) mutable {
return repair_meta::repair_row_level_stop_handler(local_repair, from, repair_meta_id,
std::move(ks_name), std::move(cf_name), std::move(range));
});
});
ms.register_repair_get_estimated_partitions([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id) {
ms.register_repair_get_estimated_partitions([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id] (repair_service& local_repair) mutable {
return container().invoke_on(shard, [from, repair_meta_id] (repair_service& local_repair) mutable {
return repair_meta::repair_get_estimated_partitions_handler(local_repair, from, repair_meta_id);
});
});
ms.register_repair_set_estimated_partitions([this] (const rpc::client_info& cinfo, uint32_t repair_meta_id,
uint64_t estimated_partitions) {
uint64_t estimated_partitions, rpc::optional<shard_id> dst_cpu_id_opt) {
auto src_cpu_id = cinfo.retrieve_auxiliary<uint32_t>("src_cpu_id");
auto shard = get_dst_shard_id(src_cpu_id, dst_cpu_id_opt);
auto from = cinfo.retrieve_auxiliary<gms::inet_address>("baddr");
return container().invoke_on(src_cpu_id % smp::count, [from, repair_meta_id, estimated_partitions] (repair_service& local_repair) mutable {
return container().invoke_on(shard, [from, repair_meta_id, estimated_partitions] (repair_service& local_repair) mutable {
return repair_meta::repair_set_estimated_partitions_handler(local_repair, from, repair_meta_id, estimated_partitions);
});
});
@@ -2493,6 +2553,7 @@ class row_level_repair {
table_id _table_id;
dht::token_range _range;
inet_address_vector_replica_set _all_live_peer_nodes;
std::vector<std::optional<shard_id>> _all_live_peer_shards;
bool _small_table_optimization;
replica::column_family& _cf;
@@ -2553,6 +2614,21 @@ public:
, _cf(_shard_task.db.local().find_column_family(_table_id))
, _seed(get_random_seed())
, _start_time(gc_clock::now()) {
repair_neighbors r_neighbors = _shard_task.get_repair_neighbors(_range);
auto& map = r_neighbors.shard_map;
for (auto& n : _all_live_peer_nodes) {
auto it = map.find(n);
if (it != map.end()) {
_all_live_peer_shards.push_back(it->second);
} else {
_all_live_peer_shards.push_back(std::nullopt);
}
}
if (_all_live_peer_shards.size() != _all_live_peer_nodes.size()) {
on_internal_error(rlogger, format("The size of shards and nodes do not match table={} range={} shards={} nodes={}",
_cf_name, _range, _all_live_peer_shards, _all_live_peer_nodes));
}
}
private:
@@ -2585,11 +2661,12 @@ private:
master.stats().round_nr++;
parallel_for_each(master.all_nodes(), [&, this] (repair_node_state& ns) {
const auto& node = ns.node;
auto dst_cpu_id = ns.shard;
// By calling `get_sync_boundary`, the `_last_sync_boundary`
// is moved to the `_current_sync_boundary` or
// `_skipped_sync_boundary` if it is not std::nullopt.
ns.state = repair_state::get_sync_boundary_started;
return master.get_sync_boundary(node, _skipped_sync_boundary).then([&, this] (get_sync_boundary_response res) {
return master.get_sync_boundary(node, _skipped_sync_boundary, dst_cpu_id).then([&, this] (get_sync_boundary_response res) {
ns.state = repair_state::get_sync_boundary_finished;
master.stats().row_from_disk_bytes[node] += res.new_rows_size;
master.stats().row_from_disk_nr[node] += res.new_rows_nr;
@@ -2662,8 +2739,9 @@ private:
// peers are identical, we think rows in the `_working_row_buff`
// are identical, there is no need to transfer each and every
// row hashes to the repair master.
master.all_nodes()[idx].state = repair_state::get_combined_row_hash_started;
return master.get_combined_row_hash(_common_sync_boundary, master.all_nodes()[idx].node).then([&, idx] (get_combined_row_hash_response resp) {
auto& ns = master.all_nodes()[idx];
ns.state = repair_state::get_combined_row_hash_started;
return master.get_combined_row_hash(_common_sync_boundary, ns.node, ns.shard).then([&, idx] (get_combined_row_hash_response resp) {
master.all_nodes()[idx].state = repair_state::get_combined_row_hash_finished;
rlogger.debug("Calling master.get_combined_row_hash for node {}, got combined_hash={}", master.all_nodes()[idx].node, resp);
combined_hashes[idx]= std::move(resp);
@@ -2694,6 +2772,7 @@ private:
for (unsigned node_idx = 0; node_idx < _all_live_peer_nodes.size(); node_idx++) {
auto& ns = master.all_nodes()[node_idx + 1];
auto& node = _all_live_peer_nodes[node_idx];
auto dst_cpu_id = ns.shard;
try {
// Here is an optimization to avoid transferring full rows hashes,
// if remote and local node, has the same combined_hashes.
@@ -2719,12 +2798,12 @@ private:
if (master.use_rpc_stream()) {
rlogger.debug("FastPath: get_row_diff with needs_all_rows_t::yes rpc stream");
ns.state = repair_state::get_row_diff_with_rpc_stream_started;
master.get_row_diff_with_rpc_stream({}, repair_meta::needs_all_rows_t::yes, repair_meta::update_peer_row_hash_sets::yes, node, node_idx);
master.get_row_diff_with_rpc_stream({}, repair_meta::needs_all_rows_t::yes, repair_meta::update_peer_row_hash_sets::yes, node, node_idx, dst_cpu_id);
ns.state = repair_state::get_row_diff_with_rpc_stream_finished;
} else {
rlogger.debug("FastPath: get_row_diff with needs_all_rows_t::yes rpc verb");
ns.state = repair_state::get_row_diff_and_update_peer_row_hash_sets_started;
master.get_row_diff_and_update_peer_row_hash_sets(node, node_idx);
master.get_row_diff_and_update_peer_row_hash_sets(node, node_idx, dst_cpu_id);
ns.state = repair_state::get_row_diff_and_update_peer_row_hash_sets_finished;
}
continue;
@@ -2735,11 +2814,11 @@ private:
// Ask the peer to send the full list hashes in the working row buf.
if (master.use_rpc_stream()) {
ns.state = repair_state::get_full_row_hashes_with_rpc_stream_started;
master.peer_row_hash_sets(node_idx) = master.get_full_row_hashes_with_rpc_stream(node, node_idx).get0();
master.peer_row_hash_sets(node_idx) = master.get_full_row_hashes_with_rpc_stream(node, node_idx, dst_cpu_id).get0();
ns.state = repair_state::get_full_row_hashes_with_rpc_stream_finished;
} else {
ns.state = repair_state::get_full_row_hashes_started;
master.peer_row_hash_sets(node_idx) = master.get_full_row_hashes(node).get0();
master.peer_row_hash_sets(node_idx) = master.get_full_row_hashes(node, dst_cpu_id).get0();
ns.state = repair_state::get_full_row_hashes_finished;
}
rlogger.debug("After master.get_full_row_hashes for node {}, hash_sets={}",
@@ -2761,11 +2840,11 @@ private:
auto needs_all_rows = repair_meta::needs_all_rows_t(set_diff.size() == master.peer_row_hash_sets(node_idx).size());
if (master.use_rpc_stream()) {
ns.state = repair_state::get_row_diff_with_rpc_stream_started;
master.get_row_diff_with_rpc_stream(std::move(set_diff), needs_all_rows, repair_meta::update_peer_row_hash_sets::no, node, node_idx);
master.get_row_diff_with_rpc_stream(std::move(set_diff), needs_all_rows, repair_meta::update_peer_row_hash_sets::no, node, node_idx, dst_cpu_id);
ns.state = repair_state::get_row_diff_with_rpc_stream_finished;
} else {
ns.state = repair_state::get_row_diff_started;
master.get_row_diff(std::move(set_diff), needs_all_rows, node, node_idx);
master.get_row_diff(std::move(set_diff), needs_all_rows, node, node_idx, dst_cpu_id);
ns.state = repair_state::get_row_diff_finished;
}
rlogger.debug("After get_row_diff node {}, hash_sets={}", master.myip(), master.working_row_hashes().get0().size());
@@ -2793,13 +2872,14 @@ private:
}
parallel_for_each(boost::irange(size_t(0), sz), [&, this] (size_t idx) {
auto& ns = master.all_nodes()[idx + 1];
auto dst_cpu_id = ns.shard;
auto needs_all_rows = repair_meta::needs_all_rows_t(master.peer_row_hash_sets(idx).empty());
auto& set_diff = set_diffs[idx];
rlogger.debug("Calling master.put_row_diff to node {}, set_diff={}, needs_all_rows={}", _all_live_peer_nodes[idx], set_diff.size(), needs_all_rows);
auto& node = master.all_nodes()[idx].node;
if (master.use_rpc_stream()) {
ns.state = repair_state::put_row_diff_with_rpc_stream_started;
return master.put_row_diff_with_rpc_stream(std::move(set_diff), needs_all_rows, _all_live_peer_nodes[idx], idx, *get_erm(), _small_table_optimization).then([&ns] {
return master.put_row_diff_with_rpc_stream(std::move(set_diff), needs_all_rows, _all_live_peer_nodes[idx], idx, *get_erm(), _small_table_optimization, dst_cpu_id).then([&ns] {
ns.state = repair_state::put_row_diff_with_rpc_stream_finished;
}).handle_exception([this, &node] (std::exception_ptr ep) {
auto s = _cf.schema();
@@ -2810,7 +2890,7 @@ private:
} else {
ns.state = repair_state::put_row_diff_started;
return master.put_row_diff(std::move(set_diff), needs_all_rows, _all_live_peer_nodes[idx]).then([&ns] {
return master.put_row_diff(std::move(set_diff), needs_all_rows, _all_live_peer_nodes[idx], dst_cpu_id).then([&ns] {
ns.state = repair_state::put_row_diff_finished;
}).handle_exception([this, &node] (std::exception_ptr ep) {
auto s = _cf.schema();
@@ -2876,10 +2956,11 @@ public:
auto repair_meta_id = _shard_task.rs.get_next_repair_meta_id().get0();
auto algorithm = get_common_diff_detect_algorithm(_shard_task.messaging.local(), _all_live_peer_nodes);
auto max_row_buf_size = get_max_row_buf_size(algorithm);
auto& sharder = _cf.get_effective_replication_map()->get_sharder(*(_cf.schema()));
auto master_node_shard_config = shard_config {
this_shard_id(),
_shard_task.sharder.shard_count(),
_shard_task.sharder.sharding_ignore_msb()
sharder.shard_count(),
sharder.sharding_ignore_msb()
};
auto s = _cf.schema();
auto schema_version = s->version();
@@ -2913,6 +2994,7 @@ public:
std::move(master_node_shard_config),
_all_live_peer_nodes,
_all_live_peer_nodes.size(),
_all_live_peer_shards,
this,
compaction_time);
auto auto_stop_master = defer([&master] {
@@ -2925,17 +3007,17 @@ public:
master.myip(), _all_live_peer_nodes, master.repair_meta_id(), _shard_task.get_keyspace(), _cf_name, schema_version, _range, _seed, max_row_buf_size);
std::exception_ptr ex = nullptr;
std::vector<gms::inet_address> nodes_to_stop;
std::vector<repair_node_state> nodes_to_stop;
nodes_to_stop.reserve(master.all_nodes().size());
try {
parallel_for_each(master.all_nodes(), [&, this] (repair_node_state& ns) {
const auto& node = ns.node;
ns.state = repair_state::row_level_start_started;
return master.repair_row_level_start(node, _shard_task.get_keyspace(), _cf_name, _range, schema_version, _shard_task.reason(), compaction_time).then([&] () {
return master.repair_row_level_start(node, _shard_task.get_keyspace(), _cf_name, _range, schema_version, _shard_task.reason(), compaction_time, ns.shard).then([&] () {
ns.state = repair_state::row_level_start_finished;
nodes_to_stop.push_back(node);
nodes_to_stop.push_back(ns);
ns.state = repair_state::get_estimated_partitions_started;
return master.repair_get_estimated_partitions(node).then([this, node, &ns] (uint64_t partitions) {
return master.repair_get_estimated_partitions(node, ns.shard).then([this, node, &ns] (uint64_t partitions) {
ns.state = repair_state::get_estimated_partitions_finished;
rlogger.trace("Get repair_get_estimated_partitions for node={}, estimated_partitions={}", node, partitions);
_estimated_partitions += partitions;
@@ -2947,7 +3029,7 @@ public:
const auto& node = ns.node;
rlogger.trace("Get repair_set_estimated_partitions for node={}, estimated_partitions={}", node, _estimated_partitions);
ns.state = repair_state::set_estimated_partitions_started;
return master.repair_set_estimated_partitions(node, _estimated_partitions).then([&ns] {
return master.repair_set_estimated_partitions(node, _estimated_partitions, ns.shard).then([&ns] {
ns.state = repair_state::set_estimated_partitions_finished;
});
}).get();
@@ -2978,9 +3060,10 @@ public:
ex = std::current_exception();
}
parallel_for_each(nodes_to_stop, [&] (const gms::inet_address& node) {
parallel_for_each(nodes_to_stop, [&] (repair_node_state& ns) {
auto node = ns.node;
master.set_repair_state(repair_state::row_level_stop_started, node);
return master.repair_row_level_stop(node, _shard_task.get_keyspace(), _cf_name, _range).then([node, &master] {
return master.repair_row_level_stop(node, _shard_task.get_keyspace(), _cf_name, _range, ns.shard).then([node, &master] {
master.set_repair_state(repair_state::row_level_stop_finished, node);
});
}).get();
@@ -3067,6 +3150,7 @@ repair_service::repair_service(distributed<gms::gossiper>& gossiper,
netw::messaging_service& ms,
sharded<replica::database>& db,
sharded<service::storage_proxy>& sp,
sharded<service::raft_address_map>& addr_map,
sharded<db::batchlog_manager>& bm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
@@ -3078,6 +3162,7 @@ repair_service::repair_service(distributed<gms::gossiper>& gossiper,
, _messaging(ms)
, _db(db)
, _sp(sp)
, _addr_map(addr_map)
, _bm(bm)
, _sys_dist_ks(sys_dist_ks)
, _sys_ks(sys_ks)

View File

@@ -16,6 +16,7 @@
#include "locator/abstract_replication_strategy.hh"
#include <seastar/core/distributed.hh>
#include <seastar/util/bool_class.hh>
#include "service/raft/raft_address_map.hh"
using namespace seastar;
@@ -82,11 +83,14 @@ public:
float repair_finished_percentage();
};
using host2ip_t = std::function<future<gms::inet_address> (locator::host_id)>;
class repair_service : public seastar::peering_sharded_service<repair_service> {
distributed<gms::gossiper>& _gossiper;
netw::messaging_service& _messaging;
sharded<replica::database>& _db;
sharded<service::storage_proxy>& _sp;
sharded<service::raft_address_map>& _addr_map;
sharded<db::batchlog_manager>& _bm;
sharded<db::system_distributed_keyspace>& _sys_dist_ks;
sharded<db::system_keyspace>& _sys_ks;
@@ -113,6 +117,7 @@ public:
netw::messaging_service& ms,
sharded<replica::database>& db,
sharded<service::storage_proxy>& sp,
sharded<service::raft_address_map>& addr_map,
sharded<db::batchlog_manager>& bm,
sharded<db::system_distributed_keyspace>& sys_dist_ks,
sharded<db::system_keyspace>& sys_ks,
@@ -155,6 +160,11 @@ private:
streaming::stream_reason reason,
shared_ptr<node_ops_info> ops_info);
public:
future<> repair_tablets(repair_uniq_id id, sstring keyspace_name, std::vector<sstring> table_names, host2ip_t host2ip, bool primary_replica_only = true);
private:
future<repair_update_system_table_response> repair_update_system_table_handler(
gms::inet_address from,
repair_update_system_table_request req);
@@ -232,6 +242,7 @@ public:
friend class repair::user_requested_repair_task_impl;
friend class repair::data_sync_repair_task_impl;
friend class repair::tablet_repair_task_impl;
};
class repair_info;

View File

@@ -101,6 +101,31 @@ protected:
virtual std::optional<double> expected_children_number() const override;
};
class tablet_repair_task_impl : public repair_task_impl {
private:
sstring _keyspace;
std::vector<sstring> _tables;
std::vector<tablet_repair_task_meta> _metas;
optimized_optional<abort_source::subscription> _abort_subscription;
public:
tablet_repair_task_impl(tasks::task_manager::module_ptr module, repair_uniq_id id, sstring keyspace, std::vector<sstring> tables, streaming::stream_reason reason, std::vector<tablet_repair_task_meta> metas)
: repair_task_impl(module, id.uuid(), id.id, "keyspace", keyspace, "", "", tasks::task_id::create_null_id(), reason)
, _keyspace(std::move(keyspace))
, _tables(std::move(tables))
, _metas(std::move(metas))
{
}
virtual tasks::is_abortable is_abortable() const noexcept override {
return tasks::is_abortable(!_abort_subscription);
}
protected:
future<> run() override;
virtual future<std::optional<double>> expected_total_workload() const override;
virtual std::optional<double> expected_children_number() const override;
};
class shard_repair_task_impl : public repair_task_impl {
public:
repair_service& rs;
@@ -108,7 +133,6 @@ public:
seastar::sharded<netw::messaging_service>& messaging;
service::migration_manager& mm;
gms::gossiper& gossiper;
const dht::sharder& sharder;
locator::effective_replication_map_ptr erm;
dht::token_range_vector ranges;
std::vector<sstring> cfs;

View File

@@ -143,10 +143,10 @@ class ManagerClient():
logger.debug("ManagerClient stopping %s", server_id)
await self.client.put_json(f"/cluster/server/{server_id}/stop")
async def server_stop_gracefully(self, server_id: ServerNum) -> None:
async def server_stop_gracefully(self, server_id: ServerNum, timeout: float = 60) -> None:
"""Stop specified server gracefully"""
logger.debug("ManagerClient stopping gracefully %s", server_id)
await self.client.put_json(f"/cluster/server/{server_id}/stop_gracefully")
await self.client.put_json(f"/cluster/server/{server_id}/stop_gracefully", timeout=timeout)
async def server_start(self, server_id: ServerNum, expected_error: Optional[str] = None,
wait_others: int = 0, wait_interval: float = 45) -> None:

View File

@@ -98,6 +98,13 @@ class RESTClient(metaclass=ABCMeta):
await self._fetch("POST", resource_uri, host = host, port = port, params = params,
json = json, timeout = timeout)
async def post_json(self, resource_uri: str, host: Optional[str] = None,
port: Optional[int] = None, params: Optional[Mapping[str, str]] = None,
json: Optional[Mapping] = None, timeout: Optional[float] = None) -> None:
ret = await self._fetch("POST", resource_uri, response_type = "json", host = host, port = port, params = params,
json = json, timeout = timeout)
return ret
async def put_json(self, resource_uri: str, data: Optional[Mapping] = None, host: Optional[str] = None,
port: Optional[int] = None, params: Optional[dict[str, str]] = None,
response_type: Optional[str] = None, timeout: Optional[float] = None) -> Any:
@@ -274,12 +281,19 @@ class ScyllaRESTAPIClient():
if table is not None:
url += "?cf={table}"
await self.client.post(url, host=node_ip)
async def dump_llvm_profile(self, node_ip : str):
"""Dump llvm profile to disk that can later be used for PGO or coverage reporting.
no-op if the scylla binary is not instrumented."""
url = "/system/dump_llvm_profile"
await self.client.post(url, host=node_ip)
async def repair(self, node_ip: str, keyspace: str, table: str) -> None:
"""Repair the given table and wait for it to complete"""
sequence_number = await self.client.post_json(f"/storage_service/repair_async/{keyspace}", host=node_ip, params={"columnFamilies": table})
status = await self.client.get_json(f"/storage_service/repair_status", host=node_ip, params={"id": str(sequence_number)})
return status
class ScyllaMetrics:
def __init__(self, lines: list[str]):
self.lines: list[str] = lines

View File

@@ -537,7 +537,7 @@ class ScyllaServer:
except ProcessLookupError:
pass
else:
STOP_TIMEOUT_SECONDS = 60
STOP_TIMEOUT_SECONDS = 120
wait_task = self.cmd.wait()
try:
await asyncio.wait_for(wait_task, timeout=STOP_TIMEOUT_SECONDS)

View File

@@ -7,3 +7,4 @@ log_date_format = %H:%M:%S
markers =
slow: tests that take more than 30 seconds to run
repair: tests for repair

View File

@@ -378,3 +378,81 @@ async def test_table_dropped_during_streaming(manager: ManagerClient):
logger.info("Verifying tablet replica")
replica = await get_tablet_replica(manager, servers[0], 'test', 'test2', tablet_token)
assert replica == (UUID(s1_host_id), 0)
@pytest.mark.repair
@pytest.mark.asyncio
async def test_tablet_repair(manager: ManagerClient):
logger.info("Bootstrapping cluster")
servers = [await manager.server_add(), await manager.server_add(), await manager.server_add()]
cql = manager.get_cql()
await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', "
"'replication_factor': 2} AND tablets = {'initial': 32};")
await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
logger.info("Populating table")
keys = range(256)
await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys])
logger.info("Repair table")
await manager.api.repair(servers[0].ip_addr, "test", "test")
async def check():
logger.info("Checking table")
rows = await cql.run_async("SELECT * FROM test.test;")
assert len(rows) == len(keys)
for r in rows:
assert r.c == r.pk
await check()
await cql.run_async("DROP KEYSPACE test;")
@pytest.mark.repair
@pytest.mark.asyncio
async def test_tablet_missing_data_repair(manager: ManagerClient):
logger.info("Bootstrapping cluster")
cmdline = [
'--hinted-handoff-enabled', 'false',
]
servers = [await manager.server_add(cmdline=cmdline),
await manager.server_add(cmdline=cmdline),
await manager.server_add(cmdline=cmdline)]
cql = manager.get_cql()
await cql.run_async("CREATE KEYSPACE test WITH replication = {'class': 'NetworkTopologyStrategy', "
"'replication_factor': 3} AND tablets = {'initial': 32};")
await cql.run_async("CREATE TABLE test.test (pk int PRIMARY KEY, c int);")
keys_list = [range(0, 100), range(100, 200), range(200, 300)]
keys = range(0, 300)
for idx in range(0,3):
s = servers[idx].server_id
await manager.server_stop_gracefully(s, timeout=120)
logger.info(f"Stopped server {idx}");
logger.info(f"Insert into server {idx}");
await asyncio.gather(*[cql.run_async(f"INSERT INTO test.test (pk, c) VALUES ({k}, {k});") for k in keys_list[idx]])
await manager.server_start(s)
logger.info(f"Started server {idx}");
await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60)
logger.info("Repair table")
await manager.api.repair(servers[0].ip_addr, "test", "test")
async def check():
logger.info("Checking table")
query = SimpleStatement("SELECT * FROM test.test;", consistency_level=ConsistencyLevel.ONE)
rows = await cql.run_async(query)
assert len(rows) == len(keys)
for r in rows:
assert r.c == r.pk
for idx in range(0,3):
s = servers[idx].server_id
await manager.server_stop_gracefully(s, timeout=120)
await check()
await manager.server_start(s)
await cql.run_async("DROP KEYSPACE test;")