From 72f3b1d5fe42f2ced8fdb606e5df65210ef7bd71 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Thu, 15 Feb 2024 18:45:41 +0300 Subject: [PATCH] topology.tablets_migration: Add cleanup_target transition stage The new stage will be used to revert migration that fails at some stages. The goal is to cleanup the pending replica, which may already received some writes by doing the cleanup RPC to the pending replica, then jumping to "revert_migration" stage introduced earlier. If pending node is dead, the call to cleanup RPC is skipped. Coordinators use old replicas. Signed-off-by: Pavel Emelyanov --- locator/tablets.cc | 5 +++++ locator/tablets.hh | 1 + service/storage_service.cc | 6 +++++- service/tablet_allocator.cc | 2 ++ service/topology_coordinator.cc | 14 ++++++++++++++ 5 files changed, 27 insertions(+), 1 deletion(-) diff --git a/locator/tablets.cc b/locator/tablets.cc index 94bee2628f..d944e27082 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -38,6 +38,8 @@ write_replica_set_selector get_selector_for_writes(tablet_transition_stage stage return write_replica_set_selector::next; case tablet_transition_stage::cleanup: return write_replica_set_selector::next; + case tablet_transition_stage::cleanup_target: + return write_replica_set_selector::previous; case tablet_transition_stage::revert_migration: return write_replica_set_selector::previous; case tablet_transition_stage::end_migration: @@ -61,6 +63,8 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage) return read_replica_set_selector::next; case tablet_transition_stage::cleanup: return read_replica_set_selector::next; + case tablet_transition_stage::cleanup_target: + return read_replica_set_selector::previous; case tablet_transition_stage::revert_migration: return read_replica_set_selector::previous; case tablet_transition_stage::end_migration: @@ -279,6 +283,7 @@ static const std::unordered_map tablet_transit {tablet_transition_stage::streaming, "streaming"}, {tablet_transition_stage::use_new, "use_new"}, {tablet_transition_stage::cleanup, "cleanup"}, + {tablet_transition_stage::cleanup_target, "cleanup_target"}, {tablet_transition_stage::revert_migration, "revert_migration"}, {tablet_transition_stage::end_migration, "end_migration"}, }; diff --git a/locator/tablets.hh b/locator/tablets.hh index f2a6abdce1..85696ba9ab 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -157,6 +157,7 @@ enum class tablet_transition_stage { write_both_read_new, use_new, cleanup, + cleanup_target, revert_migration, end_migration, }; diff --git a/service/storage_service.cc b/service/storage_service.cc index a19744d5e1..d772ced8a6 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -5512,8 +5512,12 @@ future<> storage_service::cleanup_tablet(locator::global_tablet_id tablet) { if (leaving_replica.host != tm->get_my_id()) { throw std::runtime_error(fmt::format("Tablet {} has leaving replica different than this one", tablet)); } + } else if (trinfo->stage == locator::tablet_transition_stage::cleanup_target) { + if (trinfo->pending_replica.host != tm->get_my_id()) { + throw std::runtime_error(fmt::format("Tablet {} has pending replica different than this one", tablet)); + } } else { - throw std::runtime_error(fmt::format("Tablet {} stage is not at cleanup", tablet)); + throw std::runtime_error(fmt::format("Tablet {} stage is not at cleanup/cleanup_target", tablet)); } auto shard_opt = tmap.get_shard(tablet.tablet, tm->get_my_id()); diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index 300384d6a7..0a1d8de37e 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -425,6 +425,8 @@ private: return false; case tablet_transition_stage::cleanup: return false; + case tablet_transition_stage::cleanup_target: + return false; case tablet_transition_stage::revert_migration: return false; case tablet_transition_stage::end_migration: diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index a37964b0eb..e655b23265 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -1047,6 +1047,20 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { transition_to(locator::tablet_transition_stage::end_migration); } break; + case locator::tablet_transition_stage::cleanup_target: + if (advance_in_background(gid, tablet_state.cleanup, "cleanup_target", [&] { + locator::tablet_replica dst = trinfo.pending_replica; + if (is_excluded(raft::server_id(dst.host.uuid()))) { + rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded and doesn't need to revert migration", gid, dst); + return make_ready_future<>(); + } + rtlogger.info("Initiating tablet cleanup of {} on {} to revert migration", gid, dst); + return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging, + netw::msg_addr(id2ip(dst.host)), _as, raft::server_id(dst.host.uuid()), gid); + })) { + transition_to(locator::tablet_transition_stage::revert_migration); + } + break; case locator::tablet_transition_stage::revert_migration: // Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs. // See do_tablet_operation() doc.