tablets: deallocate storage state on end_migration
When a tablet is migrated and cleaned up, deallocate the tablet storage group state on `end_migration` stage, instead of `cleanup` stage: * When the stage is updated from `cleanup` to `end_migration`, the storage group is removed on the leaving replica. * When the table is initialized, if the tablet stage is `end_migration` then we don't allocate a storage group for it. This happens for example if the leaving replica is restarted during tablet migration. If it's initialized in `cleanup` stage then we allocate a storage group, and it will be deallocated when transitioning to `end_migration`. This guarantees that the storage group is always deallocated on the leaving replica by `end_migration`, and that it is always allocated if the tablet wasn't cleaned up fully yet. It is a similar case also for the pending replica when the migration is aborted. We deallocate the state on `revert_migration` which is the stage following `cleanup_target`. Previously the storage group would be allocated when the tablet is initialized on any of the tablet replicas - also on the leaving replica, and when the tablet stage is `cleanup` or `end_migration`, and deallocated during `cleanup`. This fixes the following issue: 1. A migrating tablet enters cleanup stage 2. the tablet is cleaned up successfuly 3. The leaving replica is restarted, and allocates storage group 4. tablet cleanup is not called because it was already cleaned up 4. the storage group remains allocated on the leaving replica after the migration is completed - it's not cleaned up properly. Fixes scylladb/scylladb#23481
This commit is contained in:
@@ -221,6 +221,18 @@ std::optional<tablet_replica> get_leaving_replica(const tablet_info& tinfo, cons
|
||||
return *leaving.begin();
|
||||
}
|
||||
|
||||
bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo) {
|
||||
if (replica == locator::get_leaving_replica(tinfo, trinfo)) {
|
||||
// we do tablet cleanup on the leaving replica in the `cleanup` stage, after which there is only the `end_migration` stage.
|
||||
return trinfo.stage == locator::tablet_transition_stage::end_migration;
|
||||
}
|
||||
if (replica == trinfo.pending_replica) {
|
||||
// we do tablet cleanup on the pending replica in the `cleanup_target` stage, after which there is only the `revert_migration` stage.
|
||||
return trinfo.stage == locator::tablet_transition_stage::revert_migration;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
tablet_replica_set get_new_replicas(const tablet_info& tinfo, const tablet_migration_info& mig) {
|
||||
return replace_replica(tinfo.replicas, mig.src, mig.dst);
|
||||
}
|
||||
|
||||
@@ -291,6 +291,10 @@ struct tablet_transition_info {
|
||||
// Returns the leaving replica for a given transition.
|
||||
std::optional<tablet_replica> get_leaving_replica(const tablet_info&, const tablet_transition_info&);
|
||||
|
||||
// True if the tablet is transitioning and it's in a stage that follows the stage
|
||||
// where we clean up the tablet on the given replica.
|
||||
bool is_post_cleanup(tablet_replica replica, const tablet_info& tinfo, const tablet_transition_info& trinfo);
|
||||
|
||||
/// Represents intention to move a single tablet replica from src to dst.
|
||||
struct tablet_migration_info {
|
||||
locator::tablet_transition_kind kind;
|
||||
|
||||
@@ -832,12 +832,19 @@ public:
|
||||
auto local_replica = locator::tablet_replica{_my_host_id, this_shard_id()};
|
||||
|
||||
for (auto tid : tmap.tablet_ids()) {
|
||||
auto range = tmap.get_token_range(tid);
|
||||
|
||||
if (tmap.has_replica(tid, local_replica)) {
|
||||
tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
|
||||
ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
|
||||
if (!tmap.has_replica(tid, local_replica)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the tablet was cleaned up already on this replica, don't allocate a storage group for it.
|
||||
auto trinfo = tmap.get_tablet_transition_info(tid);
|
||||
if (trinfo && locator::is_post_cleanup(local_replica, tmap.get_tablet_info(tid), *trinfo)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
auto range = tmap.get_token_range(tid);
|
||||
tlogger.debug("Tablet with id {} and range {} present for {}.{}", tid, range, schema()->ks_name(), schema()->cf_name());
|
||||
ret[tid.value()] = allocate_storage_group(tmap, tid, std::move(range));
|
||||
}
|
||||
_storage_groups = std::move(ret);
|
||||
}
|
||||
@@ -2713,7 +2720,7 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
handle_tablet_merge_completion(*old_tablet_map, *new_tablet_map);
|
||||
}
|
||||
|
||||
// Allocate storage group if tablet is migrating in.
|
||||
// Allocate storage group if tablet is migrating in, or deallocate if it's migrating out.
|
||||
auto this_replica = locator::tablet_replica{
|
||||
.host = erm.get_token_metadata().get_my_id(),
|
||||
.shard = this_shard_id()
|
||||
@@ -2729,6 +2736,8 @@ void tablet_storage_group_manager::update_effective_replication_map(const locato
|
||||
auto range = new_tablet_map->get_token_range(tid);
|
||||
_storage_groups[tid.value()] = allocate_storage_group(*new_tablet_map, tid, std::move(range));
|
||||
tablet_migrating_in = true;
|
||||
} else if (_storage_groups.contains(tid.value()) && locator::is_post_cleanup(this_replica, new_tablet_map->get_tablet_info(tid), transition_info)) {
|
||||
remove_storage_group(tid.value());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4129,7 +4138,6 @@ future<> table::cleanup_tablet(database& db, db::system_keyspace& sys_ks, locato
|
||||
co_await stop_compaction_groups(sg);
|
||||
co_await utils::get_local_injector().inject("delay_tablet_compaction_groups_cleanup", std::chrono::seconds(5));
|
||||
co_await cleanup_compaction_groups(db, sys_ks, tid, sg);
|
||||
_sg_manager->remove_storage_group(tid.value());
|
||||
}
|
||||
|
||||
future<> table::cleanup_tablet_without_deallocation(database& db, db::system_keyspace& sys_ks, locator::tablet_id tid) {
|
||||
|
||||
Reference in New Issue
Block a user