scylla/service/topology_coordinator.cc

/*
 * Copyright (C) 2024-present ScyllaDB
 */

/*
 * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0
 */

#include <chrono>
#include <fmt/ranges.h>

#include <seastar/core/abort_source.hh>
#include <seastar/core/coroutine.hh>
#include <seastar/coroutine/as_future.hh>
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include <seastar/core/future.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/with_scheduling_group.hh>
#include <seastar/util/noncopyable_function.hh>

#include <variant>

#include "auth/service.hh"
#include "cdc/generation.hh"
#include "cql3/statements/ks_prop_defs.hh"
#include "db/system_distributed_keyspace.hh"
#include "db/system_keyspace.hh"
#include "dht/boot_strapper.hh"
#include "gms/gossiper.hh"
#include "gms/feature_service.hh"
#include "locator/host_id.hh"
#include "locator/tablets.hh"
#include "locator/token_metadata.hh"
#include "locator/network_topology_strategy.hh"
#include "message/messaging_service.hh"
#include "mutation/async_utils.hh"
#include "raft/raft.hh"
#include "replica/database.hh"
#include "replica/tablet_mutation_builder.hh"
#include "replica/tablets.hh"
#include "db/view/view_builder.hh"
#include "service/qos/service_level_controller.hh"
#include "service/migration_manager.hh"
#include "service/raft/join_node.hh"
#include "service/raft/raft_group0.hh"
#include "service/raft/raft_group0_client.hh"
#include "service/tablet_allocator.hh"
#include "service/tablet_operation.hh"
#include "service/topology_state_machine.hh"
#include "topology_mutation.hh"
#include "utils/assert.hh"
#include "utils/error_injection.hh"
#include "utils/stall_free.hh"
#include "utils/to_string.hh"
#include "service/endpoint_lifecycle_subscriber.hh"

#include "idl/join_node.dist.hh"
#include "idl/storage_service.dist.hh"
#include "replica/exceptions.hh"
#include "service/paxos/prepare_response.hh"
#include "idl/storage_proxy.dist.hh"

#include "service/topology_coordinator.hh"

#include <boost/range/join.hpp>

using token = dht::token;
using inet_address = gms::inet_address;

namespace service {

logging::logger rtlogger("raft_topology");

locator::host_id to_host_id(raft::server_id id) {
    return locator::host_id{id.uuid()};
}

future<> wait_for_gossiper(raft::server_id id, const gms::gossiper& g, abort_source& as) {
    const auto timeout = std::chrono::seconds{30};
    const auto deadline = lowres_clock::now() + timeout;
    while (true) {
        auto hids = g.get_nodes_with_host_id(to_host_id(id));
        if (!hids.empty()) {
            co_return;
        }
        if (lowres_clock::now() > deadline) {
            co_await coroutine::exception(std::make_exception_ptr(
                wait_for_ip_timeout(id, std::chrono::duration_cast<std::chrono::seconds>(timeout).count())));
        }
        static thread_local logger::rate_limit rate_limit{std::chrono::seconds(1)};
        rtlogger.log(log_level::warn, rate_limit, "cannot map {} to ip, retrying.", id);
        co_await sleep_abortable(std::chrono::milliseconds(5), as);
    }
}

namespace {
// Doesn't throw error on absence of values.
sstring get_application_state_gently(const gms::application_state_map& epmap, gms::application_state app_state) {
    const auto it = epmap.find(app_state);
    if (it == epmap.end()) {
        return sstring{};
    }
    // it's versioned_value::value(), not std::optional::value() - it does not throw
    return it->second.value();
}
} // namespace

class topology_coordinator : public endpoint_lifecycle_subscriber {
    sharded<db::system_distributed_keyspace>& _sys_dist_ks;
    gms::gossiper& _gossiper;
    netw::messaging_service& _messaging;
    locator::shared_token_metadata& _shared_tm;
    db::system_keyspace& _sys_ks;
    replica::database& _db;
    service::raft_group0& _group0;
    service::topology_state_machine& _topo_sm;
    abort_source& _as;
    gms::feature_service& _feature_service;

    raft::server& _raft;
    const raft::term_t _term;
    uint64_t _last_cmd_index = 0;

    raft_topology_cmd_handler_type _raft_topology_cmd_handler;

    tablet_allocator& _tablet_allocator;

    // The reason load_stats_ptr is a shared ptr is that load balancer can yield, and we don't want it
    // to suffer lifetime issues when stats refresh fiber overrides the current stats.
    locator::load_stats_ptr _tablet_load_stats;
    // FIXME: make frequency per table in order to reduce work in each iteration.
    //  Bigger tables will take longer to be resized. similar-sized tables can be batched into same iteration.
    static constexpr std::chrono::seconds tablet_load_stats_refresh_interval = std::chrono::seconds(60);

    std::chrono::milliseconds _ring_delay;

    gate::holder _group0_holder;

    using drop_guard_and_retake = bool_class<class retake_guard_tag>;

    // Engaged if an ongoing topology change should be rolled back. The string inside
    // will indicate a reason for the rollback.
    std::optional<sstring> _rollback;

    const locator::token_metadata& get_token_metadata() const noexcept {
        return *_shared_tm.get();
    }

    locator::token_metadata_ptr get_token_metadata_ptr() const noexcept {
        return _shared_tm.get();
    }

    // This is a topology snapshot for a given node. It contains pointers into the topology state machine
    // that may be outdated after guard is released so the structure is meant to be destroyed together
    // with the guard
    struct node_to_work_on {
        group0_guard guard;
        const topology_state_machine::topology_type* topology;
        raft::server_id id;
        const replica_state* rs;
        std::optional<topology_request> request;
        std::optional<request_param> req_param;
    };

    // The topology coordinator takes guard before operation start, but it releases it during various
    // RPC commands that it sends to make it possible to submit new requests to the state machine while
    // the coordinator drives current topology change. It is safe to do so since only the coordinator is
    // ever allowed to change node's state, others may only create requests. To make sure the coordinator did
    // not change while the lock was released, and hence the old coordinator does not work on old state, we check
    // that the raft term is still the same after the lock is re-acquired. Throw term_changed_error if it did.

    struct term_changed_error {};

    future<group0_guard> cleanup_group0_config_if_needed(group0_guard guard) {
        auto& topo = _topo_sm._topology;
        auto rconf = _group0.group0_server().get_configuration();
        if (!rconf.is_joint()) {
            // Find nodes that 'left' but still in the config and remove them
            auto to_remove = std::ranges::to<std::vector<raft::server_id>>(
                    rconf.current
                    | std::views::transform([&] (const raft::config_member& m) { return m.addr.id; })
                    | std::views::filter([&] (const raft::server_id& id) { return topo.left_nodes.contains(id); }));
            if (!to_remove.empty()) {
                // Remove from group 0 nodes that left. They may failed to do so by themselves
                release_guard(std::move(guard));
                try {
                    rtlogger.debug("topology coordinator fiber removing {}"
                                  " from raft since they are in `left` state", to_remove);
                    co_await _group0.group0_server().modify_config({}, to_remove, &_as);
                } catch (const raft::commit_status_unknown&) {
                    rtlogger.warn("topology coordinator fiber got commit_status_unknown status"
                                  " while removing {} from raft", to_remove);
                }
                guard = co_await start_operation();
            }
        }
        co_return std::move(guard);
    }

    struct cancel_requests {
        group0_guard guard;
        std::unordered_set<raft::server_id> dead_nodes;
    };

    struct start_cleanup {
        group0_guard guard;
    };

    // Return dead nodes
    std::unordered_set<locator::host_id> get_dead_nodes() const {
        std::unordered_set<locator::host_id> dead_set;
        for (auto& n : _topo_sm._topology.normal_nodes) {
            bool alive = false;
            try {
                alive = _gossiper.is_alive(locator::host_id(n.first.uuid()));
            } catch (...) {}

            if (!alive) {
                dead_set.insert(locator::host_id(n.first.uuid()));
            }
        }
        return dead_set;
    }

    // Return dead nodes and while at it checking if there are live nodes that either need cleanup
    // or running one already
    std::unordered_set<raft::server_id> get_dead_node(bool& cleanup_running, bool& cleanup_needed) const {
        std::unordered_set<raft::server_id> dead_set;
        cleanup_needed = cleanup_running = false;
        for (auto& n : _topo_sm._topology.normal_nodes) {
            bool alive = false;
            try {
                alive = _gossiper.is_alive(locator::host_id(n.first.uuid()));
            } catch (...) {}

            if (!alive) {
                dead_set.insert(n.first);
            } else {
                cleanup_running |= (n.second.cleanup == cleanup_status::running);
                cleanup_needed |= (n.second.cleanup == cleanup_status::needed);
            }
        }
        return dead_set;
    }

    std::optional<request_param> get_request_param(raft::server_id id) const {
        return _topo_sm._topology.get_request_param(id);
    };

    // Returns:
    // guard - there is nothing to do.
    // cancel_requests - no request can be started so cancel the queue
    // start_cleanup - cleanup needs to be started
    // node_to_work_on - the node the topology coordinator should work on
    std::variant<group0_guard, cancel_requests, start_cleanup, node_to_work_on> get_next_task(group0_guard guard) {
        auto& topo = _topo_sm._topology;

        if (topo.transition_nodes.size() != 0) {
            // If there is a node that is the middle of topology operation continue with it
            return get_node_to_work_on(std::move(guard));
        }

        bool cleanup_running;
        bool cleanup_needed;
        const auto dead_nodes = get_dead_node(cleanup_running, cleanup_needed);

        if (cleanup_running || topo.requests.empty()) {
            // Ether there is no requests or there is a live node that runs cleanup. Wait for it to complete.
            return std::move(guard);
        }

        std::optional<std::pair<raft::server_id, topology_request>> next_req;

        for (auto& req : topo.requests) {
            auto enough_live_nodes = [&] {
                if (req.second == topology_request::rebuild) {
                    // For rebuild only the node itself should be alive to start it
                    // it may still fail if down node has data for the rebuild process
                    return !dead_nodes.contains(req.first);
                }
                auto exclude_nodes = get_excluded_nodes(topo, req.first, req.second);
                for (auto id : dead_nodes) {
                    if (!exclude_nodes.contains(id)) {
                        return false;
                    }
                }
                return true;
            };
            if (enough_live_nodes()) {
                if (!next_req || next_req->second > req.second) {
                    next_req = req;
                }
            }
        }

        if (!next_req) {
            // We did not find a request that has enough live node to proceed
            // Cancel all requests to let admin know that no operation can succeed
            rtlogger.warn("topology coordinator: cancel request queue because no request can proceed. Dead nodes: {}", dead_nodes);
            return cancel_requests{std::move(guard), std::move(dead_nodes)};
        }

        auto [id, req] = *next_req;

        if (cleanup_needed && (req == topology_request::remove || req == topology_request::leave)) {
            // If the highest prio request is removenode or decommission we need to start cleanup if one is needed
            return start_cleanup(std::move(guard));
        }

        return node_to_work_on(std::move(guard), &topo, id, &topo.find(id)->second, req, get_request_param(id));
    };

    node_to_work_on get_node_to_work_on(group0_guard guard) const {
        auto& topo = _topo_sm._topology;

        if (topo.transition_nodes.empty()) {
            on_internal_error(rtlogger, ::format(
                "could not find node to work on"
                " even though the state requires it (state: {})", topo.tstate));
        }

        auto e = &*topo.transition_nodes.begin();
        return node_to_work_on{std::move(guard), &topo, e->first, &e->second, std::nullopt, get_request_param(e->first)};
     };

    future<group0_guard> start_operation() {
        rtlogger.debug("obtaining group 0 guard...");
        auto guard = co_await _group0.client().start_operation(_as);
        rtlogger.debug("guard taken, prev_state_id: {}, new_state_id: {}, coordinator term: {}, current Raft term: {}",
                       guard.observed_group0_state_id(), guard.new_group0_state_id(), _term, _raft.get_current_term());

        if (_term != _raft.get_current_term()) {
            throw term_changed_error{};
        }

        co_return std::move(guard);
    }

    void release_node(std::optional<node_to_work_on> node) {
        // Leaving the scope destroys the object and releases the guard.
    }

    node_to_work_on retake_node(group0_guard guard, raft::server_id id) const {
        auto& topo = _topo_sm._topology;

        auto it = topo.find(id);
        SCYLLA_ASSERT(it);

        std::optional<topology_request> req;
        auto rit = topo.requests.find(id);
        if (rit != topo.requests.end()) {
            req = rit->second;
        }
        std::optional<request_param> req_param;
        auto pit = topo.req_param.find(id);
        if (pit != topo.req_param.end()) {
            req_param = pit->second;
        }
        return node_to_work_on{std::move(guard), &topo, id, &it->second, std::move(req), std::move(req_param)};
    }

    group0_guard take_guard(node_to_work_on&& node) {
        return std::move(node.guard);
    }

    future<> update_topology_state(
            group0_guard guard, std::vector<canonical_mutation>&& updates, const sstring& reason) {
        try {
            rtlogger.info("updating topology state: {}", reason);
            rtlogger.trace("update_topology_state mutations: {}", updates);
            topology_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        } catch (group0_concurrent_modification&) {
            rtlogger.info("race while changing state: {}. Retrying", reason);
            throw;
        }
    };

    raft::server_id parse_replaced_node(const std::optional<request_param>& req_param) const {
        return service::topology::parse_replaced_node(req_param);
    }

    future<> exec_direct_command_helper(raft::server_id id, uint64_t cmd_index, const raft_topology_cmd& cmd) {
        rtlogger.debug("send {} command with term {} and index {} to {}",
            cmd.cmd, _term, cmd_index, id);
        auto result = _db.get_token_metadata().get_topology().is_me(to_host_id(id)) ?
                    co_await _raft_topology_cmd_handler(_term, cmd_index, cmd) :
                    co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
                            &_messaging, to_host_id(id), id, _term, cmd_index, cmd);
        if (result.status == raft_topology_cmd_result::command_status::fail) {
            co_await coroutine::exception(std::make_exception_ptr(
                    std::runtime_error(::format("failed status returned from {}", id))));
        }
    };

    future<node_to_work_on> exec_direct_command(node_to_work_on&& node, const raft_topology_cmd& cmd) {
        auto id = node.id;
        release_node(std::move(node));
        const auto cmd_index = ++_last_cmd_index;
        co_await exec_direct_command_helper(id, cmd_index, cmd);
        co_return retake_node(co_await start_operation(), id);
    };

    future<> exec_global_command_helper(auto nodes, const raft_topology_cmd& cmd) {
        const auto cmd_index = ++_last_cmd_index;
        auto f = co_await coroutine::as_future(
                seastar::parallel_for_each(std::move(nodes), [this, &cmd, cmd_index] (raft::server_id id) {
            return exec_direct_command_helper(id, cmd_index, cmd);
        }));

        if (f.failed()) {
            co_await coroutine::return_exception(std::runtime_error(
                ::format("raft topology: exec_global_command({}) failed with {}",
                    cmd.cmd, f.get_exception())));
        }
    };

    future<group0_guard> exec_global_command(
            group0_guard guard, const raft_topology_cmd& cmd,
            const std::unordered_set<raft::server_id>& exclude_nodes,
            drop_guard_and_retake drop_and_retake = drop_guard_and_retake::yes) {
        rtlogger.info("executing global topology command {}, excluded nodes: {}", cmd.cmd, exclude_nodes);
        auto nodes = boost::range::join(_topo_sm._topology.normal_nodes, _topo_sm._topology.transition_nodes)
            | std::views::filter([&cmd, &exclude_nodes] (const std::pair<const raft::server_id, replica_state>& n) {
                // We must send barrier and barrier_and_drain to the decommissioning node
                // as it might be accepting or coordinating requests.
                bool include_decommissioning_node = n.second.state == node_state::decommissioning
                        && (cmd.cmd == raft_topology_cmd::command::barrier || cmd.cmd == raft_topology_cmd::command::barrier_and_drain);
                return !exclude_nodes.contains(n.first) && (n.second.state == node_state::normal || include_decommissioning_node);
            })
            | std::views::keys;
        if (drop_and_retake) {
            release_guard(std::move(guard));
        }
        co_await exec_global_command_helper(std::move(nodes), cmd);
        if (drop_and_retake) {
            guard = co_await start_operation();
        }
        co_return guard;
    }

    std::unordered_set<raft::server_id> get_excluded_nodes(const topology_state_machine::topology_type& topo,
                raft::server_id id, const std::optional<topology_request>& req) const {
        return topo.get_excluded_nodes(id, req);
    }

    std::unordered_set<raft::server_id> get_excluded_nodes(const node_to_work_on& node) const {
        return node.topology->get_excluded_nodes(node.id, node.request);
    }

    future<node_to_work_on> exec_global_command(node_to_work_on&& node, const raft_topology_cmd& cmd) {
        auto guard = co_await exec_global_command(std::move(node.guard), cmd, get_excluded_nodes(node), drop_guard_and_retake::yes);
        co_return retake_node(std::move(guard), node.id);
    };

    future<group0_guard> remove_from_group0(group0_guard guard, const raft::server_id& id) {
        rtlogger.info("removing node {} from group 0 configuration...", id);
        release_guard(std::move(guard));
        co_await _group0.remove_from_raft_config(id);
        rtlogger.info("node {} removed from group 0 configuration", id);
        co_return co_await start_operation();
    }

    future<> step_down_as_nonvoter() {
        // Become a nonvoter which triggers a leader stepdown.
        co_await _group0.become_nonvoter(_as);
        if (_raft.is_leader()) {
            co_await _raft.wait_for_state_change(&_as);
        }

        // throw term_changed_error so we leave the coordinator loop instead of trying another
        // read_barrier which may fail with an (harmless, but unnecessary and annoying) error
        // telling us we're not in the configuration anymore (we'll get removed by the new
        // coordinator)
        throw term_changed_error{};
    }

    struct bootstrapping_info {
        const std::unordered_set<token>& bootstrap_tokens;
        const replica_state& rs;
    };

    // Returns data for a new CDC generation in the form of mutations for the CDC_GENERATIONS_V3 table
    // and the generation's UUID.
    //
    // If there's a bootstrapping node, its tokens should be included in the new generation.
    // Pass them and a reference to the bootstrapping node's replica_state through `binfo`.
    future<std::pair<utils::UUID, utils::chunked_vector<mutation>>> prepare_new_cdc_generation_data(
            locator::token_metadata_ptr tmptr, const group0_guard& guard, std::optional<bootstrapping_info> binfo,
            noncopyable_function<std::pair<size_t, uint8_t>(locator::host_id)> get_sharding_info_for_host_id) {
        auto get_sharding_info = [&] (dht::token end) -> std::pair<size_t, uint8_t> {
            if (binfo && binfo->bootstrap_tokens.contains(end)) {
                return {binfo->rs.shard_count, binfo->rs.ignore_msb};
            } else {
                auto ep = tmptr->get_endpoint(end);
                if (!ep) {
                    // get_sharding_info is only called for bootstrap tokens
                    // or for tokens present in token_metadata
                    on_internal_error(rtlogger, ::format(
                        "make_new_cdc_generation_data: get_sharding_info:"
                        " can't find endpoint for token {}", end));
                }

                try {
                    return get_sharding_info_for_host_id(*ep);
                } catch (...) {
                    on_internal_error(rtlogger, ::format(
                        "make_new_cdc_generation_data: get_sharding_info:"
                        " can't get sharding info for node {}, owner of token {}."
                        " Reason: {}", *ep, end, std::current_exception()));
                }
            }
        };

        auto gen_uuid = guard.new_group0_state_id();
        auto gen_desc = cdc::make_new_generation_description(
            binfo ? binfo->bootstrap_tokens : std::unordered_set<token>{}, get_sharding_info, tmptr);
        auto gen_table_schema = _db.find_schema(
            db::system_keyspace::NAME, db::system_keyspace::CDC_GENERATIONS_V3);

        const size_t max_command_size = _raft.max_command_size();
        const size_t mutation_size_threshold = max_command_size / 2;
        auto gen_mutations = co_await cdc::get_cdc_generation_mutations_v3(
            gen_table_schema, gen_uuid, gen_desc, mutation_size_threshold, guard.write_timestamp());

        co_return std::pair{gen_uuid, std::move(gen_mutations)};
    }

    // Broadcasts all mutations returned from `prepare_new_cdc_generation_data` except the last one.
    // Each mutation is sent in separate raft command. It takes `group0_guard`, and if the number of mutations
    // is greater than one, the guard is dropped, and a new one is created and returned, otherwise the old one
    // will be returned. Commands are sent in parallel and unguarded (the guard used for sending the last mutation
    // will guarantee that the term hasn't been changed). Returns the generation's UUID, guard and last mutation,
    // which will be sent with additional topology data by the caller.
    //
    // If we send the last mutation in the `write_mutation` command, we would use a total of `n + 1` commands
    // instead of `n-1 + 1` (where `n` is the number of mutations), so it's better to send it in `topology_change`
    // (we need to send it after all `write_mutations`) with some small metadata.
    //
    // With the default commitlog segment size, `mutation_size_threshold` will be 4 MB. In large clusters e.g.
    // 100 nodes, 64 shards per node, 256 vnodes cdc generation data can reach the size of 30 MB, thus
    // there will be no more than 8 commands.
    //
    // In a multi-DC cluster with 100ms latencies between DCs, this operation should take about 200ms since we
    // send the commands concurrently, but even if the commands were replicated sequentially by Raft,
    // it should take no more than 1.6s which is incomparably smaller than bootstrapping operation
    // (bootstrapping is quick if there is no data in the cluster, but usually if one has 100 nodes they
    // have tons of data, so indeed streaming/repair will take much longer (hours/days)).
    future<std::tuple<utils::UUID, group0_guard, canonical_mutation>> prepare_and_broadcast_cdc_generation_data(
            locator::token_metadata_ptr tmptr, group0_guard guard, std::optional<bootstrapping_info> binfo) {

        auto get_sharding_info_for_host_id = [&] (locator::host_id ep) -> std::pair<size_t, uint8_t> {
            auto ptr = _topo_sm._topology.find(raft::server_id{ep.uuid()});
            if (!ptr) {
                throw std::runtime_error(::format(
                        "raft topology: prepare_and_broadcast_cdc_generation_data: get_sharding_info_for_host_id:"
                        " couldn't find node {} in topology", ep));
            }

            auto& rs = ptr->second;
            return {rs.shard_count, rs.ignore_msb};
        };

        co_return co_await prepare_and_broadcast_cdc_generation_data(tmptr, std::move(guard), binfo, get_sharding_info_for_host_id);
    }

    future<std::tuple<utils::UUID, group0_guard, canonical_mutation>> prepare_and_broadcast_cdc_generation_data(
            locator::token_metadata_ptr tmptr, group0_guard guard, std::optional<bootstrapping_info> binfo,
            noncopyable_function<std::pair<size_t, uint8_t>(locator::host_id)> get_sharding_info_for_host_id) {

        auto [gen_uuid, gen_mutations] = co_await prepare_new_cdc_generation_data(tmptr, guard, binfo, std::move(get_sharding_info_for_host_id));

        if (gen_mutations.empty()) {
            on_internal_error(rtlogger, "cdc_generation_data: gen_mutations is empty");
        }

        std::vector<canonical_mutation> updates{gen_mutations.begin(), gen_mutations.end()};

        if (updates.size() > 1) {
            release_guard(std::move(guard));

            co_await parallel_for_each(updates.begin(), std::prev(updates.end()), [this, gen_uuid = gen_uuid] (canonical_mutation& m) {
                auto const reason = format(
                    "insert CDC generation data (UUID: {}), part", gen_uuid);

                rtlogger.trace("do update {} reason {}", m, reason);
                write_mutations change{{std::move(m)}};
                group0_command g0_cmd = _group0.client().prepare_command(std::move(change), reason);
                return _group0.client().add_entry_unguarded(std::move(g0_cmd), &_as);
            });

            guard = co_await start_operation();
        }

        co_return std::tuple{gen_uuid, std::move(guard), std::move(updates.back())};
    }

    // Deletes obsolete CDC generations. These are the published generations that stopped operating
    // sufficiently long ago (see comment below).
    //
    // Appends necessary mutations to `updates` and updates the `reason` string.
    future<> clean_obsolete_cdc_generations(
            const group0_guard& guard,
            std::vector<canonical_mutation>& updates,
            sstring& reason) {
        const auto& committed_gens = _topo_sm._topology.committed_cdc_generations;
        if (committed_gens.empty()) {
            co_return;
        }

        // If some node can still accept a write to a CDC generation, we cannot delete this generation.
        // A request coordinator accepts a write to one of the previous generations (the ones that stopped
        // operating before `now`) if the write's timestamp is higher than `now - 5s` where `now` is
        // provided by the coordinator's local clock. So, we can safely delete a generation if it
        // stopped operating more than 5s ago on all nodes. But since their clocks can be desynchronized,
        // we have to account for that. We assume that the max clock difference is within `ring_delay`.
        // So we delete only generations that stopped operating more than `5s + ring_delay` ago.
        auto ts_upper_bound = db_clock::now() - (cdc::get_generation_leeway() + _ring_delay);
        utils::get_local_injector().inject("clean_obsolete_cdc_generations_change_ts_ub", [&] {
            ts_upper_bound = db_clock::now();
        });

        // Theoretically, some generations might not be published within `5s + ring_delay` after creation.
        // If that happens, we reduce `ts_upper_bound` to ensure they aren't deleted.
        if (!_topo_sm._topology.unpublished_cdc_generations.empty()) {
            auto first_unpublished_ts = _topo_sm._topology.unpublished_cdc_generations.front().ts;
            if (first_unpublished_ts < ts_upper_bound) {
                ts_upper_bound = first_unpublished_ts;
            }
        }

        std::optional<std::vector<cdc::generation_id_v2>::const_iterator> first_nonobsolete_gen_it;
        for (auto it = committed_gens.begin(); it != committed_gens.end() && it->ts <= ts_upper_bound; it++) {
            if (it + 1 == committed_gens.end() || (it + 1)->ts > ts_upper_bound) {
                first_nonobsolete_gen_it = it;
            }
        }
        if (!first_nonobsolete_gen_it || *first_nonobsolete_gen_it == committed_gens.begin()) {
            co_return;
        }

        auto mut_ts = guard.write_timestamp();

        // Insert a tombstone covering all obsolete generations.
        auto s = _db.find_schema(db::system_keyspace::NAME, db::system_keyspace::CDC_GENERATIONS_V3);
        auto id_upper_bound = (*first_nonobsolete_gen_it)->id;
        mutation m(s, partition_key::from_singular(*s, cdc::CDC_GENERATIONS_V3_KEY));
        auto range = query::clustering_range::make_ending_with({
                clustering_key_prefix::from_single_value(*s, timeuuid_type->decompose(id_upper_bound)), false});
        auto bv = bound_view::from_range(range);
        m.partition().apply_delete(*s, range_tombstone{bv.first, bv.second, tombstone{mut_ts, gc_clock::now()}});
        updates.push_back(canonical_mutation(m));

        std::vector<cdc::generation_id_v2> new_committed_gens(*first_nonobsolete_gen_it, committed_gens.end());
        topology_mutation_builder builder(guard.write_timestamp());
        builder.set_committed_cdc_generations(std::move(new_committed_gens));
        updates.push_back(builder.build());

        reason += ::format("deleted data of CDC generations with time UUID lower than {}", id_upper_bound);
    }

    // If there are some unpublished CDC generations, publishes the one with the oldest timestamp
    // to user-facing description tables.
    //
    // Appends necessary mutations to `updates` and updates the `reason` string.
    future<> publish_oldest_cdc_generation(
            const group0_guard& guard,
            std::vector<canonical_mutation>& updates,
            sstring& reason) {
        const auto& unpublished_gens = _topo_sm._topology.unpublished_cdc_generations;
        if (unpublished_gens.empty()) {
            co_return;
        }

        // The generation under index 0 is the oldest because unpublished_cdc_generations are sorted by timestamp.
        auto gen_id = unpublished_gens[0];

        auto gen_data = co_await _sys_ks.read_cdc_generation(gen_id.id);

        co_await _sys_dist_ks.local().create_cdc_desc(
                gen_id.ts, gen_data, { get_token_metadata().count_normal_token_owners() });

        std::vector<cdc::generation_id_v2> new_unpublished_gens(unpublished_gens.begin() + 1, unpublished_gens.end());
        topology_mutation_builder builder(guard.write_timestamp());
        builder.set_unpublished_cdc_generations(std::move(new_unpublished_gens));
        updates.push_back(builder.build());

        reason += ::format("published CDC generation with ID {}, ", gen_id);
    }

    // The background fiber of the topology coordinator that continually publishes committed yet unpublished
    // CDC generations. Every generation is published in a separate group 0 operation.
    //
    // It also continually cleans the obsolete CDC generation data.
    future<> cdc_generation_publisher_fiber() {
        rtlogger.debug("start CDC generation publisher fiber");

        while (!_as.abort_requested()) {
            co_await utils::get_local_injector().inject("cdc_generation_publisher_fiber", [] (auto& handler) -> future<> {
                rtlogger.info("CDC generation publisher fiber sleeps after injection");
                co_await handler.wait_for_message(std::chrono::steady_clock::now() + std::chrono::minutes{5});
                rtlogger.info("CDC generation publisher fiber finishes sleeping after injection");
            }, false);

            bool sleep = false;
            try {
                auto guard = co_await start_operation();
                std::vector<canonical_mutation> updates;
                sstring reason;

                co_await publish_oldest_cdc_generation(guard, updates, reason);

                co_await clean_obsolete_cdc_generations(guard, updates, reason);

                if (!updates.empty()) {
                    co_await update_topology_state(std::move(guard), std::move(updates), std::move(reason));
                } else {
                    release_guard(std::move(guard));
                }

                if (_topo_sm._topology.unpublished_cdc_generations.empty()) {
                    // No CDC generations to publish. Wait until one appears or the topology coordinator aborts.
                    rtlogger.debug("CDC generation publisher fiber has nothing to do. Sleeping.");
                    co_await _topo_sm.event.when([&] () {
                        return !_topo_sm._topology.unpublished_cdc_generations.empty() || _as.abort_requested();
                    });
                    rtlogger.debug("CDC generation publisher fiber wakes up");
                }
            } catch (raft::request_aborted&) {
                rtlogger.debug("CDC generation publisher fiber aborted");
            } catch (seastar::abort_requested_exception) {
                rtlogger.debug("CDC generation publisher fiber aborted");
            } catch (group0_concurrent_modification&) {
            } catch (term_changed_error&) {
                rtlogger.debug("CDC generation publisher fiber notices term change {} -> {}", _term, _raft.get_current_term());
            } catch (...) {
                rtlogger.error("CDC generation publisher fiber got error {}", std::current_exception());
                sleep = true;
            }
            if (sleep) {
                try {
                    co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
                } catch (...) {
                    rtlogger.debug("CDC generation publisher: sleep failed: {}", std::current_exception());
                }
            }
            co_await coroutine::maybe_yield();
        }
    }

    // If a node crashes after initiating gossip and before joining group0, it becomes orphan and
    // remains in gossip. This background fiber periodically checks for such nodes and purges those
    // entries from gossiper by committing the node as left in raft.
    future<> gossiper_orphan_remover_fiber() {
        rtlogger.debug("start gossiper orphan remover fiber");
        bool do_speedup_fiber = false;
        co_await utils::get_local_injector().inject("fast_orphan_removal_fiber", [&](auto& handler) -> future<> {
            do_speedup_fiber = true;
            // While testing, orphan ip is introduced, and its presence is captured. Wait is added before remover thread so that the presence of orphan ip is
            // confirmed and the difference of gossiper ips before and after this thread application can be easily asserted.
            return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5));
        });
        while (!_as.abort_requested()) {
            try {
                auto guard = co_await start_operation();
                std::vector<canonical_mutation> updates;
                int32_t timeout = 60;
                co_await utils::get_local_injector().inject("speedup_orphan_removal", [&](auto& handler) -> future<> {
                    // Removes all unjoined nodes. Just for testing purposes.
                    timeout = 0;
                    co_return;
                });
                std::string reason = ::format("Ban the orphan nodes in group0. Orphan nodes HostId/IP:");
                _gossiper.for_each_endpoint_state([&](const gms::inet_address& addr, const gms::endpoint_state& eps) -> void {
                    // Since generation is in seconds unit, converting current time to seconds eases comparison computations.
                    auto current_timestamp =
                            std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now().time_since_epoch()).count();
                    auto generation = eps.get_heart_beat_state().get_generation().value();
                    auto host_id = eps.get_host_id();
                    if (current_timestamp - generation > timeout && !_topo_sm._topology.contains(raft::server_id{host_id.id}) && !_gossiper.is_alive(addr)) {
                        topology_mutation_builder builder(guard.write_timestamp());
                        // This topology mutation moves a node to left state and bans it. Hence, the value of below fields are not useful.
                        // The dummy_value used for few fields indicates the trivialness of this row entry, and is used to detect this special case.
                        static constexpr uint32_t dummy_value = 0;
                        builder.with_node(raft::server_id{host_id.id})
                                .set("datacenter", get_application_state_gently(eps.get_application_state_map(), gms::application_state::DC))
                                .set("rack", get_application_state_gently(eps.get_application_state_map(), gms::application_state::RACK))
                                .set("release_version", get_application_state_gently(eps.get_application_state_map(), gms::application_state::RELEASE_VERSION))
                                .set("num_tokens", dummy_value)
                                .set("tokens_string", sstring{})
                                .set("shard_count", dummy_value)
                                .set("ignore_msb", dummy_value)
                                .set("request_id", dummy_value)
                                .set("cleanup_status", cleanup_status::clean)
                                .set("node_state", node_state::left);
                        reason.append(::format(" {}/{},", host_id, addr));
                        updates.push_back({builder.build()});
                    }
                });
                if (!updates.empty()) {
                    co_await update_topology_state(std::move(guard), std::move(updates), reason);
                }

            } catch (raft::request_aborted&) {
                rtlogger.debug("gossiper orphan remover fiber aborted");
            } catch (seastar::abort_requested_exception) {
                rtlogger.debug("gossiper orphan remover fiber aborted");
            } catch (group0_concurrent_modification&) {
            } catch (term_changed_error&) {
                rtlogger.debug("gossiper orphan remover fiber notices term change {} -> {}", _term, _raft.get_current_term());
            } catch (...) {
                rtlogger.error("gossiper orphan remover fiber got error {}", std::current_exception());
            }
            try {
                co_await seastar::sleep_abortable(do_speedup_fiber ? std::chrono::milliseconds(1) : std::chrono::seconds(10), _as);
            } catch (...) {
                rtlogger.debug("gossiper orphan remover: sleep failed: {}", std::current_exception());
            }
            co_await coroutine::maybe_yield();
        }
    }

    // Precondition: there is no node request and no ongoing topology transition
    // (checked under the guard we're holding).
    future<> handle_global_request(group0_guard guard) {
        switch (_topo_sm._topology.global_request.value()) {
        case global_topology_request::new_cdc_generation: {
            rtlogger.info("new CDC generation requested");

            auto tmptr = get_token_metadata_ptr();
            auto [gen_uuid, guard_, mutation] = co_await prepare_and_broadcast_cdc_generation_data(tmptr, std::move(guard), std::nullopt);
            guard = std::move(guard_);

            topology_mutation_builder builder(guard.write_timestamp());
            // We don't delete the request now, but only after the generation is committed. If we deleted
            // the request now and received another new_cdc_generation request later, but before committing
            // the new generation, the second request would also create a new generation. Deleting requests
            // after the generation is committed prevents this from happening. The second request would have
            // no effect - it would just overwrite the first request.
            builder.set_transition_state(topology::transition_state::commit_cdc_generation)
                   .set_new_cdc_generation_data_uuid(gen_uuid);
            auto reason = ::format(
                "insert CDC generation data (UUID: {})", gen_uuid);
            co_await update_topology_state(std::move(guard), {std::move(mutation), builder.build()}, reason);
        }
        break;
        case global_topology_request::cleanup:
            co_await start_cleanup_on_dirty_nodes(std::move(guard), true);
            break;
        case global_topology_request::keyspace_rf_change: {
            rtlogger.info("keyspace_rf_change requested");
            sstring ks_name = *_topo_sm._topology.new_keyspace_rf_change_ks_name;
            std::unordered_map<sstring, sstring> saved_ks_props = *_topo_sm._topology.new_keyspace_rf_change_data;
            cql3::statements::ks_prop_defs new_ks_props{std::map<sstring, sstring>{saved_ks_props.begin(), saved_ks_props.end()}};

            auto repl_opts = new_ks_props.get_replication_options();
            repl_opts.erase(cql3::statements::ks_prop_defs::REPLICATION_STRATEGY_CLASS_KEY);
            utils::UUID req_uuid = *_topo_sm._topology.global_request_id;
            std::vector<canonical_mutation> updates;
            sstring error;
            if (_db.has_keyspace(ks_name)) {
                auto& ks = _db.find_keyspace(ks_name);
                auto tmptr = get_token_metadata_ptr();
                size_t unimportant_init_tablet_count = 2; // must be a power of 2
                locator::tablet_map new_tablet_map{unimportant_init_tablet_count};

                auto tables_with_mvs = ks.metadata()->tables();
                auto views = ks.metadata()->views();
                tables_with_mvs.insert(tables_with_mvs.end(), views.begin(), views.end());
                for (const auto& table_or_mv : tables_with_mvs) {
                    try {
                        locator::tablet_map old_tablets = tmptr->tablets().get_tablet_map(table_or_mv->id());
                        locator::replication_strategy_params params{repl_opts, old_tablets.tablet_count()};
                        auto new_strategy = locator::abstract_replication_strategy::create_replication_strategy("NetworkTopologyStrategy", params);
                        new_tablet_map = co_await new_strategy->maybe_as_tablet_aware()->reallocate_tablets(table_or_mv, tmptr, old_tablets);
                    } catch (const std::exception& e) {
                        error = e.what();
                        rtlogger.error("Couldn't process global_topology_request::keyspace_rf_change, error: {},"
                                       "desired new ks opts: {}", error, new_ks_props.get_replication_options());
                        updates.clear(); // remove all tablets mutations ...
                        break;           // ... and only create mutations deleting the global req
                    }

                    replica::tablet_mutation_builder tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id());
                    co_await new_tablet_map.for_each_tablet([&](locator::tablet_id tablet_id, const locator::tablet_info& tablet_info) -> future<> {
                        auto last_token = new_tablet_map.get_last_token(tablet_id);
                        updates.emplace_back(co_await make_canonical_mutation_gently(
                                replica::tablet_mutation_builder(guard.write_timestamp(), table_or_mv->id())
                                        .set_new_replicas(last_token, tablet_info.replicas)
                                        .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
                                        .set_transition(last_token, locator::tablet_transition_kind::rebuild)
                                        .build()
                        ));
                        co_await coroutine::maybe_yield();
                    });
                }
            } else {
                error = "Can't ALTER keyspace " + ks_name + ", keyspace doesn't exist";
            }

            updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp())
                                                         .set_transition_state(topology::transition_state::tablet_migration)
                                                         .set_version(_topo_sm._topology.version + 1)
                                                         .del_global_topology_request()
                                                         .del_global_topology_request_id()
                                                         .build()));
            updates.push_back(canonical_mutation(topology_request_tracking_mutation_builder(req_uuid)
                                                         .done(error)
                                                         .build()));
            if (error.empty()) {
                const sstring strategy_name = "NetworkTopologyStrategy";
                auto ks_md = keyspace_metadata::new_keyspace(ks_name, strategy_name, repl_opts,
                                                             new_ks_props.get_initial_tablets(std::nullopt),
                                                             new_ks_props.get_durable_writes(), new_ks_props.get_storage_options());
                auto schema_muts = prepare_keyspace_update_announcement(_db, ks_md, guard.write_timestamp());
                for (auto& m: schema_muts) {
                    updates.emplace_back(m);
                }
            }

            sstring reason = seastar::format("ALTER tablets KEYSPACE called with options: {}", saved_ks_props);
            rtlogger.trace("do update {} reason {}", updates, reason);
            mixed_change change{std::move(updates)};
            group0_command g0_cmd = _group0.client().prepare_command(std::move(change), guard, reason);
            co_await utils::get_local_injector().inject("wait-before-committing-rf-change-event", utils::wait_for_message(30s));
            co_await _group0.client().add_entry(std::move(g0_cmd), std::move(guard), _as);
        }
        break;
        case global_topology_request::truncate_table: {
            rtlogger.info("TRUNCATE TABLE requested");
            std::vector<canonical_mutation> updates;
            updates.push_back(topology_mutation_builder(guard.write_timestamp())
                                .set_transition_state(topology::transition_state::truncate_table)
                                .set_session(session_id(_topo_sm._topology.global_request_id.value()))
                                .build());
            co_await update_topology_state(std::move(guard), std::move(updates), "TRUNCATE TABLE requested");
        }
        break;
        }
    }

    // Preconditions:
    // - There are no topology operations in progress
    // - `features_to_enable` represents a set of features that are currently
    //   marked as supported by all normal nodes and it is not empty
    future<> enable_features(group0_guard guard, std::set<sstring> features_to_enable) {
        if (!_topo_sm._topology.transition_nodes.empty()) {
            on_internal_error(rtlogger,
                    "topology coordinator attempted to enable features even though there is"
                    " a topology operations in progress");
        }

        if (utils::get_local_injector().enter("raft_topology_suppress_enabling_features")) {
            // Prevent enabling features while the injection is enabled.
            // The topology coordinator will detect in the next iteration
            // that there are still some cluster features to enable and will
            // reach this place again. In order not to spin in a loop, sleep
            // for a short while.
            co_await sleep(std::chrono::milliseconds(100));
            co_return;
        }

        // If we are here, then we noticed that all normal nodes support some
        // features that are not enabled yet. Perform a global barrier to make
        // sure that:
        //
        // 1. All normal nodes saw (and persisted) a view of the system.topology
        //    table that is equal to what the topology coordinator sees (or newer,
        //    but in that case updating the topology state will fail),
        // 2. None of the normal nodes is restarting at the moment and trying to
        //    downgrade (this is done by a special check in the barrier handler).
        //
        // It's sufficient to only include normal nodes because:
        //
        // - There are no transitioning nodes due to the precondition,
        // - New and left nodes are not part of group 0.
        //
        // After we get a successful confirmation from each normal node, we have
        // a guarantee that they won't attempt to revoke support for those
        // features. That's because we do not allow nodes to boot without
        // a feature that is supported by all nodes in the cluster, even if
        // the feature is not enabled yet.
        guard = co_await exec_global_command(std::move(guard),
                raft_topology_cmd{raft_topology_cmd::command::barrier},
                {_raft.id()},
                drop_guard_and_retake::no);

        topology_mutation_builder builder(guard.write_timestamp());
        builder.add_enabled_features(features_to_enable);
        auto reason = ::format("enabling features: {}", features_to_enable);
        co_await update_topology_state(std::move(guard), {builder.build()}, reason);

        rtlogger.info("enabled features: {}", features_to_enable);
    }

    future<group0_guard> global_token_metadata_barrier(group0_guard&& guard, std::unordered_set<raft::server_id> exclude_nodes = {}) {
        auto version = _topo_sm._topology.version;
        bool drain_failed = false;
        try {
            guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier_and_drain, exclude_nodes, drop_guard_and_retake::yes);
        } catch (...) {
            rtlogger.error("drain rpc failed, proceed to fence old writes: {}", std::current_exception());
            drain_failed = true;
        }
        if (drain_failed) {
            guard = co_await start_operation();
        }
        topology_mutation_builder builder(guard.write_timestamp());
        // Other nodes are guaranteed to be drained on success only up to the version which was current
        // prior to barrier_and_drain RPCs were sent.
        builder.set_fence_version(version);
        auto reason = ::format("advance fence version to {}", version);
        co_await update_topology_state(std::move(guard), {builder.build()}, reason);
        guard = co_await start_operation();
        if (drain_failed) {
            // if drain failed need to wait for fence to be active on all nodes
            co_return co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier, exclude_nodes, drop_guard_and_retake::yes);
        } else {
            co_return std::move(guard);
        }
    }

    future<group0_guard> global_tablet_token_metadata_barrier(group0_guard guard) {
        // FIXME: Don't require all nodes to be up, only tablet replicas.
        return global_token_metadata_barrier(std::move(guard), _topo_sm._topology.get_excluded_nodes());
    }

    // Represents a two-state state machine which changes monotonically
    // from "not executed" to "executed successfully". This state
    // machine is transient, lives only on this coordinator.
    // The transition is achieved by execution of an idempotent async
    // operation which is tracked by a future. Even though the async
    // action is idempotent, it is costly, so we want to avoid
    // re-executing it if it was already started by this coordinator,
    // that's why we track it.
    using background_action_holder = std::optional<future<>>;

    // Transient state of tablet migration which lives on this coordinator.
    // It is guaranteed to die when migration is finished.
    // Next migration of the same tablet is guaranteed to use a different instance.
    struct tablet_migration_state {
        background_action_holder streaming;
        background_action_holder cleanup;
        background_action_holder repair;
        std::unordered_map<locator::tablet_transition_stage, background_action_holder> barriers;
        // Record the repair_time returned by the repair_tablet rpc call
        db_clock::time_point repair_time;
    };

    std::unordered_map<locator::global_tablet_id, tablet_migration_state> _tablets;

    // Set to true when any action started on behalf of a background_action_holder
    // for any tablet finishes, or fails and needs to be restarted.
    bool _tablets_ready = false;

    seastar::gate _async_gate;

    bool action_failed(background_action_holder& holder) const {
        return holder && holder->failed();
    }

    // This function drives background_action_holder towards "executed successfully"
    // by starting the action if it is not already running or if the previous instance
    // of the action failed. If the action is already running, it does nothing.
    // Returns true iff background_action_holder reached the "executed successfully" state.
    bool advance_in_background(locator::global_tablet_id gid, background_action_holder& holder, const char* name,
                               std::function<future<>()> action) {
        if (!holder || holder->failed()) {
            if (action_failed(holder)) {
                // Prevent warnings about abandoned failed future. Logged below.
                holder->ignore_ready_future();
            }
            holder = futurize_invoke(action).then_wrapped([this, gid, name] (future<> f) {
                if (f.failed()) {
                    auto ep = f.get_exception();
                    rtlogger.warn("{} for tablet {} failed: {}", name, gid, ep);
                    return seastar::sleep_abortable(std::chrono::seconds(1), _as).then([ep] () mutable {
                        std::rethrow_exception(ep);
                    });
                }
                return f;
            }).finally([this, g = _async_gate.hold(), gid, name] () noexcept {
                rtlogger.debug("{} for tablet {} resolved.", name, gid);
                _tablets_ready = true;
                _topo_sm.event.broadcast();
            });
            return false;
        }

        if (!holder->available()) {
            rtlogger.debug("Tablet {} still doing {}", gid, name);
            return false;
        }

        return true;
    }

    future<> for_each_tablet_transition(std::function<void(const locator::tablet_map&,
                                                           schema_ptr,
                                                           locator::global_tablet_id,
                                                           const locator::tablet_transition_info&)> func) {
        auto tm = get_token_metadata_ptr();
        for (auto&& [table, tmap] : tm->tablets().all_tables()) {
            co_await coroutine::maybe_yield();
            auto s = _db.find_schema(table);
            for (auto&& [tablet, trinfo]: tmap->transitions()) {
                co_await coroutine::maybe_yield();
                auto gid = locator::global_tablet_id {table, tablet};
                func(*tmap, s, gid, trinfo);
            }
        }
    }

    bool is_excluded(raft::server_id server_id) const {
        return _topo_sm._topology.get_excluded_nodes().contains(server_id);
    }

    void generate_migration_update(std::vector<canonical_mutation>& out, const group0_guard& guard, const tablet_migration_info& mig) {
        const auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(mig.tablet.table);
        auto last_token = tmap.get_last_token(mig.tablet.tablet);
        if (tmap.get_tablet_transition_info(mig.tablet.tablet)) {
            rtlogger.warn("Tablet already in transition, ignoring migration: {}", mig);
            return;
        }
        auto migration_task_info = mig.kind == locator::tablet_transition_kind::migration ? locator::tablet_task_info::make_migration_request()
            : locator::tablet_task_info::make_intranode_migration_request();
        migration_task_info.sched_nr++;
        migration_task_info.sched_time = db_clock::now();
        out.emplace_back(
            replica::tablet_mutation_builder(guard.write_timestamp(), mig.tablet.table)
                .set_new_replicas(last_token, locator::get_new_replicas(tmap.get_tablet_info(mig.tablet.tablet), mig))
                .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old)
                .set_transition(last_token, mig.kind)
                .set_migration_task_info(last_token, std::move(migration_task_info), _db.features())
                .build());
    }

    void generate_repair_update(std::vector<canonical_mutation>& out, const group0_guard& guard, const locator::global_tablet_id& gid, db_clock::time_point sched_time) {
        auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(gid.table);
        auto last_token = tmap.get_last_token(gid.tablet);
        if (tmap.get_tablet_transition_info(gid.tablet)) {
            rtlogger.warn("Tablet already in transition, ignoring repair: {}", gid);
            return;
        }
        auto& info = tmap.get_tablet_info(gid.tablet);
        auto repair_task_info = info.repair_task_info;
        if (!repair_task_info.is_user_repair_request()) {
            repair_task_info = locator::tablet_task_info::make_auto_repair_request();
        }
        repair_task_info.sched_nr++;
        repair_task_info.sched_time = db_clock::now();
        out.emplace_back(
            replica::tablet_mutation_builder(guard.write_timestamp(), gid.table)
                .set_new_replicas(last_token, tmap.get_tablet_info(gid.tablet).replicas)
                .set_stage(last_token, locator::tablet_transition_stage::repair)
                .set_transition(last_token, locator::tablet_transition_kind::repair)
                .set_repair_task_info(last_token, repair_task_info)
                .set_session(last_token, session_id(utils::UUID_gen::get_time_UUID()))
                .build());
    }

    void generate_resize_update(std::vector<canonical_mutation>& out, const group0_guard& guard, table_id table_id, locator::resize_decision resize_decision) {
            // FIXME: indent.
            auto s = _db.find_schema(table_id);
            const auto& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(table_id);
            // Sequence number is monotonically increasing, globally. Therefore, it can be used to identify a decision.
            resize_decision.sequence_number = tmap.resize_decision().next_sequence_number();
            rtlogger.debug("Generating resize decision for table {} of type {} and sequence number {}",
                           table_id, resize_decision.type_name(), resize_decision.sequence_number);
            out.emplace_back(
                replica::tablet_mutation_builder(guard.write_timestamp(), table_id)
                    .set_resize_decision(std::move(resize_decision), _db.features())
                    .build());
    }

    future<> generate_migration_updates(std::vector<canonical_mutation>& out, const group0_guard& guard, const migration_plan& plan) {
        for (const tablet_migration_info& mig : plan.migrations()) {
            co_await coroutine::maybe_yield();
            generate_migration_update(out, guard, mig);
        }

        auto sched_time = db_clock::now();
        for (const auto& gid : plan.repair_plan().repairs()) {
            co_await coroutine::maybe_yield();
            generate_repair_update(out, guard, gid, sched_time);
        }

        for (auto [table_id, resize_decision] : plan.resize_plan().resize) {
            co_await coroutine::maybe_yield();
            generate_resize_update(out, guard, table_id, resize_decision);
        }
    }

    // When "drain" is true, we migrate tablets only as long as there are nodes to drain
    // and then change the transition state to write_both_read_old. Also, while draining,
    // we ignore pending topology requests which normally interrupt load balancing.
    // When "drain" is false, we do regular load balancing.
    future<> handle_tablet_migration(group0_guard guard, bool drain) {
        // This step acts like a pump which advances state machines of individual tablets,
        // batching barriers and group0 updates.
        // If progress cannot be made, e.g. because all transitions are streaming, we block
        // and wait for notification.

        rtlogger.debug("handle_tablet_migration()");
        std::vector<canonical_mutation> updates;
        bool needs_barrier = false;
        bool has_transitions = false;

        shared_promise barrier;
        auto fail_barrier = seastar::defer([&] {
            if (needs_barrier) {
                barrier.set_exception(seastar::broken_promise());
            }
        });

        _tablets_ready = false;
        co_await for_each_tablet_transition([&] (const locator::tablet_map& tmap,
                                                 schema_ptr s,
                                                 locator::global_tablet_id gid,
                                                 const locator::tablet_transition_info& trinfo) {
            has_transitions = true;
            auto last_token = tmap.get_last_token(gid.tablet);
            auto& tablet_state = _tablets[gid];
            table_id table = s->id();

            auto get_mutation_builder = [&] () {
                return replica::tablet_mutation_builder(guard.write_timestamp(), table);
            };

            auto transition_to = [&] (locator::tablet_transition_stage stage) {
                rtlogger.debug("Will set tablet {} stage to {}", gid, stage);
                updates.emplace_back(get_mutation_builder()
                        .set_stage(last_token, stage)
                        .build());
            };

            auto do_barrier = [&] {
                return advance_in_background(gid, tablet_state.barriers[trinfo.stage], "barrier", [&] {
                    needs_barrier = true;
                    return barrier.get_shared_future();
                });
            };

            auto transition_to_with_barrier = [&] (locator::tablet_transition_stage stage) {
                if (do_barrier()) {
                    transition_to(stage);
                }
            };

            auto check_excluded_replicas = [&] {
                    auto tsi = get_migration_streaming_info(get_token_metadata().get_topology(), tmap.get_tablet_info(gid.tablet), trinfo);
                    for (auto r : tsi.read_from) {
                        if (is_excluded(raft::server_id(r.host.uuid()))) {
                            rtlogger.debug("Aborting streaming of {} because read-from {} is marked as ignored", gid, r);
                            return true;
                        }
                    }
                    for (auto r : tsi.written_to) {
                        if (is_excluded(raft::server_id(r.host.uuid()))) {
                            rtlogger.debug("Aborting streaming of {} because written-to {} is marked as ignored", gid, r);
                            return true;
                        }
                    }
                    return false;
            };

            switch (trinfo.stage) {
                case locator::tablet_transition_stage::allow_write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
                    if (do_barrier()) {
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::write_both_read_old);
                        updates.emplace_back(get_mutation_builder()
                            .set_stage(last_token, locator::tablet_transition_stage::write_both_read_old)
                            // Create session a bit earlier to avoid adding barrier
                            // to the streaming stage to create sessions on replicas.
                            .set_session(last_token, session_id(utils::UUID_gen::get_time_UUID()))
                            .build());
                    }
                    break;
                case locator::tablet_transition_stage::write_both_read_old:
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        if (check_excluded_replicas()) {
                            transition_to_with_barrier(locator::tablet_transition_stage::cleanup_target);
                            break;
                        }
                    }
                    transition_to_with_barrier(locator::tablet_transition_stage::streaming);
                    break;
                // The state "streaming" is needed to ensure that stale stream_tablet() RPC doesn't
                // get admitted before global_tablet_token_metadata_barrier() is finished for earlier
                // stage in case of coordinator failover.
                case locator::tablet_transition_stage::streaming: {
                    if (drain) {
                        utils::get_local_injector().inject("stream_tablet_fail_on_drain",
                                        [] { throw std::runtime_error("stream_tablet failed due to error injection"); });
                    }

                    if (action_failed(tablet_state.streaming)) {
                        bool cleanup = utils::get_local_injector().enter("stream_tablet_move_to_cleanup");
                        if (cleanup || check_excluded_replicas()) {
                            if (do_barrier()) {
                                rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::cleanup_target);
                                updates.emplace_back(get_mutation_builder()
                                        .set_stage(last_token, locator::tablet_transition_stage::cleanup_target)
                                        .del_session(last_token)
                                        .build());
                            }
                            break;
                        }
                    }

                    bool wait = utils::get_local_injector().enter("stream_tablet_wait");
                    if (!wait && advance_in_background(gid, tablet_state.streaming, "streaming", [&] {
                        utils::get_local_injector().inject("stream_tablet_move_to_cleanup",
                                        [] { throw std::runtime_error("stream_tablet failed due to error injection"); });

                        if (!trinfo.pending_replica) {
                            rtlogger.info("Skipped tablet streaming ({}) of {} as no pending replica found", trinfo.transition, gid);
                            return make_ready_future<>();
                        }
                        rtlogger.info("Initiating tablet streaming ({}) of {} to {}", trinfo.transition, gid, *trinfo.pending_replica);
                        auto dst = trinfo.pending_replica->host;
                        return ser::storage_service_rpc_verbs::send_tablet_stream_data(&_messaging,
                                   dst, _as, raft::server_id(dst.uuid()), gid);
                    })) {
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::write_both_read_new);
                        updates.emplace_back(get_mutation_builder()
                            .set_stage(last_token, locator::tablet_transition_stage::write_both_read_new)
                            .del_session(last_token)
                            .build());
                    }
                }
                    break;
                case locator::tablet_transition_stage::write_both_read_new: {
                    utils::get_local_injector().inject("crash-in-tablet-write-both-read-new", [] {
                        rtlogger.info("crash-in-tablet-write-both-read-new hit, killing the node");
                        _exit(1);
                    });

                    auto next_stage = locator::tablet_transition_stage::use_new;
                    if (action_failed(tablet_state.barriers[trinfo.stage])) {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        unsigned excluded_old = 0;
                        for (auto r : tinfo.replicas) {
                            if (is_excluded(raft::server_id(r.host.uuid()))) {
                                excluded_old++;
                            }
                        }
                        unsigned excluded_new = 0;
                        for (auto r : trinfo.next) {
                            if (is_excluded(raft::server_id(r.host.uuid()))) {
                                excluded_new++;
                            }
                        }
                        // Cannot revert if this is intra-node migration.
                        // We will lose data if the migrating node restarted, in which case the leaving replica
                        // doesn't contain writes anymore as sstables are attached only on the pending replica.
                        // Luckily, this we never reach this condition since excluded_new cannot be larger
                        // than excluded_old for intra-node migration.
                        if (excluded_new > excluded_old && trinfo.transition != locator::tablet_transition_kind::intranode_migration) {
                            rtlogger.debug("During {} stage of {} {} new nodes and {} old nodes were excluded", trinfo.stage, gid, excluded_new, excluded_old);
                            next_stage = locator::tablet_transition_stage::cleanup_target;
                        }
                    }
                    transition_to_with_barrier(next_stage);
                }
                    break;
                case locator::tablet_transition_stage::use_new:
                    transition_to_with_barrier(locator::tablet_transition_stage::cleanup);
                    break;
                case locator::tablet_transition_stage::cleanup:
                    if (advance_in_background(gid, tablet_state.cleanup, "cleanup", [&] {
                        auto maybe_dst = locator::get_leaving_replica(tmap.get_tablet_info(gid.tablet), trinfo);
                        if (!maybe_dst) {
                            rtlogger.info("Tablet cleanup of {} skipped because no replicas leaving", gid);
                            return make_ready_future<>();
                        }
                        locator::tablet_replica& dst = *maybe_dst;
                        if (is_excluded(raft::server_id(dst.host.uuid()))) {
                            rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded", gid, dst);
                            return make_ready_future<>();
                        }
                        rtlogger.info("Initiating tablet cleanup of {} on {}", gid, dst);
                        return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
                                                                                   dst.host, _as, raft::server_id(dst.host.uuid()), gid);
                    })) {
                        transition_to(locator::tablet_transition_stage::end_migration);
                    }
                    break;
                case locator::tablet_transition_stage::cleanup_target:
                    if (advance_in_background(gid, tablet_state.cleanup, "cleanup_target", [&] {
                        if (!trinfo.pending_replica) {
                            rtlogger.info("Tablet cleanup of {} skipped because no replicas pending", gid);
                            return make_ready_future<>();
                        }
                        locator::tablet_replica dst = *trinfo.pending_replica;
                        if (is_excluded(raft::server_id(dst.host.uuid()))) {
                            rtlogger.info("Tablet cleanup of {} on {} skipped because node is excluded and doesn't need to revert migration", gid, dst);
                            return make_ready_future<>();
                        }
                        rtlogger.info("Initiating tablet cleanup of {} on {} to revert migration", gid, dst);
                        return ser::storage_service_rpc_verbs::send_tablet_cleanup(&_messaging,
                                                                                   dst.host, _as, raft::server_id(dst.host.uuid()), gid);
                    })) {
                        transition_to(locator::tablet_transition_stage::revert_migration);
                    }
                    break;
                case locator::tablet_transition_stage::revert_migration:
                    // Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
                    // See do_tablet_operation() doc.
                    if (do_barrier()) {
                        _tablets.erase(gid);
                        updates.emplace_back(get_mutation_builder()
                                .del_transition(last_token)
                                .del_migration_task_info(last_token, _db.features())
                                .build());
                    }
                    break;
                case locator::tablet_transition_stage::end_migration: {
                    // Need a separate stage and a barrier after cleanup RPC to cut off stale RPCs.
                    // See do_tablet_operation() doc.
                    bool defer_transition = utils::get_local_injector().enter("handle_tablet_migration_end_migration");
                    if (!defer_transition && do_barrier()) {
                        _tablets.erase(gid);
                        updates.emplace_back(get_mutation_builder()
                                .del_transition(last_token)
                                .set_replicas(last_token, trinfo.next)
                                .del_migration_task_info(last_token, _db.features())
                                .build());
                    }
                }
                    break;
                case locator::tablet_transition_stage::repair: {
                    if (action_failed(tablet_state.repair)) {
                        if (do_barrier()) {
                            updates.emplace_back(get_mutation_builder()
                                    .set_stage(last_token, locator::tablet_transition_stage::end_repair)
                                    .del_session(last_token)
                                    .build());
                        }
                        break;
                    }
                    if (advance_in_background(gid, tablet_state.repair, "repair", [&] () -> future<> {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        bool valid = tinfo.repair_task_info.is_valid();
                        if (!valid) {
                            rtlogger.info("Skipping tablet repair for tablet={} which is cancelled by user", gid);
                            co_return;
                        }
                        auto sched_time = tinfo.repair_task_info.sched_time;
                        auto primary = tmap.get_primary_replica(gid.tablet);
                        auto dst = primary.host;
                        auto tablet = gid;
                        rtlogger.info("Initiating tablet repair host={} tablet={}", dst, gid);
                        auto res = co_await ser::storage_service_rpc_verbs::send_tablet_repair(&_messaging,
                                dst, _as, raft::server_id(dst.uuid()), gid);
                        auto duration = std::chrono::duration<float>(db_clock::now() - sched_time);
                        auto& tablet_state = _tablets[tablet];
                        tablet_state.repair_time = db_clock::from_time_t(gc_clock::to_time_t(res.repair_time));
                        rtlogger.info("Finished tablet repair host={} tablet={} duration={} repair_time={}",
                                dst, tablet, duration, res.repair_time);
                    })) {
                        auto& tinfo = tmap.get_tablet_info(gid.tablet);
                        bool valid = tinfo.repair_task_info.is_valid();
                        rtlogger.debug("Will set tablet {} stage to {}", gid, locator::tablet_transition_stage::end_repair);
                        auto update = get_mutation_builder()
                                        .set_stage(last_token, locator::tablet_transition_stage::end_repair)
                                        .del_repair_task_info(last_token)
                                        .del_session(last_token);
                        if (valid) {
                            auto sched_time = tinfo.repair_task_info.sched_time;
                            auto time = tablet_state.repair_time;
                            rtlogger.debug("Set tablet repair time sched_time={} return_time={} set_time={}",
                                    sched_time, tablet_state.repair_time, time);
                            update.set_repair_time(last_token, time);
                        }
                        updates.emplace_back(update.build());
                    }
                }
                    break;
                case locator::tablet_transition_stage::end_repair: {
                    if (do_barrier()) {
                        _tablets.erase(gid);
                        updates.emplace_back(get_mutation_builder()
                            .del_transition(last_token)
                            .build());
                    }
                }
                    break;
            }
        });

        // In order to keep the cluster saturated, ask the load balancer for more transitions.
        // Unless there is a pending topology change operation.
        bool preempt = false;
        if (!drain) {
            // When draining, this method is invoked with an active node transition, which
            // would normally cause preemption, which we don't want here.
            auto ts = guard.write_timestamp();
            auto [new_preempt, new_guard] = should_preempt_balancing(std::move(guard));
            preempt = new_preempt;
            guard = std::move(new_guard);
            if (ts != guard.write_timestamp()) {
                // We rely on the fact that should_preempt_balancing() does not release the guard
                // so that tablet metadata reading and updates are atomic.
                on_internal_error(rtlogger, "should_preempt_balancing() retook the guard");
            }
        }

        bool has_nodes_to_drain = false;
        if (!preempt) {
            auto plan = co_await _tablet_allocator.balance_tablets(get_token_metadata_ptr(), _tablet_load_stats, get_dead_nodes());
            has_nodes_to_drain = plan.has_nodes_to_drain();
            if (!drain || plan.has_nodes_to_drain()) {
                co_await generate_migration_updates(updates, guard, plan);
            }
        }

        // The updates have to be executed under the same guard which was used to read tablet metadata
        // to ensure that we don't reinsert tablet rows which were concurrently deleted by schema change
        // which happens outside the topology coordinator.
        bool has_updates = !updates.empty();
        if (has_updates) {
            co_await utils::get_local_injector().inject("tablet_transition_updates", utils::wait_for_message(2min));
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
            co_await update_topology_state(std::move(guard), std::move(updates), format("Tablet migration"));
        }

        if (needs_barrier) {
            // If has_updates is true then we have dropped the guard and need to re-obtain it.
            // It's fine to start an independent operation here. The barrier doesn't have to be executed
            // atomically with the read which set needs_barrier, because it's fine if the global barrier
            // works with a more recent set of nodes and it's fine if it propagates a more recent topology.
            if (!guard) {
                guard = co_await start_operation();
            }
            guard = co_await global_tablet_token_metadata_barrier(std::move(guard));
            barrier.set_value();
            fail_barrier.cancel();
        }

        if (has_updates) {
            co_return;
        }

        if (has_transitions) {
            // Streaming may have finished after we checked. To avoid missed notification, we need
            // to check atomically with event.wait()
            if (!_tablets_ready) {
                rtlogger.debug("Going to sleep with active tablet transitions");
                release_guard(std::move(guard));
                co_await await_event();
            }
            co_return;
        }

        if (drain) {
            if (has_nodes_to_drain) {
                // Prevent jumping to write_both_read_old with un-drained tablets.
                // This can happen when all candidate nodes are down.
                rtlogger.warn("Tablet draining stalled: No tablets migrating but there are nodes to drain");
                release_guard(std::move(guard));
                co_await sleep(3s); // Throttle retries
                co_return;
            }
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .set_transition_state(topology::transition_state::write_both_read_old)
                    .set_session(session_id(guard.new_group0_state_id()))
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
        } else {
            updates.emplace_back(
                topology_mutation_builder(guard.write_timestamp())
                    .del_transition_state()
                    .set_version(_topo_sm._topology.version + 1)
                    .build());
        }
        co_await update_topology_state(std::move(guard), std::move(updates), "Finished tablet migration");
    }

    future<> handle_tablet_resize_finalization(group0_guard g) {
        // Executes a global barrier to guarantee that any process (e.g. repair) holding stale version
        // of token metadata will complete before we update topology.
        auto guard = co_await global_tablet_token_metadata_barrier(std::move(g));

        auto tm = get_token_metadata_ptr();
        auto plan = co_await _tablet_allocator.balance_tablets(tm, _tablet_load_stats, get_dead_nodes());

        std::vector<canonical_mutation> updates;
        updates.reserve(plan.resize_plan().finalize_resize.size() * 2 + 1);

        for (auto& table_id : plan.resize_plan().finalize_resize) {
            auto s = _db.find_schema(table_id);
            auto new_tablet_map = co_await _tablet_allocator.resize_tablets(tm, table_id);
            updates.emplace_back(co_await replica::tablet_map_to_mutation(
                new_tablet_map,
                table_id,
                s->ks_name(),
                s->cf_name(),
                guard.write_timestamp(),
                _db.features()));

            // Clears the resize decision for a table.
            generate_resize_update(updates, guard, table_id, locator::resize_decision{});
        }

        updates.emplace_back(
            topology_mutation_builder(guard.write_timestamp())
                .del_transition_state()
                .set_version(_topo_sm._topology.version + 1)
                .build());
        co_await update_topology_state(std::move(guard), std::move(updates), format("Finished tablet split finalization"));
    }

    future<> handle_truncate_table(group0_guard guard) {
        // Execute a barrier to make sure the nodes we are performing truncate on see the session
        // and are able to create a topology_guard using the frozen_guard we are sending over RPC
        // TODO: Exclude nodes which don't contain replicas of the table we are truncating
        guard = co_await global_tablet_token_metadata_barrier(std::move(guard));

        const utils::UUID& global_request_id = _topo_sm._topology.global_request_id.value();
        std::optional<sstring> error;
        // We should perform TRUNCATE only if the session is still valid. It could be cleared if a previous truncate
        // handler performed the truncate and cleared the session, but crashed before finalizing the request
        if (_topo_sm._topology.session) {
            const auto topology_requests_entry = co_await _sys_ks.get_topology_request_entry(global_request_id, true);
            const table_id& table_id = topology_requests_entry.truncate_table_id;
            lw_shared_ptr<replica::table> table = _db.get_tables_metadata().get_table_if_exists(table_id);

            if (table) {
                const sstring& ks_name = table->schema()->ks_name();
                const sstring& cf_name = table->schema()->cf_name();

                rtlogger.info("Performing TRUNCATE TABLE for {}.{}", ks_name, cf_name);

                // Collect the IDs of the hosts with replicas, but ignore excluded nodes
                std::unordered_set<locator::host_id> replica_hosts;
                const std::unordered_set<raft::server_id> excluded_nodes = _topo_sm._topology.get_excluded_nodes();
                const locator::tablet_map& tmap = get_token_metadata_ptr()->tablets().get_tablet_map(table_id);
                co_await tmap.for_each_tablet([&] (locator::tablet_id tid, const locator::tablet_info& tinfo) {
                    for (const locator::tablet_replica& replica: tinfo.replicas) {
                        if (!excluded_nodes.contains(raft::server_id(replica.host.uuid()))) {
                            replica_hosts.insert(replica.host);
                        }
                    }
                    return make_ready_future<>();
                });

                // Release the guard to avoid blocking group0 for long periods of time while invoking RPCs
                release_guard(std::move(guard));

                co_await utils::get_local_injector().inject("truncate_table_wait", [] (auto& handler) {
                    rtlogger.info("truncate_table_wait: start");
                    return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(2));
                });

                // Check if all the nodes with replicas are alive
                for (const locator::host_id& replica_host: replica_hosts) {
                    if (!_gossiper.is_alive(replica_host)) {
                        throw std::runtime_error(::format("Cannot perform TRUNCATE on table {}.{} because host {} is down", ks_name, cf_name, replica_host));
                    }
                }

                // Send the RPC to all replicas
                const service::frozen_topology_guard frozen_guard { _topo_sm._topology.session };
                co_await coroutine::parallel_for_each(replica_hosts, [&] (const locator::host_id& host_id) -> future<> {
                    co_await ser::storage_proxy_rpc_verbs::send_truncate_with_tablets(&_messaging, host_id, ks_name, cf_name, frozen_guard);
                });
            } else {
                error = ::format("Cannot TRUNCATE table with UUID {} because it does not exist.", table_id);
            }

            // Clear the session and save the error message
            while (true) {
                if (!guard) {
                    guard = co_await start_operation();
                }

                std::vector<canonical_mutation> updates;
                updates.push_back(topology_mutation_builder(guard.write_timestamp())
                                    .del_session()
                                    .build());
                if (error) {
                    updates.push_back(topology_request_tracking_mutation_builder(global_request_id)
                                        .set("error", *error)
                                        .build());
                }

                try {
                    co_await update_topology_state(std::move(guard), std::move(updates), "Clear truncate session");
                    break;
                } catch (group0_concurrent_modification&) {
                }
            }
        }

        utils::get_local_injector().inject("truncate_crash_after_session_clear", [] {
            rtlogger.info("truncate_crash_after_session_clear hit, killing the node");
            _exit(1);
        });

        // Execute a barrier to ensure the TRUNCATE RPC can't run on any nodes after this point
        if (!guard) {
            guard = co_await start_operation();
        }
        guard = co_await global_tablet_token_metadata_barrier(std::move(guard));

        // Finalize the request
        while (true) {
            if (!guard) {
                guard = co_await start_operation();
            }
            std::vector<canonical_mutation> updates;
            updates.push_back(topology_mutation_builder(guard.write_timestamp())
                                .del_transition_state()
                                .del_global_topology_request()
                                .del_global_topology_request_id()
                                .build());
            updates.push_back(topology_request_tracking_mutation_builder(global_request_id)
                                .set("end_time", db_clock::now())
                                .set("done", true)
                                .build());

            try {
                co_await update_topology_state(std::move(guard), std::move(updates), "Truncate has completed");
                break;
            } catch (group0_concurrent_modification&) {
            }
        }
    }

    // This function must not release and reacquire the guard, callers rely
    // on the fact that the block which calls this is atomic.
    // FIXME: Don't take the ownership of the guard to make the above guarantee explicit.
    std::pair<bool, group0_guard> should_preempt_balancing(group0_guard guard) {
        auto work = get_next_task(std::move(guard));
        if (auto* node = std::get_if<node_to_work_on>(&work)) {
            return std::make_pair(true, std::move(node->guard));
        }

        if (auto* cancel = std::get_if<cancel_requests>(&work)) {
            // request queue needs to be canceled, so preempt balancing
            return std::make_pair(true, std::move(cancel->guard));
        }

        if (auto* cleanup = std::get_if<start_cleanup>(&work)) {
            // cleanup has to be started
            return std::make_pair(true, std::move(cleanup->guard));
        }

        guard = std::get<group0_guard>(std::move(work));

        if (_topo_sm._topology.global_request) {
            return std::make_pair(true, std::move(guard));
        }

        if (!_topo_sm._topology.calculate_not_yet_enabled_features().empty()) {
            return std::make_pair(true, std::move(guard));
        }

        return std::make_pair(false, std::move(guard));
    }

    void cleanup_ignored_nodes_on_left(topology_mutation_builder& builder, raft::server_id id) {
        if (_topo_sm._topology.ignored_nodes.contains(id)) {
            auto l = _topo_sm._topology.ignored_nodes;
            l.erase(id);
            builder.set_ignored_nodes(l);
        }
    }

    future<> cancel_all_requests(group0_guard guard, std::unordered_set<raft::server_id> dead_nodes) {
        std::vector<canonical_mutation> muts;
        std::vector<raft::server_id> reject_join;
        if (_topo_sm._topology.requests.empty()) {
            co_return;
        }
        auto ts = guard.write_timestamp();
        for (auto& [id, req] : _topo_sm._topology.requests) {
            topology_mutation_builder builder(ts);
            topology_request_tracking_mutation_builder rtbuilder(_topo_sm._topology.find(id)->second.request_id);
            auto node_builder = builder.with_node(id).del("topology_request");
            auto done_msg = fmt::format("Canceled. Dead nodes: {}", dead_nodes);
            rtbuilder.done(done_msg);
            if (_topo_sm._topology.global_request_id) {
                try {
                    utils::UUID uuid = utils::UUID{*_topo_sm._topology.global_request_id};
                    topology_request_tracking_mutation_builder rt_global_req_builder{uuid};
                    rt_global_req_builder.done(done_msg)
                                         .set("end_time", db_clock::now());
                    muts.emplace_back(rt_global_req_builder.build());
                } catch (...) {
                    rtlogger.warn("failed to cancel topology global request: {}", std::current_exception());
                }
            }
            switch (req) {
                case topology_request::replace:
                [[fallthrough]];
                case topology_request::join: {
                    node_builder.set("node_state", node_state::left);
                    reject_join.emplace_back(id);
                    try {
                        co_await wait_for_gossiper(id, _gossiper, _as);
                    } catch (...) {
                        rtlogger.warn("wait_for_ip failed during cancellation: {}", std::current_exception());
                    }
                }
                break;
                case topology_request::leave:
                [[fallthrough]];
                case topology_request::rebuild:
                [[fallthrough]];
                case topology_request::remove: {
                }
                break;
            }
            muts.emplace_back(builder.build());
            muts.emplace_back(rtbuilder.build());
        }

        co_await update_topology_state(std::move(guard), std::move(muts), "cancel all topology requests");

        for (auto id : reject_join) {
            try {
                co_await respond_to_joining_node(id, join_node_response_params{
                    .response = join_node_response_params::rejected{
                        .reason = "request canceled because some required nodes are dead"
                    },
                });
            } catch (...) {
                rtlogger.warn("attempt to send rejection response to {} failed: {}. "
                                "The node may hang. It's safe to shut it down manually now.",
                                id, std::current_exception());
            }
        }

    }

    // Returns `true` iff there was work to do.
    future<bool> handle_topology_transition(group0_guard guard) {
        auto tstate = _topo_sm._topology.tstate;
        if (!tstate) {
            // When adding a new source of work, make sure to update should_preempt_balancing() as well.

            auto work = get_next_task(std::move(guard));
            if (auto* node = std::get_if<node_to_work_on>(&work)) {
                co_await handle_node_transition(std::move(*node));
                co_return true;
            }

            if (auto* cancel = std::get_if<cancel_requests>(&work)) {
                co_await cancel_all_requests(std::move(cancel->guard), std::move(cancel->dead_nodes));
                co_return true;
            }

            if (auto* cleanup = std::get_if<start_cleanup>(&work)) {
                co_await start_cleanup_on_dirty_nodes(std::move(cleanup->guard), false);
                co_return true;
            }

            guard = std::get<group0_guard>(std::move(work));

            if (_topo_sm._topology.global_request) {
                co_await handle_global_request(std::move(guard));
                co_return true;
            }

            if (auto feats = _topo_sm._topology.calculate_not_yet_enabled_features(); !feats.empty()) {
                co_await enable_features(std::move(guard), std::move(feats));
                co_return true;
            }

            if (auto guard_opt = co_await maybe_migrate_system_tables(std::move(guard)); !guard_opt) {
                // The guard is consumed, it means we did migration
                co_return true;
            } else {
                guard = std::move(*guard_opt);
            }

            // If there is no other work, evaluate load and start tablet migration if there is imbalance.
            if (co_await maybe_start_tablet_migration(std::move(guard))) {
                co_return true;
            }
            co_return false;
        }

        rtlogger.info("entered `{}` transition state", *tstate);
        switch (*tstate) {
            case topology::transition_state::join_group0: {
                    auto node = get_node_to_work_on(std::move(guard));
                    if (node.rs->state == node_state::replacing) {
                        // Make sure all nodes are no longer trying to write to a node being replaced. This is important
                        // if the new node have the same IP, so that old write will not go to the new node by mistake after this point.
                        // It is important to do so before the call to finish_accepting_node() below since after this call the new node becomes
                        // a full member of the cluster and it starts loading an initial snapshot. Since snapshot loading is not atomic any queries
                        // that are done in parallel may see a partial state.
                        try {
                            node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes(node)), node.id);
                        } catch (term_changed_error&) {
                            throw;
                        } catch (group0_concurrent_modification&) {
                            throw;
                        } catch (...) {
                            rtlogger.error("transition_state::join_group0, "
                                            "global_token_metadata_barrier failed, error {}",
                                            std::current_exception());
                            _rollback = fmt::format("global_token_metadata_barrier failed in join_group0 state {}", std::current_exception());
                            break;
                        }
                    }

                bool accepted;
                std::tie(node, accepted) = co_await finish_accepting_node(std::move(node));

                // If responding to the joining node failed, move the node to the left state and
                // stop the topology transition.
                if (!accepted) {
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                    builder.del_transition_state()
                           .with_node(node.id)
                           .set("node_state", node_state::left);
                    rtbuilder.done("join is not accepted");
                    auto reason = ::format("bootstrap: failed to accept {}", node.id);
                    co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);

                    rtlogger.info("node {} moved to left state", node.id);

                    break;
                }

                switch (node.rs->state) {
                    case node_state::bootstrapping: {
                        SCYLLA_ASSERT(!node.rs->ring);
                        auto num_tokens = std::get<join_param>(node.req_param.value()).num_tokens;
                        auto tokens_string = std::get<join_param>(node.req_param.value()).tokens_string;

                        auto guard = take_guard(std::move(node));
                        topology_mutation_builder builder(guard.write_timestamp());

                        // A node has just been accepted. If it is a zero-token node, we can
                        // instantly move its state to normal. Otherwise, we need to assign
                        // random tokens and prepare a new CDC generation.
                        if (num_tokens == 0 && tokens_string.empty()) {
                            topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                            builder.del_transition_state()
                                   .with_node(node.id)
                                   .set("node_state", node_state::normal);
                            rtbuilder.done();
                            auto reason = ::format("bootstrap: joined a zero-token node {}", node.id);
                            co_await update_topology_state(std::move(guard), {builder.build(), rtbuilder.build()}, reason);
                            break;
                        }

                        auto tmptr = get_token_metadata_ptr();
                        std::unordered_set<token> bootstrap_tokens;
                        try {
                            bootstrap_tokens = dht::boot_strapper::get_bootstrap_tokens(tmptr, tokens_string, num_tokens, dht::check_token_endpoint::yes);
                        } catch (...) {
                            _rollback = fmt::format("Failed to assign tokens: {}", std::current_exception());
                        }

                        auto [gen_uuid, guard_, mutation] = co_await prepare_and_broadcast_cdc_generation_data(
                                tmptr, std::move(guard), bootstrapping_info{bootstrap_tokens, *node.rs});

                        // Write the new CDC generation data through raft.
                        builder.set_transition_state(topology::transition_state::commit_cdc_generation)
                               .set_new_cdc_generation_data_uuid(gen_uuid)
                               .with_node(node.id)
                               .set("tokens", bootstrap_tokens);
                        auto reason = ::format(
                            "bootstrap: insert tokens and CDC generation data (UUID: {})", gen_uuid);
                        co_await update_topology_state(std::move(guard_), {std::move(mutation), builder.build()}, reason);

                        co_await utils::get_local_injector().inject("topology_coordinator_pause_after_updating_cdc_generation", utils::wait_for_message(5min));
                    }
                        break;
                    case node_state::replacing: {
                        SCYLLA_ASSERT(!node.rs->ring);
                        auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
                        auto it = _topo_sm._topology.normal_nodes.find(replaced_id);
                        SCYLLA_ASSERT(it != _topo_sm._topology.normal_nodes.end());
                        SCYLLA_ASSERT(it->second.ring && it->second.state == node_state::normal);

                        topology_mutation_builder builder(node.guard.write_timestamp());

                        // If a zero-token node is replacing another zero-token node,
                        // we can instantly move the new node's state to normal.
                        if (it->second.ring->tokens.empty()) {
                            node = retake_node(co_await remove_from_group0(std::move(node.guard), replaced_id), node.id);

                            topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                            builder.del_transition_state()
                                   .with_node(node.id)
                                   .set("node_state", node_state::normal);

                            // Move the old node to the left state.
                            topology_mutation_builder builder2(node.guard.write_timestamp());
                            cleanup_ignored_nodes_on_left(builder2, replaced_id);
                            builder2.with_node(replaced_id)
                                    .set("node_state", node_state::left);
                            rtbuilder.done();
                            co_await update_topology_state(take_guard(std::move(node)), {builder.build(), builder2.build(), rtbuilder.build()},
                                    fmt::format("replace: replaced node {} with the new zero-token node {}", replaced_id, node.id));
                            break;
                        }

                        builder.set_transition_state(topology::transition_state::tablet_draining)
                               .set_version(_topo_sm._topology.version + 1)
                               .with_node(node.id)
                               .set("tokens", it->second.ring->tokens);
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build()},
                                "replace: transition to tablet_draining and take ownership of the replaced node's tokens");
                    }
                        break;
                    default:
                        on_internal_error(rtlogger,
                                format("topology is in join_group0 state, but the node"
                                       " being worked on ({}) is in unexpected state '{}'; should be"
                                       " either 'bootstrapping' or 'replacing'", node.id, node.rs->state));
                }
            }
                break;
            case topology::transition_state::commit_cdc_generation: {
                // make sure all nodes know about new topology and have the new CDC generation data
                // (we require all nodes to be alive for topo change for now)
                // Note: if there was a replace or removenode going on, we'd need to put the replaced/removed
                // node into `exclude_nodes` parameter in `exec_global_command`, but CDC generations are never
                // introduced during replace/remove.
                try {
                    guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier, {_raft.id()});
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::commit_cdc_generation, "
                                    "raft_topology_cmd::command::barrier failed, error {}", std::current_exception());
                    _rollback = fmt::format("Failed to commit cdc generation: {}", std::current_exception());
                    break;
                }

                // We don't need to add delay to the generation timestamp if this is the first generation.
                bool add_ts_delay = !_topo_sm._topology.committed_cdc_generations.empty();

                // Begin the race.
                // See the large FIXME below.
                auto cdc_gen_ts = cdc::new_generation_timestamp(add_ts_delay, _ring_delay);
                auto cdc_gen_uuid = _topo_sm._topology.new_cdc_generation_data_uuid;
                if (!cdc_gen_uuid) {
                    on_internal_error(rtlogger,
                        "new CDC generation data UUID missing in `commit_cdc_generation` state");
                }

                cdc::generation_id_v2 cdc_gen_id {
                    .ts = cdc_gen_ts,
                    .id = *cdc_gen_uuid,
                };

                if (!_topo_sm._topology.committed_cdc_generations.empty()) {
                    // Sanity check.
                    // This could happen if the topology coordinator's clock is broken.
                    auto last_committed_gen_id = _topo_sm._topology.committed_cdc_generations.back();
                    if (last_committed_gen_id.ts >= cdc_gen_ts) {
                        on_internal_error(rtlogger, ::format(
                            "new CDC generation has smaller timestamp than the previous one."
                            " Old generation ID: {}, new generation ID: {}", last_committed_gen_id, cdc_gen_id));
                    }
                }

                // Tell all nodes to start using the new CDC generation by updating the topology
                // with the generation's ID and timestamp.
                // At the same time move the topology change procedure to the next step.
                //
                // FIXME: as in previous implementation with gossiper and ring_delay, this assumes that all nodes
                // will learn about the new CDC generation before their clocks reach the generation's timestamp.
                // With this group 0 based implementation, it means that the command must be committed,
                // replicated and applied on all nodes before their clocks reach the generation's timestamp
                // (i.e. within 2 * ring_delay = 60 seconds by default if clocks are synchronized). If this
                // doesn't hold some coordinators might use the wrong CDC streams for some time and CDC stream
                // readers will miss some data. It's likely that Raft replication doesn't converge as quickly
                // as gossiping does.
                //
                // We could use a two-phase algorithm instead: first tell all nodes to prepare for using
                // the new generation, then tell all nodes to commit. If some nodes don't manage to prepare
                // in time, we abort the generation switch. If all nodes prepare, we commit. If a node prepares
                // but doesn't receive a commit in time, it stops coordinating CDC-enabled writes until it
                // receives a commit or abort. This solution does not have a safety problem like the one
                // above, but it has an availability problem when nodes get disconnected from group 0 majority
                // in the middle of a CDC generation switch (when they are prepared to switch but not
                // committed) - they won't coordinate CDC-enabled writes until they reconnect to the
                // majority and commit.
                topology_mutation_builder builder(guard.write_timestamp());
                builder.add_new_committed_cdc_generation(cdc_gen_id);
                if (_topo_sm._topology.global_request == global_topology_request::new_cdc_generation) {
                    builder.del_global_topology_request();
                    builder.del_transition_state();
                } else {
                    builder.set_transition_state(topology::transition_state::write_both_read_old);
                    builder.set_session(session_id(guard.new_group0_state_id()));
                    builder.set_version(_topo_sm._topology.version + 1);
                }
                auto str = ::format("committed new CDC generation, ID: {}", cdc_gen_id);
                co_await update_topology_state(std::move(guard), {builder.build()}, std::move(str));
            }
                break;
            case topology::transition_state::tablet_draining:
                try {
                    co_await handle_tablet_migration(std::move(guard), true);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (...) {
                    rtlogger.error("tablets draining failed with {}. Aborting the topology operation", std::current_exception());
                    _rollback = fmt::format("Failed to drain tablets: {}", std::current_exception());
                }
                break;
            case topology::transition_state::write_both_read_old: {
                auto node = get_node_to_work_on(std::move(guard));

                // make sure all nodes know about new topology (we require all nodes to be alive for topo change for now)
                try {
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes(node)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::write_both_read_old, "
                                    "global_token_metadata_barrier failed, error {}",
                                    std::current_exception());
                    _rollback = fmt::format("global_token_metadata_barrier failed in write_both_read_old state {}", std::current_exception());
                    break;
                }

                if (_group0.is_member(node.id, true)) {
                    // If we remove a node, we make it a non-voter early to improve availability in some situations.
                    // There is no downside to it because the removed node is already considered dead by us.
                    //
                    // FIXME: removenode may be aborted and the already dead node can be resurrected. We should consider
                    // restoring its voter state on the recovery path.
                    if (node.rs->state == node_state::removing) {
                        co_await _group0.set_voter_status(node.id, can_vote::no, _as);
                    }

                    // If we decommission a node when the number of nodes is even, we make it a non-voter early.
                    // All majorities containing this node will remain majorities when we make this node a non-voter
                    // and remove it from the set because the required size of a majority decreases.
                    //
                    // FIXME: when a node restarts and notices it's a non-voter, it will become a voter again. If the
                    // node restarts during a decommission, and we want the decommission to continue (e.g. because it's
                    // at a finishing non-abortable step), we must ensure that the node doesn't become a voter.
                    if (node.rs->state == node_state::decommissioning
                            && raft::configuration::voter_count(_group0.group0_server().get_configuration().current) % 2 == 0) {
                        if (node.id == _raft.id()) {
                            rtlogger.info("coordinator is decommissioning and becomes a non-voter; "
                                         "giving up leadership");
                            co_await step_down_as_nonvoter();
                        } else {
                            co_await _group0.set_voter_status(node.id, can_vote::no, _as);
                        }
                    }
                }
                if (node.rs->state == node_state::replacing) {
                    // We make a replaced node a non-voter early, just like a removed node.
                    auto replaced_node_id = parse_replaced_node(node.req_param);
                    if (_group0.is_member(replaced_node_id, true)) {
                        co_await _group0.set_voter_status(replaced_node_id, can_vote::no, _as);
                    }
                }
                utils::get_local_injector().inject("crash_coordinator_before_stream", [] { abort(); });
                raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
                auto state = node.rs->state;
                try {
                    // No need to stream data when the node is zero-token.
                    if (!node.rs->ring->tokens.empty()) {
                        if (node.rs->state == node_state::removing) {
                            // tell all token owners to stream data of the removed node to new range owners
                            auto exclude_nodes = get_excluded_nodes(node);
                            auto normal_zero_token_nodes = _topo_sm._topology.get_normal_zero_token_nodes();
                            std::move(normal_zero_token_nodes.begin(), normal_zero_token_nodes.end(),
                                    std::inserter(exclude_nodes, exclude_nodes.begin()));
                            node = retake_node(
                                    co_await exec_global_command(take_guard(std::move(node)), cmd, exclude_nodes), node.id);
                        } else {
                            // Tell joining/leaving/replacing node to stream its ranges
                            node = co_await exec_direct_command(std::move(node), cmd);
                        }
                    }
                } catch (term_changed_error&) {
                    throw;
                } catch (...) {
                    rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
                                    " (node state is {}): {}", state, std::current_exception());
                    _rollback = fmt::format("Failed stream ranges: {}", std::current_exception());
                    break;
                }
                // Streaming completed. We can now move tokens state to topology::transition_state::write_both_read_new
                topology_mutation_builder builder(node.guard.write_timestamp());
                builder
                    .set_transition_state(topology::transition_state::write_both_read_new)
                    .del_session()
                    .set_version(_topo_sm._topology.version + 1);
                auto str = ::format("{}: streaming completed for node {}", node.rs->state, node.id);
                co_await update_topology_state(take_guard(std::move(node)), {builder.build()}, std::move(str));
            }
                break;
            case topology::transition_state::write_both_read_new: {
                while (utils::get_local_injector().enter("topology_coordinator_pause_after_streaming")) {
                    co_await sleep_abortable(std::chrono::milliseconds(10), _as);
                }
                auto node = get_node_to_work_on(std::move(guard));
                bool barrier_failed = false;
                // In this state writes goes to old and new replicas but reads start to be done from new replicas
                // Before we stop writing to old replicas we need to wait for all previous reads to complete
                try {
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes(node)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::write_both_read_new, "
                                    "global_token_metadata_barrier failed, error {}",
                                    std::current_exception());
                    barrier_failed = true;
                }
                if (barrier_failed) {
                    // If barrier above failed it means there may be unfenced reads from old replicas.
                    // Lets wait for the ring delay for those writes to complete or fence to propagate
                    // before continuing.
                    // FIXME: nodes that cannot be reached need to be isolated either automatically or
                    // by an administrator
                    co_await sleep_abortable(_ring_delay, _as);
                    node = retake_node(co_await start_operation(), node.id);
                }
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                rtbuilder.done();
                switch(node.rs->state) {
                case node_state::bootstrapping: {
                    co_await utils::get_local_injector().inject("delay_node_bootstrap", [](auto& handler) {
                        rtlogger.info("delay_node_bootstrap: waiting for message");
                        return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5));
                    });
                    std::vector<canonical_mutation> muts;
                    // Since after bootstrapping a new node some nodes lost some ranges they need to cleanup
                    muts = mark_nodes_as_cleanup_needed(node, false);
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    builder.del_transition_state()
                           .with_node(node.id)
                           .set("node_state", node_state::normal);
                    muts.emplace_back(builder.build());
                    muts.emplace_back(rtbuilder.build());
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                   "bootstrap: read fence completed");
                    }
                    break;
                case node_state::removing: {
                    co_await utils::get_local_injector().inject("delay_node_removal", [](auto& handler) {
                        rtlogger.info("delay_node_removal: waiting for message");
                        return handler.wait_for_message(db::timeout_clock::now() + std::chrono::minutes(5));
                    });
                    node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);
                }
                    [[fallthrough]];
                case node_state::decommissioning: {
                    topology_mutation_builder builder(node.guard.write_timestamp());
                    node_state next_state;
                    std::vector<canonical_mutation> muts;
                    muts.reserve(2);
                    if (node.rs->state == node_state::decommissioning) {
                        next_state = node.rs->state;
                        builder.set_transition_state(topology::transition_state::left_token_ring);
                    } else {
                        next_state = node_state::left;
                        builder.del_transition_state();
                        cleanup_ignored_nodes_on_left(builder, node.id);
                        muts.push_back(rtbuilder.build());
                        co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
                    }
                    builder.set_version(_topo_sm._topology.version + 1)
                           .with_node(node.id)
                           .del("tokens")
                           .set("node_state", next_state);
                    auto str = ::format("{}: read fence completed", node.rs->state);
                    muts.push_back(builder.build());
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
                }
                    break;
                case node_state::replacing: {
                    auto replaced_node_id = parse_replaced_node(node.req_param);
                    node = retake_node(co_await remove_from_group0(std::move(node.guard), replaced_node_id), node.id);

                    std::vector<canonical_mutation> muts;

                    topology_mutation_builder builder1(node.guard.write_timestamp());
                    // Move new node to 'normal'
                    builder1.del_transition_state()
                            .set_version(_topo_sm._topology.version + 1)
                            .with_node(node.id)
                            .set("node_state", node_state::normal);
                    muts.push_back(builder1.build());

                    // Move old node to 'left'
                    topology_mutation_builder builder2(node.guard.write_timestamp());
                    cleanup_ignored_nodes_on_left(builder2, replaced_node_id);
                    builder2.with_node(replaced_node_id)
                            .del("tokens")
                            .set("node_state", node_state::left);
                    muts.push_back(builder2.build());

                    muts.push_back(rtbuilder.build());
                    co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(replaced_node_id.uuid()), muts);
                    co_await update_topology_state(take_guard(std::move(node)), std::move(muts),
                                                  "replace: read fence completed");
                    }
                    break;
                default:
                    on_fatal_internal_error(rtlogger, ::format(
                            "Ring state on node {} is write_both_read_new while the node is in state {}",
                            node.id, node.rs->state));
                }
                // Reads are fenced. We can now remove topology::transition_state and move node state to normal
            }
                break;
            case topology::transition_state::tablet_migration:
                co_await handle_tablet_migration(std::move(guard), false);
                break;
            case topology::transition_state::tablet_resize_finalization:
                co_await handle_tablet_resize_finalization(std::move(guard));
                break;
            case topology::transition_state::lock:
                release_guard(std::move(guard));
                co_await await_event();
                break;
            case topology::transition_state::left_token_ring: {
                auto node = get_node_to_work_on(std::move(guard));

                if (node.id == _raft.id()) {
                    // Someone else needs to coordinate the rest of the decommission process,
                    // because the decommissioning node is going to shut down in the middle of this state.
                    rtlogger.info("coordinator is decommissioning; giving up leadership");
                    co_await step_down_as_nonvoter();

                    // Note: if we restart after this point and become a voter
                    // and then a coordinator again, it's fine - we'll just repeat this step.
                    // (If we're in `left` state when we try to restart we won't
                    // be able to become a voter - we'll be banned from the cluster.)
                }

                bool barrier_failed = false;
                // Wait until other nodes observe the new token ring and stop sending writes to this node.
                try {
                    node = retake_node(co_await global_token_metadata_barrier(std::move(node.guard), get_excluded_nodes(node)), node.id);
                } catch (term_changed_error&) {
                    throw;
                } catch (group0_concurrent_modification&) {
                    throw;
                } catch (...) {
                    rtlogger.error("transition_state::left_token_ring, "
                                    "raft_topology_cmd::command::barrier failed, error {}",
                                    std::current_exception());
                    barrier_failed = true;
                }

                if (barrier_failed) {
                    // If barrier above failed it means there may be unfinished writes to a decommissioned node.
                    // Lets wait for the ring delay for those writes to complete and new topology to propagate
                    // before continuing.
                    co_await sleep_abortable(_ring_delay, _as);
                    node = retake_node(co_await start_operation(), node.id);
                }

                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);

                rtbuilder.done();

                co_await update_topology_state(take_guard(std::move(node)), {rtbuilder.build()}, "report request completion in left_token_ring state");

                // Tell the node to shut down.
                // This is done to improve user experience when there are no failures.
                // In the next state (`node_state::left`), the node will be banned by the rest of the cluster,
                // so there's no guarantee that it would learn about entering that state even if it was still
                // a member of group0, hence we use a separate direct RPC in this state to shut it down.
                //
                // There is the possibility that the node will never get the message
                // and decommission will hang on that node.
                // This is fine for the rest of the cluster - we will still remove, ban the node and continue.
                auto node_id = node.id;
                bool shutdown_failed = false;
                try {
                    node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::barrier);
                } catch (...) {
                    rtlogger.warn("failed to tell node {} to shut down - it may hang."
                                 " It's safe to shut it down manually now. (Exception: {})",
                                 node.id, std::current_exception());
                    shutdown_failed = true;
                }
                if (shutdown_failed) {
                    node = retake_node(co_await start_operation(), node_id);
                }

                // Remove the node from group0 here - in general, it won't be able to leave on its own
                // because we'll ban it as soon as we tell it to shut down.
                node = retake_node(co_await remove_from_group0(std::move(node.guard), node.id), node.id);

                std::vector<canonical_mutation> muts;

                topology_mutation_builder builder(node.guard.write_timestamp());
                cleanup_ignored_nodes_on_left(builder, node.id);
                builder.del_transition_state()
                       .with_node(node.id)
                       .set("node_state", node_state::left);
                muts.push_back(builder.build());
                co_await db::view::view_builder::generate_mutations_on_node_left(_db, _sys_ks, node.guard.write_timestamp(), locator::host_id(node.id.uuid()), muts);
                auto str = node.rs->state == node_state::decommissioning
                        ? ::format("finished decommissioning node {}", node.id)
                        : ::format("finished rollback of {} after {} failure", node.id, node.rs->state);
                co_await update_topology_state(take_guard(std::move(node)), std::move(muts), std::move(str));
            }
                break;
            case topology::transition_state::rollback_to_normal: {
                auto node = get_node_to_work_on(std::move(guard));

                // The barrier waits for all double writes started during the operation to complete. It allowed to fail
                // since we will fence the requests later.
                bool barrier_failed = false;
                auto state = node.rs->state;
                try {
                    node.guard = co_await exec_global_command(std::move(node.guard),raft_topology_cmd::command::barrier_and_drain, get_excluded_nodes(node), drop_guard_and_retake::yes);
                } catch (term_changed_error&) {
                    throw;
                } catch(...) {
                    rtlogger.warn("failed to run barrier_and_drain during rollback of {} after {} failure: {}",
                            node.id, state, std::current_exception());
                    barrier_failed = true;
                }

                if (barrier_failed) {
                    node.guard = co_await start_operation();
                }

                node = retake_node(std::move(node.guard), node.id);

                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                builder.set_fence_version(_topo_sm._topology.version) // fence requests in case the drain above failed
                       .set_transition_state(topology::transition_state::tablet_migration) // in case tablet drain failed we need to complete tablet transitions
                       .with_node(node.id)
                       .set("node_state", node_state::normal);
                rtbuilder.done();

                auto str = fmt::format("complete rollback of {} to state normal after {} failure", node.id, node.rs->state);

                rtlogger.info("{}", str);
                co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, str);
            }
                break;
            case topology::transition_state::truncate_table:
                co_await handle_truncate_table(std::move(guard));
                break;
        }
        co_return true;
    };

    // Called when there is no ongoing topology transition.
    // Used to start new topology transitions using node requests or perform node operations
    // that don't change the topology (like rebuild).
    future<> handle_node_transition(node_to_work_on&& node) {
        rtlogger.info("coordinator fiber found a node to work on id={} state={}", node.id, node.rs->state);

        switch (node.rs->state) {
            case node_state::none: {
                if (_topo_sm._topology.normal_nodes.empty()) {
                    rtlogger.info("skipping join node handshake for the first node in the cluster");
                } else {
                    auto validation_result = validate_joining_node(node);

                    if (utils::get_local_injector().enter("handle_node_transition_drop_expiring")) {
                        _gossiper.get_mutable_address_map().force_drop_expiring_entries();
                    }

                    // When the validation succeeded, it's important that all nodes in the
                    // cluster are aware of the IP address of the new node before we proceed to
                    // the topology::transition_state::join_group0 state, since in this state
                    // node IPs are already used to populate pending nodes in erm.
                    // This applies both to new and replacing nodes.
                    // If the wait_for_ip is unsuccessful, we should inform the new
                    // node about this failure.
                    // If the validation doesn't pass, we only need to call wait_for_ip on the current node,
                    // so that we can communicate the failure of the join request directly to
                    // the joining node.

                    {
                        std::exception_ptr wait_for_ip_error;
                        try {
                            if (holds_alternative<join_node_response_params::rejected>(validation_result)) {
                                release_guard(std::move(node.guard));
                                co_await wait_for_gossiper(node.id, _gossiper, _as);
                                node.guard = co_await start_operation();
                            } else {
                                auto exclude_nodes = get_excluded_nodes(node);
                                exclude_nodes.insert(node.id);
                                node.guard = co_await exec_global_command(std::move(node.guard),
                                    raft_topology_cmd::command::wait_for_ip,
                                    exclude_nodes);
                            }
                        } catch (term_changed_error&) {
                            throw;
                        } catch(...) {
                            wait_for_ip_error = std::current_exception();
                            rtlogger.warn("raft_topology_cmd::command::wait_for_ip failed, error {}",
                                wait_for_ip_error);
                        }
                        if (wait_for_ip_error) {
                            node.guard = co_await start_operation();
                        }
                        node = retake_node(std::move(node.guard), node.id);

                        if (wait_for_ip_error && holds_alternative<join_node_response_params::accepted>(validation_result)) {
                            validation_result = join_node_response_params::rejected {
                                .reason = ::format("wait_for_ip failed, error {}", wait_for_ip_error)
                            };
                        }
                    }

                    if (std::holds_alternative<join_node_response_params::rejected>(validation_result)) {
                        // Transition to left
                        topology_mutation_builder builder(node.guard.write_timestamp());
                        topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                        builder.with_node(node.id)
                               .del("topology_request")
                               .set("node_state", node_state::left);
                        rtbuilder.done("Join is rejected during validation");
                        auto reason = ::format("bootstrap: node rejected");

                        co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);

                        rtlogger.info("rejected node moved to left state {}", node.id);

                        try {
                            co_await respond_to_joining_node(node.id, join_node_response_params{
                                .response = std::move(validation_result),
                            });
                        } catch (const std::runtime_error& e) {
                            rtlogger.warn("attempt to send rejection response to {} failed. "
                                         "The node may hang. It's safe to shut it down manually now. Error: {}",
                                         node.id, e.what());
                        }

                        break;
                    }
                }
            }
            [[fallthrough]];
            case node_state::normal: {
                // if the state is none there have to be either 'join' or 'replace' request
                // if the state is normal there have to be either 'leave', 'remove' or 'rebuild' request
                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                rtbuilder.set("start_time", db_clock::now());
                switch (node.request.value()) {
                    case topology_request::join: {
                        SCYLLA_ASSERT(!node.rs->ring);
                        // Write chosen tokens through raft.
                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
                               .set("node_state", node_state::bootstrapping)
                               .del("topology_request");
                        auto reason = ::format("bootstrap: accept node");
                        co_await update_topology_state(std::move(node.guard), {builder.build(), rtbuilder.build()}, reason);
                        break;
                        }
                    case topology_request::leave:
                        SCYLLA_ASSERT(node.rs->ring);
                        // start decommission and put tokens of decommissioning nodes into write_both_read_old state
                        // meaning that reads will go to the replica being decommissioned
                        // but writes will go to new owner as well
                        builder.set_transition_state(topology::transition_state::tablet_draining)
                               .set_version(_topo_sm._topology.version + 1)
                               .with_node(node.id)
                               .set("node_state", node_state::decommissioning)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start decommission");
                        break;
                    case topology_request::remove: {
                        SCYLLA_ASSERT(node.rs->ring);

                        builder.set_transition_state(topology::transition_state::tablet_draining)
                               .set_version(_topo_sm._topology.version + 1)
                               .with_node(node.id)
                               .set("node_state", node_state::removing)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start removenode");
                        break;
                        }
                    case topology_request::replace: {
                        SCYLLA_ASSERT(!node.rs->ring);

                        builder.set_transition_state(topology::transition_state::join_group0)
                               .with_node(node.id)
                               .set("node_state", node_state::replacing)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "replace: accept node");
                        break;
                        }
                    case topology_request::rebuild: {
                        topology_mutation_builder builder(node.guard.write_timestamp());
                        builder.with_node(node.id)
                               .set("node_state", node_state::rebuilding)
                               .del("topology_request");
                        co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()},
                                                       "start rebuilding");
                        break;
                    }
                }
                break;
            }
            case node_state::rebuilding: {
                bool retake = false;
                auto id = node.id;
                topology_mutation_builder builder(node.guard.write_timestamp());
                topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
                try {
                    node = co_await exec_direct_command(std::move(node), raft_topology_cmd::command::stream_ranges);
                    rtbuilder.done();
                } catch (term_changed_error&) {
                    throw;
                } catch (...) {
                    rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
                                    " (node state is rebuilding): {}", std::current_exception());
                    rtbuilder.done("streaming failed");
                    retake = true;
                }
                if (retake) {
                    node = retake_node(co_await start_operation(), id);
                }
                builder.del_session().with_node(node.id)
                       .set("node_state", node_state::normal)
                       .del("rebuild_option");
                co_await update_topology_state(take_guard(std::move(node)), {builder.build(), rtbuilder.build()}, "rebuilding completed");
            }
                break;
            case node_state::bootstrapping:
            case node_state::decommissioning:
            case node_state::removing:
            case node_state::replacing:
                // Should not get here
                on_fatal_internal_error(rtlogger, ::format(
                    "Found node {} in state {} but there is no ongoing topology transition",
                    node.id, node.rs->state));
            case node_state::left:
                // Should not get here
                on_fatal_internal_error(rtlogger, ::format(
                        "Topology coordinator is called for node {} in state 'left'", node.id));
                break;
        }
    };

    std::variant<join_node_response_params::accepted, join_node_response_params::rejected>
    validate_joining_node(const node_to_work_on& node) {
        if (*node.request == topology_request::replace) {
            auto replaced_id = std::get<replace_param>(node.req_param.value()).replaced_id;
            if (!_topo_sm._topology.normal_nodes.contains(replaced_id)) {
                return join_node_response_params::rejected {
                    .reason = ::format("Cannot replace node {} because it is not in the 'normal' state", replaced_id),
                };
            }
        }

        std::vector<sstring> unsupported_features;
        const auto& supported_features = node.rs->supported_features;
        std::ranges::set_difference(node.topology->enabled_features, supported_features, std::back_inserter(unsupported_features));
        if (!unsupported_features.empty()) {
            rtlogger.warn("node {} does not understand some features: {}", node.id, unsupported_features);
            return join_node_response_params::rejected{
                .reason = seastar::format("Feature check failed. The node does not support some features that are enabled by the cluster: {}",
                        unsupported_features),
            };
        }

        return join_node_response_params::accepted {};
    }

    // Tries to finish accepting the joining node by updating the cluster
    // configuration and sending the acceptance response.
    //
    // Returns the retaken node and information on whether responding to the
    // join request succeeded.
    future<std::tuple<node_to_work_on, bool>> finish_accepting_node(node_to_work_on&& node) {
        if (_topo_sm._topology.normal_nodes.empty()) {
            // This is the first node, it joins without the handshake.
            co_return std::tuple{std::move(node), true};
        }

        auto id = node.id;

        SCYLLA_ASSERT(!_topo_sm._topology.transition_nodes.empty());

        release_node(std::move(node));

        if (!_raft.get_configuration().contains(id)) {
            co_await _raft.modify_config({raft::config_member({id, {}}, {})}, {}, &_as);
        }

        auto responded = false;
        try {
            co_await respond_to_joining_node(id, join_node_response_params{
                .response = join_node_response_params::accepted{},
            });
            responded = true;
        } catch (const std::runtime_error& e) {
            rtlogger.warn("attempt to send acceptance response to {} failed. "
                         "The node may hang. It's safe to shut it down manually now. Error: {}",
                         node.id, e.what());
        }

        co_return std::tuple{retake_node(co_await start_operation(), id), responded};
    }

    future<> respond_to_joining_node(raft::server_id id, join_node_response_params&& params) {
        co_await ser::join_node_rpc_verbs::send_join_node_response(
            &_messaging, to_host_id(id), id,
            std::move(params)
        );
    }

    std::vector<canonical_mutation> mark_nodes_as_cleanup_needed(node_to_work_on& node, bool rollback) {
        auto& topo = _topo_sm._topology;
        std::vector<canonical_mutation> muts;
        muts.reserve(topo.normal_nodes.size());
        std::unordered_set<locator::host_id> dirty_nodes;

        for (auto& [_, erm] : _db.get_non_local_strategy_keyspaces_erms()) {
            const std::unordered_set<locator::host_id>& nodes = rollback ? erm->get_all_pending_nodes() : erm->get_dirty_endpoints();
            dirty_nodes.insert(nodes.begin(), nodes.end());
        }

        for (auto& n : dirty_nodes) {
            auto id = raft::server_id(n.uuid());
            // mark all nodes (except self) as cleanup needed
            if (node.id != id) {
                topology_mutation_builder builder(node.guard.write_timestamp());
                builder.with_node(id).set("cleanup_status", cleanup_status::needed);
                muts.emplace_back(builder.build());
                rtlogger.debug("mark node {} as needed for cleanup", id);
            }
        }
        return muts;
    }

    future<> start_cleanup_on_dirty_nodes(group0_guard guard, bool global_request) {
        auto& topo = _topo_sm._topology;
        std::vector<canonical_mutation> muts;
        muts.reserve(topo.normal_nodes.size() + size_t(global_request));

        if (global_request) {
            topology_mutation_builder builder(guard.write_timestamp());
            builder.del_global_topology_request();
            muts.emplace_back(builder.build());
        }
        for (auto& [id, rs] : topo.normal_nodes) {
            if (rs.cleanup == cleanup_status::needed) {
                topology_mutation_builder builder(guard.write_timestamp());
                builder.with_node(id).set("cleanup_status", cleanup_status::running);
                muts.emplace_back(builder.build());
                rtlogger.debug("mark node {} as cleanup running", id);
            }
        }
        if (!muts.empty()) {
          co_await update_topology_state(std::move(guard), std::move(muts), "Starting cleanup");
        }
    }

    // Returns the guard if no work done. Otherwise, performs a table migration and consumes the guard.
    future<std::optional<group0_guard>> maybe_migrate_system_tables(group0_guard guard);

    // Returns true if the state machine was transitioned into tablet migration path.
    future<bool> maybe_start_tablet_migration(group0_guard);

    // Returns true if the state machine was transitioned into tablet resize finalization path.
    future<bool> maybe_start_tablet_resize_finalization(group0_guard, const table_resize_plan& plan);

    future<locator::load_stats> refresh_tablet_load_stats();
    future<> start_tablet_load_stats_refresher();

    // Precondition: the state machine upgrade state is not at upgrade_state::done.
    future<> do_upgrade_step(group0_guard);
    future<> build_coordinator_state(group0_guard);

    future<> await_event() {
        _as.check();
        co_await _topo_sm.event.when();
    }

    future<> fence_previous_coordinator();
    future<> rollback_current_topology_op(group0_guard&& guard);

    // Returns true if the coordinator should sleep after the exception.
    bool handle_topology_coordinator_error(std::exception_ptr eptr) noexcept;

public:
    topology_coordinator(
            sharded<db::system_distributed_keyspace>& sys_dist_ks, gms::gossiper& gossiper,
            netw::messaging_service& messaging, locator::shared_token_metadata& shared_tm,
            db::system_keyspace& sys_ks, replica::database& db, service::raft_group0& group0,
            service::topology_state_machine& topo_sm, abort_source& as, raft::server& raft_server,
            raft_topology_cmd_handler_type raft_topology_cmd_handler,
            tablet_allocator& tablet_allocator,
            std::chrono::milliseconds ring_delay,
            gms::feature_service& feature_service)
        : _sys_dist_ks(sys_dist_ks), _gossiper(gossiper), _messaging(messaging)
        , _shared_tm(shared_tm), _sys_ks(sys_ks), _db(db)
        , _group0(group0), _topo_sm(topo_sm), _as(as)
        , _feature_service(feature_service)
        , _raft(raft_server), _term(raft_server.get_current_term())
        , _raft_topology_cmd_handler(std::move(raft_topology_cmd_handler))
        , _tablet_allocator(tablet_allocator)
        , _ring_delay(ring_delay)
        , _group0_holder(_group0.hold_group0_gate())
    {}

    // Returns true if the upgrade was done, returns false if upgrade was interrupted.
    future<bool> maybe_run_upgrade();
    future<> run();
    future<> stop();

    virtual void on_join_cluster(const gms::inet_address& endpoint) {}
    virtual void on_leave_cluster(const gms::inet_address& endpoint, const locator::host_id& hid) {};
    virtual void on_up(const gms::inet_address& endpoint) { _topo_sm.event.broadcast(); };
    virtual void on_down(const gms::inet_address& endpoint) { _topo_sm.event.broadcast(); };
};

future<std::optional<group0_guard>> topology_coordinator::maybe_migrate_system_tables(group0_guard guard) {
    // Check if we can upgrade the view_build_status table to v2, being managed by group0.
    // First we upgrade to an intermediate version v1_5 where we write to both tables, then
    // we upgrade to v2.
    const auto view_builder_version = co_await _sys_ks.get_view_builder_version();
    if (view_builder_version == db::system_keyspace::view_builder_version_t::v1 && _feature_service.view_build_status_on_group0) {
        rtlogger.info("Migrating view_builder to v1_5");
        auto tmptr = get_token_metadata_ptr();
        co_await db::view::view_builder::migrate_to_v1_5(tmptr, _sys_ks, _sys_ks.query_processor(), _group0.client(), _as, std::move(guard));
        co_return std::nullopt;
    }

    if (view_builder_version == db::system_keyspace::view_builder_version_t::v1_5) {
        if (!get_dead_nodes().empty()) {
            rtlogger.debug("Not all nodes are alive. Skipping system table migration until there are any dead nodes.");
            co_return std::move(guard);
        }

        rtlogger.info("Migrating view_builder to v2");
        // do a barrier to ensure all nodes applied the migration to v1_5 before we continue to v2
        guard = co_await exec_global_command(std::move(guard), raft_topology_cmd::command::barrier, {_raft.id()});

        auto tmptr = get_token_metadata_ptr();
        co_await db::view::view_builder::migrate_to_v2(tmptr, _sys_ks, _sys_ks.query_processor(), _group0.client(), _as, std::move(guard));
        co_return std::nullopt;
    }

    co_return std::move(guard);
}

future<bool> topology_coordinator::maybe_start_tablet_migration(group0_guard guard) {
    rtlogger.debug("Evaluating tablet balance");

    if (utils::get_local_injector().enter("tablet_load_stats_refresh_before_rebalancing")) {
        _tablet_load_stats = make_lw_shared<const locator::load_stats>(co_await refresh_tablet_load_stats());
    }

    auto tm = get_token_metadata_ptr();
    auto plan = co_await _tablet_allocator.balance_tablets(tm, _tablet_load_stats, get_dead_nodes());
    if (plan.empty()) {
        rtlogger.debug("Tablets are balanced");
        co_return false;
    }

    std::vector<canonical_mutation> updates;

    co_await generate_migration_updates(updates, guard, plan);

    // We only want to consider transitioning into tablet resize finalization path, if there's no other work
    // to be done (e.g. start migration or/and emit split decision).
    if (updates.empty()) {
        co_return co_await maybe_start_tablet_resize_finalization(std::move(guard), plan.resize_plan());
    }

    updates.emplace_back(
        topology_mutation_builder(guard.write_timestamp())
            .set_transition_state(topology::transition_state::tablet_migration)
            .set_version(_topo_sm._topology.version + 1)
            .build());

    co_await update_topology_state(std::move(guard), std::move(updates), "Starting tablet migration");
    co_return true;
}

future<bool> topology_coordinator::maybe_start_tablet_resize_finalization(group0_guard guard, const table_resize_plan& plan) {
    if (plan.finalize_resize.empty()) {
        co_return false;
    }
    if (utils::get_local_injector().enter("tablet_split_finalization_postpone")) {
        co_return false;
    }

    std::vector<canonical_mutation> updates;

    updates.emplace_back(
        topology_mutation_builder(guard.write_timestamp())
            .set_transition_state(topology::transition_state::tablet_resize_finalization)
            .set_version(_topo_sm._topology.version + 1)
            .build());

    co_await update_topology_state(std::move(guard), std::move(updates), "Started tablet resize finalization");
    co_return true;
}

future<locator::load_stats> topology_coordinator::refresh_tablet_load_stats() {
    auto tm = get_token_metadata_ptr();

    locator::load_stats stats;
    static constexpr std::chrono::seconds wait_for_live_nodes_timeout{30};

    std::unordered_map<table_id, size_t> total_replicas;

    for (auto& [dc, nodes] : tm->get_datacenter_token_owners_nodes()) {
        locator::load_stats dc_stats;
        rtlogger.info("raft topology: Refreshing table load stats for DC {} that has {} token owners", dc, nodes.size());
        co_await coroutine::parallel_for_each(nodes, [&] (const auto& node) -> future<> {
            auto dst = node.get().host_id();

            _as.check();

            // FIXME: if a node is down, completion status cannot be relied upon, but the average tablet size
            //  could still be inferred from the replicas available.

            auto timeout = netw::messaging_service::clock_type::now() + wait_for_live_nodes_timeout;

            abort_source as;
            auto request_abort = [&as] () mutable noexcept {
                as.request_abort();
            };
            auto t = timer<lowres_clock>(request_abort);
            t.arm(timeout);
            auto sub = _as.subscribe(request_abort);

            auto node_stats = co_await ser::storage_service_rpc_verbs::send_table_load_stats(&_messaging,
                                                                                             dst,
                                                                                             as,
                                                                                             raft::server_id(dst.uuid()));

            dc_stats += node_stats;
        });

        for (auto& [table_id, table_stats] : dc_stats.tables) {
            co_await coroutine::maybe_yield();

            auto& t = _db.find_column_family(table_id);
            auto& rs = t.get_effective_replication_map()->get_replication_strategy();
            if (!rs.uses_tablets()) {
                continue;
            }
            const auto* nts_ptr = dynamic_cast<const locator::network_topology_strategy*>(&rs);
            if (!nts_ptr) {
                on_internal_error(rtlogger, "Cannot convert replication_strategy that uses tablets into network_topology_strategy");
            }

            auto rf_for_this_dc = nts_ptr->get_replication_factor(dc);
            if (rf_for_this_dc <= 0) {
                continue;
            }
            total_replicas[table_id] += rf_for_this_dc;
            rtlogger.debug("raft topology: Refreshed table load stats for DC {}, table={}, RF={}, size_in_bytes={}, split_ready_seq_number={}",
                          dc, table_id, rf_for_this_dc, table_stats.size_in_bytes, table_stats.split_ready_seq_number);
        }

        stats += dc_stats;
    }

    for (auto& [table_id, table_load_stats] : stats.tables) {
        auto table_total_replicas = total_replicas.at(table_id);
        if (table_total_replicas == 0) {
            continue;
        }
        // Takes into account the RF of each DC, so we can compute the average total size
        // for a single table replica. This allows the load balancer to compute, in turn,
        // the average tablet size by dividing total size by tablet count.
        table_load_stats.size_in_bytes /= table_total_replicas;
    }
    rtlogger.debug("raft topology: Refreshed table load stats for all DC(s).");

    co_return std::move(stats);
}

future<> topology_coordinator::start_tablet_load_stats_refresher() {
    auto can_proceed = [this] { return !_async_gate.is_closed() && !_as.abort_requested(); };
    while (can_proceed()) {
        bool sleep = true;
        try {
            _tablet_load_stats = make_lw_shared<const locator::load_stats>(co_await refresh_tablet_load_stats());
            _topo_sm.event.broadcast(); // wake up load balancer.
        } catch (raft::request_aborted&) {
            rtlogger.debug("raft topology: Tablet load stats refresher aborted");
            sleep = false;
        } catch (seastar::abort_requested_exception) {
            rtlogger.debug("raft topology: Tablet load stats refresher aborted");
            sleep = false;
        } catch (...) {
            rtlogger.warn("Found error while refreshing load stats for tablets: {}, retrying...", std::current_exception());
        }
        auto refresh_interval = utils::get_local_injector().is_enabled("short_tablet_stats_refresh_interval") ?
                std::chrono::seconds(1) : tablet_load_stats_refresh_interval;
        if (sleep && can_proceed()) {
            try {
                co_await seastar::sleep_abortable(refresh_interval, _as);
            } catch (...) {
                rtlogger.debug("raft topology: Tablet load stats refresher: sleep failed: {}", std::current_exception());
            }
        }
    }
}

future<> topology_coordinator::do_upgrade_step(group0_guard guard) {
    switch (_topo_sm._topology.upgrade_state) {
    case topology::upgrade_state_type::not_upgraded:
        on_internal_error(rtlogger, std::make_exception_ptr(std::runtime_error(
                "topology_coordinator was started even though upgrade to raft topology was not requested yet")));

    case topology::upgrade_state_type::build_coordinator_state:
        co_await build_coordinator_state(std::move(guard));
        co_return;

    case topology::upgrade_state_type::done:
        on_internal_error(rtlogger, std::make_exception_ptr(std::runtime_error(
                "topology_coordinator::do_upgrade_step called after upgrade was completed")));
    }
}

future<> topology_coordinator::build_coordinator_state(group0_guard guard) {
    // Wait until all nodes reach use_post_raft_procedures
    rtlogger.info("waiting for all nodes to finish upgrade to raft schema");
    release_guard(std::move(guard));
    co_await _group0.wait_for_all_nodes_to_finish_upgrade(_as);

    auto tmptr = get_token_metadata_ptr();

    auto sl_version = co_await _sys_ks.get_service_levels_version();
    if (!sl_version || *sl_version < 2) {
        rtlogger.info("migrating service levels data");
        co_await qos::service_level_controller::migrate_to_v2(tmptr->get_normal_token_owners().size(), _sys_ks, _sys_ks.query_processor(), _group0.client(), _as);
    }

    auto auth_version = co_await _sys_ks.get_auth_version();
    if (auth_version < db::system_keyspace::auth_version_t::v2) {
        rtlogger.info("migrating system_auth keyspace data");
        co_await auth::migrate_to_auth_v2(_sys_ks, _group0.client(),
                [this] (abort_source&) { return start_operation();}, _as);
    }

    rtlogger.info("building initial raft topology state and CDC generation");
    guard = co_await start_operation();

    auto get_application_state = [&] (locator::host_id host_id, const gms::application_state_map& epmap, gms::application_state app_state) -> sstring {
        const auto it = epmap.find(app_state);
        if (it == epmap.end()) {
            throw std::runtime_error(format("failed to build initial raft topology state from gossip for node {}: application state {} is missing in gossip",
                    host_id, app_state));
        }
        // it's versioned_value::value(), not std::optional::value() - it does not throw
        return it->second.value();
    };

    // Create a new CDC generation
    auto get_sharding_info_for_host_id = [&] (locator::host_id host_id) -> std::pair<size_t, uint8_t> {
        const auto eptr = _gossiper.get_endpoint_state_ptr(host_id);
        if (!eptr) {
            throw std::runtime_error(format("no gossiper endpoint state for node {}", host_id));
        }
        const auto& epmap = eptr->get_application_state_map();
        const auto shard_count = std::stoi(get_application_state(host_id, epmap, gms::application_state::SHARD_COUNT));
        const auto ignore_msb = std::stoi(get_application_state(host_id, epmap, gms::application_state::IGNORE_MSB_BITS));
        return std::make_pair<size_t, uint8_t>(shard_count, ignore_msb);
    };
    auto [cdc_gen_uuid, guard_, mutation] = co_await prepare_and_broadcast_cdc_generation_data(tmptr, std::move(guard), std::nullopt, get_sharding_info_for_host_id);
    guard = std::move(guard_);

    topology_mutation_builder builder(guard.write_timestamp());

    std::set<sstring> enabled_features;

    // Build per-node state
    for (const auto& node: tmptr->get_topology().get_nodes()) {
        if (!node.get().is_member()) {
            continue;
        }

        const auto& host_id = node.get().host_id();
        const auto eptr = _gossiper.get_endpoint_state_ptr(host_id);
        if (!eptr) {
            throw std::runtime_error(format("failed to build initial raft topology state from gossip for node {} as gossip contains no data for it", host_id));
        }

        const auto& epmap = eptr->get_application_state_map();

        const auto datacenter = get_application_state(host_id, epmap, gms::application_state::DC);
        const auto rack = get_application_state(host_id, epmap, gms::application_state::RACK);
        const auto tokens_v = tmptr->get_tokens(host_id);
        const std::unordered_set<dht::token> tokens(tokens_v.begin(), tokens_v.end());
        const auto release_version = get_application_state(host_id, epmap, gms::application_state::RELEASE_VERSION);
        const auto num_tokens = tokens.size();
        const auto shard_count = get_application_state(host_id, epmap, gms::application_state::SHARD_COUNT);
        const auto ignore_msb = get_application_state(host_id, epmap, gms::application_state::IGNORE_MSB_BITS);
        const auto supported_features_s = get_application_state(host_id, epmap, gms::application_state::SUPPORTED_FEATURES);
        const auto supported_features = gms::feature_service::to_feature_set(supported_features_s);

        if (enabled_features.empty()) {
            enabled_features = supported_features;
        } else {
            std::erase_if(enabled_features, [&] (const sstring& f) { return !supported_features.contains(f); });
        }

        builder.with_node(raft::server_id(host_id.uuid()))
                .set("datacenter", datacenter)
                .set("rack", rack)
                .set("tokens", tokens)
                .set("node_state", node_state::normal)
                .set("release_version", release_version)
                .set("num_tokens", (uint32_t)num_tokens)
                .set("tokens_string", "")
                .set("shard_count", (uint32_t)std::stoi(shard_count))
                .set("ignore_msb", (uint32_t)std::stoi(ignore_msb))
                .set("cleanup_status", cleanup_status::clean)
                .set("request_id", utils::UUID())
                .set("supported_features", supported_features);

        rtlogger.debug("node {} will contain the following parameters: "
                "datacenter={}, rack={}, tokens={}, shard_count={}, ignore_msb={}, supported_features={}",
                host_id, datacenter, rack, tokens, shard_count, ignore_msb, supported_features);
    }

    // Build the static columns
    const bool add_ts_delay = true; // This is not the first generation, so add delay
    auto cdc_gen_ts = cdc::new_generation_timestamp(add_ts_delay, _ring_delay);

    const cdc::generation_id_v2 cdc_gen_id {
        .ts = cdc_gen_ts,
        .id = cdc_gen_uuid,
    };

    builder.set_version(topology::initial_version)
            .set_fence_version(topology::initial_version)
            .add_new_committed_cdc_generation(cdc_gen_id)
            .add_enabled_features(std::move(enabled_features));

    // Commit
    builder.set_upgrade_state(topology::upgrade_state_type::done);
    auto reason = "upgrade: build the initial state";
    co_await update_topology_state(std::move(guard), {std::move(mutation), builder.build()}, reason);
}

future<> topology_coordinator::fence_previous_coordinator() {
    // Write empty change to make sure that a guard taken by any previous coordinator cannot
    // be used to do a successful write any more. Otherwise the following can theoretically happen
    // while a coordinator tries to execute RPC R and move to state S.
    // 1. Leader A executes topology RPC R
    // 2. Leader A takes guard G
    // 3. Leader A calls update_topology_state(S)
    // 4. Leadership moves to B (while update_topology_state is still running)
    // 5. B executed topology RPC R again
    // 6. while the RPC is running leadership moves to A again
    // 7. A completes update_topology_state(S)
    // Topology state machine moves to state S while RPC R is still running.
    // If RPC is idempotent that should not be a problem since second one executed by B will do nothing,
    // but better to be safe and cut off previous write attempt
    while (!_as.abort_requested()) {
        try {
            auto guard = co_await start_operation();
            topology_mutation_builder builder(guard.write_timestamp());
            co_await update_topology_state(std::move(guard), {builder.build()}, fmt::format("Starting new topology coordinator {}", _group0.group0_server().id()));
            break;
        } catch (group0_concurrent_modification&) {
            // If we failed to write because of concurrent modification lets retry
            continue;
        } catch (raft::request_aborted&) {
            // Abort was requested. Break the loop
            rtlogger.debug("request to fence previous coordinator was aborted");
            break;
        } catch (...) {
            rtlogger.error("failed to fence previous coordinator {}", std::current_exception());
        }
        try {
            co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
        } catch (abort_requested_exception&) {
            // Abort was requested. Break the loop
            break;
        } catch (...) {
            rtlogger.debug("sleep failed while fencing previous coordinator: {}", std::current_exception());
        }
    }
}

future<> topology_coordinator::rollback_current_topology_op(group0_guard&& guard) {
    rtlogger.info("start rolling back topology change");
    // Look for a node which operation should be aborted
    // (there should be one since we are in the rollback)
    node_to_work_on node = get_node_to_work_on(std::move(guard));
    topology::transition_state transition_state;

    switch (node.rs->state) {
        case node_state::bootstrapping:
            [[fallthrough]];
        case node_state::replacing:
            // To rollback bootstrap of replace just move the topology to left_token_ring. The node we tried to
            // add will be removed from the group0 by the transition handler. It will also be notified to shutdown.
            transition_state = topology::transition_state::left_token_ring;
            break;
        case node_state::removing: {
            auto id = node.id;
            // The node was removed already. We need to add it back. Lets do it as non voter.
            // If it ever boots again it will make itself a voter.
            release_node(std::move(node));
            co_await _group0.group0_server().modify_config({raft::config_member{{id, {}}, false}}, {}, &_as);
            node = retake_node(co_await start_operation(), id);
        }
            [[fallthrough]];
        case node_state::decommissioning:
            // To rollback decommission or remove just move the topology to rollback_to_normal.
            transition_state = topology::transition_state::rollback_to_normal;
            break;
        default:
            on_internal_error(rtlogger, fmt::format("tried to rollback in unsupported state {}", node.rs->state));
    }

    topology_mutation_builder builder(node.guard.write_timestamp());
    topology_request_tracking_mutation_builder rtbuilder(node.rs->request_id);
    builder.set_transition_state(transition_state)
           .set_version(_topo_sm._topology.version + 1);
    rtbuilder.set("error", fmt::format("Rolled back: {}", *_rollback));

    std::vector<canonical_mutation> muts;
    // We are in the process of aborting remove or decommission which may have streamed some
    // ranges to other nodes. Cleanup is needed.
    muts = mark_nodes_as_cleanup_needed(node, true);
    muts.emplace_back(builder.build());
    muts.emplace_back(rtbuilder.build());

    std::string str = fmt::format("rollback {} after {} failure, moving transition state to {} and setting cleanup flag",
            node.id, node.rs->state, transition_state);
    rtlogger.info("{}", str);
    co_await update_topology_state(std::move(node.guard), std::move(muts), str);
}

bool topology_coordinator::handle_topology_coordinator_error(std::exception_ptr eptr) noexcept {
    try {
        std::rethrow_exception(std::move(eptr));
    } catch (raft::request_aborted&) {
        rtlogger.debug("topology change coordinator fiber aborted");
    } catch (seastar::abort_requested_exception&) {
        rtlogger.debug("topology change coordinator fiber aborted");
    } catch (raft::commit_status_unknown&) {
        rtlogger.warn("topology change coordinator fiber got commit_status_unknown");
    } catch (group0_concurrent_modification&) {
        rtlogger.info("topology change coordinator fiber got group0_concurrent_modification");
    } catch (topology_coordinator::term_changed_error&) {
        // Term changed. We may no longer be a leader
        rtlogger.debug("topology change coordinator fiber notices term change {} -> {}", _term, _raft.get_current_term());
    } catch (...) {
        rtlogger.error("topology change coordinator fiber got error {}", std::current_exception());
        return true;
    }
    return false;
}

future<bool> topology_coordinator::maybe_run_upgrade() {
    if (_topo_sm._topology.upgrade_state == topology::upgrade_state_type::done) {
        // Upgrade was already done, nothing to do
        co_return true;
    }

    rtlogger.info("topology coordinator fiber is upgrading the cluster to raft topology mode");

    auto abort = _as.subscribe([this] () noexcept {
        _topo_sm.event.broadcast();
    });

    while (!_as.abort_requested() && _topo_sm._topology.upgrade_state != topology::upgrade_state_type::done) {
        bool sleep = false;
        try {
            auto guard = co_await start_operation();
            co_await do_upgrade_step(std::move(guard));
        } catch (...) {
            sleep = handle_topology_coordinator_error(std::current_exception());
        }
        if (sleep) {
            try {
                co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
            } catch (...) {
                rtlogger.debug("sleep failed: {}", std::current_exception());
            }
        }
        co_await coroutine::maybe_yield();
    }

    co_return !_as.abort_requested();
}

future<> topology_coordinator::run() {
    auto abort = _as.subscribe([this] () noexcept {
        _topo_sm.event.broadcast();
    });

    co_await fence_previous_coordinator();
    auto cdc_generation_publisher = cdc_generation_publisher_fiber();
    auto tablet_load_stats_refresher = start_tablet_load_stats_refresher();
    auto gossiper_orphan_remover = gossiper_orphan_remover_fiber();

    while (!_as.abort_requested()) {
        bool sleep = false;
        try {
            co_await utils::get_local_injector().inject("topology_coordinator_pause_before_processing_backlog", utils::wait_for_message(5min));
            auto guard = co_await cleanup_group0_config_if_needed(co_await start_operation());

            if (_rollback) {
                co_await rollback_current_topology_op(std::move(guard));
                _rollback = std::nullopt;
                continue;
            }

            bool had_work = co_await handle_topology_transition(std::move(guard));

            if (!had_work) {
                // Nothing to work on. Wait for topology change event.
                rtlogger.debug("topology coordinator fiber has nothing to do. Sleeping.");
                co_await await_event();
                rtlogger.debug("topology coordinator fiber got an event");
            }
            co_await utils::get_local_injector().inject("wait-after-topology-coordinator-gets-event", utils::wait_for_message(30s));
        } catch (...) {
            sleep = handle_topology_coordinator_error(std::current_exception());
        }
        if (sleep) {
            try {
                co_await seastar::sleep_abortable(std::chrono::seconds(1), _as);
            } catch (...) {
                rtlogger.debug("sleep failed: {}", std::current_exception());
            }
        }
        co_await coroutine::maybe_yield();
    }

    co_await _async_gate.close();
    co_await std::move(tablet_load_stats_refresher);
    co_await std::move(cdc_generation_publisher);
    co_await std::move(gossiper_orphan_remover);
}

future<> topology_coordinator::stop() {
    // if topology_coordinator::run() is aborted either because we are not a
    // leader anymore, or we are shutting down as a leader, we have to handle
    // futures in _tablets in case any of them failed, before these failures
    // are checked by future's destructor.
    co_await coroutine::parallel_for_each(_tablets, [] (auto& tablet) -> future<> {
        auto& [gid, tablet_state] = tablet;
        rtlogger.debug("Checking tablet migration state for {}", gid);
        // we should have at most only a single active barrier for each tablet,
        // but let's check all of them because we never reset these holders
        // once they are added as barriers
        for (auto& [stage, barrier]: tablet_state.barriers) {
            SCYLLA_ASSERT(barrier.has_value());
            try {
                co_await std::move(*barrier);
            } catch (...) {
                rtlogger.warn("Tablet '{}' migration failed at stage {}: {}",
                              gid, tablet_transition_stage_to_string(stage), std::current_exception());
            }
        }

        if (tablet_state.streaming) {
            try {
                co_await std::move(*tablet_state.streaming);
            } catch (...) {
                rtlogger.warn("Tablet '{}' migration failed when streaming: {}",
                              gid, std::current_exception());

            }
        }
        if (tablet_state.cleanup) {
            try {
                co_await std::move(*tablet_state.cleanup);
            } catch (...) {
                rtlogger.warn("Tablet '{}' migration failed when cleanup: {}",
                              gid, std::current_exception());
            }
        }
    });
}

future<> run_topology_coordinator(
        seastar::sharded<db::system_distributed_keyspace>& sys_dist_ks, gms::gossiper& gossiper,
        netw::messaging_service& messaging, locator::shared_token_metadata& shared_tm,
        db::system_keyspace& sys_ks, replica::database& db, service::raft_group0& group0,
        service::topology_state_machine& topo_sm, seastar::abort_source& as, raft::server& raft,
        raft_topology_cmd_handler_type raft_topology_cmd_handler,
        tablet_allocator& tablet_allocator,
        std::chrono::milliseconds ring_delay,
        endpoint_lifecycle_notifier& lifecycle_notifier,
        gms::feature_service& feature_service) {

    topology_coordinator coordinator{
            sys_dist_ks, gossiper, messaging, shared_tm,
            sys_ks, db, group0, topo_sm, as, raft,
            std::move(raft_topology_cmd_handler),
            tablet_allocator,
            ring_delay,
            feature_service};

    std::exception_ptr ex;
    lifecycle_notifier.register_subscriber(&coordinator);
    try {
        rtlogger.info("start topology coordinator fiber");
        const bool upgrade_done = co_await coordinator.maybe_run_upgrade();
        if (upgrade_done) {
            co_await with_scheduling_group(group0.get_scheduling_group(), [&] {
                return coordinator.run();
            });
        }
    } catch (...) {
        ex = std::current_exception();
    }
    if (ex) {
        try {
            if (raft.is_leader()) {
                rtlogger.warn("unhandled exception in topology_coordinator::run: {}; stepping down as a leader", ex);
                const auto stepdown_timeout_ticks = std::chrono::seconds(5) / raft_tick_interval;
                co_await raft.stepdown(raft::logical_clock::duration(stepdown_timeout_ticks));
            }
        } catch (...) {
            rtlogger.error("failed to step down before aborting: {}", std::current_exception());
        }
        on_fatal_internal_error(rtlogger, format("unhandled exception in topology_coordinator::run: {}", ex));
    }
    co_await lifecycle_notifier.unregister_subscriber(&coordinator);
    co_await coordinator.stop();
}

} // namespace service