diff --git a/docs/dev/system_keyspace.md b/docs/dev/system_keyspace.md index 9f105a8bf1..cf8c69ac2d 100644 --- a/docs/dev/system_keyspace.md +++ b/docs/dev/system_keyspace.md @@ -202,6 +202,7 @@ CREATE TABLE system.tablets ( new_replicas frozen>>>, replicas frozen>>>, stage text, + transition text, table_name text static, tablet_count int static, PRIMARY KEY ((keyspace_name, table_id), last_token) @@ -225,9 +226,13 @@ Only tables which use tablet-based replication strategy have an entry here. Each tablet is represented by a single row. `replicas` holds the set of shard-replicas of the tablet. It's a list of tuples where the first element is `host_id` of the replica and the second element is the `shard_id` of the replica. -During tablet migration, the columns `new_replicas` and `stage` are set to represent the transition. The +During tablet migration, the columns `new_replicas`, `stage` and `transition` are set to represent the transition. The `new_replicas` column holds what will be put in `replicas` after transition is done. +The `transition` column can have the following values: + * `migration` - One tablet replica is moving from one shard to another. + * `rebuild` - New tablet replica is created from the remaining replicas. + # Virtual tables in the system keyspace Virtual tables behave just like a regular table from the user's point of view. diff --git a/docs/dev/topology-over-raft.md b/docs/dev/topology-over-raft.md index 113adb331b..75cbc9c497 100644 --- a/docs/dev/topology-over-raft.md +++ b/docs/dev/topology-over-raft.md @@ -104,19 +104,28 @@ that there are no tablet transitions in the system. Tablets are migrated in parallel and independently. There is a variant of tablet migration track called tablet draining track, which is invoked -as a step of certain topology operations (e.g. decommission). Its goal is to readjust tablet replicas +as a step of certain topology operations (e.g. decommission, removenode, replace). Its goal is to readjust tablet replicas so that a given topology change can proceed. For example, when decommissioning a node, we need to migrate tablet replicas away from the node being decommissioned. Tablet draining happens before making changes to vnode-based replication. -# Tablet migration +# Tablet transitions -Each tablet has its own migration state machine stored in group0 which is part of the tablet state. It involves +Tablets can undergo a process called "transition", which performs some maintenance action on the tablet which is +globally driven by the topology change coordinator and serialized per-tablet. Transition can be one of: + + * migration - tablet replica is moved from one shard to another (possibly on a different node) + + * rebuild - new tablet replica is rebuilt from existing ones, possibly dropping old replica afterwards (on node removal or replace) + +Each tablet has its own state machine for keeping state of transition stored in group0 which is part of the tablet state. It involves these properties of a tablet: - - replicas: the old replicas of a table - - new_replicas: the new replicas of a tablet + - replicas: the old replicas of a table (also set when not in transition) + - new_replicas: the new replicas of a tablet which will become current after transition - stage: determines which replicas should be used by requests on the coordinator side, and which action should be taken by the state machine executor. + - transition: the kind of tablet transition (migration, rebuild, etc.). Affects the behavior of stages and actions + performed in those stages. Currently, the tablet state machine is driven forward by the tablet migration track of the topology state machine. diff --git a/locator/tablets.cc b/locator/tablets.cc index 91d7add407..21109ac815 100644 --- a/locator/tablets.cc +++ b/locator/tablets.cc @@ -66,10 +66,12 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage) } tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, + tablet_transition_kind transition, tablet_replica_set next, tablet_replica pending_replica, service::session_id session_id) : stage(stage) + , transition(transition) , next(std::move(next)) , pending_replica(std::move(pending_replica)) , session_id(session_id) @@ -262,6 +264,31 @@ tablet_transition_stage tablet_transition_stage_from_string(const sstring& name) return tablet_transition_stage_from_name.at(name); } +// The names are persisted in system tables so should not be changed. +static const std::unordered_map tablet_transition_kind_to_name = { + {tablet_transition_kind::migration, "migration"}, +}; + +static const std::unordered_map tablet_transition_kind_from_name = std::invoke([] { + std::unordered_map result; + for (auto&& [v, s] : tablet_transition_kind_to_name) { + result.emplace(s, v); + } + return result; +}); + +sstring tablet_transition_kind_to_string(tablet_transition_kind kind) { + auto i = tablet_transition_kind_to_name.find(kind); + if (i == tablet_transition_kind_to_name.end()) { + on_internal_error(tablet_logger, format("Invalid tablet transition kind: {}", static_cast(kind))); + } + return i->second; +} + +tablet_transition_kind tablet_transition_kind_from_string(const sstring& name) { + return tablet_transition_kind_from_name.at(name); +} + std::ostream& operator<<(std::ostream& out, tablet_id id) { return out << size_t(id); } @@ -582,3 +609,8 @@ auto fmt::formatter::format(const locator::tab -> decltype(ctx.out()) { return fmt::format_to(ctx.out(), "{}", locator::tablet_transition_stage_to_string(stage)); } + +auto fmt::formatter::format(const locator::tablet_transition_kind& kind, fmt::format_context& ctx) const + -> decltype(ctx.out()) { + return fmt::format_to(ctx.out(), "{}", locator::tablet_transition_kind_to_string(kind)); +} diff --git a/locator/tablets.hh b/locator/tablets.hh index 238b992df0..8dde6a189b 100644 --- a/locator/tablets.hh +++ b/locator/tablets.hh @@ -158,8 +158,17 @@ enum class tablet_transition_stage { end_migration, }; +enum class tablet_transition_kind { + // Tablet replica is migrating from one shard to another. + // The new replica is (tablet_transition_info::next - tablet_info::replicas). + // The leaving replica is (tablet_info::replicas - tablet_transition_info::next). + migration, +}; + sstring tablet_transition_stage_to_string(tablet_transition_stage); tablet_transition_stage tablet_transition_stage_from_string(const sstring&); +sstring tablet_transition_kind_to_string(tablet_transition_kind); +tablet_transition_kind tablet_transition_kind_from_string(const sstring&); enum class write_replica_set_selector { previous, both, next @@ -173,13 +182,17 @@ enum class read_replica_set_selector { /// Describes transition of a single tablet. struct tablet_transition_info { tablet_transition_stage stage; + tablet_transition_kind transition; tablet_replica_set next; tablet_replica pending_replica; // Optimization (next - tablet_info::replicas) service::session_id session_id; write_replica_set_selector writes; read_replica_set_selector reads; - tablet_transition_info(tablet_transition_stage stage, tablet_replica_set next, tablet_replica pending_replica, + tablet_transition_info(tablet_transition_stage stage, + tablet_transition_kind kind, + tablet_replica_set next, + tablet_replica pending_replica, service::session_id session_id = {}); bool operator==(const tablet_transition_info&) const = default; @@ -381,6 +394,11 @@ struct fmt::formatter : fmt::formatter decltype(ctx.out()); }; +template <> +struct fmt::formatter : fmt::formatter { + auto format(const locator::tablet_transition_kind&, fmt::format_context& ctx) const -> decltype(ctx.out()); +}; + template <> struct fmt::formatter : fmt::formatter { auto format(const locator::global_tablet_id&, fmt::format_context& ctx) const -> decltype(ctx.out()); diff --git a/replica/tablet_mutation_builder.hh b/replica/tablet_mutation_builder.hh index 12bdf25a87..3d0e01710a 100644 --- a/replica/tablet_mutation_builder.hh +++ b/replica/tablet_mutation_builder.hh @@ -35,6 +35,7 @@ public: tablet_mutation_builder& set_new_replicas(dht::token last_token, locator::tablet_replica_set replicas); tablet_mutation_builder& set_replicas(dht::token last_token, locator::tablet_replica_set replicas); tablet_mutation_builder& set_stage(dht::token last_token, locator::tablet_transition_stage stage); + tablet_mutation_builder& set_transition(dht::token last_token, locator::tablet_transition_kind); tablet_mutation_builder& set_session(dht::token last_token, service::session_id); tablet_mutation_builder& del_session(dht::token last_token); tablet_mutation_builder& del_transition(dht::token last_token); diff --git a/replica/tablets.cc b/replica/tablets.cc index a6d6872716..74d88e33ed 100644 --- a/replica/tablets.cc +++ b/replica/tablets.cc @@ -48,6 +48,7 @@ schema_ptr make_tablets_schema() { .with_column("replicas", replica_set_type) .with_column("new_replicas", replica_set_type) .with_column("stage", utf8_type) + .with_column("transition", utf8_type) .with_column("session", uuid_type) .with_version(db::system_keyspace::generate_schema_version(id)) .build(); @@ -87,6 +88,7 @@ tablet_map_to_mutation(const tablet_map& tablets, table_id id, const sstring& ke m.set_clustered_cell(ck, "replicas", make_list_value(replica_set_type, replicas_to_data_value(tablet.replicas)), ts); if (auto tr_info = tablets.get_tablet_transition_info(tid)) { m.set_clustered_cell(ck, "stage", tablet_transition_stage_to_string(tr_info->stage), ts); + m.set_clustered_cell(ck, "transition", tablet_transition_kind_to_string(tr_info->transition), ts); m.set_clustered_cell(ck, "new_replicas", make_list_value(replica_set_type, replicas_to_data_value(tr_info->next)), ts); if (tr_info->session_id) { m.set_clustered_cell(ck, "session", data_value(tr_info->session_id.uuid()), ts); @@ -116,6 +118,12 @@ tablet_mutation_builder::set_stage(dht::token last_token, locator::tablet_transi return *this; } +tablet_mutation_builder& +tablet_mutation_builder::set_transition(dht::token last_token, locator::tablet_transition_kind kind) { + _m.set_clustered_cell(get_ck(last_token), "transition", data_value(tablet_transition_kind_to_string(kind)), _ts); + return *this; +} + tablet_mutation_builder& tablet_mutation_builder::set_session(dht::token last_token, service::session_id session_id) { _m.set_clustered_cell(get_ck(last_token), "session", data_value(session_id.uuid()), _ts); @@ -134,6 +142,8 @@ tablet_mutation_builder::del_transition(dht::token last_token) { auto ck = get_ck(last_token); auto stage_col = _s->get_column_definition("stage"); _m.set_clustered_cell(ck, *stage_col, atomic_cell::make_dead(_ts, gc_clock::now())); + auto transition_col = _s->get_column_definition("transition"); + _m.set_clustered_cell(ck, *transition_col, atomic_cell::make_dead(_ts, gc_clock::now())); auto new_replicas_col = _s->get_column_definition("new_replicas"); _m.set_clustered_cell(ck, *new_replicas_col, atomic_cell::make_dead(_ts, gc_clock::now())); auto session_col = _s->get_column_definition("session"); @@ -212,6 +222,7 @@ future read_tablet_metadata(cql3::query_processor& qp) { if (row.has("stage")) { auto stage = tablet_transition_stage_from_string(row.get_as("stage")); + auto transition = tablet_transition_kind_from_string(row.get_as("transition")); std::unordered_set pending(new_tablet_replicas.begin(), new_tablet_replicas.end()); for (auto&& r : tablet_replicas) { @@ -229,7 +240,7 @@ future read_tablet_metadata(cql3::query_processor& qp) { if (row.has("session")) { session_id = service::session_id(row.get_as("session")); } - current->map.set_tablet_transition_info(current->tid, tablet_transition_info{stage, + current->map.set_tablet_transition_info(current->tid, tablet_transition_info{stage, transition, std::move(new_tablet_replicas), *pending.begin(), session_id}); } diff --git a/service/storage_service.cc b/service/storage_service.cc index c4241d27f1..5d0b77810f 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -2046,6 +2046,7 @@ class topology_coordinator : public endpoint_lifecycle_subscriber { replica::tablet_mutation_builder(guard.write_timestamp(), mig.tablet.table) .set_new_replicas(last_token, replace_replica(tmap.get_tablet_info(mig.tablet.tablet).replicas, mig.src, mig.dst)) .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old) + .set_transition(last_token, mig.kind) .build()); } @@ -7605,6 +7606,7 @@ future<> storage_service::move_tablet(table_id table, dht::token token, locator: updates.push_back(canonical_mutation(replica::tablet_mutation_builder(guard.write_timestamp(), table) .set_new_replicas(last_token, locator::replace_replica(tinfo.replicas, src, dst)) .set_stage(last_token, locator::tablet_transition_stage::allow_write_both_read_old) + .set_transition(last_token, locator::tablet_transition_kind::migration) .build())); updates.push_back(canonical_mutation(topology_mutation_builder(guard.write_timestamp()) .set_transition_state(topology::transition_state::tablet_migration) diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index 9a8a755bd9..6a7bea0a90 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -721,7 +721,7 @@ public: auto& target_load_sketch = co_await target_info.get_load_sketch(_tm); auto dst = global_shard_id {target, target_load_sketch.next_shard(target)}; - auto mig = tablet_migration_info {source_tablet, src, dst}; + auto mig = tablet_migration_info {tablet_transition_kind::migration, source_tablet, src, dst}; if (target_info.shards[dst.shard].streaming_write_load < max_write_streaming_load && src_node_info.shards[src_shard].streaming_read_load < max_read_streaming_load) { diff --git a/service/tablet_allocator.hh b/service/tablet_allocator.hh index 064b01a0b4..6d8782a2ff 100644 --- a/service/tablet_allocator.hh +++ b/service/tablet_allocator.hh @@ -15,6 +15,7 @@ namespace service { /// Represents intention to move a single tablet replica from src to dst. struct tablet_migration_info { + locator::tablet_transition_kind kind; locator::global_tablet_id tablet; locator::tablet_replica src; locator::tablet_replica dst; diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index 6bf399f801..59bcc6bdd2 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -140,6 +140,7 @@ SEASTAR_TEST_CASE(test_tablet_metadata_persistence) { tmap.set_tablet_transition_info(tb, tablet_transition_info{ tablet_transition_stage::allow_write_both_read_old, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h3, 3}, tablet_replica {h1, 7}, @@ -150,6 +151,7 @@ SEASTAR_TEST_CASE(test_tablet_metadata_persistence) { tb = *tmap.next_tablet(tb); tmap.set_tablet_transition_info(tb, tablet_transition_info{ tablet_transition_stage::use_new, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h1, 4}, tablet_replica {h2, 2}, @@ -262,6 +264,7 @@ SEASTAR_TEST_CASE(test_get_shard) { }); tmap.set_tablet_transition_info(tid, tablet_transition_info { tablet_transition_stage::allow_write_both_read_old, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h1, 0}, tablet_replica {h2, 3}, @@ -326,6 +329,7 @@ SEASTAR_TEST_CASE(test_mutation_builder) { tablet_replica {h2, 3}, }); b.set_stage(last_token, tablet_transition_stage::write_both_read_new); + b.set_transition(last_token, tablet_transition_kind::migration); e.local_db().apply({freeze(b.build())}, db::no_timeout).get(); } @@ -347,6 +351,7 @@ SEASTAR_TEST_CASE(test_mutation_builder) { }); expected_tmap.set_tablet_transition_info(tid1, tablet_transition_info { tablet_transition_stage::write_both_read_new, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h1, 2}, tablet_replica {h2, 3}, @@ -362,6 +367,7 @@ SEASTAR_TEST_CASE(test_mutation_builder) { tablet_mutation_builder b(ts++, table1); auto last_token = tm.get_tablet_map(table1).get_last_token(tid1); b.set_stage(last_token, tablet_transition_stage::use_new); + b.set_transition(last_token, tablet_transition_kind::migration); e.local_db().apply({freeze(b.build())}, db::no_timeout).get(); } @@ -383,6 +389,7 @@ SEASTAR_TEST_CASE(test_mutation_builder) { }); expected_tmap.set_tablet_transition_info(tid1, tablet_transition_info { tablet_transition_stage::use_new, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h1, 2}, tablet_replica {h2, 3}, @@ -471,6 +478,7 @@ SEASTAR_TEST_CASE(test_sharder) { }); tmap.set_tablet_transition_info(tid, tablet_transition_info { tablet_transition_stage::use_new, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {h1, 1}, tablet_replica {h2, 3}, @@ -608,6 +616,7 @@ static tablet_transition_info migration_to_transition_info(const tablet_migration_info& mig, const tablet_info& ti) { return tablet_transition_info { tablet_transition_stage::allow_write_both_read_old, + mig.kind, replace_replica(ti.replicas, mig.src, mig.dst), mig.dst }; @@ -1142,6 +1151,7 @@ SEASTAR_THREAD_TEST_CASE(test_load_balancing_works_with_in_progress_transitions) } tmap.set_tablet_transition_info(tmap.first_tablet(), tablet_transition_info { tablet_transition_stage::allow_write_both_read_old, + tablet_transition_kind::migration, tablet_replica_set { tablet_replica {host3, 0}, tablet_replica {host2, 0},