diff --git a/configure.py b/configure.py index 75da9a447c..59f61ed2f5 100755 --- a/configure.py +++ b/configure.py @@ -931,6 +931,7 @@ scylla_core = (['message/messaging_service.cc', 'query.cc', 'query-result-set.cc', 'locator/abstract_replication_strategy.cc', + 'locator/tablets.cc', 'locator/azure_snitch.cc', 'locator/simple_strategy.cc', 'locator/local_strategy.cc', diff --git a/dht/token.cc b/dht/token.cc index 3f1d5e960e..ba868b2347 100644 --- a/dht/token.cc +++ b/dht/token.cc @@ -282,4 +282,14 @@ compaction_group_of(unsigned most_significant_bits, const token& t) { __builtin_unreachable(); } +token last_token_of_compaction_group(unsigned most_significant_bits, size_t group) { + uint64_t n; + if (group == ((1ul << most_significant_bits) - 1)) { + n = std::numeric_limits::max(); + } else { + n = ((uint64_t(group) + 1) << (64 - most_significant_bits)) - 1; + } + return bias(n); +} + } // namespace dht diff --git a/dht/token.hh b/dht/token.hh index 1701139cef..c000c2865f 100644 --- a/dht/token.hh +++ b/dht/token.hh @@ -238,6 +238,7 @@ token first_token() { uint64_t unbias(const token& t); token bias(uint64_t n); size_t compaction_group_of(unsigned most_significant_bits, const token& t); +token last_token_of_compaction_group(unsigned most_significant_bits, size_t group); } // namespace dht diff --git a/locator/tablets.cc b/locator/tablets.cc new file mode 100644 index 0000000000..baf0ba2851 --- /dev/null +++ b/locator/tablets.cc @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2023-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#include "locator/tablet_replication_strategy.hh" +#include "locator/tablets.hh" +#include "types/types.hh" +#include "types/tuple.hh" +#include "types/set.hh" +#include "utils/hash.hh" +#include "db/system_keyspace.hh" +#include "cql3/query_processor.hh" +#include "cql3/untyped_result_set.hh" +#include "replica/database.hh" +#include "utils/stall_free.hh" + +#include +#include + +namespace locator { + +seastar::logger tablet_logger("tablets"); + +const tablet_map& tablet_metadata::get_tablet_map(table_id id) const { + try { + return _tablets.at(id); + } catch (const std::out_of_range&) { + throw std::runtime_error(format("Tablet map not found for table {}", id)); + } +} + +tablet_map& tablet_metadata::get_tablet_map(table_id id) { + return const_cast( + const_cast(this)->get_tablet_map(id)); +} + +void tablet_metadata::set_tablet_map(table_id id, tablet_map map) { + _tablets.insert_or_assign(id, std::move(map)); +} + +future<> tablet_metadata::clear_gently() { + for (auto&& [id, map] : _tablets) { + co_await map.clear_gently(); + } + co_return; +} + +tablet_map::tablet_map(size_t tablet_count) + : _log2_tablets(log2ceil(tablet_count)) { + if (tablet_count != 1ul << _log2_tablets) { + on_internal_error(tablet_logger, format("Tablet count not a power of 2: {}", tablet_count)); + } + _tablets.resize(tablet_count); +} + +void tablet_map::check_tablet_id(tablet_id id) const { + if (size_t(id) >= tablet_count()) { + throw std::logic_error(format("Invalid tablet id: {} >= {}", id, tablet_count())); + } +} + +const tablet_info& tablet_map::get_tablet_info(tablet_id id) const { + check_tablet_id(id); + return _tablets[size_t(id)]; +} + +tablet_id tablet_map::get_tablet_id(token t) const { + return tablet_id(dht::compaction_group_of(_log2_tablets, t)); +} + +dht::token tablet_map::get_last_token(tablet_id id) const { + check_tablet_id(id); + return dht::last_token_of_compaction_group(_log2_tablets, size_t(id)); +} + +dht::token tablet_map::get_first_token(tablet_id id) const { + if (id == first_tablet()) { + return dht::first_token(); + } else { + return dht::next_token(get_last_token(tablet_id(size_t(id) - 1))); + } +} + +dht::token_range tablet_map::get_token_range(tablet_id id) const { + if (id == first_tablet()) { + return dht::token_range::make({dht::minimum_token(), false}, {get_last_token(id), true}); + } else { + return dht::token_range::make({get_last_token(tablet_id(size_t(id) - 1)), false}, {get_last_token(id), true}); + } +} + +void tablet_map::set_tablet(tablet_id id, tablet_info info) { + check_tablet_id(id); + _tablets[size_t(id)] = std::move(info); +} + +void tablet_map::set_tablet_transition_info(tablet_id id, tablet_transition_info info) { + check_tablet_id(id); + _transitions.insert_or_assign(id, std::move(info)); +} + +future<> tablet_map::clear_gently() { + return utils::clear_gently(_tablets); +} + +const tablet_transition_info* tablet_map::get_tablet_transition_info(tablet_id id) const { + auto i = _transitions.find(id); + if (i == _transitions.end()) { + return nullptr; + } + return &i->second; +} + +} diff --git a/locator/tablets.hh b/locator/tablets.hh new file mode 100644 index 0000000000..019909fbf6 --- /dev/null +++ b/locator/tablets.hh @@ -0,0 +1,227 @@ +/* + * Copyright (C) 2023-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +#pragma once + +#include "dht/token.hh" +#include "utils/small_vector.hh" +#include "locator/host_id.hh" +#include "dht/i_partitioner_fwd.hh" +#include "schema/schema_fwd.hh" +#include "utils/chunked_vector.hh" +#include "utils/hash.hh" + +#include +#include +#include + +#include + +namespace locator { + +extern seastar::logger tablet_logger; + +using token = dht::token; + +// Identifies tablet within the scope of a single tablet_map, +// which has a scope of (table_id, token metadata version). +// Different tablets of different tables can have the same tablet_id. +// Different tablets in subsequent token metadata version can have the same tablet_id. +// When splitting a tablet, one of the new tablets (in the new token metadata version) +// will have the same tablet_id as the old one. +enum class tablet_id : size_t; + +struct tablet_replica { + host_id host; + shard_id shard; + + bool operator==(const tablet_replica&) const = default; +}; + +std::ostream& operator<<(std::ostream&, const tablet_replica&); + +using tablet_replica_set = utils::small_vector; + +/// Stores information about a single tablet. +struct tablet_info { + tablet_replica_set replicas; + + std::optional get_shard(host_id host) const { + for (auto&& r : replicas) { + if (r.host == host) { + return r.shard; + } + } + return std::nullopt; + } + + bool operator==(const tablet_info&) const = default; +}; + +/// Used for storing tablet state transition during topology changes. +/// Describes transition of a single tablet. +struct tablet_transition_info { + tablet_replica_set next; + tablet_replica pending_replica; // Optimization (next - tablet_info::replicas) + + bool operator==(const tablet_transition_info&) const = default; +}; + +/// Stores information about tablets of a single table. +/// +/// The map contains a constant number of tablets, tablet_count(). +/// Each tablet has an associated tablet_info, and an optional tablet_transition_info. +/// Any given token is owned by exactly one tablet in this map. +/// +/// A tablet map describes the whole ring, it cannot contain a partial mapping. +/// This means that the following sequence is always valid: +/// +/// tablet_map& tmap = ...; +/// dht::token t = ...; +/// tablet_id id = tmap.get_tablet_id(t); +/// tablet_info& info = tmap.get_tablet_info(id); +/// +/// A tablet_id obtained from an instance of tablet_map is valid for that instance only. +class tablet_map { +public: + using tablet_container = utils::chunked_vector; +private: + // The implementation assumes that _tablets.size() is a power of 2: + // + // _tablets.size() == 1 << _log2_tablets + // + tablet_container _tablets; + size_t _log2_tablets; // log_2(_tablets.size()) + std::unordered_map _transitions; +public: + /// Constructs a tablet map. + /// + /// \param tablet_count The desired tablets to allocate. Must be a power of two. + explicit tablet_map(size_t tablet_count); + + /// Returns tablet_id of a tablet which owns a given token. + tablet_id get_tablet_id(token) const; + + /// Returns tablet_info associated with a given tablet. + /// The given id must belong to this instance. + const tablet_info& get_tablet_info(tablet_id) const; + + /// Returns a pointer to tablet_transition_info associated with a given tablet. + /// If there is no transition for a given tablet, returns nullptr. + /// \throws std::logic_error If the given id does not belong to this instance. + const tablet_transition_info* get_tablet_transition_info(tablet_id) const; + + /// Returns the largest token owned by a given tablet. + /// \throws std::logic_error If the given id does not belong to this instance. + dht::token get_last_token(tablet_id id) const; + + /// Returns the smallest token owned by a given tablet. + /// \throws std::logic_error If the given id does not belong to this instance. + dht::token get_first_token(tablet_id id) const; + + /// Returns token_range which contains all tokens owned by a given tablet and only such tokens. + /// \throws std::logic_error If the given id does not belong to this instance. + dht::token_range get_token_range(tablet_id id) const; + + /// Returns the id of the first tablet. + tablet_id first_tablet() const { + return tablet_id(0); + } + + /// Returns the id of the last tablet. + tablet_id last_tablet() const { + return tablet_id(tablet_count() - 1); + } + + /// Returns the id of a tablet which follows a given tablet in the ring, + /// or disengaged optional if the given tablet is the last one. + std::optional next_tablet(tablet_id t) const { + if (t == last_tablet()) { + return std::nullopt; + } + return tablet_id(size_t(t) + 1); + } + + const tablet_container& tablets() const { + return _tablets; + } + + /// Returns an iterable range over tablet_id:s which includes all tablets in token ring order. + auto tablet_ids() const { + return boost::irange(0, tablet_count()) | boost::adaptors::transformed([] (size_t i) { + return tablet_id(i); + }); + } + + size_t tablet_count() const { + return _tablets.size(); + } + + /// Returns tablet_info associated with the tablet which owns a given token. + const tablet_info& get_tablet_info(token t) const { + return get_tablet_info(get_tablet_id(t)); + } + + bool operator==(const tablet_map&) const = default; +public: + void set_tablet(tablet_id, tablet_info); + void set_tablet_transition_info(tablet_id, tablet_transition_info); + + // Destroys gently. + // The tablet map is not usable after this call and should be destroyed. + future<> clear_gently(); +private: + void check_tablet_id(tablet_id) const; +}; + +/// Holds information about all tablets in the cluster. +/// +/// When this instance is obtained via token_metadata_ptr, it is immutable +/// (represents a snapshot) and references obtained through this are guaranteed +/// to remain valid as long as the containing token_metadata_ptr is held. +/// +/// Copy constructor can be invoked across shards. +class tablet_metadata { +public: + // FIXME: Make cheap to copy. + // We want both immutability and cheap updates, so we should use + // hierarchical data structure with shared pointers and copy-on-write. + // Currently we have immutability but updates require full copy. + // + // Also, currently the copy constructor is invoked across shards, which precludes + // using shared pointers. We should change that and use a foreign_ptr<> to + // hold immutable tablet_metadata which lives on shard 0 only. + // See storage_service::replicate_to_all_cores(). + using table_to_tablet_map = std::unordered_map; +private: + table_to_tablet_map _tablets; +public: + const tablet_map& get_tablet_map(table_id id) const; + const table_to_tablet_map& all_tables() const { return _tablets; } +public: + void set_tablet_map(table_id, tablet_map); + tablet_map& get_tablet_map(table_id id); + future<> clear_gently(); +public: + bool operator==(const tablet_metadata&) const = default; +}; + +} + +namespace std { + +template<> +struct hash { + size_t operator()(const locator::tablet_replica& r) const { + return utils::hash_combine( + std::hash()(r.host), + std::hash()(r.shard)); + } +}; + +} diff --git a/locator/token_metadata.cc b/locator/token_metadata.cc index 27f59ad7a5..2e6c824954 100644 --- a/locator/token_metadata.cc +++ b/locator/token_metadata.cc @@ -10,6 +10,7 @@ #include #include "locator/snitch_base.hh" #include "locator/abstract_replication_strategy.hh" +#include "locator/tablets.hh" #include "log.hh" #include "partition_range_compat.hh" #include @@ -62,6 +63,8 @@ private: std::vector _sorted_tokens; + tablet_metadata _tablets; + topology _topology; long _ring_version = 0; @@ -72,6 +75,13 @@ private: void sort_tokens(); + const tablet_metadata& tablets() const { return _tablets; } + + void set_tablets(tablet_metadata&& tablets) { + _tablets = std::move(tablets); + invalidate_cached_rings(); + } + struct shallow_copy {}; token_metadata_impl(shallow_copy, const token_metadata_impl& o) noexcept : _topology(topology::config{}) @@ -368,6 +378,7 @@ future token_metadata_impl::clone_only_token_map(bool clone ret._sorted_tokens = _sorted_tokens; co_await coroutine::maybe_yield(); } + ret._tablets = _tablets; co_return ret; } @@ -380,6 +391,7 @@ future<> token_metadata_impl::clear_gently() noexcept { co_await utils::clear_gently(_pending_ranges_interval_map); co_await utils::clear_gently(_sorted_tokens); co_await _topology.clear_gently(); + co_await _tablets.clear_gently(); co_return; } @@ -396,6 +408,14 @@ void token_metadata_impl::sort_tokens() { _sorted_tokens = std::move(sorted); } +const tablet_metadata& token_metadata::tablets() const { + return _impl->tablets(); +} + +void token_metadata::set_tablets(tablet_metadata tm) { + _impl->set_tablets(std::move(tm)); +} + const std::vector& token_metadata_impl::sorted_tokens() const { return _sorted_tokens; } diff --git a/locator/token_metadata.hh b/locator/token_metadata.hh index 8e83675cb9..0abef31f7f 100644 --- a/locator/token_metadata.hh +++ b/locator/token_metadata.hh @@ -41,6 +41,7 @@ class abstract_replication_strategy; using token = dht::token; class token_metadata; +class tablet_metadata; struct host_id_or_endpoint { host_id id; @@ -106,6 +107,8 @@ public: token_metadata& operator=(token_metadata&&) noexcept; ~token_metadata(); const std::vector& sorted_tokens() const; + const tablet_metadata& tablets() const; + void set_tablets(tablet_metadata); // Update token->endpoint mappings for a given \c endpoint. // \c tokens are all the tokens that are now owned by \c endpoint. //