diff --git a/configure.py b/configure.py index 45978d8955..fcafc83d67 100755 --- a/configure.py +++ b/configure.py @@ -1012,6 +1012,7 @@ scylla_core = (['message/messaging_service.cc', 'db/view/view_update_generator.cc', 'db/virtual_table.cc', 'db/virtual_tables.cc', + 'db/tablet_options.cc', 'index/secondary_index_manager.cc', 'index/secondary_index.cc', 'utils/UUID_gen.cc', diff --git a/cql3/statements/cf_prop_defs.cc b/cql3/statements/cf_prop_defs.cc index 46c0440222..2a6490e8f2 100644 --- a/cql3/statements/cf_prop_defs.cc +++ b/cql3/statements/cf_prop_defs.cc @@ -9,6 +9,7 @@ */ #include "cql3/statements/cf_prop_defs.hh" +#include "cql3/statements/request_validations.hh" #include "data_dictionary/data_dictionary.hh" #include "db/extensions.hh" #include "db/tags/extension.hh" @@ -20,6 +21,7 @@ #include "tombstone_gc.hh" #include "db/per_partition_rate_limit_extension.hh" #include "db/per_partition_rate_limit_options.hh" +#include "db/tablet_options.hh" #include "utils/bloom_calculations.hh" #include @@ -52,6 +54,8 @@ const sstring cf_prop_defs::COMPACTION_STRATEGY_CLASS_KEY = "class"; const sstring cf_prop_defs::COMPACTION_ENABLED_KEY = "enabled"; +const sstring cf_prop_defs::KW_TABLETS = "tablets"; + schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions& exts) const { schema::extensions_map er; for (auto& p : exts.schema_extensions()) { @@ -68,6 +72,14 @@ schema::extensions_map cf_prop_defs::make_schema_extensions(const db::extensions return er; } +data_dictionary::keyspace cf_prop_defs::find_keyspace(const data_dictionary::database db, std::string_view ks_name) { + try { + return db.find_keyspace(ks_name); + } catch (const data_dictionary::no_such_keyspace& e) { + throw request_validations::invalid_request("{}", e.what()); + } +} + void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, const schema::extensions_map& schema_extensions) const { // Skip validation if the comapction strategy class is already set as it means we've already // prepared (and redoing it would set strategyClass back to null, which we don't want) @@ -75,13 +87,15 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, return; } + const auto& ks = find_keyspace(db, ks_name); + static std::set keywords({ KW_COMMENT, KW_GCGRACESECONDS, KW_CACHING, KW_DEFAULT_TIME_TO_LIVE, KW_MIN_INDEX_INTERVAL, KW_MAX_INDEX_INTERVAL, KW_SPECULATIVE_RETRY, KW_BF_FP_CHANCE, KW_MEMTABLE_FLUSH_PERIOD, KW_COMPACTION, KW_COMPRESSION, KW_CRC_CHECK_CHANCE, KW_ID, KW_PAXOSGRACESECONDS, - KW_SYNCHRONOUS_UPDATES + KW_SYNCHRONOUS_UPDATES, KW_TABLETS, }); static std::set obsolete_keywords({ sstring("index_interval"), @@ -162,6 +176,16 @@ void cf_prop_defs::validate(const data_dictionary::database db, sstring ks_name, } speculative_retry::from_sstring(get_string(KW_SPECULATIVE_RETRY, speculative_retry(speculative_retry::type::NONE, 0).to_sstring())); + + if (auto tablet_options_map = get_tablet_options()) { + if (!ks.uses_tablets()) { + throw exceptions::configuration_exception("tablet options cannot be used when tablets are disabled for the keyspace"); + } + if (!db.features().tablet_options) { + throw exceptions::configuration_exception("tablet options cannot be used until all nodes in the cluster enable this feature"); + } + db::tablet_options::validate(*tablet_options_map); + } } std::map cf_prop_defs::get_compaction_type_options() const { @@ -252,6 +276,13 @@ const db::per_partition_rate_limit_options* cf_prop_defs::get_per_partition_rate return &ext->get_options(); } +std::optional cf_prop_defs::get_tablet_options() const { + if (auto tablet_options = get_map(KW_TABLETS)) { + return tablet_options.value(); + } + return std::nullopt; +} + void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const { if (has_property(KW_COMMENT)) { builder.set_comment(get_string(KW_COMMENT, "")); @@ -351,6 +382,10 @@ void cf_prop_defs::apply_to_builder(schema_builder& builder, schema::extensions_ builder.add_extension(db::tags_extension::NAME, ::make_shared(tags_map)); } + + if (auto tablet_options_opt = get_map(KW_TABLETS)) { + builder.set_tablet_options(std::move(*tablet_options_opt)); + } } void cf_prop_defs::validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const diff --git a/cql3/statements/cf_prop_defs.hh b/cql3/statements/cf_prop_defs.hh index b07ce04f77..2f5ae6526a 100644 --- a/cql3/statements/cf_prop_defs.hh +++ b/cql3/statements/cf_prop_defs.hh @@ -18,12 +18,14 @@ namespace data_dictionary { class database; +class keyspace; } class tombstone_gc_options; namespace db { class extensions; +class tablet_options; } namespace cdc { class options; @@ -60,6 +62,8 @@ public: static const sstring COMPACTION_STRATEGY_CLASS_KEY; static const sstring COMPACTION_ENABLED_KEY; + static const sstring KW_TABLETS; + // FIXME: In origin the following consts are in CFMetaData. static constexpr int32_t DEFAULT_DEFAULT_TIME_TO_LIVE = 0; static constexpr int32_t DEFAULT_MIN_INDEX_INTERVAL = 128; @@ -70,6 +74,7 @@ public: private: mutable std::optional _compaction_strategy_class; + static data_dictionary::keyspace find_keyspace(const data_dictionary::database db, std::string_view ks_name); public: std::optional get_compaction_strategy_class() const; @@ -103,6 +108,7 @@ public: int32_t get_paxos_grace_seconds() const; std::optional get_id() const; bool get_synchronous_updates_flag() const; + std::optional get_tablet_options() const; void apply_to_builder(schema_builder& builder, schema::extensions_map schema_extensions, const data_dictionary::database& db, sstring ks_name) const; void validate_minimum_int(const sstring& field, int32_t minimum_value, int32_t default_value) const; diff --git a/data_dictionary/data_dictionary.cc b/data_dictionary/data_dictionary.cc index 547b3e06a5..a6adb553c0 100644 --- a/data_dictionary/data_dictionary.cc +++ b/data_dictionary/data_dictionary.cc @@ -55,6 +55,11 @@ keyspace::is_internal() const { return _ops->is_internal(*this); } +bool +keyspace::uses_tablets() const { + return metadata()->uses_tablets(); +} + const locator::abstract_replication_strategy& keyspace::get_replication_strategy() const { return _ops->get_replication_strategy(*this); diff --git a/data_dictionary/data_dictionary.hh b/data_dictionary/data_dictionary.hh index e4539dc6e3..9951f7920c 100644 --- a/data_dictionary/data_dictionary.hh +++ b/data_dictionary/data_dictionary.hh @@ -87,6 +87,7 @@ private: keyspace(const impl* ops, const void* keyspace); public: bool is_internal() const; + bool uses_tablets() const; lw_shared_ptr metadata() const; const user_types_metadata& user_types() const; const locator::abstract_replication_strategy& get_replication_strategy() const; diff --git a/data_dictionary/keyspace_metadata.hh b/data_dictionary/keyspace_metadata.hh index 9c75a10f2a..2f82e8064b 100644 --- a/data_dictionary/keyspace_metadata.hh +++ b/data_dictionary/keyspace_metadata.hh @@ -65,6 +65,9 @@ public: std::optional initial_tablets() const { return _initial_tablets; } + bool uses_tablets() const noexcept { + return _initial_tablets.has_value(); + } const std::unordered_map& cf_meta_data() const { return _cf_meta_data; } diff --git a/db/CMakeLists.txt b/db/CMakeLists.txt index ddf2e8aa41..bac41a919e 100644 --- a/db/CMakeLists.txt +++ b/db/CMakeLists.txt @@ -38,7 +38,8 @@ target_sources(db snapshot/backup_task.cc rate_limiter.cc per_partition_rate_limit_options.cc - row_cache.cc) + row_cache.cc, + tablet_options.cc) target_include_directories(db PUBLIC ${CMAKE_SOURCE_DIR}) diff --git a/db/schema_tables.cc b/db/schema_tables.cc index 644fd30709..e6cecb8a0c 100644 --- a/db/schema_tables.cc +++ b/db/schema_tables.cc @@ -336,6 +336,11 @@ schema_ptr scylla_tables(schema_features features) { // In this case, for non-system tables, `version` is null and `schema::version()` will be a hash. sb.with_column("committed_by_group0", boolean_type); } + + // It is safe to add the `tablets` column unconditionally, + // since it is written to only after the cluster feature is enabled. + sb.with_column("tablets", map_type_impl::get_instance(utf8_type, utf8_type, false)); + sb.with_hash_version(); s = sb.build(); } @@ -1733,6 +1738,19 @@ mutation make_scylla_tables_mutation(schema_ptr table, api::timestamp_type times auto& cdef = *scylla_tables()->get_column_definition("partitioner"); m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now())); } + // A table will have engaged tablet options + // only after they were set by CREATE TABLE or ALTER TABLE, + // Meaning the cluster feature is enabled, so it is safe to write + // to this columns. + if (table->has_tablet_options()) { + auto& map = table->raw_tablet_options(); + auto& cdef = *scylla_tables()->get_column_definition("tablets"); + if (map.empty()) { + m.set_clustered_cell(ckey, cdef, atomic_cell::make_dead(timestamp, gc_clock::now())); + } else { + m.set_clustered_cell(ckey, cdef, make_map_mutation(map, cdef, timestamp)); + } + } // In-memory tables are deprecated since scylla-2024.1.0 // FIXME: delete the column when there's no live version supporting it anymore. // Writing it here breaks upgrade rollback to versions that do not support the in_memory schema_feature @@ -2154,6 +2172,19 @@ static void prepare_builder_from_table_row(const schema_ctxt& ctxt, schema_build } } +static void prepare_builder_from_scylla_tables_row(const schema_ctxt& ctxt, schema_builder& builder, const query::result_set_row& table_row) { + auto in_mem = table_row.get("in_memory"); + auto in_mem_enabled = in_mem.value_or(false); + if (in_mem_enabled) { + slogger.warn("Support for in_memory tables has been deprecated."); + } + builder.set_in_memory(in_mem_enabled); + if (auto opt_map = get_map(table_row, "tablets")) { + auto tablet_options = db::tablet_options(*opt_map); + builder.set_tablet_options(tablet_options.to_map()); + } +} + schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations sm, std::optional version) { slogger.trace("create_table_from_mutations: version={}, {}", version, sm); @@ -2208,13 +2239,7 @@ schema_ptr create_table_from_mutations(const schema_ctxt& ctxt, schema_mutations if (sm.scylla_tables()) { table_rs = query::result_set(*sm.scylla_tables()); if (!table_rs.empty()) { - query::result_set_row table_row = table_rs.row(0); - auto in_mem = table_row.get("in_memory"); - auto in_mem_enabled = in_mem.value_or(false); - if (in_mem_enabled) { - slogger.warn("Support for in_memory tables has been deprecated."); - } - builder.set_in_memory(in_mem_enabled); + prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0)); } } v3_columns columns(std::move(column_defs), is_dense, is_compound); @@ -2445,6 +2470,13 @@ view_ptr create_view_from_mutations(const schema_ctxt& ctxt, schema_mutations sm schema_builder builder{ks_name, cf_name, id}; prepare_builder_from_table_row(ctxt, builder, row); + if (sm.scylla_tables()) { + table_rs = query::result_set(*sm.scylla_tables()); + if (!table_rs.empty()) { + prepare_builder_from_scylla_tables_row(ctxt, builder, table_rs.row(0)); + } + } + auto computed_columns = get_computed_columns(sm); auto column_defs = create_columns_from_column_rows(ctxt, query::result_set(sm.columns_mutation()), ks_name, cf_name, false, column_view_virtual::no, computed_columns); for (auto&& cdef : column_defs) { diff --git a/db/tablet_options.cc b/db/tablet_options.cc new file mode 100644 index 0000000000..fc21731e55 --- /dev/null +++ b/db/tablet_options.cc @@ -0,0 +1,96 @@ +/* + * Copyright 2025-present ScyllaDB + */ +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include + +#include "exceptions/exceptions.hh" +#include "db/tablet_options.hh" +#include "utils/log.hh" + +extern logging::logger dblog; + +namespace db { + +tablet_options::tablet_options(const map_type& map) { + for (auto& [key, value_str] : map) { + switch (tablet_options::from_string(key)) { + case tablet_option_type::min_tablet_count: + if (auto value = std::atol(value_str.c_str())) { + min_tablet_count.emplace(value); + } + break; + case tablet_option_type::min_per_shard_tablet_count: + if (auto value = std::atof(value_str.c_str())) { + min_per_shard_tablet_count.emplace(value); + } + break; + case tablet_option_type::expected_data_size_in_gb: + if (auto value = std::atol(value_str.c_str())) { + expected_data_size_in_gb.emplace(value); + } + break; + } + } +} + +sstring tablet_options::to_string(tablet_option_type hint) { + switch (hint) { + case tablet_option_type::min_tablet_count: return "min_tablet_count"; + case tablet_option_type::min_per_shard_tablet_count: return "min_per_shard_tablet_count"; + case tablet_option_type::expected_data_size_in_gb: return "expected_data_size_in_gb"; + } +} + +tablet_option_type tablet_options::from_string(sstring hint_desc) { + if (hint_desc == "min_tablet_count") { + return tablet_option_type::min_tablet_count; + } else if (hint_desc == "min_per_shard_tablet_count") { + return tablet_option_type::min_per_shard_tablet_count; + } else if (hint_desc == "expected_data_size_in_gb") { + return tablet_option_type::expected_data_size_in_gb; + } else { + throw exceptions::syntax_exception(fmt::format("Unknown tablet hint '{}'", hint_desc)); + } +} + +std::map tablet_options::to_map() const { + std::map res; + if (min_tablet_count) { + res[to_string(tablet_option_type::min_tablet_count)] = fmt::to_string(*min_tablet_count); + } + if (min_per_shard_tablet_count) { + res[to_string(tablet_option_type::min_per_shard_tablet_count)] = fmt::to_string(*min_per_shard_tablet_count); + } + if (expected_data_size_in_gb) { + res[to_string(tablet_option_type::expected_data_size_in_gb)] = fmt::to_string(*expected_data_size_in_gb); + } + return res; +} + +void tablet_options::validate(const map_type& map) { + for (auto& [key, value_str] : map) { + switch (tablet_options::from_string(key)) { + case tablet_option_type::min_tablet_count: + if (auto value = std::atol(value_str.c_str()); value < 0) { + throw exceptions::configuration_exception(format("Invalid value '{}' for min_tablet_count", value)); + } + break; + case tablet_option_type::min_per_shard_tablet_count: + if (auto value = std::atof(value_str.c_str()); value < 0) { + throw exceptions::configuration_exception(format("Invalid value '{}' for min_per_shard_tablet_count", value)); + } + break; + case tablet_option_type::expected_data_size_in_gb: + if (auto value = std::atol(value_str.c_str()); value < 0) { + throw exceptions::configuration_exception(format("Invalid value '{}' for expected_data_size_in_gb", value)); + } + break; + } + } +} + +} // namespace db diff --git a/db/tablet_options.hh b/db/tablet_options.hh new file mode 100644 index 0000000000..709e76a74d --- /dev/null +++ b/db/tablet_options.hh @@ -0,0 +1,46 @@ +/* + * Copyright 2025-present ScyllaDB + */ +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#pragma once + +#include + +#include + +using namespace seastar; + +namespace db { + +// Per-table tablet options +enum class tablet_option_type { + min_tablet_count, + min_per_shard_tablet_count, + expected_data_size_in_gb, +}; + +struct tablet_options { + using map_type = std::map; + + std::optional min_tablet_count; + std::optional min_per_shard_tablet_count; + std::optional expected_data_size_in_gb; + + tablet_options() = default; + explicit tablet_options(const map_type& map); + + operator bool() const noexcept { + return min_tablet_count || min_per_shard_tablet_count || expected_data_size_in_gb; + } + + map_type to_map() const; + + static sstring to_string(tablet_option_type hint); + static tablet_option_type from_string(sstring hint_desc); + static void validate(const map_type& map); +}; + +} // namespace db diff --git a/docs/cql/ddl.rst b/docs/cql/ddl.rst index e010e6bc1b..277e4212ac 100644 --- a/docs/cql/ddl.rst +++ b/docs/cql/ddl.rst @@ -344,9 +344,17 @@ Creating a new table uses the ``CREATE TABLE`` statement: : | CLUSTERING ORDER BY '(' `clustering_order` ')' [ AND `table_options` ] : | scylla_encryption_options: '=' '{'[`cipher_algorithm` : ]','[`secret_key_strength` : ]','[`key_provider`: ]'}' : | caching '=' ' {'caching_options'}' + : | tablets '=' '{' `tablet_options` '}' : | `options` clustering_order: `column_name` (ASC | DESC) ( ',' `column_name` (ASC | DESC) )* + + tablet_options: `tablet_option` [',' `tablet_option`] + : | + + tablet_option: 'expected_data_size_in_gb' ':' + : | 'min_per_shard_tablet_count' ':' + : | 'min_tablet_count' ':' For instance:: @@ -713,6 +721,10 @@ A table supports the following options: - map - see below - :ref:`CDC Options ` + * - ``tablet_options`` + - map + - see below + - :ref:`Per-table tablet options ` .. _speculative-retry-options: @@ -898,6 +910,65 @@ The following modes are available: * - ``immediate`` - Tombstone GC is immediately performed. There is no wait time or repair requirement. This mode is useful for a table that uses the TWCS compaction strategy with no user deletes. After data is expired after TTL, ScyllaDB can perform compaction to drop the expired data immediately. +.. _cql-per-table-tablet-options: + +Per-table tablet options +######################## + +By default, ScyllaDB will allocate the initial number of tablets automatically. +Then on, tables may be automatically split if their average size is greater than twice +the ``target_tablet_size_in_bytes``, or merged if the average tablet size is less than +half the ``target_tablet_size_in_bytes``. + +Other considerations, like the total number of tablet replicas per-shard, may also affect the tablet count. +Since each tablet replica has a constant memory overhead, ScyllaDB may limit the number of tablets to prevent +shards from running out-of-memory, in the presence of many tables. + +The following per-table ``tablets`` options can be used to tune the tablet allocation logic for the table +if its data size, or performance requirements are known in advance. + +=============================== =============== =================================================================================== + Option Default Description +=============================== =============== =================================================================================== + ``expected_data_size_in_gb`` 0 This option provides a hint for the anticipated table size, before replication. + ScyllaDB will generate a tablets topology that matches that expectation (see details below). + It can be set when the table is created to allocate more tablets for it, + as if it already occupies that size. This will prevent unnecessary tablet splits + and tablet migrations during data ingestion. + It can also be changed later in the table life cycle to induce tablet splits or merges + to fit the new expected size. + The minimum tablet count is calculated by dividing the expected data + size by the ``target_tablet_size_in_bytes`` config option. + ``min_per_shard_tablet_count`` 0 Used for ensuring that the table workload is well balanced in the whole cluster in a + topology-independent way. A higher number of tablet replicas per shard may help balance + the table workload more evenly across shards and across nodes in the cluster. + For example, setting this to 10 means that shard overcommit is limited to 10%, regardless + of cluster size. + Note that ``min_per_shard_tablet_count`` supports floating point values and can be set to + a value less than 1. This is useful for clusters with large number of shards where the + average number of tablet replicas owned by each shard is less than 1. + ``min_tablet_count`` 0 Determines the minimum number of tablets to allocate for the table. + The hint is based on the deprecated keyspace ``initial`` tablets option. + Note that the actual number of tablet replicas that are owned by each shard is a + function of the tablet count, the replication factor in the datacenter, and the number + of nodes and shards in the datacenter. It is recommended to use higher-level options + such as ``expected_data_size_in_gb`` or ``min_per_shard_tablet_count`` instead. +=============================== =============== =================================================================================== + +When allocating tablets for a new table, ScyllaDB uses the maximum of the ``initial`` tablets configured for the keyspace +and the minimum tablet count calculated from the table's ``tablets`` options, if any. +If multiple tablet options are provided, ScyllaDB uses the maximum tablet count derived by each option individually. +If the keyspace ``initial`` tablets is set to zero and no ``tablets`` options are provided, +ScyllaDB automatically calculates the number of tablets so that each shard would own at least one tablet replica, +scaled up by the ``tablets_initial_scale_factor`` configuration option. + +Unlike the ``initial`` tablet count configured for the keyspace, ScyllaDB will not merge tablets when their +average size drops below half the ``target_tablet_size_in_bytes`` if that would cause the table's tablet count +to go below the minimum tablet count, or the per-shard tablet-count as per the above options. +This is useful for tables that go through rapid growth and shrink cycles. +If the table is shrunk for the long term and there are no special performance needs for the tablet, it is recommended +to drop the tablet options or to adjust them respectively, to fit the new requirements. + Other considerations: ##################### diff --git a/docs/dev/describe_schema.md b/docs/dev/describe_schema.md index 1d5e13b183..92ef9e84bd 100644 --- a/docs/dev/describe_schema.md +++ b/docs/dev/describe_schema.md @@ -147,6 +147,7 @@ CREATE TABLE ks.t_scylla_cdc_log ( AND comment = 'CDC log for ks.t' AND compaction = {'class': 'TimeWindowCompactionStrategy', 'compaction_window_size': '60', 'compaction_window_unit': 'MINUTES', 'expired_sstable_check_frequency_seconds': '1800'} AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'} + AND tablets = {'expected_data_size_in_gb': '250', 'min_per_shard_tablet_count': '0.8', 'min_tablet_count': '1'} AND crc_check_chance = 1 AND default_time_to_live = 0 AND gc_grace_seconds = 0 diff --git a/schema/schema.cc b/schema/schema.cc index 68fb2ea66f..7a6a6d1fe3 100644 --- a/schema/schema.cc +++ b/schema/schema.cc @@ -9,6 +9,7 @@ #include #include #include "cql3/description.hh" +#include "db/tablet_options.hh" #include "db/view/view.hh" #include "timestamp.hh" #include "utils/assert.hh" @@ -588,6 +589,7 @@ bool operator==(const schema& x, const schema& y) && x._raw._indices_by_name == y._raw._indices_by_name && x._raw._is_counter == y._raw._is_counter && x._raw._in_memory == y._raw._in_memory + && x._raw._tablet_options == y._raw._tablet_options ; } @@ -676,6 +678,8 @@ table_schema_version schema::calculate_digest(const schema::raw_schema& r) { feed_hash(h, ext->options_to_string()); } + feed_hash(h, r._tablet_options); + return table_schema_version(utils::UUID_gen::get_name_UUID(h.finalize())); } @@ -844,6 +848,17 @@ auto fmt::formatter::format(const schema& s, fmt::format_context& ctx) c out = fmt::format_to(out, ",minIndexInterval={}", s._raw._min_index_interval); out = fmt::format_to(out, ",maxIndexInterval={}", s._raw._max_index_interval); out = fmt::format_to(out, ",speculativeRetry={}", s._raw._speculative_retry.to_sstring()); + out = fmt::format_to(out, ",tablets={{"); + if (s._raw._tablet_options) { + n = 0; + for (auto& [k, v] : *s._raw._tablet_options) { + if (n++) { + out = fmt::format_to(out, ", "); + } + out = fmt::format_to(out, "{}={}", k, v); + } + } + out = fmt::format_to(out, "}}"); out = fmt::format_to(out, ",triggers=[]"); out = fmt::format_to(out, ",isDense={}", s._raw._is_dense); out = fmt::format_to(out, ",in_memory={}", s._raw._in_memory); @@ -1137,7 +1152,13 @@ std::ostream& schema::schema_properties(const schema_describe_helper& helper, st os << "\n AND memtable_flush_period_in_ms = " << memtable_flush_period(); os << "\n AND min_index_interval = " << min_index_interval(); os << "\n AND speculative_retry = '" << speculative_retry().to_sstring() << "'"; - + + if (has_tablet_options()) { + os << "\n AND tablets = {"; + map_as_cql_param(os, tablet_options().to_map()); + os << "}"; + } + for (auto& [type, ext] : extensions()) { os << "\n AND " << type << " = " << ext->options_to_string(); } @@ -1622,6 +1643,22 @@ const ::tombstone_gc_options& schema::tombstone_gc_options() const { return default_tombstone_gc_options; } +bool schema::has_tablet_options() const noexcept { + return _raw._tablet_options.has_value(); +} + +db::tablet_options schema::tablet_options() const { + return db::tablet_options(raw_tablet_options()); +} + +const db::tablet_options::map_type& schema::raw_tablet_options() const noexcept { + if (!_raw._tablet_options) { + static db::tablet_options::map_type no_options; + return no_options; + } + return *_raw._tablet_options; +} + schema_builder& schema_builder::with_cdc_options(const cdc::options& opts) { add_extension(cdc::cdc_extension::NAME, ::make_shared(opts)); return *this; @@ -1642,6 +1679,11 @@ schema_builder& schema_builder::set_paxos_grace_seconds(int32_t seconds) { return *this; } +schema_builder& schema_builder::set_tablet_options(std::map&& hints) { + _raw._tablet_options = std::move(hints); + return *this; +} + gc_clock::duration schema::paxos_grace_seconds() const { return std::chrono::duration_cast( std::chrono::seconds( diff --git a/schema/schema.hh b/schema/schema.hh index 8ad53689ed..4c86069c46 100644 --- a/schema/schema.hh +++ b/schema/schema.hh @@ -30,6 +30,7 @@ #include "timestamp.hh" #include "tombstone_gc_options.hh" #include "db/per_partition_rate_limit_options.hh" +#include "db/tablet_options.hh" #include "schema_fwd.hh" namespace dht { @@ -567,6 +568,7 @@ private: // schema digest. It is also not set locally on a schema tables. std::reference_wrapper _sharder; bool _in_memory = false; + std::optional> _tablet_options; std::optional _view_info; }; raw_schema _raw; @@ -742,6 +744,12 @@ public: return _raw._caching_options; } + // Returns true iff the _tablet_options are initialized. + // They may still be empty, e.g. after ALTER TABLE. + bool has_tablet_options() const noexcept; + db::tablet_options tablet_options() const; + const db::tablet_options::map_type& raw_tablet_options() const noexcept; + static void set_default_partitioner(const sstring& class_name, unsigned ignore_msb = 0); const dht::i_partitioner& get_partitioner() const; diff --git a/schema/schema_builder.hh b/schema/schema_builder.hh index 22fb39cbd8..5b1c59798f 100644 --- a/schema/schema_builder.hh +++ b/schema/schema_builder.hh @@ -227,6 +227,8 @@ public: return *this; } + schema_builder& set_tablet_options(std::map&& hints); + class default_names { public: default_names(const schema_builder&); @@ -283,7 +285,7 @@ public: schema_builder& with_cdc_options(const cdc::options&); schema_builder& with_tombstone_gc_options(const tombstone_gc_options& opts); schema_builder& with_per_partition_rate_limit_options(const db::per_partition_rate_limit_options&); - + default_names get_default_names() const { return default_names(_raw); } diff --git a/test/boost/schema_change_test.cc b/test/boost/schema_change_test.cc index 3b8e08a8ed..e5f57b3757 100644 --- a/test/boost/schema_change_test.cc +++ b/test/boost/schema_change_test.cc @@ -1158,7 +1158,7 @@ SEASTAR_TEST_CASE(test_system_schema_version_is_stable) { // If you changed the schema of system.batchlog then this is expected to fail. // Just replace expected version with the new version. - BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("3febbbce-8841-304a-abb9-170078ac173d"))); + BOOST_REQUIRE_EQUAL(s->version(), table_schema_version(utils::UUID("1f504ac7-350f-37aa-8a9e-105b1325d8e3"))); }); } diff --git a/test/cqlpy/test_describe.py b/test/cqlpy/test_describe.py index 17ab26feb8..b94b98de86 100644 --- a/test/cqlpy/test_describe.py +++ b/test/cqlpy/test_describe.py @@ -234,6 +234,31 @@ def test_desc_table(cql, test_keyspace, random_seed, has_tablets): finally: cql.execute(f"DROP TABLE {new_tbl}") +# This test compares the content of `system_schema.tables` and `system_schema.columns` tables +# when providing tablet options to CREATE TABLE. +def test_desc_table_with_tablet_options(cql, test_keyspace, random_seed, has_tablets): + if has_tablets: # issue #18180 + global counter_table_chance + counter_table_chance = 0 + tablet_options = { + 'min_tablet_count': '100', + 'min_per_shard_tablet_count': '0.8', # Verify that a floating point value works for this hint + 'expected_data_size_in_gb': '50', + } + with new_random_table(cql, test_keyspace, tablet_options=tablet_options) as tbl: + desc = cql.execute(f"DESC TABLE {tbl}") + desc_create_stmt = desc.one().create_statement + + try: + new_tbl = f"{test_keyspace}.{unique_name()}" + new_create_stmt = desc_create_stmt.replace(tbl, new_tbl) + cql.execute(new_create_stmt) + new_desc_stmt = cql.execute(f"DESC TABLE {new_tbl}") + new_desc_create_stmt = new_desc_stmt.one().create_statement + assert new_desc_create_stmt == new_create_stmt + finally: + cql.execute(f"DROP TABLE {new_tbl}") + # Test that `DESC TABLE {tbl}` contains appropriate create statement for table # This test compares the content of `system_schema.scylla_tables` tables, thus the test # is `scylla_only`. @@ -1280,7 +1305,7 @@ def new_random_keyspace(cql): # UDTs that can be used to create the table. The function uses `new_test_table` # from util.py, so it can be used in a "with", as: # with new_random_table(cql, test_keyspace) as table: -def new_random_table(cql, keyspace, udts=[]): +def new_random_table(cql, keyspace, udts=[], tablet_options={}): pk_n = random.randrange(1, max_pk) ck_n = random.randrange(max_ck) regular_n = random.randrange(1, max_regular) @@ -1348,6 +1373,8 @@ def new_random_table(cql, keyspace, udts=[]): # Extra properties which ScyllaDB supports but Cassandra doesn't extras["paxos_grace_seconds"] = random.randrange(1000, 100000) extras["tombstone_gc"] = f"{{'mode': 'timeout', 'propagation_delay_in_seconds': '{random.randrange(100, 100000)}'}}" + if tablet_options: + extras["tablets"] = str(tablet_options) extra_options = [f"{k} = {v}" for (k, v) in extras.items()] extra_str = " AND ".join(extra_options)