From db4283b54244e44de40635647296c13d9cb9acc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Chojnowski?= Date: Thu, 18 Sep 2025 14:59:36 +0200 Subject: [PATCH] sstables: introduce `ms` sstable format version Introduce `ms` -- a new sstable format version which is a hybrid of Cassandra's `me` and `da`. It is based on `me`, but with the index components (Summary.db and Index.db) replaced with the index components of `da` (Partitions.db and Rows.db). As of this patch, the version is never chosen anywhere for writing sstables yet. It is only introduced. We will add it to unit tests in a later commit, and expose it to users in yet later commit. --- api/api-doc/storage_service.json | 2 +- .../sstable/_common/sstable_what_is.rst | 3 +++ .../sstable3/sstables-3-data-file-format.rst | 8 ++++++ docs/dev/sstables-directory-structure.md | 26 ++++++++++++++++--- scylla-gdb.py | 3 ++- sstables/sstable_version.cc | 26 ++++++++++++++----- sstables/sstable_version_m.hh | 9 ++++++- sstables/sstables.cc | 8 +++++- sstables/types.hh | 3 +++ sstables/version.hh | 10 ++++++- tools/scylla-sstable.cc | 2 +- 11 files changed, 84 insertions(+), 16 deletions(-) diff --git a/api/api-doc/storage_service.json b/api/api-doc/storage_service.json index d3bdf7d952..c8ebb44796 100644 --- a/api/api-doc/storage_service.json +++ b/api/api-doc/storage_service.json @@ -3430,7 +3430,7 @@ "version":{ "type":"string", "enum":[ - "ka", "la", "mc", "md", "me" + "ka", "la", "mc", "md", "me", "ms" ], "description":"SSTable version" }, diff --git a/docs/architecture/sstable/_common/sstable_what_is.rst b/docs/architecture/sstable/_common/sstable_what_is.rst index 805eba6e70..f4fa464952 100644 --- a/docs/architecture/sstable/_common/sstable_what_is.rst +++ b/docs/architecture/sstable/_common/sstable_what_is.rst @@ -14,6 +14,9 @@ SSTable Version Support * - SSTable Version - ScyllaDB Enterprise Version - ScyllaDB Open Source Version + * - 3.x ('ms') + - 2025.4 and above + - None * - 3.x ('me') - 2022.2 and above - 5.1 and above diff --git a/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst b/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst index 76cfdd3b17..a6e288ac71 100644 --- a/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst +++ b/docs/architecture/sstable/sstable3/sstables-3-data-file-format.rst @@ -35,6 +35,14 @@ Note that the file on-disk format applies to all "m*" SSTable format versions (" See :doc:`SSTables 3.0 Statistics File Format ` for more details. +Since Scylla 2025.4, there is a "ms" format in Scylla which is a hybrid of "me" and the "da" format introduced in Cassandra 5.0. +Most components in "ms" are exactly the same as in "me", but the index components (Index.db and Summary.db) are replaced +with the trie-based index format (components Partitions.db and Rows.db) used in "da". + +Partitions.db and Rows.db are not described on this page. +See https://github.com/apache/cassandra/blob/70bcaec54d492658cb331b70869215ad64feb63d/src/java/org/apache/cassandra/io/sstable/format/bti/BtiFormat.md +for a description. + Overview ........ diff --git a/docs/dev/sstables-directory-structure.md b/docs/dev/sstables-directory-structure.md index eb4d841729..0b46d603bb 100644 --- a/docs/dev/sstables-directory-structure.md +++ b/docs/dev/sstables-directory-structure.md @@ -61,7 +61,7 @@ sub-directories as documented below. SSTables are comprised of multiple component files. The component file names are self-identifying and denote the component type, as well as per-sstable-format metadata. -Here are the different component types and their naming convention: +Here are the different component types: * Data (`Data.db`) The SSTable data file, containing a part of the actual data stored in the database. @@ -107,13 +107,29 @@ Here are the different component types and their naming convention: * Scylla (`Scylla.db`) A file holding scylla-specific metadata about the SSTable, such as sharding information, extended features support, and sstabe-run identifier. + +* Partition Key Index (`Partitions.db`) + Trie-based index of partition keys with pointers to their positions in the data file, or to a intra-partition index in Rows.db. + + +* Clustering Key Index (`Rows.db`) + Trie-based index of clustering keys within partitions. Used in conjunction with `Partitions.db` + as a replacement for `Index.db` and `Summary.db` in newest sstable formats. + (`da` in Cassandra, `ms` in Scylla). + + +* Temporary partition key hashes (`TemporaryHashes.db`) + A temporary file used for storing intermediate ingredients for the bloom filter. + This file appears only during write, and is deleted before the sstable is sealed. + ### SSTable Format Version SSTable's on-disk format has changed over time. -Three versions are currently supported by Scylla: `ka`, `la`, and `mc`. +The versions currently supported by Scylla are: `ka`, `la`, `mc`, `md`, `me`, `ms`. Cassandra's convention is that the first letter determines the major format version, in ascending order, and the second letter - the minor version, starting from `a` onward. +(`ms` is a Scylla-specific extension of `me`, so it breaks away from the `mc`-`me` series). The SSTable file names identify the SSTable format version. In addition, they provide the SSTable generation number and other metadata. @@ -129,8 +145,10 @@ and it is version specific, as follows: where: * `` is the SSTable generation - a unique positive number identifying the SSTable. -* `` is an archaic attribute that identifies the SSTable sub-format. - (Only `big` sub-format is supported by Scylla (and Cassandra) at this time.) +* `` is an attribute that identifies the SSTable sub-format. + (Only `big` sub-format is supported by Scylla at this time. + Cassandra 5.0 introduced `bti` (which stands for `BIG, trie-indexed`). + Version `ms` is a hybrid between `big` and `bti`). * `` is the file's component type, as described above. ### Table Sub-directories diff --git a/scylla-gdb.py b/scylla-gdb.py index c45ff3df63..2badfc5d66 100755 --- a/scylla-gdb.py +++ b/scylla-gdb.py @@ -4546,7 +4546,8 @@ class scylla_sstables(gdb.Command): 'la': new_format, 'mc': new_format, 'md': new_format, - 'me': new_format + 'me': new_format, + 'ms': new_format, } format_to_str = ['big'] schema = schema_ptr(sst['_schema']) diff --git a/sstables/sstable_version.cc b/sstables/sstable_version.cc index be8e61f675..81ebde6ccf 100644 --- a/sstables/sstable_version.cc +++ b/sstables/sstable_version.cc @@ -37,20 +37,15 @@ const sstring sstable_version_constants::TEMPORARY_TOC_SUFFIX = "TOC.txt.tmp"; sstable_version_constants::component_map_t sstable_version_constants::create_component_map() { return { - { component_type::Index, "Index.db"}, { component_type::CompressionInfo, "CompressionInfo.db" }, { component_type::Data, "Data.db" }, { component_type::TOC, TOC_SUFFIX }, - { component_type::Summary, "Summary.db" }, { component_type::CRC, "CRC.db" }, { component_type::Filter, "Filter.db" }, { component_type::Statistics, "Statistics.db" }, { component_type::Scylla, "Scylla.db" }, { component_type::TemporaryTOC, TEMPORARY_TOC_SUFFIX }, - { component_type::TemporaryStatistics, "Statistics.db.tmp" }, - { component_type::Rows, "Rows.db" }, - { component_type::Partitions, "Partitions.db" }, - { component_type::TemporaryHashes, "TemporaryHashes.db.tmp" }, + { component_type::TemporaryStatistics, "Statistics.db.tmp" } }; } @@ -64,6 +59,8 @@ sstable_version_constants::get_component_map(sstable_version_types version) { case sstable_version_types::md: case sstable_version_types::me: return sstable_version_constants_m::_component_map; + case sstable_version_types::ms: + return sstable_version_constants_ms::_component_map; } // Should never reach this. // Compiler should complain if the switch above does no cover all sstable_version_types values. @@ -72,6 +69,8 @@ sstable_version_constants::get_component_map(sstable_version_types version) { const sstable_version_constants::component_map_t sstable_version_constants_k_l::create_component_map() { auto result = sstable_version_constants::create_component_map(); + result.emplace(component_type::Index, "Index.db"); + result.emplace(component_type::Summary, "Summary.db"); result.emplace(component_type::Digest, "Digest.sha1"); return result; } @@ -81,6 +80,8 @@ const sstable_version_constants::component_map_t sstable_version_constants_k_l:: const sstable_version_constants::component_map_t sstable_version_constants_m::create_component_map() { auto result = sstable_version_constants::create_component_map(); + result.emplace(component_type::Index, "Index.db"); + result.emplace(component_type::Summary, "Summary.db"); result.emplace(component_type::Digest, "Digest.crc32"); return result; } @@ -88,4 +89,17 @@ const sstable_version_constants::component_map_t sstable_version_constants_m::cr const sstable_version_constants::component_map_t sstable_version_constants_m::_component_map = sstable_version_constants_m::create_component_map(); +const sstable_version_constants::component_map_t sstable_version_constants_ms::create_component_map() { + auto result = sstable_version_constants_m::create_component_map(); + // Note: for `ms`, we inherit all components from `me`. + // This means that we allow `ms` to have Index.db and Summary.db components. + result.emplace(component_type::Rows, "Rows.db"); + result.emplace(component_type::Partitions, "Partitions.db"); + result.emplace(component_type::TemporaryHashes, "TemporaryHashes.db.tmp"); + return result; } + +const sstable_version_constants::component_map_t sstable_version_constants_ms::_component_map = + sstable_version_constants_ms::create_component_map(); + +} \ No newline at end of file diff --git a/sstables/sstable_version_m.hh b/sstables/sstable_version_m.hh index 848237b5ba..d291a25470 100644 --- a/sstables/sstable_version_m.hh +++ b/sstables/sstable_version_m.hh @@ -14,10 +14,17 @@ namespace sstables { class sstable_version_constants_m final : public sstable_version_constants { - static const sstable_version_constants::component_map_t create_component_map(); public: + static const sstable_version_constants::component_map_t create_component_map(); sstable_version_constants_m() = delete; static const sstable_version_constants::component_map_t _component_map; }; +class sstable_version_constants_ms final : public sstable_version_constants { +public: + static const sstable_version_constants::component_map_t create_component_map(); + sstable_version_constants_ms() = delete; + static const sstable_version_constants::component_map_t _component_map; +}; + } diff --git a/sstables/sstables.cc b/sstables/sstables.cc index 9013c2b928..9b60e48f0a 100644 --- a/sstables/sstables.cc +++ b/sstables/sstables.cc @@ -197,6 +197,7 @@ const std::unordered_map> format_string = { @@ -2447,6 +2448,7 @@ sstring sstable::component_basename(const sstring& ks, const sstring& cf, versio case sstable::version_types::mc: case sstable::version_types::md: case sstable::version_types::me: + case sstable::version_types::ms: return v + "-" + g + "-" + f + "-" + component; } on_internal_error(sstlog, seastar::format("invalid version {} for sstable: table={}.{}, generation={}, format={}, component={}", @@ -2586,7 +2588,7 @@ static std::tuple make_entry_descriptor(cons // la-42-big-Data.db // ka-42-big-Data.db // me-3g8w_00qf_4pbog2i7h2c7am0uoe-big-Data.db - static boost::regex la_mx("(la|m[cde])-([^-]+)-(\\w+)-(.*)"); + static boost::regex la_mx("(la|m[cdes])-([^-]+)-(\\w+)-(.*)"); static boost::regex ka("(\\w+)-(\\w+)-ka-(\\d+)-(.*)"); // Use non-greedy match so that a snapshot tag that ressembles a name- wouldn't match @@ -2670,6 +2672,10 @@ sstable_format_types format_from_string(std::string_view s) { } } +bool has_summary_and_index(sstable_version_types v) { + return v != sstable_version_types::ms; +} + component_type sstable::component_from_sstring(version_types v, const sstring &s) { try { return reverse_map(s, sstable_version_constants::get_component_map(v)); diff --git a/sstables/types.hh b/sstables/types.hh index 4fccf27c11..5040781032 100644 --- a/sstables/types.hh +++ b/sstables/types.hh @@ -286,6 +286,7 @@ struct compaction_metadata : public metadata_base { case sstable_version_types::mc: case sstable_version_types::md: case sstable_version_types::me: + case sstable_version_types::ms: return f( cardinality ); @@ -331,6 +332,7 @@ struct stats_metadata : public metadata_base { template auto describe_type(sstable_version_types v, Describer f) { switch (v) { + case sstable_version_types::ms: case sstable_version_types::me: return f( estimated_partition_size, @@ -429,6 +431,7 @@ struct serialization_header : public metadata_base { case sstable_version_types::mc: case sstable_version_types::md: case sstable_version_types::me: + case sstable_version_types::ms: return f( min_timestamp_base, min_local_deletion_time_base, diff --git a/sstables/version.hh b/sstables/version.hh index e58edd6de1..006d70d7ff 100644 --- a/sstables/version.hh +++ b/sstables/version.hh @@ -14,7 +14,7 @@ namespace sstables { -enum class sstable_version_types { ka, la, mc, md, me }; +enum class sstable_version_types { ka, la, mc, md, me, ms }; enum class sstable_format_types { big }; constexpr std::array all_sstable_versions = { @@ -23,12 +23,18 @@ constexpr std::array all_sstable_versions = { sstable_version_types::mc, sstable_version_types::md, sstable_version_types::me, + // FIXME: Uncomment after tests are prepared for the new + // version. This will happen in the same series. + // sstable_version_types::ms, }; constexpr std::array writable_sstable_versions = { sstable_version_types::mc, sstable_version_types::md, sstable_version_types::me, + // FIXME: Uncomment after tests are prepared for the new + // version. This will happen in the same series. + // sstable_version_types::ms, }; constexpr sstable_version_types oldest_writable_sstable_format = sstable_version_types::mc; @@ -40,6 +46,8 @@ inline auto get_highest_sstable_version() { sstable_version_types version_from_string(std::string_view s); sstable_format_types format_from_string(std::string_view s); +bool has_summary_and_index(sstable_version_types v); + extern const std::unordered_map> version_string; extern const std::unordered_map> format_string; diff --git a/tools/scylla-sstable.cc b/tools/scylla-sstable.cc index 194324c58c..19b765d3c5 100644 --- a/tools/scylla-sstable.cc +++ b/tools/scylla-sstable.cc @@ -2372,7 +2372,7 @@ directly expressed in CQL. Editing schema options (the part after WITH) is safe. The sstable version can be selected manually with the --sstable-version option, by default the latest supported version is used. Valid options are sstable -versions which are supported for writing: mc, md and me. +versions which are supported for writing: mc, md, me, ms. Mapping of input sstables to output sstables is printed to stdout.