From 694f8a4ec6a8f0e5873042c0450565407b0891fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Wed, 10 Feb 2021 17:26:42 +0200 Subject: [PATCH 1/3] mutation_fragment_stream_validating_filter: make validation levels more fine-grained Currently key order validation for the mutation fragment stream validating filter is all or nothing. Either no keys (partition or clustering) are validated or all of them. As we suspect that clustering key order validation would add a significant overhead, this discourages turning key validation on, which means we miss out on partition key monotonicity validation which has a much more moderate cost. This patch makes this configurable in a more fine-grained fashion, providing separate levels for partition and clustering key monotonicity validation. As the choice for the default validation level is not as clear-cut as before, the default value for the validation level is removed in the validating filter's constructor. --- flat_mutation_reader.cc | 28 ++++++++++++++++++++------- mutation_fragment_stream_validator.hh | 10 ++++++++-- sstables/sstables.hh | 3 ++- sstables/sstables_manager.cc | 4 +++- sstables/writer_impl.hh | 2 +- test/boost/sstable_datafile_test.cc | 2 +- 6 files changed, 36 insertions(+), 13 deletions(-) diff --git a/flat_mutation_reader.cc b/flat_mutation_reader.cc index 4d655fc153..507f3f92fd 100644 --- a/flat_mutation_reader.cc +++ b/flat_mutation_reader.cc @@ -1036,7 +1036,7 @@ namespace { } bool mutation_fragment_stream_validating_filter::operator()(const dht::decorated_key& dk) { - if (_compare_keys) { + if (_validation_level >= mutation_fragment_stream_validation_level::partition_key) { if (!_validator(dk)) { on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected partition key: previous {}, current {}", static_cast(this), _name, _validator.previous_partition_key(), dk)); @@ -1045,13 +1045,27 @@ bool mutation_fragment_stream_validating_filter::operator()(const dht::decorated return true; } -mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(sstring_view name, const schema& s, bool compare_keys) +mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(sstring_view name, const schema& s, + mutation_fragment_stream_validation_level level) : _validator(s) , _name(format("{} ({}.{} {})", name, s.ks_name(), s.cf_name(), s.id())) - , _compare_keys(compare_keys) + , _validation_level(level) { - fmr_logger.debug("[validator {} for {}] Will validate {} monotonicity.", static_cast(this), _name, - compare_keys ? "keys" : "only partition regions"); + if (fmr_logger.level() <= log_level::debug) { + std::string_view what; + switch (_validation_level) { + case mutation_fragment_stream_validation_level::partition_region: + what = "partition region"; + break; + case mutation_fragment_stream_validation_level::partition_key: + what = "partition region and partition key"; + break; + case mutation_fragment_stream_validation_level::clustering_key: + what = "partition region, partition key and clustering key"; + break; + } + fmr_logger.debug("[validator {} for {}] Will validate {} monotonicity.", static_cast(this), _name, what); + } } bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::kind kind, position_in_partition_view pos) { @@ -1059,14 +1073,14 @@ bool mutation_fragment_stream_validating_filter::operator()(mutation_fragment::k fmr_logger.debug("[validator {}] {}:{}", static_cast(this), kind, pos); - if (_compare_keys) { + if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) { valid = _validator(kind, pos); } else { valid = _validator(kind); } if (__builtin_expect(!valid, false)) { - if (_compare_keys) { + if (_validation_level >= mutation_fragment_stream_validation_level::clustering_key) { on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected mutation fragment: previous {}:{}, current {}:{}", static_cast(this), _name, _validator.previous_mutation_fragment_kind(), _validator.previous_position(), kind, pos)); } else { diff --git a/mutation_fragment_stream_validator.hh b/mutation_fragment_stream_validator.hh index 6c823e2706..5abd2ca7a4 100644 --- a/mutation_fragment_stream_validator.hh +++ b/mutation_fragment_stream_validator.hh @@ -23,6 +23,12 @@ #include "mutation_fragment.hh" +enum class mutation_fragment_stream_validation_level { + partition_region, // fragment kind + partition_key, + clustering_key, +}; + /// Low level fragment stream validator. /// /// Tracks and validates the monotonicity of the passed in fragment kinds, @@ -110,7 +116,7 @@ struct invalid_mutation_fragment_stream : public std::runtime_error { class mutation_fragment_stream_validating_filter { mutation_fragment_stream_validator _validator; sstring _name; - bool _compare_keys; + mutation_fragment_stream_validation_level _validation_level; public: /// Constructor. @@ -118,7 +124,7 @@ public: /// \arg name is used in log messages to identify the validator, the /// schema identity is added automatically /// \arg compare_keys enable validating clustering key monotonicity - mutation_fragment_stream_validating_filter(sstring_view name, const schema& s, bool compare_keys = false); + mutation_fragment_stream_validating_filter(sstring_view name, const schema& s, mutation_fragment_stream_validation_level level); bool operator()(const dht::decorated_key& dk); bool operator()(mutation_fragment::kind kind, position_in_partition_view pos); diff --git a/sstables/sstables.hh b/sstables/sstables.hh index 7d036d247f..772de9040c 100644 --- a/sstables/sstables.hh +++ b/sstables/sstables.hh @@ -62,6 +62,7 @@ #include "sstables/shareable_components.hh" #include "sstables/open_info.hh" #include "query-request.hh" +#include "mutation_fragment_stream_validator.hh" #include #include @@ -113,7 +114,7 @@ struct sstable_writer_config { uint64_t max_sstable_size = std::numeric_limits::max(); bool backup = false; bool leave_unsealed = false; - bool validate_keys; + mutation_fragment_stream_validation_level validation_level; std::optional replay_position; std::optional sstable_level; write_monitor* monitor = &default_write_monitor(); diff --git a/sstables/sstables_manager.cc b/sstables/sstables_manager.cc index 64cdda1608..a4b9a121bb 100644 --- a/sstables/sstables_manager.cc +++ b/sstables/sstables_manager.cc @@ -56,7 +56,9 @@ sstable_writer_config sstables_manager::configure_writer(sstring origin) const { sstable_writer_config cfg; cfg.promoted_index_block_size = _db_config.column_index_size_in_kb() * 1024; - cfg.validate_keys = _db_config.enable_sstable_key_validation(); + cfg.validation_level = _db_config.enable_sstable_key_validation() + ? mutation_fragment_stream_validation_level::clustering_key + : mutation_fragment_stream_validation_level::partition_region; cfg.summary_byte_cost = summary_byte_cost(_db_config.sstable_summary_ratio()); cfg.correctly_serialize_non_compound_range_tombstones = true; diff --git a/sstables/writer_impl.hh b/sstables/writer_impl.hh index eba576b57a..76068b9ad4 100644 --- a/sstables/writer_impl.hh +++ b/sstables/writer_impl.hh @@ -46,7 +46,7 @@ struct sstable_writer::writer_impl { , _pc(pc) , _cfg(cfg) , _collector(_schema, sst.get_filename()) - , _validator(format("sstable writer {}", _sst.get_filename()), _schema, _cfg.validate_keys) + , _validator(format("sstable writer {}", _sst.get_filename()), _schema, _cfg.validation_level) {} virtual void consume_new_partition(const dht::decorated_key& dk) = 0; diff --git a/test/boost/sstable_datafile_test.cc b/test/boost/sstable_datafile_test.cc index 6921532235..45c9b4e307 100644 --- a/test/boost/sstable_datafile_test.cc +++ b/test/boost/sstable_datafile_test.cc @@ -5308,7 +5308,7 @@ SEASTAR_TEST_CASE(sstable_scrub_test) { auto local_keys = make_local_keys(3, schema); auto config = env.manager().configure_writer(); - config.validate_keys = false; // this test violates key order on purpose + config.validation_level = mutation_fragment_stream_validation_level::partition_region; // this test violates key order on purpose auto writer = sst->get_writer(*schema, local_keys.size(), config, encoding_stats{}); auto make_static_row = [&, schema, ts] { From 727bc0f5d4555e56abddcbaf1c2c267cbd170767 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Fri, 12 Feb 2021 19:51:59 +0200 Subject: [PATCH 2/3] mutation_fragment_stream_validator: add token validation level In some cases the full-blown partition key validation and especially the associated key copy per partition might be deemed too costly. As a next best thing this patch adds a token only validation, which should cover 99% (number pulled out of my sleeve) of the cases. Let's hope no one gets unlucky. --- flat_mutation_reader.cc | 32 ++++++++++++++++++++++----- mutation_fragment_stream_validator.hh | 29 +++++++++++++++++++++--- 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/flat_mutation_reader.cc b/flat_mutation_reader.cc index 507f3f92fd..f8c4fa04be 100644 --- a/flat_mutation_reader.cc +++ b/flat_mutation_reader.cc @@ -969,6 +969,14 @@ bool mutation_fragment_stream_validator::operator()(const dht::decorated_key& dk return false; } +bool mutation_fragment_stream_validator::operator()(dht::token t) { + if (_prev_partition_key.token() <= t) { + _prev_partition_key._token = t; + return true; + } + return false; +} + bool mutation_fragment_stream_validator::operator()(mutation_fragment::kind kind, position_in_partition_view pos) { if (_prev_kind == mutation_fragment::kind::partition_end) { const bool valid = (kind == mutation_fragment::kind::partition_start); @@ -1036,13 +1044,22 @@ namespace { } bool mutation_fragment_stream_validating_filter::operator()(const dht::decorated_key& dk) { - if (_validation_level >= mutation_fragment_stream_validation_level::partition_key) { - if (!_validator(dk)) { - on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected partition key: previous {}, current {}", - static_cast(this), _name, _validator.previous_partition_key(), dk)); - } + if (_validation_level < mutation_fragment_stream_validation_level::token) { + return true; + } + if (_validation_level == mutation_fragment_stream_validation_level::token) { + if (_validator(dk.token())) { + return true; + } + on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected token: previous {}, current {}", + static_cast(this), _name, _validator.previous_token(), dk.token())); + } else { + if (_validator(dk)) { + return true; + } + on_validation_error(fmr_logger, format("[validator {} for {}] Unexpected partition key: previous {}, current {}", + static_cast(this), _name, _validator.previous_partition_key(), dk)); } - return true; } mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_filter(sstring_view name, const schema& s, @@ -1057,6 +1074,9 @@ mutation_fragment_stream_validating_filter::mutation_fragment_stream_validating_ case mutation_fragment_stream_validation_level::partition_region: what = "partition region"; break; + case mutation_fragment_stream_validation_level::token: + what = "partition region and token"; + break; case mutation_fragment_stream_validation_level::partition_key: what = "partition region and partition key"; break; diff --git a/mutation_fragment_stream_validator.hh b/mutation_fragment_stream_validator.hh index 5abd2ca7a4..b6f877fb2b 100644 --- a/mutation_fragment_stream_validator.hh +++ b/mutation_fragment_stream_validator.hh @@ -25,6 +25,7 @@ enum class mutation_fragment_stream_validation_level { partition_region, // fragment kind + token, partition_key, clustering_key, }; @@ -32,8 +33,9 @@ enum class mutation_fragment_stream_validation_level { /// Low level fragment stream validator. /// /// Tracks and validates the monotonicity of the passed in fragment kinds, -/// position in partition and partition keys. Any subset of these can be -/// used, but what is used have to be consistent across the entire stream. +/// position in partition, token or partition keys. Any subset of these +/// can be used, but what is used have to be consistent across the entire +/// stream. class mutation_fragment_stream_validator { const schema& _schema; mutation_fragment::kind _prev_kind; @@ -73,11 +75,23 @@ public: /// \returns true if the mutation fragment kind is valid. bool operator()(const mutation_fragment& mf); + /// Validates the monotonicity of the token. + /// + /// Does not check fragment level monotonicity. + /// Advances the previous token, but only if the validation passes. + /// Cannot be used in parallel with the `dht::decorated_key` + /// overload. + /// + /// \returns true if the token is valid. + bool operator()(dht::token t); + /// Validates the monotonicity of the partition. /// /// Does not check fragment level monotonicity. /// Advances the previous partition-key, but only if the validation passes. - // + /// Cannot be used in parallel with the `dht::token` + /// overload. + /// /// \returns true if the partition key is valid. bool operator()(const dht::decorated_key& dk); @@ -98,6 +112,15 @@ public: return _prev_pos; } /// The previous valid partition key. + /// + /// Only valid if `operator()(const dht::decorated_key&)` or + /// `operator()(dht::token)` was used. + dht::token previous_token() const { + return _prev_partition_key.token(); + } + /// The previous valid partition key. + /// + /// Only valid if `operator()(const dht::decorated_key&)` was used. const dht::decorated_key& previous_partition_key() const { return _prev_partition_key; } From f0b284dab8eb1de042e81aba185c226afdbba4c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Botond=20D=C3=A9nes?= Date: Thu, 11 Feb 2021 10:42:16 +0200 Subject: [PATCH 3/3] sstables: enable token monotonicity validation by default Partition key order validation in data written to sstables can be very disruptive. All our components in the storage layers assume that partitions are in order, which means that reading out-of-order partitions triggers undefined behaviour. Computer scientists often joke that undefined behaviour can erase your hard drive and in this case the damage done by undefined behaviour caused by out-of-order partitions is very close to that. The corruption is known to mutate causing crashes, corrupting more data and even loose data. For this reason it is imperative that out-of-order partitions cannot get into sstables. This patch enables token monotonicity validation unconditionally in the sstable writer. As partition key monotonicity checks involve a key copy per partition, which might have an impact on the performance, we do the next best thing instead and enable only token monotonicity validation. --- sstables/sstables_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sstables/sstables_manager.cc b/sstables/sstables_manager.cc index a4b9a121bb..3ca6e0d07e 100644 --- a/sstables/sstables_manager.cc +++ b/sstables/sstables_manager.cc @@ -58,7 +58,7 @@ sstable_writer_config sstables_manager::configure_writer(sstring origin) const { cfg.promoted_index_block_size = _db_config.column_index_size_in_kb() * 1024; cfg.validation_level = _db_config.enable_sstable_key_validation() ? mutation_fragment_stream_validation_level::clustering_key - : mutation_fragment_stream_validation_level::partition_region; + : mutation_fragment_stream_validation_level::token; cfg.summary_byte_cost = summary_byte_cost(_db_config.sstable_summary_ratio()); cfg.correctly_serialize_non_compound_range_tombstones = true;