From 7b329f7102318f0999b58cd454e43f2e1326cc19 Mon Sep 17 00:00:00 2001 From: Piotr Sarna Date: Wed, 9 Sep 2020 10:34:30 +0200 Subject: [PATCH] digest: add null values to row digest With the new hashing routine, null values are taken into account when computing row digest. Previous behavior had a regression which stopped computing the hash after the first null value is encountered, but the original behavior was also prone to errors - e.g. row [1, NULL, 2] was not distinguishable from [1, 2, NULL], because their hashes were identical. This hashing is not yet active - it will only be used after the next commit introduces a proper cluster feature for it. --- digest_algorithm.hh | 3 ++- digester.hh | 5 ++++- mutation_partition.cc | 36 +++++++++++++++++++++++++++++++++++- mutation_partition.hh | 1 + service/storage_proxy.cc | 2 +- xx_hasher.hh | 7 +++++++ 6 files changed, 50 insertions(+), 4 deletions(-) diff --git a/digest_algorithm.hh b/digest_algorithm.hh index bc3da28209..1a9f76b05a 100644 --- a/digest_algorithm.hh +++ b/digest_algorithm.hh @@ -28,7 +28,8 @@ namespace query { enum class digest_algorithm : uint8_t { none = 0, // digest not required MD5 = 1, - xxHash = 2,// default algorithm + legacy_xxHash_without_null_digest = 2, + xxHash = 3, // default algorithm }; } diff --git a/digester.hh b/digester.hh index cf3e288b31..560d84e9cb 100644 --- a/digester.hh +++ b/digester.hh @@ -36,7 +36,7 @@ struct noop_hasher { }; class digester final { - std::variant _impl; + std::variant _impl; public: explicit digester(digest_algorithm algo) { @@ -47,6 +47,9 @@ public: case digest_algorithm::xxHash: _impl = xx_hasher(); break; + case digest_algorithm::legacy_xxHash_without_null_digest: + _impl = legacy_xx_hasher_without_null_digest(); + break; case digest_algorithm ::none: _impl = noop_hasher(); break; diff --git a/mutation_partition.cc b/mutation_partition.cc index aecc40c0be..f052fbe7d2 100644 --- a/mutation_partition.cc +++ b/mutation_partition.cc @@ -733,7 +733,8 @@ void appending_hash::operator()(Hasher& h, const row& cells, const schema& for (auto id : columns) { const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id); if (!cell_and_hash) { - return; + feed_hash(h, appending_hash::null_hash_value); + continue; } auto&& def = s.column_at(kind, id); if (def.is_atomic()) { @@ -766,6 +767,39 @@ void appending_hash::operator()(Hasher& h, const row& cells, const schema& } } } +// Instantiation for mutation_test.cc +template void appending_hash::operator()(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const; + +template<> +void appending_hash::operator()(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const { + for (auto id : columns) { + const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id); + if (!cell_and_hash) { + return; + } + auto&& def = s.column_at(kind, id); + if (def.is_atomic()) { + max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp()); + if (cell_and_hash->hash) { + feed_hash(h, *cell_and_hash->hash); + } else { + query::default_hasher cellh; + feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def); + feed_hash(h, cellh.finalize_uint64()); + } + } else { + auto cm = cell_and_hash->cell.as_collection_mutation(); + max_ts.update(cm.last_update(*def.type)); + if (cell_and_hash->hash) { + feed_hash(h, *cell_and_hash->hash); + } else { + query::default_hasher cellh; + feed_hash(cellh, cm, def); + feed_hash(h, cellh.finalize_uint64()); + } + } + } +} cell_hash_opt row::cell_hash_for(column_id id) const { if (_type == storage_type::vector) { diff --git a/mutation_partition.hh b/mutation_partition.hh index 8a627e9536..d56288500e 100644 --- a/mutation_partition.hh +++ b/mutation_partition.hh @@ -659,6 +659,7 @@ struct max_timestamp { template<> struct appending_hash { + static constexpr int null_hash_value = 0xbeefcafe; template void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const; }; diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index e293d18ef2..068085e22c 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -125,7 +125,7 @@ using fbu = utils::fb_utilities; static inline query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) { return proxy.features().cluster_supports_xxhash_digest_algorithm() - ? query::digest_algorithm::xxHash + ? query::digest_algorithm::legacy_xxHash_without_null_digest : query::digest_algorithm::MD5; } diff --git a/xx_hasher.hh b/xx_hasher.hh index 292ee78be0..adfaea90b9 100644 --- a/xx_hasher.hh +++ b/xx_hasher.hh @@ -67,3 +67,10 @@ private: serialize_int64(out, finalize_uint64()); } }; + +// Used to specialize templates in order to fix a bug +// in handling null values: #4567 +class legacy_xx_hasher_without_null_digest : public xx_hasher { +public: + explicit legacy_xx_hasher_without_null_digest(uint64_t seed = 0) noexcept : xx_hasher(seed) {} +};