digest: add null values to row digest

With the new hashing routine, null values are taken into account
when computing row digest. Previous behavior had a regression
which stopped computing the hash after the first null value
is encountered, but the original behavior was also prone
to errors - e.g. row [1, NULL, 2] was not distinguishable
from [1, 2, NULL], because their hashes were identical.
This hashing is not yet active - it will only be used after
the next commit introduces a proper cluster feature for it.
This commit is contained in:
Piotr Sarna
2020-09-09 10:34:30 +02:00
parent 5ffd929eaa
commit 7b329f7102
6 changed files with 50 additions and 4 deletions

View File

@@ -28,7 +28,8 @@ namespace query {
enum class digest_algorithm : uint8_t {
none = 0, // digest not required
MD5 = 1,
xxHash = 2,// default algorithm
legacy_xxHash_without_null_digest = 2,
xxHash = 3, // default algorithm
};
}

View File

@@ -36,7 +36,7 @@ struct noop_hasher {
};
class digester final {
std::variant<noop_hasher, md5_hasher, xx_hasher> _impl;
std::variant<noop_hasher, md5_hasher, xx_hasher, legacy_xx_hasher_without_null_digest> _impl;
public:
explicit digester(digest_algorithm algo) {
@@ -47,6 +47,9 @@ public:
case digest_algorithm::xxHash:
_impl = xx_hasher();
break;
case digest_algorithm::legacy_xxHash_without_null_digest:
_impl = legacy_xx_hasher_without_null_digest();
break;
case digest_algorithm ::none:
_impl = noop_hasher();
break;

View File

@@ -733,7 +733,8 @@ void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema&
for (auto id : columns) {
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
if (!cell_and_hash) {
return;
feed_hash(h, appending_hash<row>::null_hash_value);
continue;
}
auto&& def = s.column_at(kind, id);
if (def.is_atomic()) {
@@ -766,6 +767,39 @@ void appending_hash<row>::operator()(Hasher& h, const row& cells, const schema&
}
}
}
// Instantiation for mutation_test.cc
template void appending_hash<row>::operator()<xx_hasher>(xx_hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
template<>
void appending_hash<row>::operator()<legacy_xx_hasher_without_null_digest>(legacy_xx_hasher_without_null_digest& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const {
for (auto id : columns) {
const cell_and_hash* cell_and_hash = cells.find_cell_and_hash(id);
if (!cell_and_hash) {
return;
}
auto&& def = s.column_at(kind, id);
if (def.is_atomic()) {
max_ts.update(cell_and_hash->cell.as_atomic_cell(def).timestamp());
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cell_and_hash->cell.as_atomic_cell(def), def);
feed_hash(h, cellh.finalize_uint64());
}
} else {
auto cm = cell_and_hash->cell.as_collection_mutation();
max_ts.update(cm.last_update(*def.type));
if (cell_and_hash->hash) {
feed_hash(h, *cell_and_hash->hash);
} else {
query::default_hasher cellh;
feed_hash(cellh, cm, def);
feed_hash(h, cellh.finalize_uint64());
}
}
}
}
cell_hash_opt row::cell_hash_for(column_id id) const {
if (_type == storage_type::vector) {

View File

@@ -659,6 +659,7 @@ struct max_timestamp {
template<>
struct appending_hash<row> {
static constexpr int null_hash_value = 0xbeefcafe;
template<typename Hasher>
void operator()(Hasher& h, const row& cells, const schema& s, column_kind kind, const query::column_id_vector& columns, max_timestamp& max_ts) const;
};

View File

@@ -125,7 +125,7 @@ using fbu = utils::fb_utilities;
static inline
query::digest_algorithm digest_algorithm(service::storage_proxy& proxy) {
return proxy.features().cluster_supports_xxhash_digest_algorithm()
? query::digest_algorithm::xxHash
? query::digest_algorithm::legacy_xxHash_without_null_digest
: query::digest_algorithm::MD5;
}

View File

@@ -67,3 +67,10 @@ private:
serialize_int64(out, finalize_uint64());
}
};
// Used to specialize templates in order to fix a bug
// in handling null values: #4567
class legacy_xx_hasher_without_null_digest : public xx_hasher {
public:
explicit legacy_xx_hasher_without_null_digest(uint64_t seed = 0) noexcept : xx_hasher(seed) {}
};