compare_atomic_cell_for_merge: compare value last for live cells
Currently, when two cells have the same write timestamp and both are alive or expiring, we compare their value first, before checking if either of them is expiring and if both are expiring, comparing their expiration time and ttl value to determine which of them will expire later or was written later. This was changed in CASSANDRA-14592 for consistency with the preference for dead cells over live cells, as expiring cells will become tombstones at a future time and then they'd win over live cells with the same timestamp, hence they should win also before expiration. In addition, comparing the cell value before expiration can lead to unintuitive corner cases where rewriting a cell using the same timestamp but different TTL may cause scylla to return the cell with null value if it expired in the meanwhile. Also, when multiple columns are written using two upserts using the same write timestamp but with different expiration, selecting cells by their value may return a mixed result where each cell is selected individually from either upsert, by picking the cells with the largest values for each column, while using the expiration time to break tie will lead to a more consistent results where a set of cell from only one of the upserts will be selected. Fixes scylladb/scylladb#14182 Signed-off-by: Benny Halevy <bhalevy@scylladb.com>
This commit is contained in:
@@ -79,10 +79,6 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
return left.is_live() ? std::strong_ordering::less : std::strong_ordering::greater;
|
||||
}
|
||||
if (left.is_live()) {
|
||||
auto c = compare_unsigned(left.value(), right.value()) <=> 0;
|
||||
if (c != 0) {
|
||||
return c;
|
||||
}
|
||||
if (left.is_live_and_has_ttl() != right.is_live_and_has_ttl()) {
|
||||
// prefer expiring cells.
|
||||
return left.is_live_and_has_ttl() ? std::strong_ordering::greater : std::strong_ordering::less;
|
||||
@@ -90,12 +86,13 @@ compare_atomic_cell_for_merge(atomic_cell_view left, atomic_cell_view right) {
|
||||
if (left.is_live_and_has_ttl()) {
|
||||
if (left.expiry() != right.expiry()) {
|
||||
return left.expiry() <=> right.expiry();
|
||||
} else {
|
||||
} else if (right.ttl() != left.ttl()) {
|
||||
// prefer the cell that was written later,
|
||||
// so it survives longer after it expires, until purged.
|
||||
return right.ttl() <=> left.ttl();
|
||||
}
|
||||
}
|
||||
return compare_unsigned(left.value(), right.value());
|
||||
} else {
|
||||
// Both are deleted
|
||||
|
||||
|
||||
@@ -710,6 +710,11 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value")),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value"), expiry_1, ttl_1));
|
||||
|
||||
testlog.debug("Non-expiring live cells are ordered before expiring cells, regardless of their value");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value2")),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_1, ttl_1));
|
||||
|
||||
testlog.debug("Dead cells with same expiry are equal");
|
||||
assert_equal(
|
||||
atomic_cell::make_dead(1, expiry_1),
|
||||
@@ -743,16 +748,21 @@ SEASTAR_TEST_CASE(test_cell_ordering) {
|
||||
atomic_cell::make_live(*bytes_type, 0, bytes("value2")),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value1")));
|
||||
|
||||
testlog.debug("...then by value");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_2, ttl_2),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value2"), expiry_1, ttl_1));
|
||||
|
||||
testlog.debug("...then by expiry");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_1),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_2, ttl_1));
|
||||
|
||||
testlog.debug("...then by ttl (in reverse)");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_2),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes(), expiry_1, ttl_1));
|
||||
|
||||
testlog.debug("...then by value");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value1"), expiry_1, ttl_1),
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value2"), expiry_1, ttl_1));
|
||||
|
||||
testlog.debug("Dead wins");
|
||||
assert_order(
|
||||
atomic_cell::make_live(*bytes_type, 1, bytes("value")),
|
||||
|
||||
Reference in New Issue
Block a user