row_cache: Fix violation of the "oldest version are evicted first" when evicting last dummy
Consider the following MVCC state of a partition: v2: ==== <7> [entry2] ==== <9> ===== <last dummy> v1: ================================ <last dummy> [entry1] Where === means a continuous range and --- means a discontinuous range. After two LRU items are evicted (entry1 and entry2), we will end up with: v2: ---------------------- <9> ===== <last dummy> v1: ================================ <last dummy> [entry1] This will cause readers to incorrectly think there are no rows before entry <9>, because the range is continuous in v1, and continuity of a snapshot is a union of continuous intervals in all versions. The cursor will see the interval before <9> as continuous and the reader will produce no rows. This is only temporary, because current MVCC merging rules are such that the flag on the latest entry wins, so we'll end up with this once v1 is no longer needed: v2: ---------------------- <9> ===== <last dummy> ...and the reader will go to sstables to fetch the evicted rows before entry <9>, as expected. The bug is in rows_entry::on_evicted(), which treats the last dummy entry in a special way, and doesn't evict it, and doesn't clear the continuity by omission. The situation is not easy to trigger because it requires certain eviction pattern concurrent with multiple reads of the same partition in different versions, so across memtable flushes. Closes #12452
This commit is contained in:
committed by
Botond Dénes
parent
1bb1855757
commit
f97268d8f2
@@ -1228,6 +1228,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept {
|
||||
// so don't remove it, just unlink from the LRU.
|
||||
// That dummy is linked in the LRU, because there may be partitions
|
||||
// with no regular rows, and we need to track them.
|
||||
|
||||
// We still need to break continuity in order to preserve the "older versions are evicted first"
|
||||
// invariant.
|
||||
it->set_continuous(false);
|
||||
} else {
|
||||
// When evicting a dummy with both sides continuous we don't need to break continuity.
|
||||
//
|
||||
|
||||
@@ -3952,6 +3952,92 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) {
|
||||
}
|
||||
|
||||
|
||||
// Tests the following scenario:
|
||||
//
|
||||
// Initial state:
|
||||
//
|
||||
// v2: ==== <7> [entry2] ==== <9> === <13> ==== <last dummy>
|
||||
// v1: ======================================== <last dummy> [entry1]
|
||||
//
|
||||
// After two eviction events which evict entry1 and entry2, we should end up with:
|
||||
//
|
||||
// v2: ---------------------- <9> === <13> ==== <last dummy>
|
||||
// v1: ---------------------------------------- <last dummy>
|
||||
//
|
||||
// last dummy entries are treated in a special way in rows_entry::on_evicted(), and there
|
||||
// was a bug which didn't clear the continuity on last dummy when it was selected for eviction.
|
||||
// As a result, the view was this:
|
||||
//
|
||||
// v2: ---------------------- <9> === <13> ==== <last dummy>
|
||||
// v1: ======================================== <last dummy>
|
||||
//
|
||||
// This would violate the "older versions are evicted first" rule, which implies
|
||||
// that when entry2 is evicted in v2, the range in which entry2 falls into in all older versions
|
||||
// must be discontinuous. This won't hold if we don't clear continuity on last dummy in v1.
|
||||
// As a result, the range into which entry2 falls into from the perspective of v2 snapshot
|
||||
// would appear as continuous and <7> would be missing from the read result, because
|
||||
// continuity of a snapshot is a union of continuous ranges in all versions.
|
||||
//
|
||||
// Reproduces https://github.com/scylladb/scylladb/issues/12451
|
||||
SEASTAR_TEST_CASE(test_version_merging_with_range_tombstones_over_rowless_version) {
|
||||
return seastar::async([] {
|
||||
simple_schema s;
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
|
||||
auto pkey = s.make_pkey("pk");
|
||||
auto pr = dht::partition_range::make_singular(pkey);
|
||||
|
||||
memtable_snapshot_source underlying(s.schema());
|
||||
|
||||
mutation m1(s.schema(), pkey);
|
||||
m1.partition().apply(s.new_tombstone());
|
||||
underlying.apply(m1);
|
||||
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker);
|
||||
|
||||
// Populate cache
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1);
|
||||
|
||||
mutation m2(s.schema(), pkey);
|
||||
s.delete_range(m2, s.make_ckey_range(7, 13));
|
||||
s.add_row(m2, s.make_ckey(7), "v");
|
||||
s.delete_range(m2, s.make_ckey_range(9, 17));
|
||||
s.add_row(m2, s.make_ckey(9), "v");
|
||||
s.add_row(m2, s.make_ckey(17), "v");
|
||||
|
||||
{
|
||||
auto rd1 = cache.make_reader(s.schema(), semaphore.make_permit(), pr);
|
||||
auto close_rd1 = deferred_close(rd1);
|
||||
rd1.set_max_buffer_size(1); // To hold the snapshot
|
||||
rd1.fill_buffer().get();
|
||||
|
||||
apply(cache, underlying, m2);
|
||||
|
||||
evict_one_row(tracker); // hits last dummy in oldest version.
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
|
||||
evict_one_row(tracker); // hits entry in the latest version, row (v1) or rtc (v2)
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
|
||||
evict_one_row(tracker); // hits entry in the latest version, row (both v1 and v2)
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
}
|
||||
|
||||
tracker.cleaner().drain().get();
|
||||
|
||||
assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr))
|
||||
.produces(m1 + m2);
|
||||
});
|
||||
}
|
||||
|
||||
SEASTAR_TEST_CASE(row_cache_is_populated_using_compacting_sstable_reader) {
|
||||
return do_with_cql_env_thread([](cql_test_env& env) {
|
||||
replica::database& db = env.local_db();
|
||||
|
||||
Reference in New Issue
Block a user