diff --git a/row_cache.cc b/row_cache.cc index ebb0251994..47dc15009c 100644 --- a/row_cache.cc +++ b/row_cache.cc @@ -1228,6 +1228,10 @@ void rows_entry::on_evicted(cache_tracker& tracker) noexcept { // so don't remove it, just unlink from the LRU. // That dummy is linked in the LRU, because there may be partitions // with no regular rows, and we need to track them. + + // We still need to break continuity in order to preserve the "older versions are evicted first" + // invariant. + it->set_continuous(false); } else { // When evicting a dummy with both sides continuous we don't need to break continuity. // diff --git a/test/boost/row_cache_test.cc b/test/boost/row_cache_test.cc index a2f29b1510..deafb8e39d 100644 --- a/test/boost/row_cache_test.cc +++ b/test/boost/row_cache_test.cc @@ -3952,6 +3952,92 @@ SEASTAR_TEST_CASE(test_scans_erase_dummies) { } +// Tests the following scenario: +// +// Initial state: +// +// v2: ==== <7> [entry2] ==== <9> === <13> ==== +// v1: ======================================== [entry1] +// +// After two eviction events which evict entry1 and entry2, we should end up with: +// +// v2: ---------------------- <9> === <13> ==== +// v1: ---------------------------------------- +// +// last dummy entries are treated in a special way in rows_entry::on_evicted(), and there +// was a bug which didn't clear the continuity on last dummy when it was selected for eviction. +// As a result, the view was this: +// +// v2: ---------------------- <9> === <13> ==== +// v1: ======================================== +// +// This would violate the "older versions are evicted first" rule, which implies +// that when entry2 is evicted in v2, the range in which entry2 falls into in all older versions +// must be discontinuous. This won't hold if we don't clear continuity on last dummy in v1. +// As a result, the range into which entry2 falls into from the perspective of v2 snapshot +// would appear as continuous and <7> would be missing from the read result, because +// continuity of a snapshot is a union of continuous ranges in all versions. +// +// Reproduces https://github.com/scylladb/scylladb/issues/12451 +SEASTAR_TEST_CASE(test_version_merging_with_range_tombstones_over_rowless_version) { + return seastar::async([] { + simple_schema s; + tests::reader_concurrency_semaphore_wrapper semaphore; + + auto pkey = s.make_pkey("pk"); + auto pr = dht::partition_range::make_singular(pkey); + + memtable_snapshot_source underlying(s.schema()); + + mutation m1(s.schema(), pkey); + m1.partition().apply(s.new_tombstone()); + underlying.apply(m1); + + cache_tracker tracker; + row_cache cache(s.schema(), snapshot_source([&] { return underlying(); }), tracker); + + // Populate cache + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr)) + .produces(m1); + + mutation m2(s.schema(), pkey); + s.delete_range(m2, s.make_ckey_range(7, 13)); + s.add_row(m2, s.make_ckey(7), "v"); + s.delete_range(m2, s.make_ckey_range(9, 17)); + s.add_row(m2, s.make_ckey(9), "v"); + s.add_row(m2, s.make_ckey(17), "v"); + + { + auto rd1 = cache.make_reader(s.schema(), semaphore.make_permit(), pr); + auto close_rd1 = deferred_close(rd1); + rd1.set_max_buffer_size(1); // To hold the snapshot + rd1.fill_buffer().get(); + + apply(cache, underlying, m2); + + evict_one_row(tracker); // hits last dummy in oldest version. + + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr)) + .produces(m1 + m2); + + evict_one_row(tracker); // hits entry in the latest version, row (v1) or rtc (v2) + + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr)) + .produces(m1 + m2); + + evict_one_row(tracker); // hits entry in the latest version, row (both v1 and v2) + + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr)) + .produces(m1 + m2); + } + + tracker.cleaner().drain().get(); + + assert_that(cache.make_reader(s.schema(), semaphore.make_permit(), pr)) + .produces(m1 + m2); + }); +} + SEASTAR_TEST_CASE(row_cache_is_populated_using_compacting_sstable_reader) { return do_with_cql_env_thread([](cql_test_env& env) { replica::database& db = env.local_db();