Merge 'cache, mvcc: Preempt cache update when applying range tombstone from memtable' from Tomasz Grabiec
Range tombstones are represented as entry attributes, which applies to
the interval between entries. So if a range tombstone covers many
rows, to apply it we have to update all covered entries. In some
workloads that could be many entries, even the whole cache. Before
the patch, we did this update without preemption, which can cause
reactor stalls in such workloads.
This scenario is already covered by mvcc_tests,
e.g. test_apply_to_incomplete_respects_continuity. And I verified that
the new preemption point is hit in the test.
perf-row-cache-update results show no significant stalls anymore (max
2ms scheduling delay, instead of previous 1.5 s):
Generated 1124195 rows
Memtable fill took 4179.457520 [ms], {count: 8295, 99%: 0.654949 [ms], max: 32.817176 [ms]}
Draining...
took 0.000616 [ms]
cache: 2506/2948 [MB], memtable: 781/1024 [MB], alloc/comp: 1051/662 [MB] (amp: 0.630)
update: 2874.157471 [ms], preemption: {count: 26650, 99%: 1.131752 [ms], max: 2.068762 [ms]}, cache: 3027/3973 [MB], alloc/comp: 3951/2424 [MB] (amp: 0.614), pr/me/dr 1124195/0/0
Fixes #23479
Fixes #2578
Closes scylladb/scylladb#27469
* github.com:scylladb/scylladb:
cache, mvcc: Preempt cache update when applying range tombstone from memtable
partition_snapshot_row_cursor: Clarify non-obvious semantic difference of range_tombstone()
perf-row-cache-update: Add scenario with large tombstone covering many rows
This commit is contained in:
@@ -542,6 +542,7 @@ public:
|
||||
// Returns the range tombstone for the key range adjacent to the cursor's position from the side of smaller keys.
|
||||
// Excludes the range for the row itself. That information is returned by range_tombstone_for_row().
|
||||
// It's possible that range_tombstone() is empty and range_tombstone_for_row() is not empty.
|
||||
// Note that this is different from the meaning of rows_entry::range_tombstone(), which includes the row itself.
|
||||
tombstone range_tombstone() const { return _range_tombstone; }
|
||||
|
||||
// Can be called when cursor is pointing at a row.
|
||||
|
||||
@@ -575,10 +575,15 @@ utils::coroutine partition_entry::apply_to_incomplete(const schema& s,
|
||||
}
|
||||
res.row.set_range_tombstone(cur.range_tombstone_for_row() + src_cur.range_tombstone());
|
||||
|
||||
if (need_preempt()) {
|
||||
lb = position_in_partition(cur.position());
|
||||
++tracker.get_stats().rows_covered_by_range_tombstones_from_memtable;
|
||||
return stop_iteration::no;
|
||||
}
|
||||
|
||||
// FIXME: Compact the row
|
||||
++tracker.get_stats().rows_covered_by_range_tombstones_from_memtable;
|
||||
cur.next();
|
||||
// FIXME: preempt
|
||||
}
|
||||
}
|
||||
{
|
||||
|
||||
@@ -30,7 +30,7 @@ static const int cell_size = 128;
|
||||
static bool cancelled = false;
|
||||
|
||||
template<typename MutationGenerator>
|
||||
void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen) {
|
||||
void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen, std::function<mutation()> before_flush = {}) {
|
||||
tests::reader_concurrency_semaphore_wrapper semaphore;
|
||||
cache_tracker tracker;
|
||||
row_cache cache(s, make_empty_snapshot_source(), tracker, is_continuous::yes);
|
||||
@@ -58,6 +58,10 @@ void run_test(const sstring& name, schema_ptr s, MutationGenerator&& gen) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (before_flush) {
|
||||
mutation m = before_flush();
|
||||
mt->apply(m);
|
||||
}
|
||||
});
|
||||
memtable_slm.stop();
|
||||
std::cout << format("Memtable fill took {:.6f} [ms], {}", fill_d.count() * 1000, memtable_slm) << std::endl;
|
||||
@@ -181,6 +185,43 @@ static void test_partition_with_lots_of_small_rows() {
|
||||
});
|
||||
}
|
||||
|
||||
static void test_partition_with_lots_of_small_rows_covered_by_tombstone() {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", uuid_type, column_kind::partition_key)
|
||||
.with_column("ck", int32_type, column_kind::clustering_key)
|
||||
.with_column("v1", bytes_type, column_kind::regular_column)
|
||||
.with_column("v2", bytes_type, column_kind::regular_column)
|
||||
.with_column("v3", bytes_type, column_kind::regular_column)
|
||||
.build();
|
||||
|
||||
auto pk = dht::decorate_key(*s, partition_key::from_single_value(*s,
|
||||
serialized(utils::UUID_gen::get_time_UUID())));
|
||||
int ck_idx = 0;
|
||||
int flush_ck_idx = 0;
|
||||
|
||||
run_test("Large partition, lots of small rows covered by single tombstone", s, [&] {
|
||||
mutation m(s, pk);
|
||||
auto val = data_value(bytes(bytes::initialized_later(), cell_size));
|
||||
auto ck = clustering_key::from_single_value(*s, serialized(ck_idx++));
|
||||
auto ts = api::new_timestamp();
|
||||
m.set_clustered_cell(ck, "v1", val, ts);
|
||||
m.set_clustered_cell(ck, "v2", val, ts);
|
||||
m.set_clustered_cell(ck, "v3", val, ts);
|
||||
return m;
|
||||
}, [&] { // before_flush
|
||||
// Delete key range [-inf, flush_ck_idx)
|
||||
std::cout << "Generated " << (ck_idx - flush_ck_idx) << " rows\n";
|
||||
auto m = mutation(s, pk);
|
||||
auto ck = clustering_key::from_single_value(*s, serialized(flush_ck_idx));
|
||||
m.partition().apply_row_tombstone(*s, range_tombstone(
|
||||
position_in_partition_view::before_all_clustered_rows(),
|
||||
position_in_partition_view::before_key(ck),
|
||||
tombstone(api::new_timestamp(), gc_clock::now())));
|
||||
flush_ck_idx = ck_idx;
|
||||
return m;
|
||||
});
|
||||
}
|
||||
|
||||
static void test_partition_with_few_small_rows() {
|
||||
auto s = schema_builder("ks", "cf")
|
||||
.with_column("pk", uuid_type, column_kind::partition_key)
|
||||
@@ -275,6 +316,7 @@ int scylla_row_cache_update_main(int argc, char** argv) {
|
||||
cancelled = true;
|
||||
});
|
||||
logalloc::prime_segment_pool(memory::stats().total_memory(), memory::min_free_memory()).get();
|
||||
test_partition_with_lots_of_small_rows_covered_by_tombstone();
|
||||
test_small_partitions();
|
||||
test_partition_with_few_small_rows();
|
||||
test_partition_with_lots_of_small_rows();
|
||||
|
||||
Reference in New Issue
Block a user