From 7d5e22b43b6488fb466c28a37c220603c6c5cbbf Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 17 Oct 2023 16:25:28 +0300 Subject: [PATCH] replica: memtable: don't forget memtable memory allocation statistics A memtable object contains two logalloc::allocating_section members that track memory allocation requirements during reads and writes. Because these are local to the memtable, each time we seal a memtable and create a new one, these statistics are forgotten. As a result we may have to re-learn the typical size of reads and writes, incurring a small performance penalty. The solution is to move the allocating_section object to the memtable_list container. The workload is the same across all memtables of the same table, so we don't lose discrimination here. The performance penalty may be increased later if log changes to memory reserve thresholds including a backtrace, so this reduces the odds of incurring such a penalty. Closes scylladb/scylladb#15737 --- replica/database.cc | 4 +++- replica/database.hh | 2 ++ replica/memtable.cc | 12 ++++++++++-- replica/memtable.hh | 9 ++++++--- test/boost/memtable_test.cc | 20 +++++++++++++------- 5 files changed, 34 insertions(+), 13 deletions(-) diff --git a/replica/database.cc b/replica/database.cc index 1a62eaad5a..0a8b154618 100644 --- a/replica/database.cc +++ b/replica/database.cc @@ -1956,7 +1956,9 @@ future<> memtable_list::flush() { } lw_shared_ptr memtable_list::new_memtable() { - return make_lw_shared(_current_schema(), *_dirty_memory_manager, _table_stats, this, _compaction_scheduling_group); + return make_lw_shared(_current_schema(), *_dirty_memory_manager, + _read_section, _allocating_section, + _table_stats, this, _compaction_scheduling_group); } // Synchronously swaps the active memtable with a new, empty one, diff --git a/replica/database.hh b/replica/database.hh index 0a92728de0..26d794b4dc 100644 --- a/replica/database.hh +++ b/replica/database.hh @@ -169,6 +169,8 @@ private: seal_immediate_fn_type _seal_immediate_fn; std::function _current_schema; replica::dirty_memory_manager* _dirty_memory_manager; + logalloc::allocating_section _read_section; + logalloc::allocating_section _allocating_section; std::optional> _flush_coalescing; seastar::scheduling_group _compaction_scheduling_group; replica::table_stats& _table_stats; diff --git a/replica/memtable.cc b/replica/memtable.cc index a9246af37c..0d0fe5ab41 100644 --- a/replica/memtable.cc +++ b/replica/memtable.cc @@ -114,7 +114,10 @@ void memtable::memtable_encoding_stats_collector::update(const ::schema& s, cons } } -memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, replica::table_stats& table_stats, +memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, + logalloc::allocating_section& read_section, + logalloc::allocating_section& allocating_section, + replica::table_stats& table_stats, memtable_list* memtable_list, seastar::scheduling_group compaction_scheduling_group) : dirty_memory_manager_logalloc::size_tracked_region() , _dirty_mgr(dmm) @@ -122,6 +125,8 @@ memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, replica::table_ [this] (size_t freed) { remove_flushed_memory(freed); }) , _memtable_list(memtable_list) , _schema(std::move(schema)) + , _read_section(read_section) + , _allocating_section(allocating_section) , partitions(dht::raw_token_less_comparator{}) , _table_stats(table_stats) { logalloc::region::listen(&dmm.region_group()); @@ -129,9 +134,12 @@ memtable::memtable(schema_ptr schema, dirty_memory_manager& dmm, replica::table_ static thread_local dirty_memory_manager mgr_for_tests; static thread_local replica::table_stats stats_for_tests; +static thread_local logalloc::allocating_section memtable_read_section_for_tests; +static thread_local logalloc::allocating_section memtable_allocating_section_for_tests; memtable::memtable(schema_ptr schema) - : memtable(std::move(schema), mgr_for_tests, stats_for_tests) + : memtable(std::move(schema), mgr_for_tests, + memtable_read_section_for_tests, memtable_allocating_section_for_tests, stats_for_tests) { } memtable::~memtable() { diff --git a/replica/memtable.hh b/replica/memtable.hh index a2ed39a3b9..e82f022c51 100644 --- a/replica/memtable.hh +++ b/replica/memtable.hh @@ -111,8 +111,8 @@ private: mutation_cleaner _cleaner; memtable_list *_memtable_list; schema_ptr _schema; - logalloc::allocating_section _read_section; - logalloc::allocating_section _allocating_section; + logalloc::allocating_section& _read_section; + logalloc::allocating_section& _allocating_section; partitions_type partitions; size_t nr_partitions = 0; db::replay_position _replay_position; @@ -171,7 +171,10 @@ private: void clear() noexcept; uint64_t dirty_size() const; public: - explicit memtable(schema_ptr schema, dirty_memory_manager&, replica::table_stats& table_stats, memtable_list *memtable_list = nullptr, + explicit memtable(schema_ptr schema, dirty_memory_manager&, + logalloc::allocating_section& read_section, + logalloc::allocating_section& allocating_section, + replica::table_stats& table_stats, memtable_list *memtable_list = nullptr, seastar::scheduling_group compaction_scheduling_group = seastar::current_scheduling_group()); // Used for testing that want to control the flush process. explicit memtable(schema_ptr schema); diff --git a/test/boost/memtable_test.cc b/test/boost/memtable_test.cc index dfb6448ded..e0d2cfa4d5 100644 --- a/test/boost/memtable_test.cc +++ b/test/boost/memtable_test.cc @@ -169,9 +169,9 @@ SEASTAR_TEST_CASE(test_memtable_flush_reader) { return seastar::async([] { tests::reader_concurrency_semaphore_wrapper semaphore; - auto make_memtable = [] (replica::dirty_memory_manager& mgr, replica::table_stats& tbl_stats, std::vector muts) { + auto make_memtable = [] (replica::dirty_memory_manager& mgr, logalloc::allocating_section& read_section, logalloc::allocating_section& allocating_section, replica::table_stats& tbl_stats, std::vector muts) { assert(!muts.empty()); - auto mt = make_lw_shared(muts.front().schema(), mgr, tbl_stats); + auto mt = make_lw_shared(muts.front().schema(), mgr, read_section, allocating_section, tbl_stats); for (auto& m : muts) { mt->apply(m); } @@ -181,6 +181,8 @@ SEASTAR_TEST_CASE(test_memtable_flush_reader) { auto test_random_streams = [&] (random_mutation_generator&& gen) { for (auto i = 0; i < 4; i++) { replica::table_stats tbl_stats; + logalloc::allocating_section read_section; + logalloc::allocating_section allocating_section; replica::dirty_memory_manager mgr; const auto muts = gen(4); const auto now = gc_clock::now(); @@ -190,7 +192,7 @@ SEASTAR_TEST_CASE(test_memtable_flush_reader) { } testlog.info("Simple read"); - auto mt = make_memtable(mgr, tbl_stats, muts); + auto mt = make_memtable(mgr, read_section, allocating_section, tbl_stats, muts); assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit())) .produces_compacted(compacted_muts[0], now) @@ -200,7 +202,7 @@ SEASTAR_TEST_CASE(test_memtable_flush_reader) { .produces_end_of_stream(); testlog.info("Read with next_partition() calls between partition"); - mt = make_memtable(mgr, tbl_stats, muts); + mt = make_memtable(mgr, read_section, allocating_section, tbl_stats, muts); assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit())) .next_partition() .produces_compacted(compacted_muts[0], now) @@ -214,7 +216,7 @@ SEASTAR_TEST_CASE(test_memtable_flush_reader) { .produces_end_of_stream(); testlog.info("Read with next_partition() calls inside partitions"); - mt = make_memtable(mgr, tbl_stats, muts); + mt = make_memtable(mgr, read_section, allocating_section, tbl_stats, muts); assert_that(mt->make_flush_reader(gen.schema(), semaphore.make_permit())) .produces_compacted(compacted_muts[0], now) .produces_partition_start(muts[1].decorated_key(), muts[1].partition().partition_tombstone()) @@ -293,9 +295,11 @@ SEASTAR_TEST_CASE(test_unspooled_dirty_accounting_on_flush) { tests::reader_concurrency_semaphore_wrapper semaphore; replica::dirty_memory_manager mgr; + logalloc::allocating_section read_section; + logalloc::allocating_section allocating_section; replica::table_stats tbl_stats; - auto mt = make_lw_shared(s, mgr, tbl_stats); + auto mt = make_lw_shared(s, mgr, read_section, allocating_section, tbl_stats); std::vector ring = make_ring(s, 3); std::vector current_ring; @@ -428,9 +432,11 @@ SEASTAR_TEST_CASE(test_segment_migration_during_flush) { tests::reader_concurrency_semaphore_wrapper semaphore; replica::table_stats tbl_stats; + logalloc::allocating_section read_section; + logalloc::allocating_section allocating_section; replica::dirty_memory_manager mgr; - auto mt = make_lw_shared(s, mgr, tbl_stats); + auto mt = make_lw_shared(s, mgr, read_section, allocating_section, tbl_stats); const int rows_per_partition = 300; const int partitions = 3;