sstable_loader: Discard SSTable bloom filter on load-and-stream

Load-and-stream reads the entire content from SSTables, therefore it can afford to discard the bloom filter that might otherwise consume a significant amount of memory. Bloom filters are only needed by compaction and other replica::table operations that might want to check the presence of keys in the SSTable files, like single-partition reads. It's not uncommon to see Data:Filter ratio of less than 100:1, meaning that for ~300G of data, filters will take ~3G. In addition to saving memory footprint, it also reduces operation time as load-and-stream no longer have to read, parse and build the filters from disk into memory. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
2023-04-12 12:02:22 -03:00
parent 17261369ea
commit fe6df3d270
3 changed files with 10 additions and 4 deletions
--- a/replica/distributed_loader.cc
+++ b/replica/distributed_loader.cc
@@ -512,8 +512,8 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
 }

 future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
-distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf) {
-    return seastar::async([&db, ks = std::move(ks), cf = std::move(cf)] {
+distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg) {
+    return seastar::async([&db, ks = std::move(ks), cf = std::move(cf), cfg] {
        global_column_family_ptr global_table(db, ks, cf);
        sharded<sstables::sstable_directory> directory;
        auto table_id = global_table->schema()->id();
@@ -536,6 +536,7 @@ distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>&
            .enable_dangerous_direct_import_of_cassandra_counters = db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters(),
            .allow_loading_materialized_view = false,
            .sort_sstables_according_to_owner = false,
+            .sstable_open_config = cfg,
        };
        process_sstable_dir(directory, flags).get();
        directory.invoke_on_all([&sstables_on_shards] (sstables::sstable_directory& d) mutable {
--- a/replica/distributed_loader.hh
+++ b/replica/distributed_loader.hh
@@ -99,7 +99,7 @@ public:
    // Each entry contains a vector of sstables for this shard.
    // The table UUID is returned too.
    static future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
-            get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf);
+            get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg);
    static future<> process_upload_dir(distributed<replica::database>& db, distributed<db::system_distributed_keyspace>& sys_dist_ks,
            distributed<db::view::view_update_generator>& view_update_generator, sstring ks_name, sstring cf_name);
 };
--- a/sstables_loader.cc
+++ b/sstables_loader.cc
@@ -262,7 +262,12 @@ future<> sstables_loader::load_new_sstables(sstring ks_name, sstring cf_name,
        if (load_and_stream) {
            ::table_id table_id;
            std::vector<std::vector<sstables::shared_sstable>> sstables_on_shards;
-            std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name);
+            // Load-and-stream reads the entire content from SSTables, therefore it can afford to discard the bloom filter
+            // that might otherwise consume a significant amount of memory.
+            sstables::sstable_open_config cfg {
+                .load_bloom_filter = false,
+            };
+            std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name, cfg);
            co_await container().invoke_on_all([&sstables_on_shards, ks_name, cf_name, table_id, primary_replica_only] (sstables_loader& loader) mutable -> future<> {
                co_await loader.load_and_stream(ks_name, cf_name, table_id, std::move(sstables_on_shards[this_shard_id()]), primary_replica_only);
            });