sstable_loader: Discard SSTable bloom filter on load-and-stream

Load-and-stream reads the entire content from SSTables, therefore it can
afford to discard the bloom filter that might otherwise consume a significant
amount of memory. Bloom filters are only needed by compaction and other
replica::table operations that might want to check the presence of keys
in the SSTable files, like single-partition reads.

It's not uncommon to see Data:Filter ratio of less than 100:1, meaning
that for ~300G of data, filters will take ~3G.

In addition to saving memory footprint, it also reduces operation time
as load-and-stream no longer have to read, parse and build the filters
from disk into memory.

Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This commit is contained in:
Raphael S. Carvalho
2023-04-12 12:02:22 -03:00
parent 17261369ea
commit fe6df3d270
3 changed files with 10 additions and 4 deletions

View File

@@ -512,8 +512,8 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
}
future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf) {
return seastar::async([&db, ks = std::move(ks), cf = std::move(cf)] {
distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg) {
return seastar::async([&db, ks = std::move(ks), cf = std::move(cf), cfg] {
global_column_family_ptr global_table(db, ks, cf);
sharded<sstables::sstable_directory> directory;
auto table_id = global_table->schema()->id();
@@ -536,6 +536,7 @@ distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>&
.enable_dangerous_direct_import_of_cassandra_counters = db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters(),
.allow_loading_materialized_view = false,
.sort_sstables_according_to_owner = false,
.sstable_open_config = cfg,
};
process_sstable_dir(directory, flags).get();
directory.invoke_on_all([&sstables_on_shards] (sstables::sstable_directory& d) mutable {

View File

@@ -99,7 +99,7 @@ public:
// Each entry contains a vector of sstables for this shard.
// The table UUID is returned too.
static future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf);
get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg);
static future<> process_upload_dir(distributed<replica::database>& db, distributed<db::system_distributed_keyspace>& sys_dist_ks,
distributed<db::view::view_update_generator>& view_update_generator, sstring ks_name, sstring cf_name);
};

View File

@@ -262,7 +262,12 @@ future<> sstables_loader::load_new_sstables(sstring ks_name, sstring cf_name,
if (load_and_stream) {
::table_id table_id;
std::vector<std::vector<sstables::shared_sstable>> sstables_on_shards;
std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name);
// Load-and-stream reads the entire content from SSTables, therefore it can afford to discard the bloom filter
// that might otherwise consume a significant amount of memory.
sstables::sstable_open_config cfg {
.load_bloom_filter = false,
};
std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name, cfg);
co_await container().invoke_on_all([&sstables_on_shards, ks_name, cf_name, table_id, primary_replica_only] (sstables_loader& loader) mutable -> future<> {
co_await loader.load_and_stream(ks_name, cf_name, table_id, std::move(sstables_on_shards[this_shard_id()]), primary_replica_only);
});