sstable_loader: Discard SSTable bloom filter on load-and-stream
Load-and-stream reads the entire content from SSTables, therefore it can afford to discard the bloom filter that might otherwise consume a significant amount of memory. Bloom filters are only needed by compaction and other replica::table operations that might want to check the presence of keys in the SSTable files, like single-partition reads. It's not uncommon to see Data:Filter ratio of less than 100:1, meaning that for ~300G of data, filters will take ~3G. In addition to saving memory footprint, it also reduces operation time as load-and-stream no longer have to read, parse and build the filters from disk into memory. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com>
This commit is contained in:
@@ -512,8 +512,8 @@ distributed_loader::process_upload_dir(distributed<replica::database>& db, distr
|
||||
}
|
||||
|
||||
future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
|
||||
distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf) {
|
||||
return seastar::async([&db, ks = std::move(ks), cf = std::move(cf)] {
|
||||
distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg) {
|
||||
return seastar::async([&db, ks = std::move(ks), cf = std::move(cf), cfg] {
|
||||
global_column_family_ptr global_table(db, ks, cf);
|
||||
sharded<sstables::sstable_directory> directory;
|
||||
auto table_id = global_table->schema()->id();
|
||||
@@ -536,6 +536,7 @@ distributed_loader::get_sstables_from_upload_dir(distributed<replica::database>&
|
||||
.enable_dangerous_direct_import_of_cassandra_counters = db.local().get_config().enable_dangerous_direct_import_of_cassandra_counters(),
|
||||
.allow_loading_materialized_view = false,
|
||||
.sort_sstables_according_to_owner = false,
|
||||
.sstable_open_config = cfg,
|
||||
};
|
||||
process_sstable_dir(directory, flags).get();
|
||||
directory.invoke_on_all([&sstables_on_shards] (sstables::sstable_directory& d) mutable {
|
||||
|
||||
@@ -99,7 +99,7 @@ public:
|
||||
// Each entry contains a vector of sstables for this shard.
|
||||
// The table UUID is returned too.
|
||||
static future<std::tuple<table_id, std::vector<std::vector<sstables::shared_sstable>>>>
|
||||
get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf);
|
||||
get_sstables_from_upload_dir(distributed<replica::database>& db, sstring ks, sstring cf, sstables::sstable_open_config cfg);
|
||||
static future<> process_upload_dir(distributed<replica::database>& db, distributed<db::system_distributed_keyspace>& sys_dist_ks,
|
||||
distributed<db::view::view_update_generator>& view_update_generator, sstring ks_name, sstring cf_name);
|
||||
};
|
||||
|
||||
@@ -262,7 +262,12 @@ future<> sstables_loader::load_new_sstables(sstring ks_name, sstring cf_name,
|
||||
if (load_and_stream) {
|
||||
::table_id table_id;
|
||||
std::vector<std::vector<sstables::shared_sstable>> sstables_on_shards;
|
||||
std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name);
|
||||
// Load-and-stream reads the entire content from SSTables, therefore it can afford to discard the bloom filter
|
||||
// that might otherwise consume a significant amount of memory.
|
||||
sstables::sstable_open_config cfg {
|
||||
.load_bloom_filter = false,
|
||||
};
|
||||
std::tie(table_id, sstables_on_shards) = co_await replica::distributed_loader::get_sstables_from_upload_dir(_db, ks_name, cf_name, cfg);
|
||||
co_await container().invoke_on_all([&sstables_on_shards, ks_name, cf_name, table_id, primary_replica_only] (sstables_loader& loader) mutable -> future<> {
|
||||
co_await loader.load_and_stream(ks_name, cf_name, table_id, std::move(sstables_on_shards[this_shard_id()]), primary_replica_only);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user