Merge 'Prevent invalidation of iterators over database::_column_families' from Aleksandra Martyniuk

Maps related to column families in database are extracted
to a column_families_data class. Access to them is possible only
through methods. All methods which may preempt hold rwlock
in relevant mode, so that the iterators can't become invalid.

Fixes: #13290

Closes #13349

* github.com:scylladb/scylladb:
  replica: make tables_metadata's attributes private
  replica: add methods to get a filtered copy of tables map
  replica: add methods to check if given table exists
  replica: add methods to get table or table id
  replica: api: return table_id instead of const table_id&
  replica: iterate safely over tables related maps
  replica: pass tables_metadata to phased_barrier_top_10_counts
  replica: add methods to safely add and remove table
  replica: wrap column families related maps into tables_metadata
  replica: futurize database::add_column_family and database::remove
This commit is contained in:
Botond Dénes
2023-07-31 15:31:59 +03:00
21 changed files with 271 additions and 173 deletions

View File

@@ -43,7 +43,7 @@ std::tuple<sstring, sstring> parse_fully_qualified_cf_name(sstring name) {
return std::make_tuple(name.substr(0, pos), name.substr(end));
}
const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
table_id get_uuid(const sstring& ks, const sstring& cf, const replica::database& db) {
try {
return db.find_uuid(ks, cf);
} catch (replica::no_such_column_family& e) {
@@ -51,7 +51,7 @@ const table_id& get_uuid(const sstring& ks, const sstring& cf, const replica::da
}
}
const table_id& get_uuid(const sstring& name, const replica::database& db) {
table_id get_uuid(const sstring& name, const replica::database& db) {
auto [ks, cf] = parse_fully_qualified_cf_name(name);
return get_uuid(ks, cf, db);
}
@@ -135,9 +135,9 @@ static future<json::json_return_type> get_cf_histogram(http_context& ctx, const
static future<json::json_return_type> get_cf_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
std::function<utils::ihistogram(const replica::database&)> fun = [f] (const replica::database& db) {
utils::ihistogram res;
for (auto i : db.get_column_families()) {
res += (i.second->get_stats().*f).hist;
}
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) mutable {
res += (table->get_stats().*f).hist;
});
return res;
};
return ctx.db.map(fun).then([](const std::vector<utils::ihistogram> &res) {
@@ -162,9 +162,9 @@ static future<json::json_return_type> get_cf_rate_and_histogram(http_context& c
static future<json::json_return_type> get_cf_rate_and_histogram(http_context& ctx, utils::timed_rate_moving_average_summary_and_histogram replica::column_family_stats::*f) {
std::function<utils::rate_moving_average_and_histogram(const replica::database&)> fun = [f] (const replica::database& db) {
utils::rate_moving_average_and_histogram res;
for (auto i : db.get_column_families()) {
res += (i.second->get_stats().*f).rate();
}
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
res += (table->get_stats().*f).rate();
});
return res;
};
return ctx.db.map(fun).then([](const std::vector<utils::rate_moving_average_and_histogram> &res) {
@@ -306,21 +306,21 @@ ratio_holder filter_recent_false_positive_as_ratio_holder(const sstables::shared
void set_column_family(http_context& ctx, routes& r, sharded<db::system_keyspace>& sys_ks) {
cf::get_column_family_name.set(r, [&ctx] (const_req req){
std::vector<sstring> res;
for (auto i: ctx.db.local().get_column_families_mapping()) {
res.push_back(i.first.first + ":" + i.first.second);
}
ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
res.push_back(kscf.first + ":" + kscf.second);
});
return res;
});
cf::get_column_family.set(r, [&ctx] (std::unique_ptr<http::request> req){
std::list<cf::column_family_info> res;
for (auto i: ctx.db.local().get_column_families_mapping()) {
ctx.db.local().get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
cf::column_family_info info;
info.ks = i.first.first;
info.cf = i.first.second;
info.ks = kscf.first;
info.cf = kscf.second;
info.type = "ColumnFamilies";
res.push_back(info);
}
});
return make_ready_future<json::json_return_type>(json::stream_range_as_array(std::move(res), std::identity()));
});

View File

@@ -23,7 +23,7 @@ namespace api {
void set_column_family(http_context& ctx, httpd::routes& r, sharded<db::system_keyspace>& sys_ks);
void unset_column_family(http_context& ctx, httpd::routes& r);
const table_id& get_uuid(const sstring& name, const replica::database& db);
table_id get_uuid(const sstring& name, const replica::database& db);
future<> foreach_column_family(http_context& ctx, const sstring& name, std::function<void(replica::column_family&)> f);
@@ -68,9 +68,10 @@ struct map_reduce_column_families_locally {
std::function<std::unique_ptr<std::any>(std::unique_ptr<std::any>, std::unique_ptr<std::any>)> reducer;
future<std::unique_ptr<std::any>> operator()(replica::database& db) const {
auto res = seastar::make_lw_shared<std::unique_ptr<std::any>>(std::make_unique<std::any>(init));
return do_for_each(db.get_column_families(), [res, this](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) {
*res = reducer(std::move(*res), mapper(*i.second.get()));
}).then([res] {
return db.get_tables_metadata().for_each_table_gently([res, this] (table_id, seastar::lw_shared_ptr<replica::table> table) {
*res = reducer(std::move(*res), mapper(*table.get()));
return make_ready_future();
}).then([res] () {
return std::move(*res);
});
}

View File

@@ -68,8 +68,8 @@ void set_compaction_manager(http_context& ctx, routes& r) {
cm::get_pending_tasks_by_table.set(r, [&ctx] (std::unique_ptr<http::request> req) {
return ctx.db.map_reduce0([](replica::database& db) {
return do_with(std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>(), [&db](std::unordered_map<std::pair<sstring, sstring>, uint64_t, utils::tuple_hash>& tasks) {
return do_for_each(db.get_column_families(), [&tasks](const std::pair<table_id, seastar::lw_shared_ptr<replica::table>>& i) -> future<> {
replica::table& cf = *i.second.get();
return db.get_tables_metadata().for_each_table_gently([&tasks] (table_id, lw_shared_ptr<replica::table> table) {
replica::table& cf = *table.get();
tasks[std::make_pair(cf.schema()->ks_name(), cf.schema()->cf_name())] = cf.estimate_pending_compactions();
return make_ready_future<>();
}).then([&tasks] {

View File

@@ -980,10 +980,9 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
ks.set_incremental_backups(value);
}
for (auto& pair: db.get_column_families()) {
auto cf_ptr = pair.second;
cf_ptr->set_incremental_backups(value);
}
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
table->set_incremental_backups(value);
});
}).then([] {
return make_ready_future<json::json_return_type>(json_void());
});
@@ -1258,7 +1257,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
auto& ext = db.get_config().extensions();
for (auto& t : db.get_column_families() | boost::adaptors::map_values) {
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
auto& schema = t->schema();
if ((ks.empty() || ks == schema->ks_name()) && (cf.empty() || cf == schema->cf_name())) {
// at most Nsstables long
@@ -1339,7 +1338,7 @@ void set_storage_service(http_context& ctx, routes& r, sharded<service::storage_
}
res.emplace_back(std::move(tst));
}
}
});
std::sort(res.begin(), res.end(), [](const ss::table_sstables& t1, const ss::table_sstables& t2) {
return t1.keyspace() < t2.keyspace() || (t1.keyspace() == t2.keyspace() && t1.table() < t2.table());
});

View File

@@ -641,21 +641,21 @@ future<> generation_service::maybe_rewrite_streams_descriptions() {
// For each CDC log table get the TTL setting (from CDC options) and the table's creation time
std::vector<time_and_ttl> times_and_ttls;
for (auto& [_, cf] : _db.get_column_families()) {
auto& s = *cf->schema();
_db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> t) {
auto& s = *t->schema();
auto base = cdc::get_base_table(_db, s.ks_name(), s.cf_name());
if (!base) {
// Not a CDC log table.
continue;
return;
}
auto& cdc_opts = base->cdc_options();
if (!cdc_opts.enabled()) {
// This table is named like a CDC log table but it's not one.
continue;
return;
}
times_and_ttls.push_back(time_and_ttl{as_timepoint(s.id().uuid()), cdc_opts.ttl()});
}
});
if (times_and_ttls.empty()) {
// There's no point in rewriting old generations' streams (they don't contain any data).

View File

@@ -126,8 +126,7 @@ future<> db::commitlog_replayer::impl::init() {
}
}, [this](replica::database& db) {
return do_with(shard_rpm_map{}, [this, &db](shard_rpm_map& map) {
return parallel_for_each(db.get_column_families(), [this, &map](auto& cfp) {
auto uuid = cfp.first;
return db.get_tables_metadata().parallel_for_each_table([this, &map] (table_id uuid, lw_shared_ptr<replica::table>) {
// We do this on each cpu, for each CF, which technically is a little wasteful, but the values are
// cached, this is only startup, and it makes the code easier.
// Get all truncation records for the CF and initialize max rps if
@@ -156,13 +155,13 @@ future<> db::commitlog_replayer::impl::init() {
// existing sstables-per-shard.
// So, go through all CF:s and check, if a shard mapping does not
// have data for it, assume we must set global pos to zero.
for (auto&p : _db.local().get_column_families()) {
_db.local().get_tables_metadata().for_each_table([&] (table_id id, lw_shared_ptr<replica::table>) {
for (auto&p1 : _rpm) { // for each shard
if (!p1.second.contains(p.first)) {
if (!p1.second.contains(id)) {
_min_pos[p1.first] = replay_position();
}
}
}
});
for (auto&p : _min_pos) {
rlogger.debug("minimum position for shard {}: {}", p.first, p.second);
}

View File

@@ -265,8 +265,8 @@ void view_update_generator::setup_metrics() {
}
void view_update_generator::discover_staging_sstables() {
for (auto& x : _db.get_column_families()) {
auto t = x.second->shared_from_this();
_db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
auto t = table->shared_from_this();
for (auto sstables = t->get_sstables(); sstables::shared_sstable sst : *sstables) {
if (sst->requires_view_building()) {
_progress_tracker->on_sstable_registration(sst);
@@ -276,7 +276,7 @@ void view_update_generator::discover_staging_sstables() {
_registration_sem.consume(1);
}
}
}
});
}
}

View File

@@ -283,13 +283,13 @@ public:
const auto snapshots_by_tables = co_await _db.map_reduce(snapshot_reducer(), [ks_name_ = ks_data.name] (replica::database& db) mutable -> future<snapshots_by_tables_map> {
auto ks_name = std::move(ks_name_);
snapshots_by_tables_map snapshots_by_tables;
for (auto& [_, table] : db.get_column_families()) {
co_await db.get_tables_metadata().for_each_table_gently(coroutine::lambda([&] (table_id, lw_shared_ptr<replica::table> table) -> future<> {
if (table->schema()->ks_name() != ks_name) {
continue;
co_return;
}
const auto unordered_snapshots = co_await table->get_snapshot_details();
snapshots_by_tables.emplace(table->schema()->cf_name(), std::map<sstring, replica::table::snapshot_details>(unordered_snapshots.begin(), unordered_snapshots.end()));
}
}));
co_return snapshots_by_tables;
});
@@ -433,9 +433,9 @@ private:
};
co_return co_await _db.map_reduce(shard_reducer(reduce), [map, reduce] (replica::database& db) {
T val = {};
for (auto& [_, table] : db.get_column_families()) {
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
val = reduce(val, map(*table));
}
});
return val;
});
}
@@ -560,13 +560,13 @@ public:
res.total = occupancy.total_space();
res.free = occupancy.free_space();
res.entries = db.row_cache_tracker().partitions();
for (const auto& [_, t] : db.get_column_families()) {
db.get_tables_metadata().for_each_table([&] (table_id id, lw_shared_ptr<replica::table> t) {
auto& cache_stats = t->get_row_cache().stats();
res.hits += cache_stats.hits.count();
res.misses += cache_stats.misses.count();
res.hits_moving_average += cache_stats.hits.rate();
res.requests_moving_average += (cache_stats.hits.rate() + cache_stats.misses.rate());
}
});
return res;
}, stats{}, stats::reduce).then([] (stats s) {
return std::vector<std::pair<sstring, sstring>>{

17
main.cc
View File

@@ -1357,8 +1357,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
// Needs to happen before replaying the schema commitlog, which interprets
// replay position in the truncation record.
// Needs to happen before system_keyspace::setup(), which reads truncation records.
for (auto&& e : db.local().get_column_families()) {
auto table_ptr = e.second;
db.local().get_tables_metadata().for_each_table([] (table_id, lw_shared_ptr<replica::table> table_ptr) {
if (table_ptr->schema()->ks_name() == db::schema_tables::NAME) {
if (table_ptr->get_truncation_record() != db_clock::time_point::min()) {
// replay_position stored in the truncation record may belong to
@@ -1371,7 +1370,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
table_ptr->schema()->ks_name(), table_ptr->schema()->cf_name()));
}
}
}
});
auto sch_cl = db.local().schema_commitlog();
if (sch_cl != nullptr) {
@@ -1416,10 +1415,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
}
db.invoke_on_all([] (replica::database& db) {
for (auto& x : db.get_column_families()) {
replica::table& t = *(x.second);
db.get_tables_metadata().for_each_table([] (table_id, lw_shared_ptr<replica::table> table) {
replica::table& t = *table;
t.enable_auto_compaction();
}
});
}).get();
// If the same sstable is shared by several shards, it cannot be
@@ -1434,10 +1433,10 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
// streaming
db.invoke_on_all([] (replica::database& db) {
for (auto& x : db.get_column_families()) {
replica::column_family& cf = *(x.second);
db.get_tables_metadata().for_each_table([] (table_id, lw_shared_ptr<replica::table> table) {
replica::column_family& cf = *table;
cf.trigger_compaction();
}
});
}).get();
api::set_server_gossip(ctx, gossiper).get();
api::set_server_snitch(ctx, snitch).get();

View File

@@ -127,19 +127,20 @@ std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo
}
static size_t get_nr_tables(const replica::database& db, const sstring& keyspace) {
auto& m = db.get_column_families_mapping();
return std::count_if(m.begin(), m.end(), [&keyspace] (auto& e) {
return e.first.first == keyspace;
size_t tables = 0;
db.get_tables_metadata().for_each_table_id([&keyspace, &tables] (const std::pair<sstring, sstring>& kscf, table_id) {
tables += kscf.first == keyspace;
});
return tables;
}
static std::vector<sstring> list_column_families(const replica::database& db, const sstring& keyspace) {
std::vector<sstring> ret;
for (auto &&e : db.get_column_families_mapping()) {
if (e.first.first == keyspace) {
ret.push_back(e.first.second);
}
db.get_tables_metadata().for_each_table_id([&] (const std::pair<sstring, sstring>& kscf, table_id) {
if (kscf.first == keyspace) {
ret.push_back(kscf.second);
}
});
return ret;
}

View File

@@ -3084,13 +3084,10 @@ future<> repair_service::cleanup_history(tasks::task_id repair_id) {
}
future<> repair_service::load_history() {
auto tables = get_db().local().get_column_families();
for (const auto& x : tables) {
auto& table_uuid = x.first;
auto& table = x.second;
co_await get_db().local().get_tables_metadata().for_each_table_gently(coroutine::lambda([&] (table_id table_uuid, lw_shared_ptr<replica::table> table) -> future<> {
auto shard = unsigned(table_uuid.uuid().get_most_significant_bits()) % smp::count;
if (shard != this_shard_id()) {
continue;
co_return;
}
rlogger.info("Loading repair history for keyspace={}, table={}, table_uuid={}",
table->schema()->ks_name(), table->schema()->cf_name(), table_uuid);
@@ -3111,8 +3108,7 @@ future<> repair_service::load_history() {
entry.ks, entry.cf, range, repair_time);
}
});
}
co_return;
}));
}
repair_meta_ptr repair_service::get_repair_meta(gms::inet_address from, uint32_t repair_meta_id) {

View File

@@ -66,11 +66,11 @@ public:
}
virtual std::vector<data_dictionary::table> get_tables(data_dictionary::database db) const override {
std::vector<data_dictionary::table> ret;
auto&& tables = unwrap(db).get_column_families();
ret.reserve(tables.size());
for (auto&& [uuid, cf] : tables) {
ret.push_back(wrap(*cf));
}
auto& tmd = unwrap(db).get_tables_metadata();
ret.reserve(tmd.size());
tmd.for_each_table([&] (table_id, const lw_shared_ptr<table> table) {
ret.push_back(wrap(*table));
});
return ret;
}
virtual std::optional<data_dictionary::table> try_find_table(data_dictionary::database db, std::string_view ks, std::string_view table) const override {

View File

@@ -144,7 +144,7 @@ public:
};
const boost::container::static_vector<std::pair<size_t, boost::container::static_vector<table*, 16>>, 10>
phased_barrier_top_10_counts(const std::unordered_map<table_id, lw_shared_ptr<column_family>>& tables, std::function<size_t(table&)> op_count_getter) {
phased_barrier_top_10_counts(const database::tables_metadata& tables_metadata, std::function<size_t(table&)> op_count_getter) {
using table_list = boost::container::static_vector<table*, 16>;
using count_and_tables = std::pair<size_t, table_list>;
const auto less = [] (const count_and_tables& a, const count_and_tables& b) {
@@ -154,20 +154,20 @@ phased_barrier_top_10_counts(const std::unordered_map<table_id, lw_shared_ptr<co
boost::container::static_vector<count_and_tables, 10> res;
count_and_tables* min_element = nullptr;
for (const auto& [tid, table] : tables) {
tables_metadata.for_each_table([&] (table_id tid, lw_shared_ptr<table> table) {
const auto count = op_count_getter(*table);
if (!count) {
continue;
return;
}
if (res.size() < res.capacity()) {
auto& elem = res.emplace_back(count, table_list({table.get()}));
if (!min_element || min_element->first > count) {
min_element = &elem;
}
continue;
return;
}
if (min_element->first > count) {
continue;
return;
}
auto it = boost::find_if(res, [count] (const count_and_tables& x) {
@@ -175,13 +175,13 @@ phased_barrier_top_10_counts(const std::unordered_map<table_id, lw_shared_ptr<co
});
if (it != res.end()) {
it->second.push_back(table.get());
continue;
return;
}
// If we are here, min_element->first < count
*min_element = {count, table_list({table.get()})};
min_element = &*boost::min_element(res, less);
}
});
boost::sort(res, less);
@@ -272,7 +272,7 @@ void database::setup_scylla_memory_diagnostics_producer() {
for (const auto& [name, op_count_getter] : phased_barriers) {
writeln(" {} (top 10):\n", name);
auto total = 0;
for (const auto& [count, table_list] : phased_barrier_top_10_counts(_column_families, op_count_getter)) {
for (const auto& [count, table_list] : phased_barrier_top_10_counts(_tables_metadata, op_count_getter)) {
total += count;
writeln(" {}", count);
if (table_list.empty()) {
@@ -869,13 +869,13 @@ database::init_commitlog() {
return db::commitlog::create_commitlog(db::commitlog::config::from_db_config(_cfg, _dbcfg.commitlog_scheduling_group, _dbcfg.available_memory)).then([this](db::commitlog&& log) {
_commitlog = std::make_unique<db::commitlog>(std::move(log));
_commitlog->add_flush_handler([this](db::cf_id_type id, db::replay_position pos) {
if (!_column_families.contains(id)) {
if (!_tables_metadata.contains(id)) {
// the CF has been removed.
_commitlog->discard_completed_segments(id);
return;
}
// Initiate a background flush. Waited upon in `stop()`.
(void)_column_families[id]->flush(pos);
(void)_tables_metadata.get_table(id).flush(pos);
}).release(); // we have longer life time than CL. Ignore reg anchor
});
}
@@ -965,13 +965,13 @@ void database::maybe_init_schema_commitlog() {
_schema_commitlog = std::make_unique<db::commitlog>(db::commitlog::create_commitlog(std::move(c)).get0());
_schema_commitlog->add_flush_handler([this] (db::cf_id_type id, db::replay_position pos) {
if (!_column_families.contains(id)) {
if (!_tables_metadata.contains(id)) {
// the CF has been removed.
_schema_commitlog->discard_completed_segments(id);
return;
}
// Initiate a background flush. Waited upon in `stop()`.
(void)_column_families[id]->flush(pos);
(void)_tables_metadata.get_table(id).flush(pos);
}).release();
}
@@ -996,10 +996,10 @@ future<> database::create_local_system_table(
cfg.memtable_scheduling_group = default_scheduling_group();
cfg.memtable_to_cache_scheduling_group = default_scheduling_group();
}
add_column_family(ks, table, std::move(cfg));
co_await add_column_family(ks, table, std::move(cfg));
}
void database::add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg) {
future<> database::add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg) {
schema = local_schema_registry().learn(schema);
schema->registry_entry()->mark_synced();
auto&& rs = ks.get_replication_strategy();
@@ -1023,18 +1023,17 @@ void database::add_column_family(keyspace& ks, schema_ptr schema, column_family:
cf->set_durable_writes(ks.metadata()->durable_writes());
auto uuid = schema->id();
if (_column_families.contains(uuid)) {
if (_tables_metadata.contains(uuid)) {
throw std::invalid_argument("UUID " + uuid.to_sstring() + " already mapped");
}
auto kscf = std::make_pair(schema->ks_name(), schema->cf_name());
if (_ks_cf_to_uuid.contains(kscf)) {
if (_tables_metadata.contains(kscf)) {
throw std::invalid_argument("Column family " + schema->cf_name() + " exists");
}
ks.add_or_update_column_family(schema);
cf->start();
schema->registry_entry()->set_table(cf->weak_from_this());
_column_families.emplace(uuid, std::move(cf));
_ks_cf_to_uuid.emplace(std::move(kscf), uuid);
co_await _tables_metadata.add_table(schema);
if (schema->is_view()) {
find_column_family(schema->view_info()->base_id()).add_or_update_view(view_ptr(schema));
}
@@ -1042,10 +1041,10 @@ void database::add_column_family(keyspace& ks, schema_ptr schema, column_family:
future<> database::add_column_family_and_make_directory(schema_ptr schema) {
auto& ks = find_keyspace(schema->ks_name());
add_column_family(ks, schema, ks.make_column_family_config(*schema, *this));
co_await add_column_family(ks, schema, ks.make_column_family_config(*schema, *this));
auto& cf = find_column_family(schema);
cf.get_index_manager().reload();
return cf.init_storage();
co_await cf.init_storage();
}
bool database::update_column_family(schema_ptr new_schema) {
@@ -1066,13 +1065,12 @@ bool database::update_column_family(schema_ptr new_schema) {
return columns_changed;
}
void database::remove(table& cf) noexcept {
future<> database::remove(table& cf) noexcept {
auto s = cf.schema();
auto& ks = find_keyspace(s->ks_name());
cf.deregister_metrics();
_column_families.erase(s->id());
co_await _tables_metadata.remove_table(s);
ks.metadata()->remove_column_family(s);
_ks_cf_to_uuid.erase(std::make_pair(s->ks_name(), s->cf_name()));
if (s->is_view()) {
try {
find_column_family(s->view_info()->base_id()).remove_view(view_ptr(s));
@@ -1084,7 +1082,7 @@ void database::remove(table& cf) noexcept {
future<> database::detach_column_family(table& cf) {
auto uuid = cf.schema()->id();
remove(cf);
co_await remove(cf);
cf.clear_views();
co_await cf.await_pending_ops();
for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) {
@@ -1152,15 +1150,15 @@ future<> database::drop_table_on_all_shards(sharded<database>& sharded_db, shard
co_await table_shards->destroy_storage();
}
const table_id& database::find_uuid(std::string_view ks, std::string_view cf) const {
table_id database::find_uuid(std::string_view ks, std::string_view cf) const {
try {
return _ks_cf_to_uuid.at(std::make_pair(ks, cf));
return _tables_metadata.get_table_id(std::make_pair(ks, cf));
} catch (std::out_of_range&) {
throw no_such_column_family(ks, cf);
}
}
const table_id& database::find_uuid(const schema_ptr& schema) const {
table_id database::find_uuid(const schema_ptr& schema) const {
return find_uuid(schema->ks_name(), schema->cf_name());
}
@@ -1250,11 +1248,9 @@ std::unordered_map<sstring, locator::vnode_effective_replication_map_ptr> databa
std::vector<lw_shared_ptr<column_family>> database::get_non_system_column_families() const {
return boost::copy_range<std::vector<lw_shared_ptr<column_family>>>(
get_column_families()
| boost::adaptors::map_values
| boost::adaptors::filtered([](const lw_shared_ptr<column_family>& cf) {
return !is_system_keyspace(cf->schema()->ks_name());
}));
get_tables_metadata().filter([] (auto uuid_and_cf) {
return !is_system_keyspace(uuid_and_cf.second->schema()->ks_name());
}) | boost::adaptors::map_values);
}
column_family& database::find_column_family(std::string_view ks_name, std::string_view cf_name) {
@@ -1277,7 +1273,7 @@ const column_family& database::find_column_family(std::string_view ks_name, std:
column_family& database::find_column_family(const table_id& uuid) {
try {
return *_column_families.at(uuid);
return _tables_metadata.get_table(uuid);
} catch (...) {
throw no_such_column_family(uuid);
}
@@ -1285,14 +1281,14 @@ column_family& database::find_column_family(const table_id& uuid) {
const column_family& database::find_column_family(const table_id& uuid) const {
try {
return *_column_families.at(uuid);
return _tables_metadata.get_table(uuid);
} catch (...) {
throw no_such_column_family(uuid);
}
}
bool database::column_family_exists(const table_id& uuid) const {
return _column_families.contains(uuid);
return _tables_metadata.contains(uuid);
}
future<>
@@ -1419,7 +1415,7 @@ schema_ptr database::find_schema(const table_id& uuid) const {
}
bool database::has_schema(std::string_view ks_name, std::string_view cf_name) const {
return _ks_cf_to_uuid.contains(std::make_pair(ks_name, cf_name));
return _tables_metadata.contains(std::make_pair(ks_name, cf_name));
}
std::vector<view_ptr> database::get_views() const {
@@ -1464,11 +1460,10 @@ future<> database::create_keyspace_on_all_shards(sharded<database>& sharded_db,
future<>
database::drop_caches() const {
std::unordered_map<table_id, lw_shared_ptr<column_family>> tables = get_column_families();
std::unordered_map<table_id, lw_shared_ptr<column_family>> tables = get_tables_metadata().get_column_families_copy();
for (auto&& e : tables) {
table& t = *e.second;
co_await t.get_row_cache().invalidate(row_cache::external_updater([] {}));
auto sstables = t.get_sstables();
for (sstables::shared_sstable sst : *sstables) {
co_await sst->drop_caches();
@@ -1825,10 +1820,10 @@ std::ostream& operator<<(std::ostream& out, const column_family& cf) {
std::ostream& operator<<(std::ostream& out, const database& db) {
out << "{\n";
for (auto&& e : db._column_families) {
auto&& cf = *e.second;
out << "(" << e.first.to_sstring() << ", " << cf.schema()->cf_name() << ", " << cf.schema()->ks_name() << "): " << cf << "\n";
}
db._tables_metadata.for_each_table([&] (table_id id, const lw_shared_ptr<table> tp) {
auto&& cf = *tp;
out << "(" << id.to_sstring() << ", " << cf.schema()->cf_name() << ", " << cf.schema()->ks_name() << "): " << cf << "\n";
});
out << "}";
return out;
}
@@ -2333,13 +2328,13 @@ schema_ptr database::find_indexed_table(const sstring& ks_name, const sstring& i
future<> database::close_tables(table_kind kind_to_close) {
auto b = defer([this] { _stop_barrier.abort(); });
co_await coroutine::parallel_for_each(_column_families, [this, kind_to_close](auto& val_pair) -> future<> {
auto& s = val_pair.second->schema();
co_await _tables_metadata.parallel_for_each_table(coroutine::lambda([this, kind_to_close] (table_id, lw_shared_ptr<table> table) -> future<> {
auto& s = table->schema();
table_kind k = is_system_table(*s) || _cfg.extensions().is_extension_internal_keyspace(s->ks_name()) ? table_kind::system : table_kind::user;
if (k == kind_to_close) {
co_await val_pair.second->stop();
co_await table->stop();
}
});
}));
co_await _stop_barrier.arrive_and_wait();
b.cancel();
}
@@ -2422,8 +2417,8 @@ future<> database::stop() {
}
future<> database::flush_all_memtables() {
return parallel_for_each(_column_families, [] (auto& cfp) {
return cfp.second->flush();
return _tables_metadata.parallel_for_each_table([] (table_id, lw_shared_ptr<table> table) {
return table->flush();
});
}
@@ -2811,8 +2806,8 @@ future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_nam
// and has no remaining snapshots
if (!has_snapshots) {
auto [cf_name, cf_uuid] = extract_cf_name_and_uuid(table_ent->name);
const auto& it = _ks_cf_to_uuid.find(std::make_pair(ks_name, cf_name));
auto dropped = (it == _ks_cf_to_uuid.cend()) || (cf_uuid != it->second);
auto id_opt = _tables_metadata.get_table_id_if_exists(std::make_pair(ks_name, cf_name));
auto dropped = !id_opt || (cf_uuid != id_opt);
if (dropped) {
dblog.info("Removing dropped table dir {}", table_dir);
sstables::remove_table_directory_if_has_no_snapshots(table_dir).get();
@@ -2825,7 +2820,7 @@ future<> database::clear_snapshot(sstring tag, std::vector<sstring> keyspace_nam
}
future<> database::flush_non_system_column_families() {
auto non_system_cfs = get_column_families() | boost::adaptors::filtered([this] (auto& uuid_and_cf) {
auto non_system_cfs = get_tables_metadata().filter([this] (auto uuid_and_cf) {
auto cf = uuid_and_cf.second;
auto& ks = cf->schema()->ks_name();
return !is_system_keyspace(ks) && !_cfg.extensions().is_extension_internal_keyspace(ks);
@@ -2847,7 +2842,7 @@ future<> database::flush_non_system_column_families() {
}
future<> database::flush_system_column_families() {
auto system_cfs = get_column_families() | boost::adaptors::filtered([this] (auto& uuid_and_cf) {
auto system_cfs = get_tables_metadata().filter([this] (auto uuid_and_cf) {
auto cf = uuid_and_cf.second;
auto& ks = cf->schema()->ks_name();
return is_system_keyspace(ks) || _cfg.extensions().is_extension_internal_keyspace(ks);
@@ -2880,6 +2875,94 @@ future<> database::drain() {
b.cancel();
}
size_t database::tables_metadata::size() const noexcept {
return _column_families.size();
}
future<> database::tables_metadata::add_table(schema_ptr schema) {
auto holder = co_await _cf_lock.hold_write_lock();
auto id = schema->id();
auto kscf = std::make_pair(schema->ks_name(), schema->cf_name());
try {
_column_families.emplace(id, schema->table().shared_from_this());
_ks_cf_to_uuid.emplace(kscf, id);
} catch (...) {
_ks_cf_to_uuid.erase(std::move(kscf));
_column_families.erase(id);
throw;
}
}
future<> database::tables_metadata::remove_table(schema_ptr schema) noexcept {
try {
auto holder = co_await _cf_lock.hold_write_lock();
_column_families.erase(schema->id());
_ks_cf_to_uuid.erase(std::make_pair(schema->ks_name(), schema->cf_name()));
} catch (...) {
on_fatal_internal_error(dblog, format("tables_metadata::remove_cf: {}", std::current_exception()));
}
}
table& database::tables_metadata::get_table(table_id id) const {
return *_column_families.at(id);
}
table_id database::tables_metadata::get_table_id(const std::pair<std::string_view, std::string_view>& kscf) const {
return _ks_cf_to_uuid.at(kscf);
}
lw_shared_ptr<table> database::tables_metadata::get_table_if_exists(table_id id) const {
if (auto it = _column_families.find(id); it != _column_families.end()) {
return it->second;
}
return nullptr;
}
table_id database::tables_metadata::get_table_id_if_exists(const std::pair<std::string_view, std::string_view>& kscf) const {
if (auto it = _ks_cf_to_uuid.find(kscf); it != _ks_cf_to_uuid.end()) {
return it->second;
}
return table_id::create_null_id();
}
bool database::tables_metadata::contains(table_id id) const {
return _column_families.contains(id);
}
bool database::tables_metadata::contains(std::pair<std::string_view, std::string_view> kscf) const {
return _ks_cf_to_uuid.contains(kscf);
}
void database::tables_metadata::for_each_table(std::function<void(table_id, lw_shared_ptr<table>)> f) const {
for (auto& [id, table]: _column_families) {
f(id, table);
}
}
void database::tables_metadata::for_each_table_id(std::function<void(const ks_cf_t&, table_id)> f) const {
for (auto& [kscf, id]: _ks_cf_to_uuid) {
f(kscf, id);
}
}
future<> database::tables_metadata::for_each_table_gently(std::function<future<>(table_id, lw_shared_ptr<table>)> f) {
auto holder = co_await _cf_lock.hold_read_lock();
for (auto& [id, table]: _column_families) {
co_await f(id, table);
}
}
future<> database::tables_metadata::parallel_for_each_table(std::function<future<>(table_id, lw_shared_ptr<table>)> f) {
auto holder = co_await _cf_lock.hold_read_lock();
co_await coroutine::parallel_for_each(_column_families, [f = std::move(f)] (auto& table) {
return f(table.first, table.second);
});
}
const std::unordered_map<table_id, lw_shared_ptr<table>> database::tables_metadata::get_column_families_copy() const {
return _column_families;
}
data_dictionary::database
database::as_data_dictionary() const {
static constinit data_dictionary_impl _impl;

View File

@@ -1304,6 +1304,34 @@ public:
}
};
using ks_cf_t = std::pair<sstring, sstring>;
using ks_cf_to_uuid_t =
flat_hash_map<ks_cf_t, table_id, utils::tuple_hash, string_pair_eq>;
class tables_metadata {
rwlock _cf_lock;
std::unordered_map<table_id, lw_shared_ptr<column_family>> _column_families;
ks_cf_to_uuid_t _ks_cf_to_uuid;
public:
size_t size() const noexcept;
future<> add_table(schema_ptr schema);
future<> remove_table(schema_ptr schema) noexcept;
table& get_table(table_id id) const;
table_id get_table_id(const std::pair<std::string_view, std::string_view>& kscf) const;
lw_shared_ptr<table> get_table_if_exists(table_id id) const;
table_id get_table_id_if_exists(const std::pair<std::string_view, std::string_view>& kscf) const;
bool contains(table_id id) const;
bool contains(std::pair<std::string_view, std::string_view> kscf) const;
void for_each_table(std::function<void(table_id, lw_shared_ptr<table>)> f) const;
void for_each_table_id(std::function<void(const ks_cf_t&, table_id)> f) const;
future<> for_each_table_gently(std::function<future<>(table_id, lw_shared_ptr<table>)> f);
future<> parallel_for_each_table(std::function<future<>(table_id, lw_shared_ptr<table>)> f);
const std::unordered_map<table_id, lw_shared_ptr<table>> get_column_families_copy() const;
const auto filter(std::function<bool(std::pair<table_id, lw_shared_ptr<table>>)> f) const {
return _column_families | boost::adaptors::filtered(std::move(f));
}
};
private:
replica::cf_stats _cf_stats;
static constexpr size_t max_count_concurrent_reads{100};
@@ -1370,10 +1398,7 @@ private:
db::per_partition_rate_limit::info> _apply_stage;
flat_hash_map<sstring, keyspace> _keyspaces;
std::unordered_map<table_id, lw_shared_ptr<column_family>> _column_families;
using ks_cf_to_uuid_t =
flat_hash_map<std::pair<sstring, sstring>, table_id, utils::tuple_hash, string_pair_eq>;
ks_cf_to_uuid_t _ks_cf_to_uuid;
tables_metadata _tables_metadata;
std::unique_ptr<db::commitlog> _commitlog;
std::unique_ptr<db::commitlog> _schema_commitlog;
utils::updateable_value_source<table_schema_version> _version;
@@ -1453,7 +1478,7 @@ private:
Future update_write_metrics(Future&& f);
void update_write_metrics_for_timed_out_write();
future<> create_keyspace(const lw_shared_ptr<keyspace_metadata>&, locator::effective_replication_map_factory& erm_factory, system_keyspace system);
void remove(table&) noexcept;
future<> remove(table&) noexcept;
void drop_keyspace(const sstring& name);
future<> update_keyspace(const keyspace_metadata& tmp_ksm);
static future<> modify_keyspace_on_all_shards(sharded<database>& sharded_db, std::function<future<>(replica::database&)> func, std::function<future<>(replica::database&)> notifier);
@@ -1540,8 +1565,8 @@ public:
future<> add_column_family_and_make_directory(schema_ptr schema);
/* throws no_such_column_family if missing */
const table_id& find_uuid(std::string_view ks, std::string_view cf) const;
const table_id& find_uuid(const schema_ptr&) const;
table_id find_uuid(std::string_view ks, std::string_view cf) const;
table_id find_uuid(const schema_ptr&) const;
/**
* Creates a keyspace for a given metadata if it still doesn't exist.
@@ -1650,23 +1675,18 @@ public:
return _keyspaces;
}
const std::unordered_map<table_id, lw_shared_ptr<column_family>>& get_column_families() const {
return _column_families;
const tables_metadata& get_tables_metadata() const {
return _tables_metadata;
}
std::unordered_map<table_id, lw_shared_ptr<column_family>>& get_column_families() {
return _column_families;
tables_metadata& get_tables_metadata() {
return _tables_metadata;
}
std::vector<lw_shared_ptr<column_family>> get_non_system_column_families() const;
std::vector<view_ptr> get_views() const;
const ks_cf_to_uuid_t&
get_column_families_mapping() const {
return _ks_cf_to_uuid;
}
const db::config& get_config() const {
return _cfg;
}
@@ -1707,7 +1727,7 @@ public:
public:
bool update_column_family(schema_ptr s);
private:
void add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg);
future<> add_column_family(keyspace& ks, schema_ptr schema, column_family::config cfg);
future<> detach_column_family(table& cf);
struct table_truncate_state;

View File

@@ -472,11 +472,11 @@ future<> distributed_loader::populate_keyspace(distributed<replica::database>& d
dblog.info("Populating Keyspace {}", ks_name);
auto& ks = i->second;
auto& column_families = db.local().get_column_families();
auto& tables_metadata = db.local().get_tables_metadata();
co_await coroutine::parallel_for_each(ks.metadata()->cf_meta_data() | boost::adaptors::map_values, [&] (schema_ptr s) -> future<> {
auto uuid = s->id();
lw_shared_ptr<replica::column_family> cf = column_families[uuid];
lw_shared_ptr<replica::column_family> cf = tables_metadata.get_table(uuid).shared_from_this();
// System tables (from system and system_schema keyspaces) are loaded in two phases.
// The populate_keyspace function can be called in the second phase for tables that

View File

@@ -1209,7 +1209,7 @@ def find_dbs():
def for_each_table(db=None):
if not db:
db = find_db()
cfs = db['_column_families']
cfs = db['_tables_metadata']['_column_families']
for (key, value) in unordered_map(cfs):
yield value['_p'].reinterpret_cast(lookup_type(['replica::table', 'column_family'])[1].pointer()).dereference() # it's a lw_shared_ptr
@@ -1511,7 +1511,7 @@ class scylla_tables(gdb.Command):
for shard in shards:
db = find_db(shard)
cfs = db['_column_families']
cfs = db['_tables_metadata']['_column_families']
for (key, value) in unordered_map(cfs):
value = seastar_lw_shared_ptr(value).get().dereference()
schema = schema_ptr(value['_schema'])
@@ -1533,7 +1533,7 @@ class scylla_table(gdb.Command):
def _find_table(self, ks, cf):
db = find_db()
cfs = db['_column_families']
cfs = db['_tables_metadata']['_column_families']
for (key, value) in unordered_map(cfs):
value = seastar_lw_shared_ptr(value).get().dereference()
schema = schema_ptr(value['_schema'])
@@ -1900,7 +1900,7 @@ class seastar_lw_shared_ptr():
def all_tables(db):
"""Returns pointers to table objects which exist on current shard"""
for (key, value) in unordered_map(db['_column_families']):
for (key, value) in unordered_map(db['_tables_metadata']['_column_families']):
yield seastar_lw_shared_ptr(value).get()

View File

@@ -78,9 +78,9 @@ void load_broadcaster::start_broadcasting() {
llogger.debug("Disseminating load info ...");
_done = _db.map_reduce0([](replica::database& db) {
int64_t res = 0;
for (auto i : db.get_column_families()) {
res += i.second->get_stats().live_disk_space_used;
}
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> table) {
res += table->get_stats().live_disk_space_used;
});
return res;
}, int64_t(0), std::plus<int64_t>()).then([this] (int64_t size) {
return _gossiper.add_local_application_state(gms::application_state::LOAD,
@@ -137,7 +137,7 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
};
auto cf_to_cache_hit_stats = [non_system_filter] (replica::database& db) {
return boost::copy_range<std::unordered_map<table_id, stat>>(db.get_column_families() | boost::adaptors::filtered(non_system_filter) |
return boost::copy_range<std::unordered_map<table_id, stat>>(db.get_tables_metadata().filter(non_system_filter) |
boost::adaptors::transformed([] (const std::pair<table_id, lw_shared_ptr<replica::column_family>>& cf) {
auto& stats = cf.second->get_row_cache().stats();
return std::make_pair(cf.first, stat{float(stats.reads_with_no_misses.rate().rates[0]), float(stats.reads_with_misses.rate().rates[0])});
@@ -159,11 +159,11 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
// set calculated rates on all shards
return _db.invoke_on_all([this, cpuid = this_shard_id()] (replica::database& db) {
return do_for_each(_rates, [this, cpuid, &db] (auto&& r) mutable {
auto it = db.get_column_families().find(r.first);
if (it == db.get_column_families().end()) { // a table may be added before map/reduce completes and this code runs
auto cf_opt = db.get_tables_metadata().get_table_if_exists(r.first);
if (!cf_opt) { // a table may be added before map/reduce completes and this code runs
return;
}
auto& cf = *it;
auto& cf = cf_opt;
stat& s = r.second;
float rate = 0;
if (s.h) {
@@ -171,10 +171,10 @@ future<lowres_clock::duration> cache_hitrate_calculator::recalculate_hitrates()
}
if (this_shard_id() == cpuid) {
// calculate max difference between old rate and new one for all cfs
_diff = std::max(_diff, std::abs(float(cf.second->get_global_cache_hit_rate()) - rate));
_gstate += format("{}.{}:{:0.6f};", cf.second->schema()->ks_name(), cf.second->schema()->cf_name(), rate);
_diff = std::max(_diff, std::abs(float(cf->get_global_cache_hit_rate()) - rate));
_gstate += format("{}.{}:{:0.6f};", cf->schema()->ks_name(), cf->schema()->cf_name(), rate);
}
cf.second->set_global_cache_hit_rate(cache_temperature(rate));
cf->set_global_cache_hit_rate(cache_temperature(rate));
});
});
}).then([this] {

View File

@@ -3316,16 +3316,16 @@ future<> storage_service::replicate_to_all_cores(mutable_token_metadata_ptr tmpt
co_await container().invoke_on_all([&] (storage_service& ss) {
auto& db = ss._db.local();
auto tmptr = pending_token_metadata_ptr[this_shard_id()];
for (auto&& [id, cf] : db.get_column_families()) { // Safe because we iterate without preemption
auto rs = db.find_keyspace(cf->schema()->keypace_name()).get_replication_strategy_ptr();
db.get_tables_metadata().for_each_table([&] (table_id id, lw_shared_ptr<replica::table> table) {
auto rs = db.find_keyspace(table->schema()->keypace_name()).get_replication_strategy_ptr();
locator::effective_replication_map_ptr erm;
if (auto pt_rs = rs->maybe_as_per_table()) {
erm = pt_rs->make_replication_map(id, tmptr);
} else {
erm = pending_effective_replication_maps[this_shard_id()][cf->schema()->keypace_name()];
erm = pending_effective_replication_maps[this_shard_id()][table->schema()->keypace_name()];
}
pending_table_erms[this_shard_id()].emplace(id, std::move(erm));
}
});
});
} catch (...) {
ex = std::current_exception();

View File

@@ -464,15 +464,15 @@ std::vector<replica::column_family*> stream_session::get_column_family_stores(co
std::vector<replica::column_family*> stores;
auto& db = manager().db();
if (column_families.empty()) {
for (auto& x : db.get_column_families()) {
replica::column_family& cf = *(x.second);
db.get_tables_metadata().for_each_table([&] (table_id, lw_shared_ptr<replica::table> tp) {
replica::column_family& cf = *tp;
auto cf_name = cf.schema()->cf_name();
auto ks_name = cf.schema()->ks_name();
if (ks_name == keyspace) {
sslog.debug("Find ks={} cf={}", ks_name, cf_name);
stores.push_back(&cf);
}
}
});
} else {
// TODO: We can move this to database class and use shared_ptr<column_family> instead
for (auto& cf_name : column_families) {

View File

@@ -116,8 +116,8 @@ SEASTAR_THREAD_TEST_CASE(test_large_data) {
// and the old sstable is deleted.
flush(e);
e.db().invoke_on_all([] (replica::database& dbi) {
return parallel_for_each(dbi.get_column_families(), [&dbi] (auto& table) {
return dbi.get_compaction_manager().perform_major_compaction((table.second)->as_table_state());
return dbi.get_tables_metadata().parallel_for_each_table([&dbi] (table_id, lw_shared_ptr<replica::table> t) {
return dbi.get_compaction_manager().perform_major_compaction(t->as_table_state());
});
}).get();

View File

@@ -860,10 +860,10 @@ public:
replica::distributed_loader::init_non_system_keyspaces(db, proxy, sys_ks).get();
db.invoke_on_all([] (replica::database& db) {
for (auto& x : db.get_column_families()) {
replica::table& t = *(x.second);
db.get_tables_metadata().for_each_table([] (table_id, lw_shared_ptr<replica::table> table) {
replica::table& t = *table;
t.enable_auto_compaction();
}
});
}).get();
if (raft_gr.local().is_enabled()) {