compaction_manager: Fix reactor stalls during periodic submissions
Every 1 hour, compaction manager will submit all registered table_state for a regular compaction attempt, all without yielding. This can potentially cause a reactor stall if there are 1000s of table states, as compaction strategy heuristics will run on behalf of each, and processing all buckets and picking the best one is not cheap. This problem can be magnified with compaction groups, as each group is represented by a table state. This might appear in dashboard as periodic stalls, every 1h, misleading the investigator into believing that the problem is caused by a chronological job. This is fixed by piggybacking on compaction reevaluation loop which can yield between each submission attempt if needed. Fixes #12390. Signed-off-by: Raphael S. Carvalho <raphaelsc@scylladb.com> Closes #12391
This commit is contained in:
committed by
Avi Kivity
parent
8797958dfc
commit
67ebd70e6e
@@ -15,6 +15,7 @@
|
||||
#include <seastar/core/coroutine.hh>
|
||||
#include <seastar/coroutine/switch_to.hh>
|
||||
#include <seastar/coroutine/parallel_for_each.hh>
|
||||
#include <seastar/coroutine/maybe_yield.hh>
|
||||
#include "sstables/exceptions.hh"
|
||||
#include "sstables/sstable_directory.hh"
|
||||
#include "locator/abstract_replication_strategy.hh"
|
||||
@@ -744,8 +745,9 @@ void compaction_manager::enable() {
|
||||
std::function<void()> compaction_manager::compaction_submission_callback() {
|
||||
return [this] () mutable {
|
||||
for (auto& e: _compaction_state) {
|
||||
submit(*e.first);
|
||||
postpone_compaction_for_table(e.first);
|
||||
}
|
||||
reevaluate_postponed_compactions();
|
||||
};
|
||||
}
|
||||
|
||||
@@ -756,15 +758,24 @@ future<> compaction_manager::postponed_compactions_reevaluation() {
|
||||
_postponed.clear();
|
||||
co_return;
|
||||
}
|
||||
auto postponed = std::move(_postponed);
|
||||
// A task_state being reevaluated can re-insert itself into postponed list, which is the reason
|
||||
// for moving the list to be processed into a local.
|
||||
auto postponed = std::exchange(_postponed, {});
|
||||
try {
|
||||
for (auto& t : postponed) {
|
||||
for (auto it = postponed.begin(); it != postponed.end();) {
|
||||
compaction::table_state* t = *it;
|
||||
it = postponed.erase(it);
|
||||
// skip reevaluation of a table_state that became invalid post its removal
|
||||
if (!_compaction_state.contains(t)) {
|
||||
continue;
|
||||
}
|
||||
auto s = t->schema();
|
||||
cmlog.debug("resubmitting postponed compaction for table {}.{} [{}]", s->ks_name(), s->cf_name(), fmt::ptr(t));
|
||||
submit(*t);
|
||||
co_await coroutine::maybe_yield();
|
||||
}
|
||||
} catch (...) {
|
||||
_postponed = std::move(postponed);
|
||||
_postponed.insert(postponed.begin(), postponed.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user