qos: don't populate effective service level cache until auth is migrated to raft

Right now, service levels are migrated in one group0 command and auth
is migrated in the next one. This has a bad effect on the group0 state
reload logic - modifying service levels in group0 causes the effective
service levels cache to be recalculated, and to do so we need to fetch
information about all roles. If the reload happens after SL upgrade and
before auth upgrade, the query for roles will be directed to the legacy
auth tables in system_auth - and the query, being a potentially remote
query, has a timeout. If the query times out, it will throw
an exception which will break the group0 apply fiber and the node will
need to be restarted to bring it back to work.

In order to solve this issue, make sure that the service level module
does not start populating and using the service level cache until both
service levels and auth are migrated to raft. This is achieved by adding
the check both to the cache population logic and the effective service
level getter - they now look at service level's accessor new method,
`can_use_effective_service_level_cache` which takes a look at the auth
version.

Fixes: scylladb/scylladb#24963
This commit is contained in:
Piotr Dulikowski
2025-07-25 14:12:11 +02:00
parent d5e023bbad
commit 2bb800c004
7 changed files with 28 additions and 1 deletions

View File

@@ -100,6 +100,10 @@ bool raft_service_level_distributed_data_accessor::is_v2() const {
return true;
}
bool raft_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
return !auth::legacy_mode(_qp);
}
::shared_ptr<service_level_controller::service_level_distributed_data_accessor> raft_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
return nullptr;
}

View File

@@ -39,6 +39,7 @@ public:
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override;
virtual bool is_v2() const override;
virtual bool can_use_effective_service_level_cache() const override;
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
};

View File

@@ -309,6 +309,17 @@ future<> service_level_controller::update_effective_service_levels_cache() {
// might be not initialized yet.
co_return;
}
if (!_sl_data_accessor || !_sl_data_accessor->can_use_effective_service_level_cache()) {
// Don't populate the effective service level cache until auth is migrated to raft.
// Otherwise, executing the code that follows would read roles data
// from system_auth tables; that would be bad because reading from
// those tables is prone to timeouts, and `update_effective_service_levels_cache`
// is called from the group0 context - a timeout like that would render
// group0 non-functional on the node until restart.
//
// See scylladb/scylladb#24963 for more details.
co_return;
}
auto units = co_await get_units(_global_controller_db->notifications_serializer, 1);
auto& role_manager = _auth_service.local().underlying_role_manager();
@@ -385,7 +396,7 @@ void service_level_controller::stop_legacy_update_from_distributed_data() {
}
future<std::optional<service_level_options>> service_level_controller::find_effective_service_level(const sstring& role_name) {
if (_sl_data_accessor->is_v2()) {
if (_sl_data_accessor->can_use_effective_service_level_cache()) {
auto effective_sl_it = _effective_service_levels_db.find(role_name);
co_return effective_sl_it != _effective_service_levels_db.end()
? std::optional<service_level_options>(effective_sl_it->second)

View File

@@ -114,6 +114,9 @@ public:
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const = 0;
virtual bool is_v2() const = 0;
// Returns whether effective service level cache can be populated and used.
// This is equivalent to checking whether auth + raft have been migrated to raft.
virtual bool can_use_effective_service_level_cache() const = 0;
// Returns v2(raft) data accessor. If data accessor is already a raft one, returns nullptr.
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const = 0;
};

View File

@@ -37,6 +37,10 @@ bool standard_service_level_distributed_data_accessor::is_v2() const {
return false;
}
bool standard_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
return false;
}
::shared_ptr<service_level_controller::service_level_distributed_data_accessor> standard_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
return ::static_pointer_cast<service_level_controller::service_level_distributed_data_accessor>(
::make_shared<raft_service_level_distributed_data_accessor>(qp, group0_client));

View File

@@ -31,6 +31,7 @@ public:
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override { return make_ready_future(); }
virtual bool is_v2() const override;
virtual bool can_use_effective_service_level_cache() const override;
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
};
}

View File

@@ -166,6 +166,9 @@ SEASTAR_THREAD_TEST_CASE(too_many_service_levels) {
virtual bool is_v2() const override {
return true;
}
virtual bool can_use_effective_service_level_cache() const override {
return true;
}
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override {
return make_shared<data_accessor>();
}