qos: don't populate effective service level cache until auth is migrated to raft
Right now, service levels are migrated in one group0 command and auth is migrated in the next one. This has a bad effect on the group0 state reload logic - modifying service levels in group0 causes the effective service levels cache to be recalculated, and to do so we need to fetch information about all roles. If the reload happens after SL upgrade and before auth upgrade, the query for roles will be directed to the legacy auth tables in system_auth - and the query, being a potentially remote query, has a timeout. If the query times out, it will throw an exception which will break the group0 apply fiber and the node will need to be restarted to bring it back to work. In order to solve this issue, make sure that the service level module does not start populating and using the service level cache until both service levels and auth are migrated to raft. This is achieved by adding the check both to the cache population logic and the effective service level getter - they now look at service level's accessor new method, `can_use_effective_service_level_cache` which takes a look at the auth version. Fixes: scylladb/scylladb#24963
This commit is contained in:
@@ -100,6 +100,10 @@ bool raft_service_level_distributed_data_accessor::is_v2() const {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool raft_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
|
||||
return !auth::legacy_mode(_qp);
|
||||
}
|
||||
|
||||
::shared_ptr<service_level_controller::service_level_distributed_data_accessor> raft_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@@ -39,6 +39,7 @@ public:
|
||||
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override;
|
||||
|
||||
virtual bool is_v2() const override;
|
||||
virtual bool can_use_effective_service_level_cache() const override;
|
||||
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
|
||||
};
|
||||
|
||||
|
||||
@@ -309,6 +309,17 @@ future<> service_level_controller::update_effective_service_levels_cache() {
|
||||
// might be not initialized yet.
|
||||
co_return;
|
||||
}
|
||||
if (!_sl_data_accessor || !_sl_data_accessor->can_use_effective_service_level_cache()) {
|
||||
// Don't populate the effective service level cache until auth is migrated to raft.
|
||||
// Otherwise, executing the code that follows would read roles data
|
||||
// from system_auth tables; that would be bad because reading from
|
||||
// those tables is prone to timeouts, and `update_effective_service_levels_cache`
|
||||
// is called from the group0 context - a timeout like that would render
|
||||
// group0 non-functional on the node until restart.
|
||||
//
|
||||
// See scylladb/scylladb#24963 for more details.
|
||||
co_return;
|
||||
}
|
||||
auto units = co_await get_units(_global_controller_db->notifications_serializer, 1);
|
||||
|
||||
auto& role_manager = _auth_service.local().underlying_role_manager();
|
||||
@@ -385,7 +396,7 @@ void service_level_controller::stop_legacy_update_from_distributed_data() {
|
||||
}
|
||||
|
||||
future<std::optional<service_level_options>> service_level_controller::find_effective_service_level(const sstring& role_name) {
|
||||
if (_sl_data_accessor->is_v2()) {
|
||||
if (_sl_data_accessor->can_use_effective_service_level_cache()) {
|
||||
auto effective_sl_it = _effective_service_levels_db.find(role_name);
|
||||
co_return effective_sl_it != _effective_service_levels_db.end()
|
||||
? std::optional<service_level_options>(effective_sl_it->second)
|
||||
|
||||
@@ -114,6 +114,9 @@ public:
|
||||
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const = 0;
|
||||
|
||||
virtual bool is_v2() const = 0;
|
||||
// Returns whether effective service level cache can be populated and used.
|
||||
// This is equivalent to checking whether auth + raft have been migrated to raft.
|
||||
virtual bool can_use_effective_service_level_cache() const = 0;
|
||||
// Returns v2(raft) data accessor. If data accessor is already a raft one, returns nullptr.
|
||||
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const = 0;
|
||||
};
|
||||
|
||||
@@ -37,6 +37,10 @@ bool standard_service_level_distributed_data_accessor::is_v2() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool standard_service_level_distributed_data_accessor::can_use_effective_service_level_cache() const {
|
||||
return false;
|
||||
}
|
||||
|
||||
::shared_ptr<service_level_controller::service_level_distributed_data_accessor> standard_service_level_distributed_data_accessor::upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const {
|
||||
return ::static_pointer_cast<service_level_controller::service_level_distributed_data_accessor>(
|
||||
::make_shared<raft_service_level_distributed_data_accessor>(qp, group0_client));
|
||||
|
||||
@@ -31,6 +31,7 @@ public:
|
||||
virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override { return make_ready_future(); }
|
||||
|
||||
virtual bool is_v2() const override;
|
||||
virtual bool can_use_effective_service_level_cache() const override;
|
||||
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override;
|
||||
};
|
||||
}
|
||||
|
||||
@@ -166,6 +166,9 @@ SEASTAR_THREAD_TEST_CASE(too_many_service_levels) {
|
||||
virtual bool is_v2() const override {
|
||||
return true;
|
||||
}
|
||||
virtual bool can_use_effective_service_level_cache() const override {
|
||||
return true;
|
||||
}
|
||||
virtual ::shared_ptr<service_level_distributed_data_accessor> upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override {
|
||||
return make_shared<data_accessor>();
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user