diff --git a/CMakeLists.txt b/CMakeLists.txt index d0f29c1fe0..1a19edda5e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ else() set(Seastar_EXCLUDE_APPS_FROM_ALL ON CACHE BOOL "" FORCE) set(Seastar_EXCLUDE_TESTS_FROM_ALL ON CACHE BOOL "" FORCE) set(Seastar_IO_URING ON CACHE BOOL "" FORCE) - set(Seastar_SCHEDULING_GROUPS_COUNT 16 CACHE STRING "" FORCE) + set(Seastar_SCHEDULING_GROUPS_COUNT 19 CACHE STRING "" FORCE) set(Seastar_UNUSED_RESULT_ERROR ON CACHE BOOL "" FORCE) add_subdirectory(seastar) target_compile_definitions (seastar @@ -199,6 +199,7 @@ target_sources(scylla-main tombstone_gc_options.cc tombstone_gc.cc reader_concurrency_semaphore.cc + reader_concurrency_semaphore_group.cc row_cache.cc schema_mutations.cc serializer.cc diff --git a/alternator/server.cc b/alternator/server.cc index 1f3fd4b7ea..f20f52786c 100644 --- a/alternator/server.cc +++ b/alternator/server.cc @@ -456,9 +456,16 @@ future server::handle_api_request(std::unique_ptr tracing::trace_state_ptr trace_state = maybe_trace_query(client_state, username, op, content); tracing::trace(trace_state, "{}", op); - rjson::value json_request = co_await _json_parser.parse(std::move(content)); - co_return co_await callback_it->second(_executor, client_state, trace_state, - make_service_permit(std::move(units)), std::move(json_request), std::move(req)); + + auto user = client_state.user(); + auto f = [this, content = std::move(content), &callback = callback_it->second, + client_state = std::move(client_state), trace_state = std::move(trace_state), + units = std::move(units), req = std::move(req)] () mutable -> future { + rjson::value json_request = co_await _json_parser.parse(std::move(content)); + co_return co_await callback(_executor, client_state, trace_state, + make_service_permit(std::move(units)), std::move(json_request), std::move(req)); + }; + co_return co_await _sl_controller.with_user_service_level(user, std::ref(f)); } void server::set_routes(routes& r) { diff --git a/api/CMakeLists.txt b/api/CMakeLists.txt index e1e1a1c9ba..2a6a3fab6f 100644 --- a/api/CMakeLists.txt +++ b/api/CMakeLists.txt @@ -42,6 +42,7 @@ set(swagger_files api-doc/messaging_service.json api-doc/metrics.json api-doc/raft.json + api-doc/service_levels.json api-doc/storage_proxy.json api-doc/storage_service.json api-doc/stream_manager.json @@ -82,6 +83,7 @@ target_sources(api lsa.cc messaging_service.cc raft.cc + service_levels.cc storage_proxy.cc storage_service.cc stream_manager.cc diff --git a/api/api-doc/service_levels.json b/api/api-doc/service_levels.json new file mode 100644 index 0000000000..58c1cf96bd --- /dev/null +++ b/api/api-doc/service_levels.json @@ -0,0 +1,56 @@ +{ + "apiVersion":"0.0.1", + "swaggerVersion":"1.2", + "basePath":"{{Protocol}}://{{Host}}", + "resourcePath":"/service_levels", + "produces":[ + "application/json" + ], + "apis":[ + { + "path":"/service_levels/switch_tenants", + "operations":[ + { + "method":"POST", + "summary":"Switch tenants on all opened connections if needed", + "type":"void", + "nickname":"do_switch_tenants", + "produces":[ + "application/json" + ], + "parameters":[] + } + ] + }, + { + "path":"/service_levels/count_connections", + "operations":[ + { + "method":"GET", + "summary":"Count opened CQL connections per scheduling group per user", + "type":"connections_count_map", + "nickname":"count_connections", + "produces":[ + "application/json" + ], + "parameters":[] + } + ] + } + ], + "models":{}, + "components": { + "schemas": { + "connections_count_map": { + "type": "object", + "additionalProperties": { + "type": "object", + "additionalProperties": { + "type": "integer" + } + } + } + } + } + +} \ No newline at end of file diff --git a/api/api.cc b/api/api.cc index f3ba941f07..ff5c06432d 100644 --- a/api/api.cc +++ b/api/api.cc @@ -36,6 +36,7 @@ #include "tasks.hh" #include "raft.hh" #include "gms/gossip_address_map.hh" +#include "service_levels.hh" logging::logger apilog("api"); @@ -358,6 +359,12 @@ future<> unset_server_cql_server_test(http_context& ctx) { #endif +future<> set_server_service_levels(http_context &ctx, cql_transport::controller& ctl, sharded& qp) { + return register_api(ctx, "service_levels", "The service levels API", [&ctl, &qp] (http_context& ctx, routes& r) { + set_service_levels(ctx, r, ctl, qp); + }); +} + future<> set_server_tasks_compaction_module(http_context& ctx, sharded& ss, sharded& snap_ctl) { auto rb = std::make_shared < api_registry_builder > (ctx.api_doc); diff --git a/api/api_init.hh b/api/api_init.hh index 236600c797..435f637c0c 100644 --- a/api/api_init.hh +++ b/api/api_init.hh @@ -73,6 +73,10 @@ namespace tasks { class task_manager; } +namespace cql3 { +class query_processor; +} + namespace api { struct http_context { @@ -141,6 +145,7 @@ future<> set_format_selector(http_context& ctx, db::sstables_format_selector& se future<> unset_format_selector(http_context& ctx); future<> set_server_cql_server_test(http_context& ctx, cql_transport::controller& ctl); future<> unset_server_cql_server_test(http_context& ctx); +future<> set_server_service_levels(http_context& ctx, cql_transport::controller& ctl, sharded& qp); future<> set_server_commitlog(http_context& ctx, sharded&); future<> unset_server_commitlog(http_context& ctx); diff --git a/api/cql_server_test.cc b/api/cql_server_test.cc index 77e0301d1b..d50394ca85 100644 --- a/api/cql_server_test.cc +++ b/api/cql_server_test.cc @@ -26,21 +26,24 @@ struct connection_sl_params : public json::json_base { json::json_element _role_name; json::json_element _workload_type; json::json_element _timeout; + json::json_element _scheduling_group; - connection_sl_params(const sstring& role_name, const sstring& workload_type, const sstring& timeout) { + connection_sl_params(const sstring& role_name, const sstring& workload_type, const sstring& timeout, const sstring& scheduling_group) { _role_name = role_name; _workload_type = workload_type; _timeout = timeout; + _scheduling_group = scheduling_group; register_params(); } connection_sl_params(const connection_sl_params& params) - : connection_sl_params(params._role_name(), params._workload_type(), params._timeout()) {} + : connection_sl_params(params._role_name(), params._workload_type(), params._timeout(), params._scheduling_group()) {} void register_params() { add(&_role_name, "role_name"); add(&_workload_type, "workload_type"); add(&_timeout, "timeout"); + add(&_scheduling_group, "scheduling_group"); } }; @@ -54,7 +57,8 @@ void set_cql_server_test(http_context& ctx, seastar::httpd::routes& r, cql_trans return connection_sl_params( std::move(params.role_name), sstring(qos::service_level_options::to_string(params.workload_type)), - to_string(cql_duration(months_counter{0}, days_counter{0}, nanoseconds_counter{nanos}))); + to_string(cql_duration(months_counter{0}, days_counter{0}, nanoseconds_counter{nanos})), + std::move(params.scheduling_group_name)); }); co_return result; }); diff --git a/api/service_levels.cc b/api/service_levels.cc new file mode 100644 index 0000000000..753a6e3198 --- /dev/null +++ b/api/service_levels.cc @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2023-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include "service_levels.hh" +#include "api/api-doc/service_levels.json.hh" +#include "cql3/query_processor.hh" +#include "cql3/untyped_result_set.hh" +#include "db/consistency_level_type.hh" +#include "seastar/json/json_elements.hh" +#include "transport/controller.hh" +#include + + +namespace api { + +namespace sl = httpd::service_levels_json; +using namespace json; +using namespace seastar::httpd; + + +void set_service_levels(http_context& ctx, routes& r, cql_transport::controller& ctl, sharded& qp) { + sl::do_switch_tenants.set(r, [&ctl] (std::unique_ptr req) -> future { + co_await ctl.update_connections_scheduling_group(); + co_return json_void(); + }); + + sl::count_connections.set(r, [&qp] (std::unique_ptr req) -> future { + auto connections = co_await qp.local().execute_internal( + "SELECT username, scheduling_group FROM system.clients WHERE client_type='cql' ALLOW FILTERING", + db::consistency_level::LOCAL_ONE, + cql3::query_processor::cache_internal::no + ); + + using connections_per_user = std::unordered_map; + using connections_per_scheduling_group = std::unordered_map; + connections_per_scheduling_group result; + + for (auto it = connections->begin(); it != connections->end(); it++) { + auto user = it->get_as("username"); + auto shg = it->get_as("scheduling_group"); + + if (result.contains(shg)) { + result[shg][user]++; + } + else { + result[shg] = {{user, 1}}; + } + } + + co_return result; + }); + +} + + + + +} \ No newline at end of file diff --git a/api/service_levels.hh b/api/service_levels.hh new file mode 100644 index 0000000000..e2e3993774 --- /dev/null +++ b/api/service_levels.hh @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2023-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#pragma once + +#include "api.hh" + +namespace api { + +void set_service_levels(http_context& ctx, httpd::routes& r, cql_transport::controller& ctl, sharded& qp); + +} \ No newline at end of file diff --git a/client_data.hh b/client_data.hh index 91a77c13b7..f2f6a2370f 100644 --- a/client_data.hh +++ b/client_data.hh @@ -45,6 +45,7 @@ struct client_data { std::optional ssl_enabled; std::optional ssl_protocol; std::optional username; + std::optional scheduling_group_name; sstring stage_str() const { return to_string(connection_stage); } sstring client_type_str() const { return to_string(ct); } diff --git a/configure.py b/configure.py index be09e24aa7..c89cd6c4ce 100755 --- a/configure.py +++ b/configure.py @@ -1160,6 +1160,7 @@ scylla_core = (['message/messaging_service.cc', 'service/topology_coordinator.cc', 'node_ops/node_ops_ctl.cc', 'node_ops/task_manager_module.cc', + 'reader_concurrency_semaphore_group.cc', ] + [Antlr3Grammar('cql3/Cql.g')] \ + scylla_raft_core ) @@ -1214,6 +1215,8 @@ api = ['api/api.cc', Json2Code('api/api-doc/raft.json'), Json2Code('api/api-doc/cql_server_test.json'), 'api/cql_server_test.cc', + 'api/service_levels.cc', + Json2Code('api/api-doc/service_levels.json'), ] alternator = [ @@ -1871,7 +1874,7 @@ def configure_seastar(build_dir, mode, mode_config): '-DSeastar_DEPRECATED_OSTREAM_FORMATTERS=OFF', '-DSeastar_UNUSED_RESULT_ERROR=ON', '-DCMAKE_EXPORT_COMPILE_COMMANDS=ON', - '-DSeastar_SCHEDULING_GROUPS_COUNT=16', + '-DSeastar_SCHEDULING_GROUPS_COUNT=19', '-DSeastar_IO_URING=ON', ] diff --git a/cql3/Cql.g b/cql3/Cql.g index f62298d14c..628c823737 100644 --- a/cql3/Cql.g +++ b/cql3/Cql.g @@ -2127,6 +2127,7 @@ basic_unreserved_keyword returns [sstring str] | K_SERVICE_LEVELS | K_ATTACHED | K_FOR + | K_SHARES | K_GROUP | K_TIMEOUT | K_SERVICE @@ -2335,6 +2336,7 @@ K_SERVICE: S E R V I C E; K_LEVEL: L E V E L; K_LEVELS: L E V E L S; K_EFFECTIVE: E F F E C T I V E; +K_SHARES: S H A R E S; K_SCYLLA_TIMEUUID_LIST_INDEX: S C Y L L A '_' T I M E U U I D '_' L I S T '_' I N D E X; K_SCYLLA_COUNTER_SHARD_LIST: S C Y L L A '_' C O U N T E R '_' S H A R D '_' L I S T; diff --git a/cql3/statements/alter_service_level_statement.cc b/cql3/statements/alter_service_level_statement.cc index c1a9a15b61..b47b4460f1 100644 --- a/cql3/statements/alter_service_level_statement.cc +++ b/cql3/statements/alter_service_level_statement.cc @@ -38,6 +38,7 @@ alter_service_level_statement::execute(query_processor& qp, service::query_state &state, const query_options &, std::optional guard) const { service::group0_batch mc{std::move(guard)}; + validate_shares_option(qp, _slo); qos::service_level& sl = state.get_service_level_controller().get_service_level(_service_level); qos::service_level_options slo = _slo.replace_defaults(sl.slo); auto& slc = state.get_service_level_controller(); diff --git a/cql3/statements/create_service_level_statement.cc b/cql3/statements/create_service_level_statement.cc index 394741685e..5f67b0bcec 100644 --- a/cql3/statements/create_service_level_statement.cc +++ b/cql3/statements/create_service_level_statement.cc @@ -10,9 +10,11 @@ #include "exceptions/exceptions.hh" #include "seastarx.hh" #include "cql3/statements/create_service_level_statement.hh" +#include "service/qos/qos_common.hh" #include "service/qos/service_level_controller.hh" #include "service/client_state.hh" #include "service/query_state.hh" +#include "utils/error_injection.hh" namespace cql3 { @@ -44,7 +46,14 @@ create_service_level_statement::execute(query_processor& qp, } service::group0_batch mc{std::move(guard)}; - qos::service_level_options slo = _slo.replace_defaults(qos::service_level_options{}); + validate_shares_option(qp, _slo); + + auto default_slo = qos::service_level_options{.shares = qos::service_level_controller::default_shares}; + if (utils::get_local_injector().is_enabled("create_service_levels_without_default_shares")) { + default_slo.shares = qos::service_level_options::unset_marker{}; + } + qos::service_level_options slo = _slo.replace_defaults(default_slo); + auto& sl = state.get_service_level_controller(); co_await sl.add_distributed_service_level(_service_level, slo, _if_not_exists, mc); co_await sl.commit_mutations(std::move(mc)); diff --git a/cql3/statements/list_effective_service_level_statement.cc b/cql3/statements/list_effective_service_level_statement.cc index 918c6dbf10..9059662d3d 100644 --- a/cql3/statements/list_effective_service_level_statement.cc +++ b/cql3/statements/list_effective_service_level_statement.cc @@ -53,6 +53,20 @@ static bytes_opt decompose_timeout (const qos::service_level_options::timeout_ty }, duration); }; +static bytes_opt decompose_shares(const qos::service_level_options::shares_type& shares) { + return std::visit(overloaded_functor{ + [&] (const qos::service_level_options::unset_marker&) { + return bytes_opt(); + }, + [&] (const qos::service_level_options::delete_marker&) { + return bytes_opt(); + }, + [&] (const int32_t& s) -> bytes_opt { + return utf8_type->decompose(fmt::format("{}", s)); + }, + }, shares); +}; + future<::shared_ptr> list_effective_service_level_statement::execute(query_processor& qp, service::query_state& state, const query_options&, std::optional) const { static thread_local const std::vector> metadata({ @@ -84,6 +98,11 @@ list_effective_service_level_statement::execute(query_processor& qp, service::qu utf8_type->decompose(slo->effective_names->timeout), decompose_timeout(slo->timeout) }); + rs->add_row({ + utf8_type->decompose("shares"), + utf8_type->decompose(slo->effective_names->shares), + decompose_shares(slo->shares) + }); auto rows = ::make_shared(result(std::move(std::move(rs)))); co_return ::static_pointer_cast(rows); diff --git a/cql3/statements/list_service_level_statement.cc b/cql3/statements/list_service_level_statement.cc index a9b861678d..2a7fd94288 100644 --- a/cql3/statements/list_service_level_statement.cc +++ b/cql3/statements/list_service_level_statement.cc @@ -47,10 +47,14 @@ list_service_level_statement::execute(query_processor& qp, type); }; - static thread_local const std::vector> metadata({make_column("service_level", utf8_type), + std::vector> metadata({make_column("service_level", utf8_type), make_column("timeout", duration_type), - make_column("workload_type", utf8_type) + make_column("workload_type", utf8_type), + make_column("shares", int32_type), }); + if (_describe_all) { + metadata.push_back(make_column("percentage of all service level shares", utf8_type)); + } return make_ready_future().then([this, &state] () { if (_describe_all) { @@ -59,7 +63,7 @@ list_service_level_statement::execute(query_processor& qp, return state.get_service_level_controller().get_distributed_service_level(_service_level); } }) - .then([] (qos::service_levels_info sl_info) { + .then([this, metadata = std::move(metadata)] (qos::service_levels_info sl_info) { auto d = [] (const qos::service_level_options::timeout_type& duration) -> bytes_opt { return std::visit(overloaded_functor{ [&] (const qos::service_level_options::unset_marker&) { @@ -74,15 +78,51 @@ list_service_level_statement::execute(query_processor& qp, }, }, duration); }; + auto dd = [] (const std::variant& v) -> bytes_opt { + return std::visit(overloaded_functor{ + [&] (const qos::service_level_options::unset_marker&) { + return bytes_opt(); + }, + [&] (const qos::service_level_options::delete_marker&) { + return bytes_opt(); + }, + [&] (const T& v) -> bytes_opt { + return data_type_for()->decompose(v); + }, + }, v); + }; + auto get_shares_value = [] (const std::variant& shares) { + if (std::holds_alternative(shares)) { + return std::get(shares); + } else { + return qos::service_level_controller::default_shares; + } + }; + + int32_t sum_of_shares = 0; + if (_describe_all) { + for (auto &&[_, slo]: sl_info) { + sum_of_shares += get_shares_value(slo.shares); + } + } + auto rs = std::make_unique(metadata); for (auto &&[sl_name, slo] : sl_info) { bytes_opt workload = slo.workload == qos::service_level_options::workload_type::unspecified ? bytes_opt() : utf8_type->decompose(qos::service_level_options::to_string(slo.workload)); - rs->add_row(std::vector{ + + auto row = std::vector{ utf8_type->decompose(sl_name), d(slo.timeout), - workload}); + workload, + dd(slo.shares)}; + if (_describe_all) { + row.push_back(utf8_type->decompose( + fmt::format("{:.2f}%", 100.0f * get_shares_value(slo.shares) / sum_of_shares) + )); + } + rs->add_row(std::move(row)); } auto rows = ::make_shared(result(std::move(std::move(rs)))); diff --git a/cql3/statements/service_level_statement.cc b/cql3/statements/service_level_statement.cc index d0e48b6454..a91acbeb76 100644 --- a/cql3/statements/service_level_statement.cc +++ b/cql3/statements/service_level_statement.cc @@ -7,6 +7,8 @@ */ #include "service_level_statement.hh" +#include "service/storage_proxy.hh" +#include "gms/feature_service.hh" namespace cql3 { @@ -28,5 +30,11 @@ bool service_level_statement::needs_guard(query_processor&, service::query_state return state.get_service_level_controller().is_v2(); } +void service_level_statement::validate_shares_option(const query_processor& qp, const qos::service_level_options& slo) const { + if (!std::holds_alternative(slo.shares) && !qp.proxy().features().workload_prioritization) { + throw exceptions::invalid_request_exception("`shares` option can only be used when the cluster is fully upgraded to enterprise"); + } +} + } } diff --git a/cql3/statements/service_level_statement.hh b/cql3/statements/service_level_statement.hh index 354fb2c4ee..7aad668114 100644 --- a/cql3/statements/service_level_statement.hh +++ b/cql3/statements/service_level_statement.hh @@ -11,6 +11,7 @@ #include "cql3/cql_statement.hh" #include "cql3/query_processor.hh" #include "raw/parsed_statement.hh" +#include "service/qos/qos_common.hh" #include "service/query_state.hh" namespace cql3 { @@ -49,6 +50,8 @@ public: bool depends_on(std::string_view ks_name, std::optional cf_name) const override; future<> check_access(query_processor& qp, const service::client_state& state) const override; +protected: + void validate_shares_option(const query_processor& qp, const qos::service_level_options& slo) const; }; } diff --git a/cql3/statements/sl_prop_defs.cc b/cql3/statements/sl_prop_defs.cc index 5e9230feae..a35840ffed 100644 --- a/cql3/statements/sl_prop_defs.cc +++ b/cql3/statements/sl_prop_defs.cc @@ -17,7 +17,7 @@ namespace statements { void sl_prop_defs::validate() { static std::set timeout_props { - "timeout", "workload_type" + "timeout", "workload_type", sstring(KW_SHARES), }; auto get_duration = [&] (const std::optional& repr) -> qos::service_level_options::timeout_type { if (!repr) { @@ -42,6 +42,7 @@ void sl_prop_defs::validate() { property_definitions::validate(timeout_props); _slo.timeout = get_duration(get_simple("timeout")); + auto workload_string_opt = get_simple("workload_type"); if (workload_string_opt) { auto workload = qos::service_level_options::parse_workload_type(*workload_string_opt); @@ -55,6 +56,15 @@ void sl_prop_defs::validate() { _slo.workload = qos::service_level_options::workload_type::delete_marker; } } + + if (has_property(KW_SHARES)) { + auto shares = get_int(KW_SHARES, SHARES_DEFAULT_VAL); + if ((shares < SHARES_MIN_VAL) || (shares > SHARES_MAX_VAL )) { + throw exceptions::syntax_exception(format("'SHARES' can only take values of {}-{} (given {})", + SHARES_MIN_VAL, SHARES_MAX_VAL, shares)); + } + _slo.shares = shares; + } } qos::service_level_options sl_prop_defs::get_service_level_options() const { diff --git a/cql3/statements/sl_prop_defs.hh b/cql3/statements/sl_prop_defs.hh index 072eb29e86..27669603e0 100644 --- a/cql3/statements/sl_prop_defs.hh +++ b/cql3/statements/sl_prop_defs.hh @@ -18,8 +18,13 @@ namespace statements { class sl_prop_defs : public property_definitions { qos::service_level_options _slo; public: - void validate(); + + static constexpr auto KW_SHARES = "shares"; + static constexpr int SHARES_DEFAULT_VAL = 1000; + static constexpr int SHARES_MIN_VAL = 1; + static constexpr int SHARES_MAX_VAL = 1000; + qos::service_level_options get_service_level_options() const; }; diff --git a/db/system_distributed_keyspace.cc b/db/system_distributed_keyspace.cc index adae10f3b5..0703fee777 100644 --- a/db/system_distributed_keyspace.cc +++ b/db/system_distributed_keyspace.cc @@ -13,6 +13,7 @@ #include "replica/database.hh" #include "db/consistency_level_type.hh" #include "db/system_keyspace.hh" +#include "db/config.hh" #include "schema/schema_builder.hh" #include "timeout_config.hh" #include "types/types.hh" @@ -21,6 +22,8 @@ #include "cdc/generation.hh" #include "cql3/query_processor.hh" #include "service/storage_proxy.hh" +#include "gms/feature_service.hh" + #include "service/migration_manager.hh" #include "locator/host_id.hh" @@ -152,8 +155,14 @@ static const sstring CDC_TIMESTAMPS_KEY = "timestamps"; schema_ptr service_levels() { static thread_local auto schema = [] { auto id = generate_legacy_id(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS); - return schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id)) + auto builder = schema_builder(system_distributed_keyspace::NAME, system_distributed_keyspace::SERVICE_LEVELS, std::make_optional(id)) .with_column("service_level", utf8_type, column_kind::partition_key) + .with_column("shares", int32_type); + if (utils::get_local_injector().is_enabled("service_levels_v1_table_without_shares")) { + builder.remove_column("shares"); + } + + return builder .with_hash_version() .build(); }(); @@ -207,9 +216,12 @@ system_distributed_keyspace::system_distributed_keyspace(cql3::query_processor& , _sp(sp) { } -static thread_local std::pair new_columns[] { - {"timeout", duration_type}, - {"workload_type", utf8_type} +static std::vector> new_service_levels_columns(bool workload_prioritization_enabled) { + std::vector> new_columns {{"timeout", duration_type}, {"workload_type", utf8_type}}; + if (workload_prioritization_enabled) { + new_columns.push_back({"shares", int32_type}); + } + return new_columns; }; static schema_ptr get_current_service_levels(data_dictionary::database db) { @@ -218,11 +230,11 @@ static schema_ptr get_current_service_levels(data_dictionary::database db) { : service_levels(); } -static schema_ptr get_updated_service_levels(data_dictionary::database db) { +static schema_ptr get_updated_service_levels(data_dictionary::database db, bool workload_prioritization_enabled) { SCYLLA_ASSERT(this_shard_id() == 0); auto schema = get_current_service_levels(db); schema_builder b(schema); - for (const auto& col : new_columns) { + for (const auto& col : new_service_levels_columns(workload_prioritization_enabled)) { auto& [col_name, col_type] = col; bytes options_name = to_bytes(col_name.data()); if (schema->get_column_definition(options_name)) { @@ -234,20 +246,20 @@ static schema_ptr get_updated_service_levels(data_dictionary::database db) { return b.build(); } -future<> system_distributed_keyspace::start() { +future<> system_distributed_keyspace::create_tables(std::vector tables) { if (this_shard_id() != 0) { _started = true; co_return; } auto db = _sp.data_dictionary(); - auto tables = ensured_tables(); while (true) { // Check if there is any work to do before taking the group 0 guard. + bool workload_prioritization_enabled = _sp.features().workload_prioritization; bool keyspaces_setup = db.has_keyspace(NAME) && db.has_keyspace(NAME_EVERYWHERE); bool tables_setup = std::all_of(tables.begin(), tables.end(), [db] (schema_ptr t) { return db.has_schema(t->ks_name(), t->cf_name()); } ); - bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db)); + bool service_levels_up_to_date = get_current_service_levels(db)->equal_columns(*get_updated_service_levels(db, workload_prioritization_enabled)); if (keyspaces_setup && tables_setup && service_levels_up_to_date) { dlogger.info("system_distributed(_everywhere) keyspaces and tables are up-to-date. Not creating"); _started = true; @@ -287,12 +299,12 @@ future<> system_distributed_keyspace::start() { // Get mutations for creating and updating tables. auto num_keyspace_mutations = mutations.size(); co_await coroutine::parallel_for_each(ensured_tables(), - [this, &mutations, db, ts, sd_ksm, sde_ksm] (auto&& table) -> future<> { + [this, &mutations, db, ts, sd_ksm, sde_ksm, workload_prioritization_enabled] (auto&& table) -> future<> { auto ksm = table->ks_name() == NAME ? sd_ksm : sde_ksm; // Ensure that the service_levels table contains new columns. if (table->cf_name() == SERVICE_LEVELS) { - table = get_updated_service_levels(db); + table = get_updated_service_levels(db, workload_prioritization_enabled); } if (!db.has_schema(table->ks_name(), table->cf_name())) { @@ -325,6 +337,24 @@ future<> system_distributed_keyspace::start() { } } + future<> system_distributed_keyspace::start_workload_prioritization() { + if (this_shard_id() != 0) { + co_return; + } + if (_qp.db().features().workload_prioritization) { + co_await create_tables({get_updated_service_levels(_qp.db(), true)}); + } +} + +future<> system_distributed_keyspace::start() { + if (this_shard_id() != 0) { + _started = true; + co_return; + } + + co_await create_tables(ensured_tables()); +} + future<> system_distributed_keyspace::stop() { return make_ready_future<>(); } @@ -740,6 +770,13 @@ system_distributed_keyspace::get_cdc_desc_v1_timestamps(context ctx) { co_return res; } +bool system_distributed_keyspace::workload_prioritization_tables_exists() { + auto wp_table = get_updated_service_levels(_qp.db(), true); + auto table = _qp.db().try_find_table(NAME, wp_table->cf_name()); + + return table && table->schema()->equal_columns(*wp_table); +} + future system_distributed_keyspace::get_service_levels(qos::query_context ctx) const { return qos::get_service_levels(_qp, NAME, SERVICE_LEVELS, db::consistency_level::ONE, ctx); } @@ -766,6 +803,19 @@ future<> system_distributed_keyspace::set_service_level(sstring service_level_na }, }, tv); }; + auto to_data_value_g = [&] (const std::variant& v) { + return std::visit(overloaded_functor { + [&] (const qos::service_level_options::unset_marker&) { + return data_value::make_null(data_type_for()); + }, + [&] (const qos::service_level_options::delete_marker&) { + return data_value::make_null(data_type_for()); + }, + [&] (const T& v) { + return data_value(v); + }, + }, v); + }; data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified ? data_value::make_null(utf8_type) : data_value(qos::service_level_options::to_string(slo.workload)); @@ -776,6 +826,11 @@ future<> system_distributed_keyspace::set_service_level(sstring service_level_na workload, service_level_name}, cql3::query_processor::cache_internal::no); + co_await _qp.execute_internal(format("UPDATE {}.{} SET shares = ? WHERE service_level = ?;", NAME, SERVICE_LEVELS), + db::consistency_level::ONE, + internal_distributed_query_state(), + {to_data_value_g(slo.shares), service_level_name}, + cql3::query_processor::cache_internal::no); } future<> system_distributed_keyspace::drop_service_level(sstring service_level_name) const { diff --git a/db/system_distributed_keyspace.hh b/db/system_distributed_keyspace.hh index 6a3a9ab766..20a8418f49 100644 --- a/db/system_distributed_keyspace.hh +++ b/db/system_distributed_keyspace.hh @@ -82,6 +82,7 @@ public: system_distributed_keyspace(cql3::query_processor&, service::migration_manager&, service::storage_proxy&); future<> start(); + future<> start_workload_prioritization(); future<> stop(); bool started() const { return _started; } @@ -116,6 +117,10 @@ public: future get_service_level(sstring service_level_name) const; future<> set_service_level(sstring service_level_name, qos::service_level_options slo) const; future<> drop_service_level(sstring service_level_name) const; + bool workload_prioritization_tables_exists(); + +private: + future<> create_tables(std::vector tables); }; } diff --git a/db/system_keyspace.cc b/db/system_keyspace.cc index 93531455bc..1304601472 100644 --- a/db/system_keyspace.cc +++ b/db/system_keyspace.cc @@ -1157,6 +1157,7 @@ schema_ptr system_keyspace::service_levels_v2() { .with_column("service_level", utf8_type, column_kind::partition_key) .with_column("timeout", duration_type) .with_column("workload_type", utf8_type) + .with_column("shares", int32_type) .with_hash_version() .build(); }(); diff --git a/db/virtual_tables.cc b/db/virtual_tables.cc index 03f9bbc1e5..b42be72948 100644 --- a/db/virtual_tables.cc +++ b/db/virtual_tables.cc @@ -736,6 +736,7 @@ class clients_table : public streaming_virtual_table { .with_column("ssl_enabled", boolean_type) .with_column("ssl_protocol", utf8_type) .with_column("username", utf8_type) + .with_column("scheduling_group", utf8_type) .with_hash_version() .build(); } @@ -842,6 +843,9 @@ class clients_table : public streaming_virtual_table { set_cell(cr.cells(), "ssl_protocol", *cd.ssl_protocol); } set_cell(cr.cells(), "username", cd.username ? *cd.username : sstring("anonymous")); + if (cd.scheduling_group_name) { + set_cell(cr.cells(), "scheduling_group", *cd.scheduling_group_name); + } co_await result.emit_row(std::move(cr)); } co_await result.emit_partition_end(); diff --git a/docs/alternator/compatibility.md b/docs/alternator/compatibility.md index 956bdf07a7..0ea3f95207 100644 --- a/docs/alternator/compatibility.md +++ b/docs/alternator/compatibility.md @@ -159,6 +159,37 @@ If you don't know the name of the table, you can try a forbidden operation and the AccessDeniedException error will contain the name of the table that was lacking permissions. +## Workload Isolation + +In DynamoDB read/write capacity of each table can be defined either to a fixed +value (provisioned mode) or to be adaptive (on-demand). On top of that requests +are also subject to per table and per account quotas. + +Due to the nature of Alternator deployment the whole cluster is available to serve +user requests and underlying hardware can be utilized to its full capacity. When +there is a need to allow more resources to given workload on the expense of some competing +one we offer feature called **Workload Prioritization**. + +To use this feature define service level with a fixed amount of shares +(higher value means proportionally more capacity) and attach it to a role +which then will be used to authorize requests. This can be currently done +only via CQL API, here is an example on how to do that: +```cql +CREATE ROLE alice WITH PASSWORD = 'abcd' AND LOGIN = true; +CREATE ROLE bob WITH PASSWORD = 'abcd' AND LOGIN = true; + +CREATE SERVICE_LEVEL IF NOT EXISTS olap WITH SHARES = 100; +CREATE SERVICE_LEVEL IF NOT EXISTS oltp WITH SHARES = 1000; + +ATTACH SERVICE_LEVEL olap TO alice; +ATTACH SERVICE_LEVEL oltp TO bob; +``` +Note that `alternator_enforce_authorization` has to be enabled in Scylla configuration. + +See [Authorization](##Authorization) section to learn more about roles and authorization. +See +to read about **Workload Prioritization** in detail. + ## Metrics Scylla has an advanced and extensive monitoring framework for inspecting diff --git a/docs/dev/service_levels.md b/docs/dev/service_levels.md index cf1fcff63f..dbfe35d31d 100644 --- a/docs/dev/service_levels.md +++ b/docs/dev/service_levels.md @@ -33,7 +33,8 @@ SELECT * FROM system.role_attributes WHERE role='r' and attribute_name='service CREATE TABLE system_distributed.service_levels ( service_level text PRIMARY KEY, timeout duration, - workload_type text) + workload_type text, + shares int); ``` The table is used to store and distribute the service levels configuration. @@ -41,6 +42,7 @@ The table column names meanings are: *service_level* - the name of the service level. *timeout* - timeout for operations performed by users under this service level *workload_type* - type of workload declared for this service level (NULL, interactive or batch) +*shares* - a number that represents this service level priority in relation to other service levels. ``` select * from system_distributed.service_levels ; @@ -136,6 +138,35 @@ the conflicts are resolved as follows: - `X` vs `NULL` -> `X` - `batch` vs `interactive` -> `batch` - under the assumption that `batch` is safer, because it would not trigger load shedding as eagerly as `interactive` + So for example to create a service level that is twice more important than the default service + level (which has shares of 1000) one can run: + + ``` + INSERT INTO system_distributed.service_level (service_level, shares) VALUES ('double_importance',2000); + ``` + +## Service levels REST API + +In a current state, Service Levels/Workload Prioritization has its own flaws, one of which is a requirement to restart connections to apply changes of users' service levels change. + +Until we improve service levels controller to make the changes automatically, here is a REST API to ease to work of maintaining and managing service levels and connections. + +A `tenant` (used below) is equal to scheduling group under which a connection is working. + +### Switch tenants + +`/service_levels/switch_tenants` endpoint triggers a tenant switch on all opened CQL connections on a single node without any interruption or their restart. +The response is returned immediately but the actual work might take up to tens of seconds. + +### Inspecting current scheduling group of connections + +`/service_levels/count_connections` endpoing is a tool to inspect status of all opened CQL connections. It returns a map with connections count per scheduling group, per user: +``` +{'sl:default': {'cassandra': 3}, 'sl:sl1': {'test_user': 3}} +``` + +In fact, this endpoint is a wrapper which executes simple query on `system.clients` table and aggregates the result. The table has added `scheduling_group` column, so to inspect a particular connection, it can be directly looked up in `system.clients` table. + ### Effective service level Actual values of service level's options may come from different service levels, not only from the one user is assigned with. This can be achieved by assigning one role to another. diff --git a/docs/features/index.rst b/docs/features/index.rst index 34f1e520be..2d561f380c 100644 --- a/docs/features/index.rst +++ b/docs/features/index.rst @@ -14,6 +14,7 @@ This document highlights ScyllaDB's key data modeling features. Counters Change Data Capture Workload Attributes + Workload Prioritization .. panel-box:: :title: ScyllaDB Features diff --git a/docs/features/workload-attributes.rst b/docs/features/workload-attributes.rst index b2eb24a495..9f1a904b12 100644 --- a/docs/features/workload-attributes.rst +++ b/docs/features/workload-attributes.rst @@ -13,7 +13,7 @@ You can define a workload's attribute using the *service level* concept. The ser attributes to users and roles. When a user logs into the system, all of the attributes attached to that user and to the roles granted to that user are combined and become a set of workload attributes. -See `Service Level Management `_ for more information about service levels. +See :ref:`Service Level Management ` for more information about service levels. Prerequisites --------------- @@ -126,7 +126,7 @@ Available Workload Types * - ``unspecified`` - A generic workload without any specific characteristics (default). * - ``interactive`` - - A workload sensitive to latency, expected to have high/unbounded concurrency, with dynamic characteristics. For example, a workload assigned to users clicking on a website and generating events with their clicks. + - A workload sensitive to latency, expected to have high/unbounded concurrency, with dynamic characteristics, :doc:`OLTP `. For example, a workload assigned to users clicking on a website and generating events with their clicks. * - ``batch`` - - A workload for processing large amounts of data, not sensitive to latency, expected to have fixed concurrency. For example, a workload assigned to processing billions of historical sales records to generate statistics. + - A workload for processing large amounts of data, not sensitive to latency, expected to have fixed concurrency, :doc:`OLAP `. For example, a workload assigned to processing billions of historical sales records to generate statistics. diff --git a/docs/features/workload-prioritization.rst b/docs/features/workload-prioritization.rst new file mode 100644 index 0000000000..b2b17cf40b --- /dev/null +++ b/docs/features/workload-prioritization.rst @@ -0,0 +1,448 @@ +======================== +Workload Prioritization +======================== + +:label-tip:`ScyllaDB Enterprise` + +In a typical database there are numerous workloads running at the same time. +Each workload type dictates a different acceptable level of latency and throughput. +For example, consider the following two workloads: + +* OLTP ( Online Transaction Processing) - backend database for your application + + - High volume of requests + - Fast processing + - In essence - Latency sensitive + +* OLAP (Online Analytical Processing ) - performs data analytics in the background + + - High volume of data + - Slow queries + - In essence - Latency agnostic + +Using Service Level CQL commands, database administrators (working on Scylla Enterprise) can set different workload prioritization levels (levels of service) for each workload without sacrificing latency or throughput. +By assigning each service level to the different roles within your organization, DBAs ensure that each role_ receives the level of service the role requires. + +.. _`role` : /operating-scylla/security/rbac_usecase/ + +Prerequisites +============= +To create a level of service and assign it to a role, you need: + +* An :doc:`authenticated ` and :doc:`authorized ` user +* At least one :ref:`role created `. + +Work by Example +--------------- + +To follow the examples in this document, create the roles `spark` and `web`. You can assign permissions to these roles later, if needed. + +**Procedure** + +Run the following: + +.. code-block:: cql + + CREATE ROLE Spark; + CREATE ROLE Web; + +Workload Prioritization Workflow +================================ + +1. `Create a Service Level`_ +2. `Assign a Service Level to a Role`_ + +.. _workload-priorization-service-level-management: + +Service Level Management +======================== + +These commands set, list, and edit the level of service. + +Create a Service Level +---------------------- + +When you create a service level, you allocate a percentage of resources to the service level. Remember to use the correct naming convention to name your service level. If you decide to use :doc:`Reserved Keywords `, enclose them in either single or double quotes (for example ``'primary'``). + +**Syntax** + +.. code-block:: none + + CREATE SERVICE_LEVEL [IF NOT EXISTS] [WITH SHARES = ]; + +Where: + +* ``service_level_name`` - Specifies the name of the service you're creating. This can be any string without spaces. The name should be meaningful, such as class names (silver, gold, diamond, platinum), or categories (OLAP or OLTP), etc. +* ``shares_number`` - The number of shares of the resources you're granting to the service level name. You can use any number within the range from 1 to 1000. **Default : 1000** + +Example +....... + +There are 3 service levels (OLAP, OLTP, Default) where: (the percentage of resources = (Assigned Shares / Total Shares) x 100). Total Shares in this case is the total of all allocated shares + the Default SLA (1000). The percentage of resources would be: + +.. list-table:: + :widths: 30 30 30 + :header-rows: 1 + + * - Service Level Name + - Shares + - Percentage of Resources + * - OLAP + - 100 + - 4% + * - OLTP + - 1000 + - 48% + * - Default + - 1000 + - 48% + * - Total + - 2100 + - 100% + +**Procedure** + +1. To create these service levels, run the following CQL commands: + +.. code-block:: cql + + CREATE SERVICE_LEVEL IF NOT EXISTS OLAP WITH SHARES = 100; + CREATE SERVICE_LEVEL IF NOT EXISTS OLTP WITH SHARES = 1000; + +2. Confirm the service level change reflects the new service level allocations: + +.. code-block:: cql + + LIST ALL SERVICE_LEVELS; + + service_level | shares + --------------+------- + olap | 100 + --------------+------- + oltp | 1000 + (2 rows) + +Change Resource Allocation for a Service Level +----------------------------------------------- + +You can change resource allocation for a given service level. If you don't specify the number the shares, the default setting (1000) is used. + +**Syntax** + +.. code-block:: none + + ALTER SERVICE_LEVEL + WITH SHARES = ; + + +Where: + +* ``service_level_name`` - Specifies the name of the service level you created. See `Create a Service Level`_. +* ``shares_number`` - The number of shares in the CPU that you're granting to the service level name. You can use any number within the range from 1 to 1000. **Default : 1000** + + +.. warning:: + + Altering the SERVICE LEVEL does not affect active sessions (see `#12923 `_). + + To apply a new timeout to existing clients, execute a :doc:`rolling restart ` after the ALTER command. + + +Example +........ + +Analysts are complaining that they don't have enough resources. To increase the resources, you change the service level attributes for the OLAP service level. + +**Procedure** + +1. Run the following: + +.. code-block:: cql + + ALTER SERVICE_LEVEL OLAP WITH SHARES = 500; + +2. Confirm the service level change reflects the new service level allocation: + +.. code-block:: cql + + LIST SERVICE_LEVEL OLAP; + + service_level | shares + --------------+------- + olap | 500 + (1 rows) + +3. To change it back to the original setting (or to remain consistent for the examples that follow) change the shares amount back to the original. + +.. code-block:: cql + + ALTER SERVICE_LEVEL OLAP WITH SHARES = 100; + +Display Specified Service Level Parameters +------------------------------------------ + +Lists the specified service level with its class parameters. If the service level is attached to a role it does not appear in this list. + +**Syntax** + +.. code-block:: none + + LIST SERVICE_LEVEL ; + +Where: + +* ``service_level_name`` - Specifies the name of the service level you created. See `Create a Service Level`_. + +Example +....... + +In this example you list the service level parameters for OLTP. + +**Procedure** + +Run the following: + +.. code-block:: cql + + LIST SERVICE_LEVEL OLTP; + + service_level | shares + --------------+------- + oltp | 1000 + (1 rows) + +Display All Service Levels and Parameters +----------------------------------------- + +Lists all service levels with their class parameters. This list contains all service levels including those which are assigned to roles. + +**Syntax** + +.. code-block:: none + + LIST ALL SERVICE_LEVELS; + +Example +....... + +In this example, you list all service levels and their parameters. + +**Procedure** + +Run the following: + +.. code-block:: cql + + LIST ALL SERVICE_LEVELS; + + service_level | shares + ---------------+-------- + olap | 100 + oltp | 1000 + (2 rows) + + +Delete a Service Level +---------------------- + +Permanently removes the service level. Any role attached to this service level is automatically assigned to the Default SLA if there is no other service level attached to the role. + +**Syntax** + +.. code-block:: none + + DROP SERVICE_LEVEL IF EXISTS ; + +Where: + +* ``service_level_name`` - Specifies the name of the service level you created. See `Create a Service Level`_. +* ``IF EXISTS`` - If the service level does not exist and IF EXISTS is not used an error is returned. + + +Example +....... + +In this example you drop the OLTP service level. + +**Procedure** + +Run the following: + +.. code-block:: cql + + DROP SERVICE_LEVEL IF EXISTS OLTP; + +Manage Roles with Service Levels +================================ + +Once you have created roles and service levels you can attach and remove the service levels from the roles and list which roles are attached to which service levels. + +Assign a Service Level to a Role +-------------------------------- + +If you have created a role and a service level, you can attach the service level to the role. + +.. note:: A role can only be assigned **one** service level. However, the same service level can be attached to many roles. If a role inherits a service level from another role, the highest level of service from all the roles wins. + +**Syntax** + +.. code-block:: none + + ATTACH SERVICE_LEVEL TO ; + +Where: + +* ``service_level_name`` - Specifies the name of the service level you created. See `Create a Service Level`_. +* ``role_name`` - Specifies the role that you want to use the service level on. This is the role you created with :ref:`create role `. + +.. note:: Any role which does not have an SLA attached to it, receives the default SLA. + +Example +....... + +Continuing from the example in `Create a Service Level`_, you can attach the service levels that you created to different roles in your organization as follows: + +.. list-table:: + :widths: 50 50 + :header-rows: 1 + + * - Service Level Name + - Role Name + * - OLAP + - Spark + * - OLTP + - Web + + +**Procedure** + +To assign these service levels to the roles, run the following CQL commands: + +.. code-block:: cql + + ATTACH SERVICE_LEVEL OLAP TO Spark; + ATTACH SERVICE_LEVEL OLTP TO Web; + +List All Attached Service Levels for All Roles +---------------------------------------------- + +Lists all directly attached service levels for all roles. This does not include any service level which the role inherits from other roles. + +**Syntax** + +.. code-block:: none + + LIST ALL ATTACHED SERVICE_LEVELS; + +Example +....... + +In this example you list all service levels attached to any role. + +**Procedure** + +Run the following: + +.. code-block:: cql + + LIST ALL ATTACHED SERVICE_LEVELS; + + role | service_level + -------+--------------- + spark | olap + -------+--------------- + web | oltp + + (2 rows) + +List the Roles Assigned to a Specific Service Level +---------------------------------------------------- + +Lists all roles directly attached to a service level. This does not include any service level which the role inherits from other roles. + +**Syntax** + +.. code-block:: none + + LIST ATTACHED SERVICE_LEVEL OF ; + +Where: + +* ``role_name`` - Specifies the role that you want to use the service level on. This is the role you created with :ref:`create role `. + +Example +....... + +In this example, you list all of Roles which are assigned to the OLAP Service Level. + +**Procedure** + +Run the following: + +.. code-block:: cql + + LIST ATTACHED SERVICE_LEVEL OF Spark; + + role | service_level + -------+--------------- + spark | olap + + (1 rows) + +Remove a Service Level from a Role +---------------------------------- + +Removes a service level from a specified role. Once the service level is removed from a role, if there are other service levels attached to roles which that role inherits, the service level in the hierarchy with the most amount of shares wins. + +**Syntax** + +.. code-block:: none + + DETACH SERVICE_LEVEL FROM ; + +Where: + +* ``role_name`` - Specifies the role that you want to use the service level on. This is the role you created with :ref:`create role `. + +Example +....... + +In this example, you re-assign the Spark to a different level of service by detaching it from one level of service and attaching it to another. + +**Procedure** + +Run the following: + +.. code-block:: cql + + DETACH SERVICE_LEVEL FROM Spark; + +At this point, the Spark role receives the Default SLA, until it is assigned another service level. You assign a new service level to this role using `Assign a Service Level to a Role`_. + +Using Workload Prioritization with your Application +=================================================== + +In order for workload prioritization to take effect, application users need to be assigned to a relevant role. In addition, each role you create needs to be assigned to a specific Service Level. Any user that signs into the application without a role is automatically assigned the `Default` service level. This is always be the case with users who sign in anonymously. + + +Limits +====== +Scylla Enterprise is limited to 8 service levels, including the default one; this means you can create up to 7 service levels. + + +Additional References +===================== + +`OLAP or OLTP? Why Not Both? `_ Session by Glauber Costa from Scylla Summit 2018 + +`Scylla University: Workload Prioritization lesson `_ - The lesson covers: + +* The evolving requirements for operational (OLTP) and analytics (OLAP) workloads in the modern datacenter +* How Scylla provides built-in control over workload priority and makes it easy for administrators to configure workload priorities +* The impact of minimizing integrations and maintenance tasks, while also shrinking the datacenter footprint and maximizing utilization +* Test results of how it performs in real-world settings + + + + + diff --git a/docs/operating-scylla/security/rbac-usecase.rst b/docs/operating-scylla/security/rbac-usecase.rst index 79bfd78b98..059da7f561 100644 --- a/docs/operating-scylla/security/rbac-usecase.rst +++ b/docs/operating-scylla/security/rbac-usecase.rst @@ -22,7 +22,7 @@ In the same manner, should someone leave the organization, all you would have to Should someone change positions at the company, just assign the new employee to the new role and revoke roles no longer required for the new position. To build an RBAC environment, you need to create the roles and their associated permissions and then assign or grant the roles to the individual users. Roles inherit the permissions of any other roles that they are granted. The hierarchy of roles can be either simple or extremely complex. This gives great flexibility to database administrators, where they can create specific permission conditions without incurring a huge administrative burden. -In addition to standard roles, `ScyllaDB Enterprise `_ users can implement `Workload Prioritization `_, which allows you to attach roles to Service Levels, thus granting resources to roles as the role demands. +In addition to standard roles, ScyllaDB Enterprise users can implement :doc:`Workload Prioritization `, which allows you to attach roles to Service Levels, thus granting resources to roles as the role demands. .. _rbac-usecase-grant-roles-and-permissions: @@ -213,4 +213,4 @@ Additional References * :doc:`Authorization` * :doc:`CQLSh the CQL shell` -* `Workload Prioritization `_ - to attach a service level to a role. Only available in `ScyllaDB Enterprise `_. +* :doc:`Workload Prioritization ` - to attach a service level to a role diff --git a/generic_server.cc b/generic_server.cc index 96da666fe9..9076034bb8 100644 --- a/generic_server.cc +++ b/generic_server.cc @@ -38,6 +38,19 @@ connection::~connection() _server._connections_list.erase(iter); } +connection::execute_under_tenant_type +connection::no_tenant() { + // return a function that runs the process loop with no scheduling group games + return [] (connection_process_loop loop) { + return loop(); + }; +} + +void connection::switch_tenant(execute_under_tenant_type exec) { + _execute_under_current_tenant = std::move(exec); + _tenant_switch = true; +} + future<> server::for_each_gently(noncopyable_function fn) { _gentle_iterators.emplace_front(*this); std::list::iterator gi = _gentle_iterators.begin(); @@ -63,13 +76,26 @@ static bool is_broken_pipe_or_connection_reset(std::exception_ptr ep) { return false; } +future<> connection::process_until_tenant_switch() { + _tenant_switch = false; + { + return do_until([this] { + return _read_buf.eof() || _tenant_switch; + }, [this] { + return process_request(); + }); + } +} + future<> connection::process() { return with_gate(_pending_requests_gate, [this] { return do_until([this] { return _read_buf.eof(); }, [this] { - return process_request(); + return _execute_under_current_tenant([this] { + return process_until_tenant_switch(); + }); }).then_wrapped([this] (future<> f) { handle_error(std::move(f)); }); diff --git a/generic_server.hh b/generic_server.hh index 9a1db7ab92..5d856f9b87 100644 --- a/generic_server.hh +++ b/generic_server.hh @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,11 @@ class server; // member function to perform request processing. This base class provides a // `_read_buf` and a `_write_buf` for reading requests and writing responses. class connection : public boost::intrusive::list_base_hook<> { +public: + using connection_process_loop = noncopyable_function ()>; + using execute_under_tenant_type = noncopyable_function (connection_process_loop)>; + bool _tenant_switch = false; + execute_under_tenant_type _execute_under_current_tenant = no_tenant(); protected: server& _server; connected_socket _fd; @@ -44,6 +50,8 @@ protected: seastar::gate _pending_requests_gate; seastar::gate::holder _hold_server; +private: + future<> process_until_tenant_switch(); public: connection(server& server, connected_socket&& fd); virtual ~connection(); @@ -57,6 +65,10 @@ public: virtual void on_connection_close(); virtual future<> shutdown(); + + void switch_tenant(execute_under_tenant_type execute); + + static execute_under_tenant_type no_tenant(); }; // A generic TCP socket server. diff --git a/gms/feature_service.hh b/gms/feature_service.hh index dd585cb793..d52056d5df 100644 --- a/gms/feature_service.hh +++ b/gms/feature_service.hh @@ -156,6 +156,7 @@ public: gms::feature test_only_feature { *this, "TEST_ONLY_FEATURE"sv }; gms::feature address_nodes_by_host_ids { *this, "ADDRESS_NODES_BY_HOST_IDS"sv }; + gms::feature workload_prioritization { *this, "WORKLOAD_PRIORITIZATION"sv }; gms::feature compression_dicts { *this, "COMPRESSION_DICTS"sv }; public: diff --git a/main.cc b/main.cc index a0c9e4dbd5..cd4da43c52 100644 --- a/main.cc +++ b/main.cc @@ -1211,6 +1211,20 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl sstm.stop().get(); }); + static sharded auth_service; + static sharded maintenance_auth_service; + static sharded sl_controller; + debug::the_sl_controller = &sl_controller; + + //starting service level controller + qos::service_level_options default_service_level_configuration; + default_service_level_configuration.shares = 1000; + sl_controller.start(std::ref(auth_service), std::ref(token_metadata), std::ref(stop_signal.as_sharded_abort_source()), default_service_level_configuration, dbcfg.statement_scheduling_group).get(); + sl_controller.invoke_on_all(&qos::service_level_controller::start).get(); + auto stop_sl_controller = defer_verbose_shutdown("service level controller", [] { + sl_controller.stop().get(); + }); + lang::manager::config lang_config; lang_config.lua.max_bytes = cfg->user_defined_function_allocation_limit_bytes(); lang_config.lua.max_contiguous = cfg->user_defined_function_contiguous_allocation_limit_bytes(); @@ -1247,7 +1261,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl // because it obtains the list of pre-existing segments for replay, which must // not include reserve segments created by active commitlogs. db.local().init_commitlog().get(); - db.invoke_on_all(&replica::database::start).get(); + db.invoke_on_all(&replica::database::start, std::ref(sl_controller)).get(); ::sigquit_handler sigquit_handler(db); @@ -1339,19 +1353,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl api::unset_server_config(ctx).get(); }); - static sharded auth_service; - static sharded maintenance_auth_service; - static sharded sl_controller; - debug::the_sl_controller = &sl_controller; - - //starting service level controller - qos::service_level_options default_service_level_configuration; - sl_controller.start(std::ref(auth_service), std::ref(token_metadata), std::ref(stop_signal.as_sharded_abort_source()), default_service_level_configuration).get(); - sl_controller.invoke_on_all(&qos::service_level_controller::start).get(); - auto stop_sl_controller = defer_verbose_shutdown("service level controller", [] { - sl_controller.stop().get(); - }); - static sharded sys_dist_ks; static sharded sys_ks; static sharded view_update_generator; @@ -1515,7 +1516,6 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl netw::messaging_service::scheduling_config scfg; scfg.statement_tenants = { - {dbcfg.statement_scheduling_group, "$user"}, {default_scheduling_group(), "$system"}, {dbcfg.streaming_scheduling_group, "$maintenance", false} }; @@ -1534,7 +1534,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl } // Delay listening messaging_service until gossip message handlers are registered - messaging.start(mscfg, scfg, creds, std::ref(feature_service), std::ref(gossip_address_map), std::ref(compressor_tracker)).get(); + messaging.start(mscfg, scfg, creds, std::ref(feature_service), std::ref(gossip_address_map), std::ref(compressor_tracker), std::ref(sl_controller)).get(); auto stop_ms = defer_verbose_shutdown("messaging service", [&messaging] { messaging.invoke_on_all(&netw::messaging_service::stop).get(); }); @@ -2286,6 +2286,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl // Register controllers after drain_on_shutdown() below, so that even on start // failure drain is called and stops controllers cql_transport::controller cql_server_ctl(auth_service, mm_notifier, gossiper, qp, service_memory_limiter, sl_controller, lifecycle_notifier, *cfg, cql_sg_stats_key, maintenance_socket_enabled::no, dbcfg.statement_scheduling_group); + + api::set_server_service_levels(ctx, cql_server_ctl, qp).get(); + alternator::controller alternator_ctl(gossiper, proxy, mm, sys_dist_ks, cdc_generation_service, service_memory_limiter, auth_service, sl_controller, *cfg, dbcfg.statement_scheduling_group); redis::controller redis_ctl(proxy, auth_service, mm, *cfg, gossiper, dbcfg.statement_scheduling_group); diff --git a/message/messaging_service.cc b/message/messaging_service.cc index 158a2edbc6..31e8692aa5 100644 --- a/message/messaging_service.cc +++ b/message/messaging_service.cc @@ -18,6 +18,8 @@ #include "message/messaging_service.hh" #include #include "gms/gossiper.hh" +#include "service/storage_service.hh" +#include "service/qos/service_level_controller.hh" #include "streaming/prepare_message.hh" #include "gms/gossip_digest_syn.hh" #include "gms/gossip_digest_ack.hh" @@ -275,10 +277,11 @@ messaging_service::messaging_service( uint16_t port, gms::feature_service& feature_service, gms::gossip_address_map& address_map, - utils::walltime_compressor_tracker& wct) + utils::walltime_compressor_tracker& wct, + qos::service_level_controller& sl_controller) : messaging_service(config{std::move(id), ip, ip, port}, scheduling_config{{{{}, "$default"}}, {}, {}}, - nullptr, feature_service, address_map, wct) + nullptr, feature_service, address_map, wct, sl_controller) {} static @@ -386,9 +389,10 @@ void messaging_service::do_start_listen() { // the first by wrapping its server_socket, but not the second. auto limits = rpc_resource_limits(_cfg.rpc_memory_limit); limits.isolate_connection = [this] (sstring isolation_cookie) { - rpc::isolation_config cfg; - cfg.sched_group = scheduling_group_for_isolation_cookie(isolation_cookie); - return cfg; + + return scheduling_group_for_isolation_cookie(isolation_cookie).then([] (scheduling_group sg) { + return rpc::isolation_config{.sched_group = sg}; + }); }; if (!_server[0] && _cfg.encrypt != encrypt_what::all && _cfg.port) { auto listen = [&] (const gms::inet_address& a, rpc::streaming_domain_type sdomain) { @@ -467,7 +471,7 @@ void messaging_service::do_start_listen() { } messaging_service::messaging_service(config cfg, scheduling_config scfg, std::shared_ptr credentials, gms::feature_service& feature_service, - gms::gossip_address_map& address_map, utils::walltime_compressor_tracker& arct) + gms::gossip_address_map& address_map, utils::walltime_compressor_tracker& arct, qos::service_level_controller& sl_controller) : _cfg(std::move(cfg)) , _rpc(new rpc_protocol_wrapper(serializer { })) , _credentials_builder(credentials ? std::make_unique(*credentials) : nullptr) @@ -476,6 +480,7 @@ messaging_service::messaging_service(config cfg, scheduling_config scfg, std::sh , _scheduling_config(scfg) , _scheduling_info_for_connection_index(initial_scheduling_info()) , _feature_service(feature_service) + , _sl_controller(sl_controller) , _compressor_factory_wrapper(std::make_unique(arct, _cfg.enable_advanced_rpc_compression)) , _address_map(address_map) { @@ -743,13 +748,21 @@ msg_addr messaging_service::addr_for_host_id(locator::host_id hid) { } unsigned -messaging_service::get_rpc_client_idx(messaging_verb verb) const { +messaging_service::get_rpc_client_idx(messaging_verb verb) { auto idx = s_rpc_client_idx_table[static_cast(verb)]; if (idx < PER_SHARD_CONNECTION_COUNT) { return idx; } + // this is just a workaround for a wrong initialization order in messaging_service's + // constructor that causes _connection_index_for_tenant to be queried before it is + // initialized. This WA makes the behaviour match OSS in this case and it should be + // removed once it is fixed in OSS. If it isn't removed the behaviour will still be + // correct but we will lose cycles on an unnecesairy check. + if (_connection_index_for_tenant.size() == 0) { + return idx; + } const auto curr_sched_group = current_scheduling_group(); for (unsigned i = 0; i < _connection_index_for_tenant.size(); ++i) { if (_connection_index_for_tenant[i].sched_group == curr_sched_group) { @@ -757,16 +770,42 @@ messaging_service::get_rpc_client_idx(messaging_verb verb) const { // i == 0: the default tenant maps to the default client indexes belonging to the interval // [PER_SHARD_CONNECTION_COUNT, PER_SHARD_CONNECTION_COUNT + PER_TENANT_CONNECTION_COUNT). idx += i * PER_TENANT_CONNECTION_COUNT; - break; + return idx; } else { // If the tenant is disable, immediately return current index to // use $system tenant. return idx; } } + } - return idx; + // if we got here - it means that two conditions are met: + // 1. We are trying to get a client for a statement/statement_ack verb. + // 2. We are running in a scheduling group that is not assigned to one of the + // static tenants (e.g $system) + // If this scheduling group is of one of the system's static statement tenants we + // whould have caught it in the loop above. + // The other posibility is that we are running in a scheduling group belongs to + // a service level, maybe a deleted one, this is why it is possible that we will + // not find the service level name. + + std::optional service_level = _sl_controller.get_active_service_level(); + scheduling_group sg_for_tenant = curr_sched_group; + if (!service_level) { + service_level = qos::service_level_controller::default_service_level_name; + sg_for_tenant = _sl_controller.get_default_scheduling_group(); + } + auto it = _dynamic_tenants_to_client_idx.find(*service_level); + // the second part of this condition checks that the service level didn't "suddenly" + // changed scheduling group. If it did, it means probably that it was dropped and + // added again, if it happens we will update it's connection indexes since it is + // basically a new tenant with the same name. + if (it == _dynamic_tenants_to_client_idx.end() || + _scheduling_info_for_connection_index[it->second].sched_group != sg_for_tenant) { + return add_statement_tenant(*service_level,sg_for_tenant) + (idx - PER_SHARD_CONNECTION_COUNT); + } + return it->second; } std::vector @@ -802,25 +841,96 @@ messaging_service::scheduling_group_for_verb(messaging_verb verb) const { return _scheduling_info_for_connection_index[idx].sched_group; } -scheduling_group +future messaging_service::scheduling_group_for_isolation_cookie(const sstring& isolation_cookie) const { // Once per connection, so a loop is fine. for (auto&& info : _scheduling_info_for_connection_index) { if (info.isolation_cookie == isolation_cookie) { - return info.sched_group; + return make_ready_future(info.sched_group); } } - // Check for the case of the client using a connection class we don't - // recognize, but we know its a tenant, not a system connection. - // Fall-back to the default tenant in this case. - for (auto&& connection_prefix : _connection_types_prefix) { - if (isolation_cookie.find(connection_prefix.data()) == 0) { - return _scheduling_config.statement_tenants.front().sched_group; + + // We first check if this is a statement isolation cookie - if it is, we will search for the + // appropriate service level in the service_level_controller since in can be that + // _scheduling_info_for_connection_index is not yet updated (drop readd case for example) + // in the future we will only fall back here for new service levels that havn't been referenced + // before. + // It is safe to assume that an unknown connection type can be rejected since a connection + // with an unknown purpose on the inbound side is useless. + // However, until we get rid of the backward compatibility code below, we can't reject the + // connection since there is a slight chance that this connection comes from an old node that + // still doesn't use the "connection type prefix" convention. + auto tenant_connection = [] (const sstring& isolation_cookie) -> bool { + for (auto&& connection_prefix : _connection_types_prefix) { + if(isolation_cookie.find(connection_prefix.data()) == 0) { + return true; + } } + return false; + }; + + std::string service_level_name = ""; + if (tenant_connection(isolation_cookie)) { + // Extract the service level name from the connection isolation cookie. + service_level_name = isolation_cookie.substr(std::string(isolation_cookie).find_first_of(':') + 1); + } else if (_sl_controller.has_service_level(isolation_cookie)) { + // Backward Compatibility Code - This entire "else if" block should be removed + // in the major version that follows the one that contains this code. + // When upgrading from an older enterprise version the isolation cookie is not + // prefixed with "statement:" or any other connection type prefix, so an isolation cookie + // that comes from an older node will simply contain the service level name. + // we do an extra step to be also future proof and make sure it is indeed a service + // level's name, since if this is the older version and we upgrade to a new one + // we could have more connection classes (eg: streaming,gossip etc...) and we wouldn't + // want it to overload the default statement's scheduling group. + // it is not bulet proof in the sense that if a new tenant class happens to have the exact + // name as one of the service levels it will be diverted to the default statement scheduling + // group but it has a small chance of happening. + service_level_name = isolation_cookie; + mlogger.info("Trying to allow an rpc connection from an older node for service level {}", service_level_name); + } else { + // Client is using a new connection class that the server doesn't recognize yet. + // Assume it's important, after server upgrade we'll recognize it. + service_level_name = isolation_cookie; + mlogger.warn("Assuming an unknown cookie is from an older node and represent some not yet discovered service level {} - Trying to allow it.", service_level_name); + } + + if (_sl_controller.has_service_level(service_level_name)) { + return make_ready_future(_sl_controller.get_scheduling_group(service_level_name)); + } else if (service_level_name.starts_with('$')) { + // Tenant names starting with '$' are reserved for internal ones. If the tenant is not recognized + // to this point, it means we are in the middle of cluster upgrade and we don't know this tenant yet. + // Hardwire it to the default service level to keep things simple. + // This also includes `$user` tenant which is used in OSS and may appear in mixed OSS/Enterprise cluster. + return make_ready_future(_sl_controller.get_default_scheduling_group()); + } else { + mlogger.info("Service level {} is still unknown, will try to create it now and allow the RPC connection.", service_level_name); + // If the service level don't exist there are two possibilities, it is either created but still not known by this + // node. Or it has been deleted and the initiating node hasn't caught up yet, in both cases it is safe to __try__ and + // create a new service level (internally), it will naturally catch up eventually and by creating it here we prevent + // an rpc connection for a valid service level to permanently get stuck in the default service level scheduling group. + // If we can't create the service level (we already have too many service levels), we will reject the connection by returning + // an exeptional future. + qos::service_level_options slo; + // We put here the minimal ammount of shares for this service level to be functional. When the node catches up it will + // be either deleted or the number of shares and other configuration options will be updated. + slo.shares.emplace(1000); + slo.shares_name.emplace(service_level_name); + return _sl_controller.add_service_level(service_level_name, slo).then([this, service_level_name] () { + if (_sl_controller.has_service_level(service_level_name)) { + return make_ready_future(_sl_controller.get_scheduling_group(service_level_name)); + } else { + // The code until here is best effort, to provide fast catchup in case the configuration changes very quickly and being used + // before this node caught up, or alternatively during startup while the configuration table hasn't been consulted yet. + // If for some reason we couldn't add the service level, it is better to wait for the configuration to settle, + // this occasion should be rare enough, even if it happen, two paths are possible, either the initiating node will + // catch up, figure out the service level has been deleted and will not reattempt this rpc connection, or that this node will + // eventually catch up with the correct configuration (mainly some service levels that have been deleted and "made room" for this service level) and + // will eventually allow the connection. + return make_exception_future(std::runtime_error(fmt::format("Rejecting RPC connection for service level: {}, probably only a transitional effect", service_level_name))); + } + }); } - // Client is using a new connection class that the server doesn't recognize yet. - // Assume it's important, after server upgrade we'll recognize it. - return default_scheduling_group(); } @@ -1248,6 +1358,37 @@ future<> messaging_service::unregister_repair_get_full_row_hashes_with_rpc_strea // Wrappers for verbs +unsigned messaging_service::add_statement_tenant(sstring tenant_name, scheduling_group sg) { + auto idx = _clients.size(); + auto scheduling_info_for_connection_index_size = _scheduling_info_for_connection_index.size(); + auto undo = defer([&] { + _clients.resize(idx); + _clients_with_host_id.resize(idx); + _scheduling_info_for_connection_index.resize(scheduling_info_for_connection_index_size); + }); + _clients.resize(_clients.size() + PER_TENANT_CONNECTION_COUNT); + _clients_with_host_id.resize(_clients_with_host_id.size() + PER_TENANT_CONNECTION_COUNT); + // this functions as a way to delete an obsolete tenant with the same name but keeping _clients + // indexing and _scheduling_info_for_connection_index indexing in sync. + sstring first_cookie = sstring(_connection_types_prefix[0]) + tenant_name; + for (unsigned i = 0; i < _scheduling_info_for_connection_index.size(); i++) { + if (_scheduling_info_for_connection_index[i].isolation_cookie == first_cookie) { + // remove all connections associated with this tenant, since we are reinserting it. + for (size_t j = 0; j < _connection_types_prefix.size() ; j++) { + _scheduling_info_for_connection_index[i + j].isolation_cookie = ""; + } + break; + } + } + for (auto&& connection_prefix : _connection_types_prefix) { + sstring isolation_cookie = sstring(connection_prefix) + tenant_name; + _scheduling_info_for_connection_index.emplace_back(scheduling_info_for_connection_index{sg, isolation_cookie}); + } + _dynamic_tenants_to_client_idx.insert_or_assign(tenant_name, idx); + undo.cancel(); + return idx; +} + // Wrapper for TASKS_CHILDREN_REQUEST void messaging_service::register_tasks_get_children(std::function (const rpc::client_info& cinfo, tasks::get_children_request)>&& func) { register_handler(this, messaging_verb::TASKS_GET_CHILDREN, std::move(func)); diff --git a/message/messaging_service.hh b/message/messaging_service.hh index a982ce912b..eb5c52613c 100644 --- a/message/messaging_service.hh +++ b/message/messaging_service.hh @@ -119,6 +119,10 @@ using get_children_request = task_id; using get_children_response = std::vector; } +namespace qos { + class service_level_controller; +} + namespace netw { /* All verb handler identifiers */ @@ -353,6 +357,8 @@ private: std::vector _scheduling_info_for_connection_index; std::vector _connection_index_for_tenant; gms::feature_service& _feature_service; + std::unordered_map _dynamic_tenants_to_client_idx; + qos::service_level_controller& _sl_controller; std::unique_ptr _compressor_factory_wrapper; struct connection_ref; @@ -370,9 +376,9 @@ public: using clock_type = lowres_clock; messaging_service(locator::host_id id, gms::inet_address ip, uint16_t port, - gms::feature_service&, gms::gossip_address_map&, utils::walltime_compressor_tracker&); + gms::feature_service&, gms::gossip_address_map&, utils::walltime_compressor_tracker&, qos::service_level_controller&); messaging_service(config cfg, scheduling_config scfg, std::shared_ptr, - gms::feature_service&, gms::gossip_address_map&, utils::walltime_compressor_tracker&); + gms::feature_service&, gms::gossip_address_map&, utils::walltime_compressor_tracker&, qos::service_level_controller&); ~messaging_service(); future<> start(); @@ -468,10 +474,11 @@ public: std::unique_ptr& rpc(); static msg_addr get_source(const rpc::client_info& client); scheduling_group scheduling_group_for_verb(messaging_verb verb) const; - scheduling_group scheduling_group_for_isolation_cookie(const sstring& isolation_cookie) const; + future scheduling_group_for_isolation_cookie(const sstring& isolation_cookie) const; std::vector initial_scheduling_info() const; - unsigned get_rpc_client_idx(messaging_verb verb) const; + unsigned get_rpc_client_idx(messaging_verb verb); static constexpr std::array _connection_types_prefix = {"statement:", "statement-ack:", "forward:"}; // "forward" is the old name for "mapreduce" + unsigned add_statement_tenant(sstring tenant_name, scheduling_group sg); void init_feature_listeners(); private: diff --git a/reader_concurrency_semaphore.hh b/reader_concurrency_semaphore.hh index 44aba5437b..b52afefc9a 100644 --- a/reader_concurrency_semaphore.hh +++ b/reader_concurrency_semaphore.hh @@ -124,6 +124,8 @@ public: uint64_t sstables_read = 0; // Permits waiting on something: admission, memory or execution uint64_t waiters = 0; + + friend auto operator<=>(const stats&, const stats&) = default; }; using permit_list_type = bi::list< diff --git a/reader_concurrency_semaphore_group.cc b/reader_concurrency_semaphore_group.cc new file mode 100644 index 0000000000..fcc5ea7894 --- /dev/null +++ b/reader_concurrency_semaphore_group.cc @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2021-present ScyllaDB + */ + +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +#include "reader_concurrency_semaphore_group.hh" + +// Calling adjust is serialized since 2 adjustments can't happen simultaneosly, +// if they did the behaviour would be undefined. +future<> reader_concurrency_semaphore_group::adjust() { + return with_semaphore(_operations_serializer, 1, [this] () { + ssize_t distributed_memory = 0; + for (auto& [sg, wsem] : _semaphores) { + const ssize_t memory_share = std::floor((double(wsem.weight) / double(_total_weight)) * _total_memory); + wsem.sem.set_resources({_max_concurrent_reads, memory_share}); + distributed_memory += memory_share; + } + // Slap the remainder on one of the semaphores. + // This will be a few bytes, doesn't matter where we add it. + auto& sem = _semaphores.begin()->second.sem; + sem.set_resources(sem.initial_resources() + reader_resources{0, _total_memory - distributed_memory}); + }); +} + +// The call to change_weight is serialized as a consequence of the call to adjust. +future<> reader_concurrency_semaphore_group::change_weight(weighted_reader_concurrency_semaphore& sem, size_t new_weight) { + auto diff = new_weight - sem.weight; + if (diff) { + sem.weight += diff; + _total_weight += diff; + return adjust(); + } + return make_ready_future<>(); +} + +future<> reader_concurrency_semaphore_group::wait_adjust_complete() { + return with_semaphore(_operations_serializer, 1, [] { + return make_ready_future<>(); + }); +} + +future<> reader_concurrency_semaphore_group::stop() noexcept { + return parallel_for_each(_semaphores, [] (auto&& item) { + return item.second.sem.stop(); + }).then([this] { + _semaphores.clear(); + }); +} + +reader_concurrency_semaphore& reader_concurrency_semaphore_group::get(scheduling_group sg) { + return _semaphores.at(sg).sem; +} +reader_concurrency_semaphore* reader_concurrency_semaphore_group::get_or_null(scheduling_group sg) { + auto it = _semaphores.find(sg); + if (it == _semaphores.end()) { + return nullptr; + } else { + return &(it->second.sem); + } +} +reader_concurrency_semaphore& reader_concurrency_semaphore_group::add_or_update(scheduling_group sg, size_t shares) { + auto result = _semaphores.try_emplace( + sg, + 0, + _max_concurrent_reads, + _name_prefix ? format("{}_{}", *_name_prefix, sg.name()) : sg.name(), + _max_queue_length, + _serialize_limit_multiplier, + _kill_limit_multiplier, + _cpu_concurrency + ); + auto&& it = result.first; + // since we serialize all group changes this change wait will be queues and no further operations + // will be executed until this adjustment ends. + (void)change_weight(it->second, shares); + return it->second.sem; +} + +future<> reader_concurrency_semaphore_group::remove(scheduling_group sg) { + auto node_handle = _semaphores.extract(sg); + if (!node_handle.empty()) { + weighted_reader_concurrency_semaphore& sem = node_handle.mapped(); + return sem.sem.stop().then([this, &sem] { + return change_weight(sem, 0); + }).finally([node_handle = std::move(node_handle)] () { + // this holds on to the node handle until we destroy it only after the semaphore + // is stopped properly. + }); + } + return make_ready_future(); +} + +size_t reader_concurrency_semaphore_group::size() { + return _semaphores.size(); +} + +void reader_concurrency_semaphore_group::foreach_semaphore(std::function func) { + for (auto& [sg, wsem] : _semaphores) { + func(sg, wsem.sem); + } +} + +future<> +reader_concurrency_semaphore_group::foreach_semaphore_async(std::function (scheduling_group, reader_concurrency_semaphore&)> func) { + auto units = co_await get_units(_operations_serializer, 1); + for (auto& [sg, wsem] : _semaphores) { + co_await func(sg, wsem.sem); + } +} diff --git a/reader_concurrency_semaphore_group.hh b/reader_concurrency_semaphore_group.hh new file mode 100644 index 0000000000..6d073ef472 --- /dev/null +++ b/reader_concurrency_semaphore_group.hh @@ -0,0 +1,90 @@ +/* + * SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + */ + +/* + * Copyright (C) 2021-present ScyllaDB + */ + +#pragma once + +#include +#include +#include "reader_concurrency_semaphore.hh" +#include +#include + +// The reader_concurrency_semaphore_group is a group of semaphores that shares a common pool of memory, +// the memory is dynamically divided between them according to a relative slice of shares each semaphore +// is given. +// All of the mutating operations on the group are asynchronic and serialized. The semaphores are created +// and managed by the group. + +class reader_concurrency_semaphore_group { + size_t _total_memory; + size_t _total_weight; + size_t _max_concurrent_reads; + size_t _max_queue_length; + utils::updateable_value _serialize_limit_multiplier; + utils::updateable_value _kill_limit_multiplier; + utils::updateable_value _cpu_concurrency; + + friend class database_test_wrapper; + + struct weighted_reader_concurrency_semaphore { + size_t weight; + ssize_t memory_share; + reader_concurrency_semaphore sem; + weighted_reader_concurrency_semaphore(size_t shares, int count, sstring name, size_t max_queue_length, + utils::updateable_value serialize_limit_multiplier, + utils::updateable_value kill_limit_multiplier, + utils::updateable_value cpu_concurrency) + : weight(shares) + , memory_share(0) + , sem(utils::updateable_value(count), 0, name, max_queue_length, std::move(serialize_limit_multiplier), std::move(kill_limit_multiplier), + std::move(cpu_concurrency), reader_concurrency_semaphore::register_metrics::yes) {} + }; + + std::unordered_map _semaphores; + seastar::semaphore _operations_serializer; + std::optional _name_prefix; + + future<> change_weight(weighted_reader_concurrency_semaphore& sem, size_t new_weight); + +public: + reader_concurrency_semaphore_group(size_t memory, size_t max_concurrent_reads, size_t max_queue_length, + utils::updateable_value serialize_limit_multiplier, + utils::updateable_value kill_limit_multiplier, + utils::updateable_value cpu_concurrency, + std::optional name_prefix = std::nullopt) + : _total_memory(memory) + , _total_weight(0) + , _max_concurrent_reads(max_concurrent_reads) + , _max_queue_length(max_queue_length) + , _serialize_limit_multiplier(std::move(serialize_limit_multiplier)) + , _kill_limit_multiplier(std::move(kill_limit_multiplier)) + , _cpu_concurrency(std::move(cpu_concurrency)) + , _operations_serializer(1) + , _name_prefix(std::move(name_prefix)) { } + + ~reader_concurrency_semaphore_group() { + assert(_semaphores.empty()); + } + future<> adjust(); + future<> wait_adjust_complete(); + + future<> stop() noexcept; + reader_concurrency_semaphore& get(scheduling_group sg); + reader_concurrency_semaphore* get_or_null(scheduling_group sg); + reader_concurrency_semaphore& add_or_update(scheduling_group sg, size_t shares); + future<> remove(scheduling_group sg); + size_t size(); + void foreach_semaphore(std::function func); + + future<> foreach_semaphore_async(std::function (scheduling_group, reader_concurrency_semaphore&)> func); + + auto sum_read_concurrency_sem_var(std::invocable auto member) { + using ret_type = std::invoke_result_t; + return boost::accumulate(_semaphores | boost::adaptors::map_values | boost::adaptors::transformed([=] (weighted_reader_concurrency_semaphore& wrcs) { return std::invoke(member, wrcs.sem); }), ret_type(0)); + } +}; diff --git a/replica/database.cc b/replica/database.cc index 87da7731d3..234f2991be 100644 --- a/replica/database.cc +++ b/replica/database.cc @@ -67,6 +67,7 @@ #include "locator/abstract_replication_strategy.hh" #include "timeout_config.hh" #include "tombstone_gc.hh" +#include "service/qos/service_level_controller.hh" #include "replica/data_dictionary_impl.hh" #include "replica/global_table_ptr.hh" @@ -220,14 +221,8 @@ void database::setup_scylla_memory_diagnostics_producer() { writeln("Replica:\n"); writeln(" Read Concurrency Semaphores:\n"); - const std::pair semaphores[] = { - {"user", _read_concurrency_sem}, - {"streaming", _streaming_concurrency_sem}, - {"system", _system_read_concurrency_sem}, - {"compaction", _compaction_concurrency_sem}, - {"view update", _view_update_read_concurrency_sem}, - }; - for (const auto& [name, sem] : semaphores) { + + static auto semaphore_dump = [&writeln] (const sstring& name, const reader_concurrency_semaphore& sem) { const auto initial_res = sem.initial_resources(); const auto available_res = sem.available_resources(); if (sem.is_unlimited()) { @@ -245,7 +240,17 @@ void database::setup_scylla_memory_diagnostics_producer() { utils::to_hr_size(initial_res.memory), sem.get_stats().waiters); } - } + }; + + semaphore_dump("streaming", _streaming_concurrency_sem); + semaphore_dump("system", _system_read_concurrency_sem); + semaphore_dump("compaction", _compaction_concurrency_sem); + _reader_concurrency_semaphores_group.foreach_semaphore([] (scheduling_group sg, reader_concurrency_semaphore& sem) { + semaphore_dump(sg.name(), sem); + }); + _view_update_read_concurrency_semaphores_group.foreach_semaphore([] (scheduling_group sg, reader_concurrency_semaphore& sem) { + semaphore_dump(sg.name(), sem); + }); writeln(" Execution Stages:\n"); const std::pair execution_stage_summaries[] = { @@ -311,6 +316,42 @@ public: } }; +reader_concurrency_semaphore& +database::read_concurrency_sem() { + reader_concurrency_semaphore* sem = _reader_concurrency_semaphores_group.get_or_null(current_scheduling_group()); + if (!sem) { + // this line is commented out, however we shouldn't get here because it means that a user query or even worse, + // some random query was triggered from an unanticipated scheduling groups and this violates the isolation we are trying to achieve. + // It is commented out for two reasons: + // 1. So we will be able to ease into this new system, first testing functionality and effect and only then mix in exceptions and asserts. + // 2. So the series containing those changes will be backportable without causing too harsh regressions (aborts) on one hand and without forcing + // extensive changes on the other hand. + // Follow Up: uncomment this line and run extensive testing. Handle every case of abort. + // seastar::on_internal_error(dblog, format("Tried to run a user query in a wrong scheduling group (scheduling group: '{}')", current_scheduling_group().name())); + sem = _reader_concurrency_semaphores_group.get_or_null(_default_read_concurrency_group); + if (!sem) { + // If we got here - the initialization went very wrong and we can't do anything about it. + // This can only happen if someone touched the initialization code which is assumed to initialize at least + // this default semaphore. + seastar::on_internal_error(dblog, "Default read concurrency semaphore wasn't found, something probably went wrong during database::start"); + } + } + return *sem; +} + +// With same concerns as read_concurrency_sem(). +reader_concurrency_semaphore& +database::view_update_read_concurrency_sem() { + reader_concurrency_semaphore* sem = _view_update_read_concurrency_semaphores_group.get_or_null(current_scheduling_group()); + if (!sem) { + sem = _view_update_read_concurrency_semaphores_group.get_or_null(_default_read_concurrency_group); + if (!sem) { + seastar::on_internal_error(dblog, "Default view update read concurrency semaphore wasn't found, something probably went wrong during database::start"); + } + } + return *sem; +} + database::database(const db::config& cfg, database_config dbcfg, service::migration_notifier& mn, gms::feature_service& feat, const locator::shared_token_metadata& stm, compaction_manager& cm, sstables::storage_manager& sstm, lang::manager& langm, sstables::directory_semaphore& sst_dir_sem, const abort_source& abort, utils::cross_shard_barrier barrier) : _stats(make_lw_shared()) @@ -329,15 +370,6 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat } return backlog; })) - , _read_concurrency_sem( - utils::updateable_value(max_count_concurrent_reads), - max_memory_concurrent_reads(), - "user", - max_inactive_queue_length(), - _cfg.reader_concurrency_semaphore_serialize_limit_multiplier, - _cfg.reader_concurrency_semaphore_kill_limit_multiplier, - _cfg.reader_concurrency_semaphore_cpu_concurrency, - reader_concurrency_semaphore::register_metrics::yes) // No timeouts or queue length limits - a failure here can kill an entire repair. // Trust the caller to limit concurrency. , _streaming_concurrency_sem( @@ -360,15 +392,14 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat utils::updateable_value(std::numeric_limits::max()), utils::updateable_value(std::numeric_limits::max()), reader_concurrency_semaphore::register_metrics::yes) - , _view_update_read_concurrency_sem( - utils::updateable_value(max_count_concurrent_view_update_reads), + , _view_update_read_concurrency_semaphores_group( max_memory_concurrent_view_update_reads(), - "view_update", + utils::updateable_value(max_count_concurrent_view_update_reads), max_inactive_view_update_queue_length(), _cfg.view_update_reader_concurrency_semaphore_serialize_limit_multiplier, _cfg.view_update_reader_concurrency_semaphore_kill_limit_multiplier, _cfg.view_update_reader_concurrency_semaphore_cpu_concurrency, - reader_concurrency_semaphore::register_metrics::yes) + "view_update") , _row_cache_tracker(_cfg.index_cache_fraction.operator utils::updateable_value(), cache_tracker::register_metrics::yes) , _apply_stage("db_apply", &database::do_apply) , _version(empty_version) @@ -392,6 +423,10 @@ database::database(const db::config& cfg, database_config dbcfg, service::migrat , _feat(feat) , _shared_token_metadata(stm) , _lang_manager(langm) + , _reader_concurrency_semaphores_group(max_memory_concurrent_reads(), max_count_concurrent_reads, max_inactive_queue_length(), + _cfg.reader_concurrency_semaphore_serialize_limit_multiplier, + _cfg.reader_concurrency_semaphore_kill_limit_multiplier, + _cfg.reader_concurrency_semaphore_cpu_concurrency) , _stop_barrier(std::move(barrier)) , _update_memtable_flush_static_shares_action([this, &cfg] { return _memtable_controller.update_static_shares(cfg.memtable_flush_static_shares()); }) , _memtable_flush_static_shares_observer(cfg.memtable_flush_static_shares.observe(_update_memtable_flush_static_shares_action.make_observer())) @@ -485,6 +520,12 @@ namespace replica { static const metrics::label class_label("class"); + +auto +database::sum_read_concurrency_sem_stat(std::invocable auto stats_member) { + return _reader_concurrency_semaphores_group.sum_read_concurrency_sem_var([&] (reader_concurrency_semaphore& rcs) { return std::invoke(stats_member, rcs.get_stats()); }); +} + void database::setup_metrics() { _dirty_memory_manager.setup_collectd("regular"); @@ -1605,7 +1646,7 @@ query::max_result_size database::get_query_max_result_size() const { reader_concurrency_semaphore& database::get_reader_concurrency_semaphore() { switch (classify_request(_dbcfg)) { - case request_class::user: return _read_concurrency_sem; + case request_class::user: return read_concurrency_sem(); case request_class::system: return _system_read_concurrency_sem; case request_class::maintenance: return _streaming_concurrency_sem; } @@ -1634,9 +1675,15 @@ future<> database::clear_inactive_reads_for_tablet(table_id table, dht::token_ra } future<> database::foreach_reader_concurrency_semaphore(std::function(reader_concurrency_semaphore&)> func) { - for (auto* sem : {&_read_concurrency_sem, &_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem, &_view_update_read_concurrency_sem}) { + for (auto* sem : {&_streaming_concurrency_sem, &_compaction_concurrency_sem, &_system_read_concurrency_sem}) { co_await func(*sem); } + co_await _reader_concurrency_semaphores_group.foreach_semaphore_async([&] (scheduling_group sg, reader_concurrency_semaphore& sem) -> future<> { + co_await func(sem); + }); + co_await _view_update_read_concurrency_semaphores_group.foreach_semaphore_async([&] (scheduling_group sg, reader_concurrency_semaphore& sem) -> future<> { + co_await func(sem); + }); } std::ostream& operator<<(std::ostream& out, const column_family& cf) { @@ -1935,7 +1982,7 @@ future<> database::do_apply(schema_ptr s, const frozen_mutation& m, tracing::tra co_await coroutine::return_exception(std::runtime_error("view update generator not plugged to push updates")); } - auto lock_f = co_await coroutine::as_future(cf.push_view_replica_updates(_view_update_generator, s, m, timeout, std::move(tr_state), _view_update_read_concurrency_sem)); + auto lock_f = co_await coroutine::as_future(cf.push_view_replica_updates(_view_update_generator, s, m, timeout, std::move(tr_state), view_update_read_concurrency_sem())); if (lock_f.failed()) { auto ex = lock_f.get_exception(); if (is_timeout_exception(ex)) { @@ -2182,7 +2229,54 @@ void database::revert_initial_system_read_concurrency_boost() { dblog.debug("Reverted system read concurrency from initial {} to normal {}", database::max_count_concurrent_reads, database::max_count_system_concurrent_reads); } -future<> database::start() { +future<> database::start(sharded& sl_controller) { + sl_controller.local().register_subscriber(this); + _unsubscribe_qos_configuration_change = [this, &sl_controller] () { + return sl_controller.local().unregister_subscriber(this); + }; + qos::service_level default_service_level = sl_controller.local().get_service_level(qos::service_level_controller::default_service_level_name); + int32_t default_shares = 1000; + if (int32_t* default_shares_p = std::get_if(&(default_service_level.slo.shares))) { + default_shares = *default_shares_p; + } else { + on_internal_error(dblog, "The default service_level should always contain shares value"); + } + + // The former _dbcfg.statement_scheduling_group and the later can be the same group, so we want + // the later to be the accurate one. + _default_read_concurrency_group = default_service_level.sg; + _reader_concurrency_semaphores_group.add_or_update(default_service_level.sg, default_shares); + _view_update_read_concurrency_semaphores_group.add_or_update(default_service_level.sg, default_shares); + + // lets insert the statement scheduling group only if we haven't reused it in sl_controller, + // but it shouldn't happen + if (!_reader_concurrency_semaphores_group.get_or_null(_dbcfg.statement_scheduling_group)) { + // This is super ugly, we need to either force the database to use system scheduling group for non-user queries + // or, if we have user queries running on this scheduling group make it's definition more robust (what runs in it). + // Another ugly thing here is that we have to have a pre-existing knowladge about the shares ammount this group was + // built with. I think we should have a followup that makes this more robust. + _reader_concurrency_semaphores_group.add_or_update(_dbcfg.statement_scheduling_group, 1000); + _view_update_read_concurrency_semaphores_group.add_or_update(_dbcfg.statement_scheduling_group, 1000); + } + + // This will wait for the semaphores to be given some memory. + // We need this since the below statements (get_distributed_service_levels in particular) will need + // to run queries and for this they will need to admit some memory. + co_await _reader_concurrency_semaphores_group.wait_adjust_complete(); + co_await _view_update_read_concurrency_semaphores_group.wait_adjust_complete(); + + auto service_levels = co_await sl_controller.local().get_distributed_service_levels(qos::query_context::group0); + for (auto&& service_level_record : service_levels) { + auto service_level = sl_controller.local().get_service_level(service_level_record.first); + if (service_level.slo.shares_name && *service_level.slo.shares_name != qos::service_level_controller::default_service_level_name) { + // We know slo.shares is valid becuse we know that slo.shares_name is valid + _reader_concurrency_semaphores_group.add_or_update(service_level.sg, std::get(service_level.slo.shares)); + _view_update_read_concurrency_semaphores_group.add_or_update(service_level.sg, std::get(service_level.slo.shares)); + } + } + + co_await _reader_concurrency_semaphores_group.adjust(); + co_await _view_update_read_concurrency_semaphores_group.adjust(); _large_data_handler->start(); // We need the compaction manager ready early so we can reshard. _compaction_manager.enable(); @@ -2215,10 +2309,12 @@ future<> database::shutdown() { } future<> database::stop() { + if (_unsubscribe_qos_configuration_change) { + co_await std::exchange(_unsubscribe_qos_configuration_change, {})(); + } if (!_shutdown) { co_await shutdown(); } - // try to ensure that CL has done disk flushing if (_commitlog) { dblog.info("Shutting down commitlog"); @@ -2250,11 +2346,11 @@ future<> database::stop() { dblog.info("Stopping querier cache"); co_await _querier_cache.stop(); dblog.info("Stopping concurrency semaphores"); - co_await _read_concurrency_sem.stop(); + co_await _reader_concurrency_semaphores_group.stop(); + co_await _view_update_read_concurrency_semaphores_group.stop(); co_await _streaming_concurrency_sem.stop(); co_await _compaction_concurrency_sem.stop(); co_await _system_read_concurrency_sem.stop(); - co_await _view_update_read_concurrency_sem.stop(); dblog.info("Joining memtable update action"); co_await _update_memtable_flush_static_shares_action.join(); } @@ -3029,3 +3125,41 @@ future>> query_data( } } // namespace replica + +namespace replica { + +/** This callback is going to be called just before the service level is available **/ +future<> database::on_before_service_level_add(qos::service_level_options slo, qos::service_level_info sl_info) { + if (auto shares_p = std::get_if(&slo.shares)) { + _reader_concurrency_semaphores_group.add_or_update(sl_info.sg, *shares_p); + _view_update_read_concurrency_semaphores_group.add_or_update(sl_info.sg, *shares_p); + // the call to add_or_update_read_concurrency_sem will take the semaphore until the adjustment + // is completed, we need to wait for the operation to complete. + co_await _reader_concurrency_semaphores_group.wait_adjust_complete(); + co_await _view_update_read_concurrency_semaphores_group.wait_adjust_complete(); + } +} +/** This callback is going to be called just after the service level is removed **/ +future<> database::on_after_service_level_remove(qos::service_level_info sl_info) { + co_await _reader_concurrency_semaphores_group.remove(sl_info.sg); + co_await _view_update_read_concurrency_semaphores_group.remove(sl_info.sg); +} +/** This callback is going to be called just before the service level is changed **/ +future<> database::on_before_service_level_change(qos::service_level_options slo_before, qos::service_level_options slo_after, + qos::service_level_info sl_info) { + if (auto shares_p = std::get_if(&slo_after.shares)) { + _reader_concurrency_semaphores_group.add_or_update(sl_info.sg, *shares_p); + _view_update_read_concurrency_semaphores_group.add_or_update(sl_info.sg, *shares_p); + // the call to add_or_update_read_concurrency_sem will take the semaphore until the adjustment + // is completed, we need to wait for the operation to complete. + co_await _reader_concurrency_semaphores_group.wait_adjust_complete(); + co_await _view_update_read_concurrency_semaphores_group.wait_adjust_complete(); + } +} + +future<> +database::on_effective_service_levels_cache_reloaded() { + co_return; +} + +} diff --git a/replica/database.hh b/replica/database.hh index d0e2534384..94fc42e372 100644 --- a/replica/database.hh +++ b/replica/database.hh @@ -47,7 +47,7 @@ #include "utils/phased_barrier.hh" #include "backlog_controller.hh" #include "dirty_memory_manager.hh" -#include "reader_concurrency_semaphore.hh" +#include "reader_concurrency_semaphore_group.hh" #include "db/timeout_clock.hh" #include "querier.hh" #include "cache_temperature.hh" @@ -67,6 +67,7 @@ #include "utils/serialized_action.hh" #include "compaction/compaction_fwd.hh" #include "compaction_group.hh" +#include "service/qos/qos_configuration_change_subscriber.hh" class cell_locker; class cell_locker_stats; @@ -137,6 +138,10 @@ class view_update_generator; } +namespace qos { + class service_level_controller; +} + class mutation_reordered_with_truncate_exception : public std::exception {}; class column_family_test; @@ -1383,7 +1388,7 @@ class db_user_types_storage; // local metadata reads // use table::shard_for_reads()/table::shard_for_writes() for data -class database : public peering_sharded_service { +class database : public peering_sharded_service, qos::qos_configuration_change_subscriber { friend class ::database_test_wrapper; public: enum class table_kind { @@ -1487,13 +1492,13 @@ private: flush_controller _memtable_controller; drain_progress _drain_progress {}; - reader_concurrency_semaphore _read_concurrency_sem; + reader_concurrency_semaphore _streaming_concurrency_sem; reader_concurrency_semaphore _compaction_concurrency_sem; reader_concurrency_semaphore _system_read_concurrency_sem; - // The view update read concurrency semaphore used for view updates coming from user writes. - reader_concurrency_semaphore _view_update_read_concurrency_sem; + // The view update read concurrency semaphores used for view updates coming from user writes. + reader_concurrency_semaphore_group _view_update_read_concurrency_semaphores_group; db::timeout_semaphore _view_update_concurrency_sem{max_memory_pending_view_updates()}; cache_tracker _row_cache_tracker; @@ -1540,6 +1545,10 @@ private: const locator::shared_token_metadata& _shared_token_metadata; lang::manager& _lang_manager; + reader_concurrency_semaphore_group _reader_concurrency_semaphores_group; + scheduling_group _default_read_concurrency_group; + noncopyable_function()> _unsubscribe_qos_configuration_change; + utils::cross_shard_barrier _stop_barrier; db::rate_limiter _rate_limiter; @@ -1579,6 +1588,10 @@ private: future<> create_in_memory_keyspace(const lw_shared_ptr& ksm, locator::effective_replication_map_factory& erm_factory, system_keyspace system); void setup_metrics(); void setup_scylla_memory_diagnostics_producer(); + reader_concurrency_semaphore& read_concurrency_sem(); + reader_concurrency_semaphore& view_update_read_concurrency_sem(); + auto sum_read_concurrency_sem_var(std::invocable auto member); + auto sum_read_concurrency_sem_stat(std::invocable auto stats_member); future<> do_apply(schema_ptr, const frozen_mutation&, tracing::trace_state_ptr tr_state, db::timeout_clock::time_point timeout, db::commitlog_force_sync sync, db::per_partition_rate_limit::info rate_limit_info); future<> do_apply_many(const std::vector&, db::timeout_clock::time_point timeout); @@ -1714,7 +1727,7 @@ public: /// reads, to speed up startup. After startup this should be reverted to /// the normal concurrency. void revert_initial_system_read_concurrency_boost(); - future<> start(); + future<> start(sharded&); future<> shutdown(); future<> stop(); future<> close_tables(table_kind kind_to_close); @@ -1906,6 +1919,14 @@ public: } future<> clear_inactive_reads_for_tablet(table_id table, dht::token_range tablet_range); + + /** This callback is going to be called just before the service level is available **/ + virtual future<> on_before_service_level_add(qos::service_level_options slo, qos::service_level_info sl_info) override; + /** This callback is going to be called just after the service level is removed **/ + virtual future<> on_after_service_level_remove(qos::service_level_info sl_info) override; + /** This callback is going to be called just before the service level is changed **/ + virtual future<> on_before_service_level_change(qos::service_level_options slo_before, qos::service_level_options slo_after, qos::service_level_info sl_info) override; + virtual future<> on_effective_service_levels_cache_reloaded() override; }; // A helper function to parse the directory name back diff --git a/scylla-gdb.py b/scylla-gdb.py index 92082943a9..d098c5be6e 100755 --- a/scylla-gdb.py +++ b/scylla-gdb.py @@ -2311,13 +2311,21 @@ class scylla_memory(gdb.Command): if not db: return + per_service_level_sem = [] + for sg, sem in unordered_map(db["_reader_concurrency_semaphores_group"]["_semaphores"]): + per_service_level_sem.append(scylla_memory.format_semaphore_stats(sem["sem"])) + + per_service_level_vu_sem = [] + for sg, sem in unordered_map(db["_view_update_read_concurrency_semaphores_group"]["_semaphores"]): + per_service_level_vu_sem.append(scylla_memory.format_semaphore_stats(sem["sem"])) + database_typename = lookup_type(['replica::database', 'database'])[1].name gdb.write('Replica:\n') gdb.write(' Read Concurrency Semaphores:\n {}\n {}\n {}\n {}\n'.format( - scylla_memory.format_semaphore_stats(db['_read_concurrency_sem']), + '\n '.join(per_service_level_sem), scylla_memory.format_semaphore_stats(db['_streaming_concurrency_sem']), scylla_memory.format_semaphore_stats(db['_system_read_concurrency_sem']), - scylla_memory.format_semaphore_stats(db['_view_update_read_concurrency_sem']))) + '\n '.join(per_service_level_vu_sem))) gdb.write(' Execution Stages:\n') for es_path in [('_apply_stage',)]: @@ -5809,12 +5817,17 @@ class scylla_read_stats(gdb.Command): semaphores = [gdb.parse_and_eval(arg) for arg in args.split(' ')] else: db = find_db() - semaphores = [db["_read_concurrency_sem"], db["_streaming_concurrency_sem"], db["_system_read_concurrency_sem"]] + semaphores = [db["_streaming_concurrency_sem"], db["_system_read_concurrency_sem"]] semaphores.append(db["_compaction_concurrency_sem"]) try: - semaphores.append(db["_view_update_read_concurrency_sem"]) + semaphores += [weighted_sem["sem"] for (_, weighted_sem) in unordered_map(db["_reader_concurrency_semaphores_group"]["_semaphores"])] except gdb.error: - # 6.2 compatibility + # compatibility with code before per-scheduling-group semaphore + pass + try: + semaphores += [weighted_sem["sem"] for (_, weighted_sem) in unordered_map(db["_view_update_read_concurrency_semaphores_group"]["_semaphores"])] + except gdb.error: + # 2024.2 compatibility pass for semaphore in semaphores: diff --git a/service/qos/qos_common.cc b/service/qos/qos_common.cc index ad782ded1c..e95733aa28 100644 --- a/service/qos/qos_common.cc +++ b/service/qos/qos_common.cc @@ -7,10 +7,13 @@ */ #include "qos_common.hh" +#include "service/qos/service_level_controller.hh" #include "utils/overloaded_functor.hh" #include "cql3/query_processor.hh" #include "cql3/result_set.hh" #include "cql3/untyped_result_set.hh" +#include +#include #include namespace qos { @@ -43,6 +46,19 @@ service_level_options service_level_options::replace_defaults(const service_leve // no-op break; } + std::visit(overloaded_functor { + [&] (const unset_marker& um) { + // reset the value to the default one + ret.shares = default_values.shares; + }, + [&] (const delete_marker& dm) { + // remove the value + ret.shares = unset_marker{}; + }, + [&] (const int32_t&) { + // leave the value as is + }, + }, ret.shares); return ret; } @@ -52,6 +68,11 @@ service_level_options service_level_options::merge_with(const service_level_opti slo.effective_names->timeout = other.effective_names->timeout; } }; + auto maybe_update_shares_name = [] (service_level_options& slo, const service_level_options& other) { + if (slo.effective_names && other.effective_names) { + slo.effective_names->shares = other.effective_names->shares; + } + }; auto maybe_update_workload_name = [] (service_level_options& slo, const service_level_options& other) { if (slo.effective_names && other.effective_names) { slo.effective_names->workload = other.effective_names->workload; @@ -91,9 +112,39 @@ service_level_options service_level_options::merge_with(const service_level_opti maybe_update_workload_name(ret, other); } + std::visit(overloaded_functor { + [&] (const unset_marker& um) { + ret.shares = other.shares; + maybe_update_shares_name(ret, other); + }, + [&] (const delete_marker& dm) { + ret.shares = other.shares; + maybe_update_shares_name(ret, other); + }, + [&] (const int32_t& s) { + if (auto* other_shares = std::get_if(&other.shares)) { + auto prev_shares = ret.shares; + ret.shares = std::min(s, *other_shares); + + if (prev_shares != ret.shares) { + ret.shares_name = other.shares_name; + maybe_update_shares_name(ret, other); + } + } + }, + }, ret.shares); + return ret; } +sstring service_level_options::to_string(timeout_type tt) { + return std::visit(make_visitor( + [] (unset_marker) -> sstring { return "null"; }, + [] (delete_marker) -> sstring { return ""; }, + [] (lowres_clock::duration value) { return seastar::format("{}", value); } + ), tt); +} + std::string_view service_level_options::to_string(const workload_type& wt) { switch (wt) { case workload_type::unspecified: return "unspecified"; @@ -119,10 +170,19 @@ std::optional service_level_options::parse return std::nullopt; } +sstring service_level_options::to_string(shares_type st) { + return std::visit(make_visitor( + [] (unset_marker) -> sstring { return "default"; }, + [] (delete_marker) -> sstring { return ""; }, + [] (int32_t value) { return seastar::format("{}", value); } + ), st); +} + void service_level_options::init_effective_names(std::string_view service_level_name) { effective_names = service_level_options::slo_effective_names { .timeout = sstring(service_level_name), - .workload = sstring(service_level_name) + .workload = sstring(service_level_name), + .shares = sstring(service_level_name), }; } @@ -144,8 +204,23 @@ static service_level_options::timeout_type get_duration(const cql3::untyped_resu return std::chrono::duration_cast(std::chrono::nanoseconds(dur_opt->nanoseconds)); }; +static qos::service_level_options::shares_type get_shares(const cql3::untyped_result_set_row& row, std::string_view col_name) { + auto shares_opt = row.get_opt(col_name); + if (!shares_opt) { + return qos::service_level_controller::default_shares; + } + return *shares_opt; +} + +static sstring get_columns(cql3::query_processor& qp, std::string_view ks_name, std::string_view cf_name) { + auto schema = qp.db().find_schema(ks_name, cf_name); + return boost::algorithm::join(schema->all_columns() | boost::adaptors::transformed([] (const auto& col) { + return col.name_as_cql_string(); + }), " ,"); +} + future get_service_levels(cql3::query_processor& qp, std::string_view ks_name, std::string_view cf_name, db::consistency_level cl, qos::query_context ctx) { - sstring prepared_query = seastar::format("SELECT * FROM {}.{};", ks_name, cf_name); + sstring prepared_query = seastar::format("SELECT {} FROM {}.{};", get_columns(qp, ks_name, cf_name), ks_name, cf_name); auto result_set = co_await qp.execute_internal(prepared_query, cl, qos_query_state(ctx), cql3::query_processor::cache_internal::yes); qos::service_levels_info service_levels; @@ -156,6 +231,7 @@ future get_service_levels(cql3::query_processor& qp, s qos::service_level_options slo{ .timeout = get_duration(row, "timeout"), .workload = workload.value_or(qos::service_level_options::workload_type::unspecified), + .shares = get_shares(row, "shares"), }; service_levels.emplace(service_level_name, slo); } catch (...) { @@ -167,7 +243,7 @@ future get_service_levels(cql3::query_processor& qp, s } future get_service_level(cql3::query_processor& qp, std::string_view ks_name, std::string_view cf_name, sstring service_level_name, db::consistency_level cl) { - sstring prepared_query = seastar::format("SELECT * FROM {}.{} WHERE service_level = ?;", ks_name, cf_name); + sstring prepared_query = seastar::format("SELECT {} FROM {}.{} WHERE service_level = ?;", get_columns(qp, ks_name, cf_name), ks_name, cf_name); auto result_set = co_await qp.execute_internal(prepared_query, cl, qos_query_state(), {service_level_name}, cql3::query_processor::cache_internal::yes); qos::service_levels_info service_levels; @@ -178,6 +254,7 @@ future get_service_level(cql3::query_processor& qp, std::st qos::service_level_options slo{ .timeout = get_duration(row, "timeout"), .workload = workload.value_or(qos::service_level_options::workload_type::unspecified), + .shares = get_shares(row, "shares"), }; service_levels.emplace(service_level_name, slo); } catch (...) { diff --git a/service/qos/qos_common.hh b/service/qos/qos_common.hh index 16efa26037..0991bd2170 100644 --- a/service/qos/qos_common.hh +++ b/service/qos/qos_common.hh @@ -17,6 +17,8 @@ #include #include #include +#include +#include "exceptions/exceptions.hh" namespace cql3 { class query_processor; @@ -61,6 +63,11 @@ struct service_level_options { timeout_type timeout = unset_marker{}; workload_type workload = workload_type::unspecified; + using shares_type = std::variant; + shares_type shares = unset_marker{}; + + std::optional shares_name; // service level name, if shares is set + service_level_options replace_defaults(const service_level_options& other) const; // Merges the values of two service level options. The semantics depends // on the type of the parameter - e.g. for timeouts, a min value is preferred. @@ -68,12 +75,17 @@ struct service_level_options { bool operator==(const service_level_options& other) const = default; + static sstring to_string(timeout_type); + static std::string_view to_string(const workload_type& wt); static std::optional parse_workload_type(std::string_view sv); + static sstring to_string(shares_type); + struct slo_effective_names { sstring timeout; sstring workload; + sstring shares; bool operator==(const slo_effective_names& other) const = default; bool operator!=(const slo_effective_names& other) const = default; @@ -90,9 +102,9 @@ using service_levels_info = std::map; /// /// A logical argument error for a service_level statement operation. /// -class service_level_argument_exception : public std::invalid_argument { +class service_level_argument_exception : public exceptions::invalid_request_exception { public: - using std::invalid_argument::invalid_argument; + using exceptions::invalid_request_exception::invalid_request_exception; }; /// @@ -110,10 +122,29 @@ service::query_state& qos_query_state(qos::query_context ctx = qos::query_contex future get_service_levels(cql3::query_processor& qp, std::string_view ks_name, std::string_view cf_name, db::consistency_level cl, qos::query_context ctx); future get_service_level(cql3::query_processor& qp, std::string_view ks_name, std::string_view cf_name, sstring service_level_name, db::consistency_level cl); +class service_level_scheduling_groups_exhausted : public std::runtime_error { +public: + static constexpr const char* msg = "Can't create scheduling group for {}, consider removing this service level or some other service level"; + service_level_scheduling_groups_exhausted(sstring name) : std::runtime_error(format(msg, name)) { + } +}; + } +template <> struct fmt::formatter : fmt::formatter { + auto format(qos::service_level_options::timeout_type tt, fmt::format_context& ctx) const { + return formatter::format(qos::service_level_options::to_string(tt), ctx); + } +}; + template <> struct fmt::formatter : fmt::formatter { auto format(qos::service_level_options::workload_type wt, fmt::format_context& ctx) const { return formatter::format(qos::service_level_options::to_string(wt), ctx); } }; + +template <> struct fmt::formatter : fmt::formatter { + auto format(qos::service_level_options::shares_type st, fmt::format_context& ctx) const { + return formatter::format(qos::service_level_options::to_string(st), ctx); + } +}; diff --git a/service/qos/qos_configuration_change_subscriber.hh b/service/qos/qos_configuration_change_subscriber.hh index 0804e4609d..b26fc30489 100644 --- a/service/qos/qos_configuration_change_subscriber.hh +++ b/service/qos/qos_configuration_change_subscriber.hh @@ -15,6 +15,7 @@ namespace qos { struct service_level_info { sstring name; + seastar::scheduling_group sg; }; class qos_configuration_change_subscriber { public: diff --git a/service/qos/raft_service_level_distributed_data_accessor.cc b/service/qos/raft_service_level_distributed_data_accessor.cc index 588081f379..3bcf124cd1 100644 --- a/service/qos/raft_service_level_distributed_data_accessor.cc +++ b/service/qos/raft_service_level_distributed_data_accessor.cc @@ -60,11 +60,26 @@ future<> raft_service_level_distributed_data_accessor::set_service_level(sstring validate_state(_group0_client); static sstring insert_query = format("INSERT INTO {}.{} (service_level, timeout, workload_type) VALUES (?, ?, ?);", db::system_keyspace::NAME, db::system_keyspace::SERVICE_LEVELS_V2); + static sstring update_shares_query = format("UPDATE {}.{} SET shares = ? WHERE service_level = ?", db::system_keyspace::NAME, db::system_keyspace::SERVICE_LEVELS_V2); data_value workload = slo.workload == qos::service_level_options::workload_type::unspecified ? data_value::make_null(utf8_type) : data_value(qos::service_level_options::to_string(slo.workload)); auto muts = co_await _qp.get_mutations_internal(insert_query, qos_query_state(), mc.write_timestamp(), {service_level_name, timeout_to_data_value(slo.timeout), workload}); + auto muts_shares = co_await std::visit(overloaded_functor { + [&] (const service_level_options::unset_marker& um) -> future> { + co_return std::vector(); + }, + [&] (const service_level_options::delete_marker& dm) -> future> { + co_return co_await _qp.get_mutations_internal(update_shares_query, qos_query_state(), mc.write_timestamp(), {data_value::make_null(int32_type), data_value(service_level_name)}); + }, + [&] (const int32_t& s) -> future> { + co_return co_await _qp.get_mutations_internal(update_shares_query, qos_query_state(), mc.write_timestamp(), {data_value(s), data_value(service_level_name)}); + } + }, slo.shares); + + muts.insert(muts.end(), muts_shares.begin(), muts_shares.end()); + mc.add_mutations(std::move(muts), format("service levels internal statement: {}", insert_query)); } diff --git a/service/qos/service_level_controller.cc b/service/qos/service_level_controller.cc index 3b32843422..3399c8ab4a 100644 --- a/service/qos/service_level_controller.cc +++ b/service/qos/service_level_controller.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include "cql3/untyped_result_set.hh" #include "db/config.hh" #include "db/consistency_level_type.hh" @@ -33,26 +34,34 @@ #include "service/storage_service.hh" #include "service/topology_state_machine.hh" #include "utils/sorting.hh" +#include namespace qos { static logging::logger sl_logger("service_level_controller"); sstring service_level_controller::default_service_level_name = "default"; +constexpr const char* scheduling_group_name_pattern = "sl:{}"; +constexpr const char* deleted_scheduling_group_name_pattern = "sl_deleted:{}"; +constexpr const char* temp_scheduling_group_name_pattern = "sl_temp:{}"; - - -service_level_controller::service_level_controller(sharded& auth_service, locator::shared_token_metadata& tm, abort_source& as, service_level_options default_service_level_config): - _sl_data_accessor(nullptr), - _auth_service(auth_service), - _token_metadata(tm), - _last_successful_config_update(seastar::lowres_clock::now()), - _logged_intervals(0), - _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); })) - +service_level_controller::service_level_controller(sharded& auth_service, locator::shared_token_metadata& tm, abort_source& as, service_level_options default_service_level_config, scheduling_group default_scheduling_group, bool destroy_default_sg_on_drain) + : _sl_data_accessor(nullptr) + , _auth_service(auth_service) + , _token_metadata(tm) + , _last_successful_config_update(seastar::lowres_clock::now()) + , _logged_intervals(0) + , _early_abort_subscription(as.subscribe([this] () noexcept { do_abort(); })) { + // We can't rename the system default scheduling group so we have to reject it. + assert(default_scheduling_group != get_default_scheduling_group()); if (this_shard_id() == global_controller) { _global_controller_db = std::make_unique(); _global_controller_db->default_service_level_config = default_service_level_config; + _global_controller_db->default_sg = default_scheduling_group; + _global_controller_db->destroy_default_sg = destroy_default_sg_on_drain; + // since the first thing that is being done is adding the default service level, we only + // need to throw the given group to the pool of scheduling groups for reuse. + _global_controller_db->deleted_scheduling_groups.emplace_back(default_scheduling_group); } } @@ -132,12 +141,47 @@ future<> service_level_controller::stop() { _global_controller_db->notifications_serializer.broken(); try { - co_await std::exchange(_global_controller_db->distributed_data_update, make_ready_future<>()); + auto f = co_await coroutine::as_future(std::exchange(_global_controller_db->distributed_data_update, make_ready_future<>())); + // delete all sg's in _service_levels_db, leaving it empty. + for (auto it = _service_levels_db.begin(); it != _service_levels_db.end(); ) { + _global_controller_db->deleted_scheduling_groups.emplace_back(it->second.sg); + it = _service_levels_db.erase(it); + } + f.get(); } catch (const broken_semaphore& ignored) { } catch (const sleep_aborted& ignored) { } catch (const exceptions::unavailable_exception& ignored) { } catch (const exceptions::read_timeout_exception& ignored) { } + + // exclude scheduling groups we shouldn't destroy + std::erase_if(_global_controller_db->deleted_scheduling_groups, [this] (scheduling_group& sg) { + if (sg == default_scheduling_group()) { + return true; + } else if (!_global_controller_db->destroy_default_sg && _global_controller_db->default_sg == sg) { + return true; + } else { + return false; + } + }); + + // destroy all sg's in _global_controller_db->deleted_scheduling_groups, leaving it empty + // if any destroy_scheduling_group call fails, return one of the exceptions + std::deque deleted_scheduling_groups = std::move(_global_controller_db->deleted_scheduling_groups); + std::exception_ptr ex; + + while (!deleted_scheduling_groups.empty()) { + auto f = co_await coroutine::as_future(destroy_scheduling_group(deleted_scheduling_groups.front())); + if (f.failed()) { + auto e = f.get_exception(); + sl_logger.error("Destroying scheduling group \"{}\" on stop failed: {}. Ignored.", deleted_scheduling_groups.front().name(), e); + ex = std::move(e); + } + deleted_scheduling_groups.pop_front(); + } + if (ex) { + std::rethrow_exception(std::move(ex)); + } } void service_level_controller::abort_group0_operations() { @@ -163,7 +207,8 @@ future<> service_level_controller::update_service_levels_cache(qos::query_contex // detects it the scan query done inside this call is failing. service_levels = _sl_data_accessor->get_service_levels(ctx).get(); - service_levels_info service_levels_for_add_or_update; + service_levels_info service_levels_for_update; + service_levels_info service_levels_for_add; service_levels_info service_levels_for_delete; auto current_it = _service_levels_db.begin(); @@ -187,7 +232,7 @@ future<> service_level_controller::update_service_levels_cache(qos::query_contex if (current_it->second.slo != new_state_it->second) { // The service level configuration is different // in the new state and the old state, meaning it needs to be updated. - service_levels_for_add_or_update.insert(*new_state_it); + service_levels_for_update.insert(*new_state_it); } current_it++; new_state_it++; @@ -196,36 +241,63 @@ future<> service_level_controller::update_service_levels_cache(qos::query_contex //removed, but only if it is not static since static configurations dont //come from the distributed keyspace but from code. if (!current_it->second.is_static) { - sl_logger.info("service level \"{}\" was deleted.", current_it->first.c_str()); service_levels_for_delete.emplace(current_it->first, current_it->second.slo); } current_it++; } else { /*new_it->first < current_it->first */ // The service level exits in the new state but not in the old state // so it needs to be added. - sl_logger.info("service level \"{}\" was added.", new_state_it->first.c_str()); - service_levels_for_add_or_update.insert(*new_state_it); + service_levels_for_add.insert(*new_state_it); new_state_it++; } } for (; current_it != _service_levels_db.end(); current_it++) { if (!current_it->second.is_static) { - sl_logger.info("service level \"{}\" was deleted.", current_it->first.c_str()); service_levels_for_delete.emplace(current_it->first, current_it->second.slo); } } for (; new_state_it != service_levels.end(); new_state_it++) { - sl_logger.info("service level \"{}\" was added.", new_state_it->first.c_str()); - service_levels_for_add_or_update.emplace(new_state_it->first, new_state_it->second); + service_levels_for_add.emplace(new_state_it->first, new_state_it->second); } for (auto&& sl : service_levels_for_delete) { do_remove_service_level(sl.first, false).get(); + sl_logger.info("service level \"{}\" was deleted.", sl.first.c_str()); } - for (auto&& sl : service_levels_for_add_or_update) { + for (auto&& sl : service_levels_for_update) { do_add_service_level(sl.first, sl.second).get(); + sl_logger.info("service level \"{}\" was updated. New values: (timeout: {}, workload_type: {}, shares: {})", + sl.first, sl.second.timeout, sl.second.workload, sl.second.shares); } + _effective_service_levels_db.clear(); + for (auto&& sl : service_levels_for_add) { + bool make_room = false; + std::map::reverse_iterator it; + try { + do_add_service_level(sl.first, sl.second).get(); + sl_logger.info("service level \"{}\" was added.", sl.first.c_str()); + } catch (service_level_scheduling_groups_exhausted &ex) { + it = _service_levels_db.rbegin(); + if (it->first == default_service_level_name) { + it++; + } + if (it->first.compare(sl.first) > 0) { + make_room = true; + } else { + _effectively_dropped_sls.insert(sl.first); + sl_logger.warn("{}", ex.what()); + } + } + if (make_room) { + sl_logger.warn("service level \"{}\" will be effectively dropped to make scheduling group available to \"{}\", please consider removing a service level." + , it->first, sl.first ); + do_remove_service_level(it->first, false).get(); + _effectively_dropped_sls.insert(it->first); + do_add_service_level(sl.first, sl.second).get(); + } + } + }); }); } @@ -258,9 +330,16 @@ future<> service_level_controller::update_effective_service_levels_cache() { std::optional sl_options; if (auto sl_name_it = attributes.find(role); sl_name_it != attributes.end()) { - auto sl = _service_levels_db.at(sl_name_it->second); - sl_options = sl.slo; - sl_options->init_effective_names(sl_name_it->second); + if (auto sl_it = _service_levels_db.find(sl_name_it->second); sl_it != _service_levels_db.end()) { + sl_options = sl_it->second.slo; + sl_options->init_effective_names(sl_name_it->second); + sl_options->shares_name = sl_name_it->second; + } else if (_effectively_dropped_sls.contains(sl_name_it->second)) { + // service level might be effective dropped, then it's not present in `_service_levels_db` + sl_logger.warn("Service level {} is effectively dropped and its values are ignored.", sl_name_it->second); + } else { + sl_logger.error("Couldn't find service level {} in first level cache", sl_name_it->second); + } } auto [it, it_end] = hierarchy.equal_range(role); @@ -330,7 +409,9 @@ future> service_level_controller::find_effe } sl_it->second.slo.init_effective_names(*sl_name); - return sl_it->second.slo; + auto slo = sl_it->second.slo; + slo.shares_name = sl_name; + return slo; } catch (...) { // when we fail, we act as if the attribute does not exist so the node // will not be brought down. return std::nullopt; @@ -361,14 +442,23 @@ std::optional service_level_controller::find_cached_effec future<> service_level_controller::notify_service_level_added(sstring name, service_level sl_data) { return seastar::async( [this, name, sl_data] { - _subscribers.thread_for_each([name, sl_data] (qos_configuration_change_subscriber* subscriber) { + service_level_info sl_info = { + .name = name, + .sg = sl_data.sg, + }; + _subscribers.thread_for_each([name, sl_data, sl_info] (qos_configuration_change_subscriber* subscriber) { try { - subscriber->on_before_service_level_add(sl_data.slo, {name}).get(); + subscriber->on_before_service_level_add(sl_data.slo, sl_info).get(); } catch (...) { sl_logger.error("notify_service_level_added: exception occurred in one of the observers callbacks {}", std::current_exception()); } }); - _service_levels_db.emplace(name, sl_data); + auto result= _service_levels_db.emplace(name, sl_data); + if (result.second) { + unsigned sl_idx = internal::scheduling_group_index(sl_data.sg); + _sl_lookup[sl_idx].first = &(result.first->first); + _sl_lookup[sl_idx].second = &(result.first->second); + } }); } @@ -379,13 +469,26 @@ future<> service_level_controller::notify_service_level_updated(sstring name, se if (sl_it != _service_levels_db.end()) { service_level_options slo_before = sl_it->second.slo; return seastar::async( [this,sl_it, name, slo_before, slo] { - _subscribers.thread_for_each([name, slo_before, slo] (qos_configuration_change_subscriber* subscriber) { + future<> f = make_ready_future(); + service_level_info sl_info = { + .name = name, + .sg = sl_it->second.sg, + }; + _subscribers.thread_for_each([name, slo_before, slo, sl_info] (qos_configuration_change_subscriber* subscriber) { try { - subscriber->on_before_service_level_change(slo_before, slo, {name}).get(); + subscriber->on_before_service_level_change(slo_before, slo, sl_info).get(); } catch (...) { sl_logger.error("notify_service_level_updated: exception occurred in one of the observers callbacks {}", std::current_exception()); } }); + if (sl_it->second.slo.shares != slo.shares) { + int32_t new_shares = default_shares; + if (auto new_shares_p = std::get_if(&slo.shares)) { + new_shares = *new_shares_p; + } + sl_it->second.sg.set_shares(new_shares); + } + sl_it->second.slo = slo; }); } @@ -395,11 +498,22 @@ future<> service_level_controller::notify_service_level_updated(sstring name, se future<> service_level_controller::notify_service_level_removed(sstring name) { auto sl_it = _service_levels_db.find(name); if (sl_it != _service_levels_db.end()) { + unsigned sl_idx = internal::scheduling_group_index(sl_it->second.sg); + _sl_lookup[sl_idx].first = nullptr; + _sl_lookup[sl_idx].second = nullptr; + if (this_shard_id() == global_controller) { + _global_controller_db->deleted_scheduling_groups.emplace_back(sl_it->second.sg); + co_await rename_scheduling_group(sl_it->second.sg, seastar::format(deleted_scheduling_group_name_pattern, sl_it->first)); + } + service_level_info sl_info = { + .name = name, + .sg = sl_it->second.sg, + }; _service_levels_db.erase(sl_it); - co_return co_await seastar::async( [this, name] { - _subscribers.thread_for_each([name] (qos_configuration_change_subscriber* subscriber) { + co_return co_await seastar::async( [this, name, sl_info] { + _subscribers.thread_for_each([name, sl_info] (qos_configuration_change_subscriber* subscriber) { try { - subscriber->on_after_service_level_remove({name}).get(); + subscriber->on_after_service_level_remove(sl_info).get(); } catch (...) { sl_logger.error("notify_service_level_removed: exception occurred in one of the observers callbacks {}", std::current_exception()); } @@ -409,6 +523,39 @@ future<> service_level_controller::notify_service_level_removed(sstring name) { co_return; } +scheduling_group service_level_controller::get_default_scheduling_group() { + return _default_service_level.sg; +} + +scheduling_group service_level_controller::get_scheduling_group(sstring service_level_name) { + auto service_level_it = _service_levels_db.find(service_level_name); + if (service_level_it != _service_levels_db.end()) { + return service_level_it->second.sg; + } else { + return get_default_scheduling_group(); + } +} + +future service_level_controller::get_user_scheduling_group(const std::optional& usr) { + if (usr && usr->name) { + auto sl_opt = co_await find_effective_service_level(*usr->name); + auto& sl_name = (sl_opt && sl_opt->shares_name) ? *sl_opt->shares_name : default_service_level_name; + co_return get_scheduling_group(sl_name); + } + else { + co_return get_default_scheduling_group(); + } +} + +std::optional service_level_controller::get_active_service_level() { + unsigned sched_idx = internal::scheduling_group_index(current_scheduling_group()); + if (_sl_lookup[sched_idx].first) { + return sstring(*_sl_lookup[sched_idx].first); + } else { + return std::nullopt; + } +} + future<> service_level_controller::notify_effective_service_levels_cache_reloaded() { co_await _subscribers.for_each([] (qos_configuration_change_subscriber* subscriber) -> future<> { return subscriber->on_effective_service_levels_cache_reloaded(); @@ -518,6 +665,25 @@ future service_level_controller::get_distributed_service_le return _sl_data_accessor ? _sl_data_accessor->get_service_level(service_level_name) : make_ready_future(); } +future service_level_controller::validate_before_service_level_add() { + assert(this_shard_id() == global_controller); + if (_global_controller_db->deleted_scheduling_groups.size() > 0) { + return make_ready_future(true); + } else if (_global_controller_db->scheduling_groups_exhausted) { + return make_ready_future(false); + } else { + return create_scheduling_group(seastar::format(temp_scheduling_group_name_pattern, _global_controller_db->unique_group_counter++), 1).then_wrapped([this] (future new_sg_f) { + if (new_sg_f.failed()) { + new_sg_f.ignore_ready_future(); + _global_controller_db->scheduling_groups_exhausted = true; + return make_ready_future(false); + } + _global_controller_db->deleted_scheduling_groups.emplace_back(new_sg_f.get()); + return make_ready_future(true); + }); + } +} + future<> service_level_controller::set_distributed_service_level(sstring name, service_level_options slo, set_service_level_op_type op_type, service::group0_batch& mc) { auto sl_info = co_await _sl_data_accessor->get_service_levels(); auto it = sl_info.find(name); @@ -533,6 +699,13 @@ future<> service_level_controller::set_distributed_service_level(sstring name, s co_return; } } + + if (op_type != set_service_level_op_type::alter) { + bool validation_result = co_await container().invoke_on(global_controller, &service_level_controller::validate_before_service_level_add); + if (!validation_result&& !utils::get_local_injector().enter("allow_service_level_over_limit")) { + throw exceptions::invalid_request_exception("Can't create service level - no more scheduling groups exist"); + } + } co_return co_await _sl_data_accessor->set_service_level(name, slo, mc); } @@ -554,8 +727,49 @@ future<> service_level_controller::do_add_service_level(sstring name, service_le return make_ready_future(); } } else { - return do_with(service_level(slo, is_static), std::move(name), [this] (service_level& sl, sstring& name) { - return container().invoke_on_all(&service_level_controller::notify_service_level_added, name, sl); + return do_with(service_level(slo, is_static, default_scheduling_group()), + std::move(name), [this] (service_level& sl, sstring& name) { + return make_ready_future().then([this, &sl, &name] () mutable { + int32_t share_count = default_shares; + if (auto* maybe_shares = std::get_if(&sl.slo.shares)) { + share_count = *maybe_shares; + } + + if (!_global_controller_db->deleted_scheduling_groups.empty()) { + auto&& it = std::find_if(_global_controller_db->deleted_scheduling_groups.begin() + , _global_controller_db->deleted_scheduling_groups.end() + , [sg_name_to_find = seastar::format(deleted_scheduling_group_name_pattern, name)] (const scheduling_group& sg) { + return (sg.name() == sg_name_to_find); + }); + if (it != _global_controller_db->deleted_scheduling_groups.end()) { + sl.sg = *it; + _global_controller_db->deleted_scheduling_groups.erase(it); + } else { + sl.sg = _global_controller_db->deleted_scheduling_groups.front(); + _global_controller_db->deleted_scheduling_groups.pop_front(); + } + return container().invoke_on_all([&sl, share_count] (service_level_controller& service) { + scheduling_group non_const_sg = sl.sg; + return non_const_sg.set_shares((float)share_count); + }).then([&sl, &name] { + return rename_scheduling_group(sl.sg, seastar::format(scheduling_group_name_pattern, name)); + }); + } else if (_global_controller_db->scheduling_groups_exhausted) { + return make_exception_future<>(service_level_scheduling_groups_exhausted(name)); + } else { + return create_scheduling_group(seastar::format(scheduling_group_name_pattern, name), share_count).then_wrapped([this, name, &sl] (future sg_fut) { + if (sg_fut.failed()) { + sg_fut.ignore_ready_future(); + _global_controller_db->scheduling_groups_exhausted = true; + return make_exception_future<>(service_level_scheduling_groups_exhausted(name)); + } + sl.sg = sg_fut.get(); + return make_ready_future<>(); + }); + } + }).then([this, &sl, &name] () { + return container().invoke_on_all(&service_level_controller::notify_service_level_added, name, sl); + }); }); } return make_ready_future(); @@ -703,7 +917,7 @@ future<> service_level_controller::unregister_subscriber(qos_configuration_chang static sstring describe_service_level(std::string_view sl_name, const service_level_options& sl_opts) { using slo = service_level_options; - utils::small_vector opts{}; + utils::small_vector opts{}; const sstring sl_name_formatted = cql3::util::maybe_quote(sl_name); @@ -729,6 +943,10 @@ static sstring describe_service_level(std::string_view sl_name, const service_le on_internal_error(sl_logger, "Unexpected workload type"); } + if (auto* maybe_shares = std::get_if(&sl_opts.shares)) { + opts.push_back(seastar::format("SHARES = {}", *maybe_shares)); + } + if (opts.size() == 0) { return seastar::format("CREATE SERVICE LEVEL {};", sl_name_formatted); } diff --git a/service/qos/service_level_controller.hh b/service/qos/service_level_controller.hh index 46b2082791..9004a1625c 100644 --- a/service/qos/service_level_controller.hh +++ b/service/qos/service_level_controller.hh @@ -50,12 +50,14 @@ namespace qos { struct service_level { service_level_options slo; bool is_static = false; + scheduling_group sg; service_level() = default; - service_level(service_level_options slo, bool is_static) + service_level(service_level_options slo, bool is_static, scheduling_group sg) : slo(std::move(slo)) , is_static(is_static) + , sg(sg) {} }; @@ -68,7 +70,8 @@ using update_both_cache_levels = bool_class; * 1. Global controller which is responsible for all of the data and plumbing * manipulation. * 2. Local controllers that act upon the data and facilitates execution in - * the service level context + * the service level context: i.e functions in their service level's + * scheduling group and io operations with their correct io priority. * * Definitions: * service level - User creates service level with some parameters (timeout/workload type). @@ -99,6 +102,8 @@ using update_both_cache_levels = bool_class; */ class service_level_controller : public peering_sharded_service, public service::endpoint_lifecycle_subscriber { public: + static inline const int32_t default_shares = 1000; + class service_level_distributed_data_accessor { public: virtual future get_service_levels(qos::query_context ctx = qos::query_context::unspecified) const = 0; @@ -116,8 +121,7 @@ public: private: struct global_controller_data { service_levels_info static_configurations{}; - int schedg_group_cnt = 0; - int io_priority_cnt = 0; + std::deque deleted_scheduling_groups{}; service_level_options default_service_level_config; // The below future is used to serialize work so no reordering can occur. // This is needed so for example: delete(x), add(x) will not reverse yielding @@ -127,6 +131,13 @@ private: future<> distributed_data_update = make_ready_future(); abort_source dist_data_update_aborter; abort_source group0_aborter; + scheduling_group default_sg; + bool destroy_default_sg; + // a counter for making unique temp scheduling groups names + int unique_group_counter; + // A flag that indicates that we exhausted all of our scheduling groups + // and we can't create new ones. + bool scheduling_groups_exhausted = false; }; std::unique_ptr _global_controller_db; @@ -137,6 +148,9 @@ private: std::map _service_levels_db; // role name -> effective service_level_options std::map _effective_service_levels_db; + // Keeps names of effectively dropped service levels. Those service levels exits in the table but are not present in _service_levels_db cache + std::set _effectively_dropped_sls; + std::pair _sl_lookup[max_scheduling_groups()]; service_level _default_service_level; service_level_distributed_data_accessor_ptr _sl_data_accessor; sharded& _auth_service; @@ -147,7 +161,8 @@ private: optimized_optional _early_abort_subscription; void do_abort() noexcept; public: - service_level_controller(sharded& auth_service, locator::shared_token_metadata& tm, abort_source& as, service_level_options default_service_level_config); + service_level_controller(sharded& auth_service, locator::shared_token_metadata& tm, abort_source& as, service_level_options default_service_level_config, + scheduling_group default_scheduling_group, bool destroy_default_sg_on_drain = false); /** * this function must be called *once* from any shard before any other functions are called. @@ -191,6 +206,69 @@ public: void abort_group0_operations(); + /** + * this is an executor of a function with arguments under a service level + * that corresponds to a given user. + * @param usr - the user for determining the service level + * @param func - the function to be executed + * @return a future that is resolved when the function's operation is resolved + * (if it returns a future). or a ready future containing the returned value + * from the function/ + */ + template > + requires std::invocable + futurize_t with_user_service_level(const std::optional& usr, Func&& func) { + if (usr && usr->name) { + return find_effective_service_level(*usr->name).then([this, func = std::move(func)] (std::optional opts) mutable { + auto& service_level_name = (opts && opts->shares_name) ? *opts->shares_name : default_service_level_name; + return with_service_level(service_level_name, std::move(func)); + }); + } else { + return with_service_level(default_service_level_name, std::move(func)); + } + } + + /** + * this is an executor of a function with arguments under a specific + * service level. + * @param service_level_name + * @param func - the function to be executed + * @param args - the arguments to pass to the function. + * @return a future that is resolved when the function's operation is resolved + * (if it returns a future). or a ready future containing the returned value + * from the function/ + */ + template > + requires std::invocable + futurize_t with_service_level(sstring service_level_name, Func&& func) { + service_level& sl = get_service_level(service_level_name); + return with_scheduling_group(sl.sg, std::move(func)); + } + + /** + * @return the default service level scheduling group (see service_level_controller::initialize). + */ + scheduling_group get_default_scheduling_group(); + /** + * Get the scheduling group for a specific service level. + * @param service_level_name - the service level which it's scheduling group + * should be returned. + * @return if the service level exists the service level's scheduling group. else + * get_scheduling_group("default") + */ + scheduling_group get_scheduling_group(sstring service_level_name); + /** + * Get the scheduling group of a specific user + * @param user - the user for determining the service level + * @return if the user is authenticated the user's scheduling group. otherwise get_scheduling_group("default") + */ + future get_user_scheduling_group(const std::optional& usr); + /** + * @return the name of the currently active service level if such exists or an empty + * optional if no active service level. + */ + std::optional get_active_service_level(); + /** * Start legacy update loop if RAFT_SERVICE_LEVELS_CHANGE feature is not enabled yet * or the cluster is in recovery mode @@ -332,6 +410,10 @@ private: alter }; + /** Validate that we can handle an addition of another service level + * Must be called from on the global controller + */ + future validate_before_service_level_add(); future<> set_distributed_service_level(sstring name, service_level_options slo, set_service_level_op_type op_type, service::group0_batch& mc); future> describe_created_service_levels() const; diff --git a/service/qos/standard_service_level_distributed_data_accessor.hh b/service/qos/standard_service_level_distributed_data_accessor.hh index a308ffd63b..c37faeabcf 100644 --- a/service/qos/standard_service_level_distributed_data_accessor.hh +++ b/service/qos/standard_service_level_distributed_data_accessor.hh @@ -19,8 +19,7 @@ namespace db { class system_distributed_keyspace; } namespace qos { -class standard_service_level_distributed_data_accessor : public service_level_controller::service_level_distributed_data_accessor, - public ::enable_shared_from_this { +class standard_service_level_distributed_data_accessor : public service_level_controller::service_level_distributed_data_accessor { private: db::system_distributed_keyspace& _sys_dist_ks; public: diff --git a/service/storage_service.cc b/service/storage_service.cc index f4fcc6d91e..fed207321a 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -108,6 +108,8 @@ #include "service/topology_mutation.hh" #include "service/topology_coordinator.hh" #include "cql3/query_processor.hh" +#include "service/qos/service_level_controller.hh" +#include "service/qos/standard_service_level_distributed_data_accessor.hh" #include #include @@ -2043,6 +2045,35 @@ future<> storage_service::join_topology(sharded co_await _sys_ks.local().cdc_set_rewritten(std::nullopt); } + // now, that the system distributed keyspace is initialized and started, + // pass an accessor to the service level controller so it can interact with it + // but only if the conditions are right (the cluster supports or have supported + // workload prioritization before): + if (!sys_dist_ks.local().workload_prioritization_tables_exists()) { + // if we got here, it means that the workload priotization didn't exist before and + // also that the cluster currently doesn't support workload prioritization. + // we delay the creation of the tables and accessing them until it does. + // + // the callback might be run immediately and it uses async methods, so the thread is needed + co_await seastar::async([&] { + _workload_prioritization_registration = _feature_service.workload_prioritization.when_enabled([&sys_dist_ks] () { + // since we are creating tables here and we wouldn't want to have a race condition + // we will first wait for a random period of time and only then start the routine + // the race condition can happen because the feature flag will "light up" in about + // the same time on all nodes. The more nodes there are, the higher the chance for + // a race. + std::random_device seed_gen; + std::default_random_engine rnd_engine(seed_gen()); + std::uniform_int_distribution<> delay_generator(0,5000000); + sleep(std::chrono::microseconds(delay_generator(rnd_engine))).get(); + sys_dist_ks.invoke_on_all(&db::system_distributed_keyspace::start_workload_prioritization).get(); + slogger.info("Workload prioritization v1 started."); + }); + }); + } else { + slogger.info("Workload prioritization v1 is already started."); + } + if (!cdc_gen_id) { // If we didn't observe any CDC generation at this point, then either // 1. we're replacing a node, diff --git a/service/storage_service.hh b/service/storage_service.hh index 2b65798528..0d44ed864c 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -28,6 +28,7 @@ #include "dht/token_range_endpoints.hh" #include #include "gms/application_state.hh" +#include "gms/feature.hh" #include #include #include "replica/database_fwd.hh" @@ -174,6 +175,7 @@ private: using client_shutdown_hook = noncopyable_function; std::vector _protocol_servers; std::vector _listeners; + gms::feature::listener_registration _workload_prioritization_registration; gate _async_gate; condition_variable _tablet_split_monitor_event; diff --git a/test.py b/test.py index e9a4868e71..4052150c08 100755 --- a/test.py +++ b/test.py @@ -1135,8 +1135,13 @@ class PythonTest(Test): try: cluster.before_test(self.uname) prepare_cql = self.suite.cfg.get("prepare_cql", None) - if prepare_cql: - next(iter(cluster.running.values())).control_connection.execute(prepare_cql) + if prepare_cql and not hasattr(cluster, 'prepare_cql_executed'): + cc = next(iter(cluster.running.values())).control_connection + if not isinstance(prepare_cql, collections.abc.Iterable): + prepare_cql = [prepare_cql] + for stmt in prepare_cql: + cc.execute(stmt) + cluster.prepare_cql_executed = True logger.info("Leasing Scylla cluster %s for test %s", cluster, self.uname) self.args.insert(0, "--host={}".format(cluster.endpoint())) self.is_before_test_ok = True diff --git a/test/alternator/conftest.py b/test/alternator/conftest.py index 7ea2040151..0bf69410ae 100644 --- a/test/alternator/conftest.py +++ b/test/alternator/conftest.py @@ -68,7 +68,7 @@ def pytest_collection_modifyitems(config, items): # from the appropriate system table, but can't do it with Alternator (because # we don't know yet the secret key!), so we need to do it with CQL. @cache -def get_valid_alternator_role(url): +def get_valid_alternator_role(url, role='cassandra'): from cassandra.cluster import Cluster from cassandra.auth import PlainTextAuthProvider auth_provider = PlainTextAuthProvider( @@ -85,7 +85,6 @@ def get_valid_alternator_role(url): # We could have looked for any role/salted_hash pair, but we # already know a role "cassandra" exists (we just used it to # connect to CQL!), so let's just use that role. - role = 'cassandra' salted_hash = list(session.execute(f"SELECT salted_hash FROM {ks}.roles WHERE role = '{role}'"))[0].salted_hash if salted_hash is None: break @@ -129,7 +128,7 @@ def dynamodb(request): region_name='us-east-1', aws_access_key_id=user, aws_secret_access_key=secret, config=boto_config.merge(botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300))) -def new_dynamodb_session(request, dynamodb): +def new_dynamodb_session(request, dynamodb, user='cassandra', password='secret_pass'): ses = boto3.Session() host = urlparse(dynamodb.meta.client._endpoint.host) conf = botocore.client.Config(parameter_validation=False) @@ -137,7 +136,7 @@ def new_dynamodb_session(request, dynamodb): return boto3.resource('dynamodb', config=conf) if host.hostname == 'localhost': conf = conf.merge(botocore.client.Config(retries={"max_attempts": 0}, read_timeout=300)) - user, secret = get_valid_alternator_role(dynamodb.meta.client._endpoint.host) + user, secret = get_valid_alternator_role(dynamodb.meta.client._endpoint.host, role=user) return ses.resource('dynamodb', endpoint_url=dynamodb.meta.client._endpoint.host, verify=host.scheme != 'http', region_name='us-east-1', aws_access_key_id=user, aws_secret_access_key=secret, config=conf) diff --git a/test/alternator/run b/test/alternator/run index 64131dad6d..797edf142c 100755 --- a/test/alternator/run +++ b/test/alternator/run @@ -115,6 +115,19 @@ run.wait_for_services(pid, [ lambda: check_alternator(alternator_url), ]) +# Set up the the proper authentication credentials needed by the Alternator +# test. Currently this can only be done through CQL, which is why above we +# needed to make sure CQL is available. +cluster = run.get_cql_cluster(ip) +cql = cluster.connect() + +# Additional role and service level are created to test the feature properly (alternator doesn't have it's own API to set it up so we need to use CQL). +cql.execute("INSERT INTO system_auth_v2.roles (role, salted_hash) VALUES ('alternator_custom_sl', 'secret_pass')") +cql.execute("CREATE SERVICE LEVEL sl_alternator") +cql.execute("ATTACH SERVICE LEVEL sl_alternator TO alternator_custom_sl") + +cluster.shutdown() + # Finally run pytest: success = run.run_pytest(sys.path[0], ['--url', alternator_url] + sys.argv[1:]) diff --git a/test/alternator/suite.yaml b/test/alternator/suite.yaml index ce11ad8094..9264015b1e 100644 --- a/test/alternator/suite.yaml +++ b/test/alternator/suite.yaml @@ -1,5 +1,10 @@ type: Python pool_size: 6 +prepare_cql: + - INSERT INTO system.roles (role, can_login, salted_hash) VALUES ('alternator_custom_sl', true, 'secret_pass') + - CREATE SERVICE LEVEL sl_alternator + - ATTACH SERVICE LEVEL sl_alternator TO alternator_custom_sl + run_first: - test_streams - test_scan diff --git a/test/alternator/test_metrics.py b/test/alternator/test_metrics.py index 9ba9008b03..c2e18c9c18 100644 --- a/test/alternator/test_metrics.py +++ b/test/alternator/test_metrics.py @@ -95,13 +95,13 @@ def get_metric(metrics, name, requested_labels=None, the_metrics=None): # of the specified metrics. Helps reduce the amount of code duplication # below. @contextmanager -def check_increases_metric(metrics, metric_names): +def check_increases_metric(metrics, metric_names, requested_labels=None): the_metrics = get_metrics(metrics) - saved_metrics = { x: get_metric(metrics, x, None, the_metrics) for x in metric_names } + saved_metrics = { x: get_metric(metrics, x, requested_labels, the_metrics) for x in metric_names } yield the_metrics = get_metrics(metrics) for n in metric_names: - assert saved_metrics[n] < get_metric(metrics, n, None, the_metrics), f'metric {n} did not increase' + assert saved_metrics[n] < get_metric(metrics, n, requested_labels, the_metrics), f'metric {n} did not increase' @contextmanager def check_increases_metric_exact(metrics, metric_name, increase_value): diff --git a/test/alternator/test_service_levels.py b/test/alternator/test_service_levels.py new file mode 100644 index 0000000000..9689d57b06 --- /dev/null +++ b/test/alternator/test_service_levels.py @@ -0,0 +1,95 @@ +# Copyright 2023-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + +import pytest +from test.alternator.util import random_string, is_aws +from test.alternator.conftest import new_dynamodb_session +from test.alternator.test_metrics import metrics, get_metrics, check_increases_metric +from contextlib import contextmanager +from cassandra.auth import PlainTextAuthProvider +from cassandra.cluster import Cluster, ExecutionProfile, EXEC_PROFILE_DEFAULT, ConsistencyLevel +from cassandra.policies import RoundRobinPolicy +import time +import re + +# Quote an identifier if it needs to be double-quoted in CQL. Quoting is +# *not* needed if the identifier matches [a-z][a-z0-9_]*, otherwise it does. +# double-quotes ('"') in the string are doubled. +def maybe_quote(identifier): + if re.match('^[a-z][a-z0-9_]*$', identifier): + return identifier + return '"' + identifier.replace('"', '""') + '"' + +# Convenience context manager for temporarily GRANTing some permission and +# then revoking it. +@contextmanager +def temporary_grant(cql, permission, resource, role): + role = maybe_quote(role) + cql.execute(f"GRANT {permission} ON {resource} TO {role}") + try: + yield + finally: + cql.execute(f"REVOKE {permission} ON {resource} FROM {role}") + +# Convenience function for getting the full CQL table name (ksname.cfname) +# for the given Alternator table. This uses our insider knowledge that +# table named "x" is stored in keyspace called "alternator_x", and if we +# ever change this we'll need to change this function too. +def cql_table_name(tab): + return maybe_quote('alternator_' + tab.name) + '.' + maybe_quote(tab.name) + +# This file is all about testing RBAC as configured via CQL, so we need to +# connect to CQL to set these tests up. The "cql" fixture below enables that. +# If we're not testing Scylla, or the CQL port is not available on the same +# IP address as the Alternator IP address, a test using this fixture will +# be skipped with a message about the CQL API not being available. +@pytest.fixture(scope="module") +def cql(dynamodb): + if is_aws(dynamodb): + pytest.skip('Scylla-only CQL API not supported by AWS') + url = dynamodb.meta.client._endpoint.host + host, = re.search(r'.*://([^:]*):', url).groups() + profile = ExecutionProfile( + load_balancing_policy=RoundRobinPolicy(), + consistency_level=ConsistencyLevel.LOCAL_QUORUM, + serial_consistency_level=ConsistencyLevel.LOCAL_SERIAL) + cluster = Cluster(execution_profiles={EXEC_PROFILE_DEFAULT: profile}, + contact_points=[host], + port=9042, + protocol_version=4, + auth_provider=PlainTextAuthProvider(username='cassandra', password='cassandra'), + ) + try: + ret = cluster.connect() + # "BEGIN BATCH APPLY BATCH" is the closest to do-nothing I could find + ret.execute("BEGIN BATCH APPLY BATCH") + except NoHostAvailable: + pytest.skip('Could not connect to Scylla-only CQL API') + yield ret + cluster.shutdown() + +def test_service_level_metrics(test_table, request, dynamodb, cql, metrics): + print("Please make sure authorization is enforced in your Scylla installation: alternator_enforce_authorization: true") + p = random_string() + c = random_string() + _ = get_metrics(metrics) + # Use additional user created by test/alternator/run to execute write under sl_alternator service level. + ses = new_dynamodb_session(request, dynamodb, user='alternator_custom_sl') + # service_level_controler acts asynchronously in a loop so we can fail metric check + # if it hasn't processed service level update yet. It can take as long as 10 seconds. + started = time.time() + timeout = 30 + while True: + try: + with temporary_grant(cql, 'MODIFY', cql_table_name(test_table), 'alternator_custom_sl'): + with check_increases_metric(metrics, + ['scylla_storage_proxy_coordinator_write_latency_count'], + {'scheduling_group_name': 'sl:sl_alternator'}): + ses.meta.client.put_item(TableName=test_table.name, Item={'p': p, 'c': c}) + break # no exception, test passed + except: + if time.time() - started > timeout: + raise + else: + time.sleep(0.5) # retry diff --git a/test/auth_cluster/test_raft_service_levels.py b/test/auth_cluster/test_raft_service_levels.py index 2e715f1bc0..b2ebb6bfed 100644 --- a/test/auth_cluster/test_raft_service_levels.py +++ b/test/auth_cluster/test_raft_service_levels.py @@ -11,7 +11,7 @@ from test.pylib.rest_client import get_host_api_address, read_barrier from test.pylib.util import unique_name, wait_for_cql_and_get_hosts from test.pylib.manager_client import ManagerClient from test.topology.util import trigger_snapshot, wait_until_topology_upgrade_finishes, enter_recovery_state, reconnect_driver, \ - delete_raft_topology_state, delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes + delete_raft_topology_state, delete_raft_data_and_upgrade_state, wait_until_upgrade_finishes, wait_for_token_ring_and_group0_consistency from test.topology.conftest import skip_mode from cassandra import ConsistencyLevel from cassandra.query import SimpleStatement @@ -189,9 +189,9 @@ def create_roles_stmts(): def create_service_levels_stmts(): return [ - "CREATE SERVICE LEVEL sl1 WITH timeout=30m AND workload_type='interactive'", - "CREATE SERVICE LEVEL sl2 WITH timeout=1h AND workload_type='batch'", - "CREATE SERVICE LEVEL sl3 WITH timeout=30s", + "CREATE SERVICE LEVEL sl1 WITH timeout=30m AND workload_type='interactive' AND shares=1000", + "CREATE SERVICE LEVEL sl2 WITH timeout=1h AND workload_type='batch' AND shares=500", + "CREATE SERVICE LEVEL sl3 WITH timeout=30s AND shares=800", ] def attach_service_levels_stms(): @@ -230,6 +230,7 @@ async def assert_connections_params(manager: ManagerClient, hosts, expect): continue assert param["workload_type"] == expect[role]["workload_type"] assert param["timeout"] == expect[role]["timeout"] + assert param["scheduling_group"] @pytest.mark.asyncio @skip_mode('release', 'cql server testing REST API is not supported in release mode') @@ -248,14 +249,17 @@ async def test_connections_parameters_auto_update(manager: ManagerClient, build_ "r1": { "workload_type": "unspecified", "timeout": default_timeout(build_mode), + "scheduling_group": "sl:default", }, "r2": { "workload_type": "unspecified", "timeout": default_timeout(build_mode), + "scheduling_group": "sl:default", }, "r3": { "workload_type": "unspecified", "timeout": default_timeout(build_mode), + "scheduling_group": "sl:default", }, }) @@ -271,14 +275,17 @@ async def test_connections_parameters_auto_update(manager: ManagerClient, build_ "r1": { "workload_type": "interactive", "timeout": "30m", + "scheduling_group": "sl:sl1", }, "r2": { "workload_type": "batch", "timeout": "1h", + "scheduling_group": "sl:sl2", }, "r3": { "workload_type": "unspecified", "timeout": "30s", + "scheduling_group": "sl:sl3", }, }) @@ -292,14 +299,17 @@ async def test_connections_parameters_auto_update(manager: ManagerClient, build_ "r1": { "workload_type": "batch", "timeout": "30s", + "scheduling_group": "sl:sl2", }, "r2": { "workload_type": "batch", "timeout": "30s", + "scheduling_group": "sl:sl2", }, "r3": { "workload_type": "unspecified", "timeout": "30s", + "scheduling_group": "sl:sl3", }, }) @@ -332,3 +342,133 @@ async def test_service_level_cache_after_restart(manager: ManagerClient): result = await cql.run_async("SELECT workload_type FROM system.service_levels_v2") assert len(result) == 1 and result[0].workload_type == 'batch' + +@pytest.mark.asyncio +@skip_mode('release', 'error injection is disabled in release mode') +async def test_shares_check(manager: ManagerClient): + srv = await manager.server_add(config={ + "error_injections_at_startup": [ + { "name": "suppress_features", "value": "WORKLOAD_PRIORITIZATION"} + ] + }) + await manager.server_start(srv.server_id) + + sl1 = f"sl_{unique_name()}" + sl2 = f"sl_{unique_name()}" + cql = manager.get_cql() + + await cql.run_async(f"CREATE SERVICE LEVEL {sl1}") + with pytest.raises(InvalidRequest, match="`shares` option can only be used when the cluster is fully upgraded to enterprise"): + await cql.run_async(f"CREATE SERVICE LEVEL {sl2} WITH shares=500") + with pytest.raises(InvalidRequest, match="`shares` option can only be used when the cluster is fully upgraded to enterprise"): + await cql.run_async(f"ALTER SERVICE LEVEL {sl1} WITH shares=100") + + await manager.server_stop_gracefully(srv.server_id) + await manager.server_update_config(srv.server_id, "error_injections_at_startup", []) + await manager.server_start(srv.server_id) + await wait_for_cql_and_get_hosts(manager.get_cql(), [srv], time.time() + 60) + + cql = manager.get_cql() + await cql.run_async(f"CREATE SERVICE LEVEL {sl2} WITH shares=500") + await cql.run_async(f"ALTER SERVICE LEVEL {sl1} WITH shares=100") + +@pytest.mark.asyncio +@skip_mode('release', 'error injection is not supported in release mode') +async def test_workload_prioritization_upgrade(manager: ManagerClient): + # This test simulates OSS->enterprise upgrade in v1 service levels. + # Using error injection, the test disables WORKLOAD_PRIORITIZATION feature + # and removes `shares` column from system_distributed.service_levels table. + config = { + 'authenticator': 'AllowAllAuthenticator', + 'authorizer': 'AllowAllAuthorizer', + 'force_gossip_topology_changes': True, + 'error_injections_at_startup': [ + { + 'name': 'suppress_features', + 'value': 'WORKLOAD_PRIORITIZATION' + }, + { + 'name': 'service_levels_v1_table_without_shares' + } + ] + } + servers = [await manager.server_add(config=config) for _ in range(3)] + cql = manager.get_cql() + hosts = await wait_for_cql_and_get_hosts(cql, servers, time.time() + 60) + + # Validate that service levels' table has no `shares` column + sl_schema = await cql.run_async("DESC TABLE system_distributed.service_levels") + assert "shares int" not in sl_schema[0].create_statement + with pytest.raises(InvalidRequest): + await cql.run_async("CREATE SERVICE LEVEL sl1 WITH shares = 100") + + # Do rolling restart of the cluster and remove error injections + for server in servers: + await manager.server_update_config(server.server_id, 'error_injections_at_startup', []) + await manager.rolling_restart(servers) + + # Validate that `shares` column was added + logs = [await manager.server_open_log(server.server_id) for server in servers] + await logs[0].wait_for("Workload prioritization v1 started|Workload prioritization v1 is already started", timeout=10) + sl_schema_upgraded = await cql.run_async("DESC TABLE system_distributed.service_levels") + assert "shares int" in sl_schema_upgraded[0].create_statement + await cql.run_async("CREATE SERVICE LEVEL sl2 WITH shares = 100") + +@pytest.mark.asyncio +@skip_mode('release', 'error injection is disabled in release mode') +async def test_service_levels_over_limit(manager: ManagerClient): + srv = await manager.server_add(config={ + "error_injections_at_startup": ['allow_service_level_over_limit'] + }) + await manager.server_start(srv.server_id) + cql = manager.get_cql() + hosts = await wait_for_cql_and_get_hosts(cql, [srv], time.time() + 60) + + SL_LIMIT = 7 + sls = [] + for i in range(SL_LIMIT + 1): + sl = f"sl_{i}_{unique_name()}" + sls.append(sl) + await cql.run_async(f"CREATE SERVICE LEVEL {sl}") + + log = await manager.server_open_log(srv.server_id) + mark = await log.mark() + await cql.run_async(f"ATTACH SERVICE LEVEL {sls[-1]} TO CASSANDRA") + await log.wait_for(f"Service level {sls[-1]} is effectively dropped and its values are ignored.", timeout=10, from_mark=mark) + + mark = await log.mark() + # When service levels exceed the limit, last service levels in alphabetical order are effectively dropped + sl_name = f"aaa_sl_{unique_name()}" + await cql.run_async(f"CREATE SERVICE LEVEL {sl_name}") + await log.wait_for(f"service level \"{sls[-2]}\" will be effectively dropped to make scheduling group available to \"{sl_name}\", please consider removing a service level.", timeout=10, from_mark=mark) + +# Reproduces issue scylla-enterprise#4912 +@pytest.mark.asyncio +async def test_service_level_metric_name_change(manager: ManagerClient) -> None: + s = await manager.server_add() + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) + cql = manager.get_cql() + + sl1 = unique_name() + sl2 = unique_name() + + # creates scheduling group `sl:sl1` + await cql.run_async(f"CREATE SERVICE LEVEL {sl1}") + # renames scheduling group `sl:sl1` to `sl_deleted:sl1` + await cql.run_async(f"DROP SERVICE LEVEL {sl1}") + # renames scheduling group `sl_deleted:sl1` to `sl:sl2` + await cql.run_async(f"CREATE SERVICE LEVEL {sl2}") + # creates scheduling group `sl:sl1` + await cql.run_async(f"CREATE SERVICE LEVEL {sl1}") + # In issue #4912, service_level_controller thought there was no room + # for `sl:sl1` scheduling group because create_scheduling_group() failed due to + # `seastar::metrics::double_registration (registering metrics twice for metrics: transport_cql_requests_count)` + # but the scheduling group was actually created. + # When sl2 is dropped, service_level_controller tries to rename its + # scheduling group to `sl:sl1`, triggering + # `seastar::metrics::double_registration (registering metrics twice for metrics: scheduler_runtime_ms)` + await cql.run_async(f"DROP SERVICE LEVEL {sl2}") + + # Check if group0 is healthy + s2 = await manager.server_add() + await wait_for_token_ring_and_group0_consistency(manager, time.time() + 30) diff --git a/test/boost/cql_query_test.cc b/test/boost/cql_query_test.cc index 008b075250..95a7308081 100644 --- a/test/boost/cql_query_test.cc +++ b/test/boost/cql_query_test.cc @@ -40,6 +40,7 @@ #include "db/extensions.hh" #include "cql3/cql_config.hh" #include "test/lib/exception_utils.hh" +#include "service/qos/qos_common.hh" #include "utils/rjson.hh" #include "schema/schema_builder.hh" #include "service/migration_manager.hh" @@ -5061,14 +5062,21 @@ SEASTAR_TEST_CASE(test_user_based_sla_queries) { e.execute_cql("CREATE SERVICE_LEVEL sl_1;").get(); auto msg = e.execute_cql("LIST SERVICE_LEVEL sl_1;").get(); assert_that(msg).is_rows().with_rows({ - {utf8_type->decompose("sl_1"), {}, {}}, + {utf8_type->decompose("sl_1"), {}, {}, int32_type->decompose(1000)}, + }); + //create and alter service levels + e.execute_cql("CREATE SERVICE_LEVEL sl_2 WITH SHARES = 200;").get(); + e.execute_cql("ALTER SERVICE_LEVEL sl_1 WITH SHARES = 111;").get(); + msg = e.execute_cql("LIST ALL SERVICE_LEVELS;").get(); + assert_that(msg).is_rows().with_rows({ + {utf8_type->decompose("sl_1"), {}, {}, int32_type->decompose(111), utf8_type->decompose("35.69%")}, + {utf8_type->decompose("sl_2"), {}, {}, int32_type->decompose(200), utf8_type->decompose("64.31%")}, }); - e.execute_cql("CREATE SERVICE_LEVEL sl_2;").get(); //drop service levels e.execute_cql("DROP SERVICE_LEVEL sl_1;").get(); msg = e.execute_cql("LIST ALL SERVICE_LEVELS;").get(); assert_that(msg).is_rows().with_rows({ - {utf8_type->decompose("sl_2"), {}, {}}, + {utf8_type->decompose("sl_2"), {}, {}, int32_type->decompose(200), utf8_type->decompose("100.00%")}, }); // validate exceptions (illegal requests) @@ -5076,9 +5084,12 @@ SEASTAR_TEST_CASE(test_user_based_sla_queries) { e.execute_cql("DROP SERVICE_LEVEL IF EXISTS sl_1;").get(); BOOST_REQUIRE_THROW(e.execute_cql("CREATE SERVICE_LEVEL sl_2;").get(), exceptions::invalid_request_exception); - BOOST_REQUIRE_THROW(e.execute_cql("CREATE SERVICE_LEVEL sl_2;").get(), exceptions::invalid_request_exception); + BOOST_REQUIRE_THROW(e.execute_cql("CREATE SERVICE_LEVEL sl_2 WITH SHARES = 999;").get(), exceptions::invalid_request_exception); e.execute_cql("CREATE SERVICE_LEVEL IF NOT EXISTS sl_2;").get(); + BOOST_REQUIRE_THROW(e.execute_cql("CREATE SERVICE_LEVEL sl_1 WITH SHARES = 0;").get(), exceptions::syntax_exception); + BOOST_REQUIRE_THROW(e.execute_cql("CREATE SERVICE_LEVEL sl_1 WITH SHARES = 1001;").get(), exceptions::syntax_exception); + // test attach role e.execute_cql("ATTACH SERVICE_LEVEL sl_2 TO tester").get(); msg = e.execute_cql("LIST ATTACHED SERVICE_LEVEL OF tester;").get(); @@ -5096,7 +5107,7 @@ SEASTAR_TEST_CASE(test_user_based_sla_queries) { BOOST_CHECK(true); // tests detaching service levels e.execute_cql("CREATE ROLE tester2;").get(); - e.execute_cql("CREATE SERVICE_LEVEL sl_1;").get(); + e.execute_cql("CREATE SERVICE_LEVEL sl_1 WITH SHARES = 998;").get(); e.execute_cql("ATTACH SERVICE_LEVEL sl_1 TO tester2;").get(); e.execute_cql("DETACH SERVICE_LEVEL FROM tester;").get(); msg = e.execute_cql("LIST ATTACHED SERVICE_LEVEL OF tester2;").get(); @@ -5130,6 +5141,7 @@ SEASTAR_TEST_CASE(test_user_based_sla_queries) { msg = e.execute_cql("LIST ALL ATTACHED SERVICE_LEVELS;").get(); assert_that(msg).is_rows().with_rows({ }); + BOOST_REQUIRE_THROW(e.execute_cql("ALTER SERVICE_LEVEL i_do_not_exist WITH shares = 1;").get(), exceptions::invalid_request_exception); }); } diff --git a/test/boost/database_test.cc b/test/boost/database_test.cc index 10a2381d67..647c44da5c 100644 --- a/test/boost/database_test.cc +++ b/test/boost/database_test.cc @@ -61,7 +61,7 @@ public: explicit database_test_wrapper(replica::database& db) : _db(db) { } reader_concurrency_semaphore& get_user_read_concurrency_semaphore() { - return _db._read_concurrency_sem; + return _db.read_concurrency_sem(); } reader_concurrency_semaphore& get_streaming_read_concurrency_semaphore() { return _db._streaming_concurrency_sem; @@ -69,6 +69,14 @@ public: reader_concurrency_semaphore& get_system_read_concurrency_semaphore() { return _db._system_read_concurrency_sem; } + + size_t get_total_user_reader_concurrency_semaphore_memory() { + return _db._reader_concurrency_semaphores_group._total_memory; + } + + size_t get_total_user_reader_concurrency_semaphore_weight() { + return _db._reader_concurrency_semaphores_group._total_weight; + } }; static future<> apply_mutation(sharded& sharded_db, table_id uuid, const mutation& m, bool do_flush = false, @@ -1151,7 +1159,8 @@ SEASTAR_THREAD_TEST_CASE(reader_concurrency_semaphore_selection_test) { auto& db = e.local_db(); database_test_wrapper tdb(db); for (const auto& [sched_group, expected_sem_getter] : scheduling_group_and_expected_semaphore) { - with_scheduling_group(sched_group, [&db, sched_group = sched_group, expected_sem_ptr = &expected_sem_getter(tdb)] { + with_scheduling_group(sched_group, [&db, sched_group = sched_group, &tdb, &expected_sem_getter = expected_sem_getter] { + auto expected_sem_ptr = &expected_sem_getter(tdb); auto& sem = db.get_reader_concurrency_semaphore(); if (&sem != expected_sem_ptr) { BOOST_FAIL(fmt::format("Unexpected semaphore for scheduling group {}, expected {}, got {}", sched_group.name(), expected_sem_ptr->name(), sem.name())); @@ -1296,6 +1305,92 @@ SEASTAR_TEST_CASE(upgrade_sstables) { }); } +SEASTAR_THREAD_TEST_CASE(per_service_level_reader_concurrency_semaphore_test) { + cql_test_config cfg; + do_with_cql_env_thread([] (cql_test_env& e) { + const size_t num_service_levels = 3; + const size_t num_keys_to_insert = 10; + const size_t num_individual_reads_to_test = 50; + auto& db = e.local_db(); + database_test_wrapper dbt(db); + size_t total_memory = dbt.get_total_user_reader_concurrency_semaphore_memory(); + sharded& sl_controller = e.service_level_controller_service(); + std::array sl_names; + qos::service_level_options slo; + size_t expected_total_weight = 0; + auto index_to_weight = [] (size_t i) -> size_t { + return (i + 1)*100; + }; + + // make the default service level take as little memory as possible + slo.shares.emplace(1); + expected_total_weight += 1; + sl_controller.local().add_service_level(qos::service_level_controller::default_service_level_name, slo).get(); + + // Just to make the code more readable. + auto get_reader_concurrency_semaphore_for_sl = [&] (sstring sl_name) -> reader_concurrency_semaphore& { + return *sl_controller.local().with_service_level(sl_name, noncopyable_function([&] { + return &db.get_reader_concurrency_semaphore(); + })).get(); + }; + + for (unsigned i = 0; i < num_service_levels; i++) { + sstring sl_name = format("sl{}", i); + slo.shares.emplace(index_to_weight(i)); + sl_controller.local().add_service_level(sl_name, slo).get(); + expected_total_weight += index_to_weight(i); + // Make sure that the total weight is tracked correctly in the semaphore group + BOOST_REQUIRE_EQUAL(expected_total_weight, dbt.get_total_user_reader_concurrency_semaphore_weight()); + sl_names[i] = sl_name; + size_t total_distributed_memory = 0; + for (unsigned j = 0 ; j <= i ; j++) { + reader_concurrency_semaphore& sem = get_reader_concurrency_semaphore_for_sl(sl_names[j]); + // Make sure that all semaphores that has been created until now - have the right amount of available memory + // after the operation has ended. + // We allow for a small delta of up to num_service_levels. This allows an off-by-one for each semaphore, + // the remainder being added to one of the semaphores. + // We make sure this didn't leak/create memory by checking the total below. + const auto delta = std::abs(ssize_t((index_to_weight(j) * total_memory) / expected_total_weight) - sem.available_resources().memory); + BOOST_REQUIRE_LE(delta, num_service_levels); + total_distributed_memory += sem.available_resources().memory; + } + total_distributed_memory += get_reader_concurrency_semaphore_for_sl(qos::service_level_controller::default_service_level_name).available_resources().memory; + BOOST_REQUIRE_EQUAL(total_distributed_memory, total_memory); + } + + auto get_semaphores_stats_snapshot = [&] () { + std::unordered_map snapshot; + for (auto&& sl_name : sl_names) { + snapshot[sl_name] = get_reader_concurrency_semaphore_for_sl(sl_name).get_stats(); + } + return snapshot; + }; + e.execute_cql("CREATE TABLE tbl (a int, b int, PRIMARY KEY (a));").get(); + + for (unsigned i = 0; i < num_keys_to_insert; i++) { + for (unsigned j = 0; j < num_keys_to_insert; j++) { + e.execute_cql(format("INSERT INTO tbl(a, b) VALUES ({}, {});", i, j)).get(); + } + } + + for (unsigned i = 0; i < num_individual_reads_to_test; i++) { + int random_service_level = tests::random::get_int(num_service_levels - 1); + auto snapshot_before = get_semaphores_stats_snapshot(); + + sl_controller.local().with_service_level(sl_names[random_service_level], noncopyable_function()> ([&] { + return e.execute_cql("SELECT * FROM tbl;").discard_result(); + })).get(); + auto snapshot_after = get_semaphores_stats_snapshot(); + for (auto& [sl_name, stats] : snapshot_before) { + // Make sure that the only semaphore that experienced any activity (at least measured activity) is + // the semaphore that belongs to the current service level. + BOOST_REQUIRE((stats == snapshot_after[sl_name] && sl_name != sl_names[random_service_level]) || + (stats != snapshot_after[sl_name] && sl_name == sl_names[random_service_level])); + } + } + }, std::move(cfg)).get(); +} + SEASTAR_TEST_CASE(populate_from_quarantine_works) { auto tmpdir_for_data = make_lw_shared(); auto db_cfg_ptr = make_shared(); diff --git a/test/boost/reader_concurrency_semaphore_test.cc b/test/boost/reader_concurrency_semaphore_test.cc index cb39f0990f..be0283916d 100644 --- a/test/boost/reader_concurrency_semaphore_test.cc +++ b/test/boost/reader_concurrency_semaphore_test.cc @@ -11,6 +11,7 @@ #include #include "reader_concurrency_semaphore.hh" #include "sstables/sstables_manager.hh" +#include "reader_concurrency_semaphore_group.hh" #include "test/lib/log.hh" #include "test/lib/simple_schema.hh" #include "test/lib/cql_assertions.hh" @@ -1219,6 +1220,113 @@ SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_set_resources) { } // namespace reader_concurrency_semaphore_test +SEASTAR_THREAD_TEST_CASE(test_reader_concurrency_semaphore_group) { + const auto initial_resources = reader_resources{100, 100 * 1024}; + auto serialize_multiplier = utils::updateable_value_source(2); + auto kill_multiplier = utils::updateable_value_source(3); + auto cpu_concurrency = utils::updateable_value_source(1); + + reader_concurrency_semaphore_group sem_group(initial_resources.memory, initial_resources.count, 1000, + utils::updateable_value(serialize_multiplier), + utils::updateable_value(kill_multiplier), + utils::updateable_value(cpu_concurrency)); + auto stop_sem = deferred_stop(sem_group); + + circular_buffer recycle_bin; + + const auto initial_shares = 1000; + struct scheduling_group_with_shares { + scheduling_group sg; + size_t shares; + + scheduling_group_with_shares(scheduling_group sg, size_t shares) : sg(sg), shares(shares) { } + }; + std::vector scheduling_groups; + const auto max_sched_groups = 8; + + auto check_sem_group = [&] { + const auto total_shares = boost::accumulate(scheduling_groups + | boost::adaptors::transformed([] (const scheduling_group_with_shares& sgs) { return sgs.shares; }), size_t(0)); + ssize_t total_memory = 0; + sem_group.foreach_semaphore([&] (scheduling_group sg, reader_concurrency_semaphore& sem) { + const auto res = sem.available_resources(); + BOOST_CHECK_EQUAL(res.count, initial_resources.count); // currently count is not partitioned among the semaphores + auto it = std::find_if(scheduling_groups.begin(), scheduling_groups.end(), [sg] (const scheduling_group_with_shares& sgs) { return sgs.sg == sg; }); + BOOST_REQUIRE(it != scheduling_groups.end()); + const auto shares = it->shares; + const ssize_t expected_memory = std::floor((double(shares) / double(total_shares)) * initial_resources.memory); + const auto memory_diff = std::abs(res.memory - expected_memory); + testlog.trace("{}: {}/{} (shares) -> {}/{} (memory) | res.memory: {}", sg.name(), shares, total_shares, expected_memory, initial_resources.memory, res.memory); + BOOST_CHECK_LE(memory_diff, scheduling_groups.size()); // due to integer division, we allow for ceil/floor (off-by-one), the remainder being added to any semaphore + total_memory += res.memory; + }); + BOOST_CHECK_EQUAL(total_memory, initial_resources.memory); // no off-by-one allowed on the total + }; + + auto add_sg = [&, sgi = 0] () mutable { + if (scheduling_groups.size() >= max_sched_groups) { + return false; + } + testlog.debug("create sg{}", sgi); + scheduling_group sg; + const auto sg_name = format("sg{}", sgi++); + if (recycle_bin.empty()) { + sg = create_scheduling_group(sg_name, initial_shares).get(); + } else { + sg = recycle_bin.front(); + recycle_bin.pop_front(); + rename_scheduling_group(sg, sg_name).get(); + } + scheduling_groups.emplace_back(sg, initial_shares); + sem_group.add_or_update(sg, initial_shares); + sem_group.wait_adjust_complete().get(); + return true; + }; + + while (add_sg()) { + check_sem_group(); + } + + for (size_t i = 0; i < 32; ++i) { + testlog.debug("iteration {}", i); + std::shuffle(scheduling_groups.begin(), scheduling_groups.end(), tests::random::gen()); + switch (tests::random::get_int(0, 3)) { + case 0: // add + { + testlog.debug("maybe add sg"); + if (add_sg()) { + break; + } + [[fallthrough]]; + } + case 1: //remove + { + const auto& sgs = scheduling_groups.back(); + testlog.debug("maybe remove {}", sgs.sg.name()); + if (scheduling_groups.size() > 1) { + testlog.debug("remove {}", sgs.sg.name()); + sem_group.remove(sgs.sg).get(); + recycle_bin.push_back(sgs.sg); + scheduling_groups.pop_back(); + break; + } + [[fallthrough]]; + } + default: //update + { + auto& sgs = scheduling_groups.back(); + const auto new_shares = tests::random::get_int(100, 1000); + sgs.shares = new_shares; + testlog.debug("update {}: {}->{}", sgs.sg.name(), sgs.shares, new_shares); + sem_group.add_or_update(sgs.sg, new_shares); + sem_group.wait_adjust_complete().get(); + break; + } + } + check_sem_group(); + } +} + namespace { class allocating_reader { diff --git a/test/boost/service_level_controller_test.cc b/test/boost/service_level_controller_test.cc index 453470d2a4..5375cedaaf 100644 --- a/test/boost/service_level_controller_test.cc +++ b/test/boost/service_level_controller_test.cc @@ -15,6 +15,7 @@ #include #include "seastarx.hh" +#include "service/qos/qos_common.hh" #include "test/lib/scylla_test_case.hh" #include "test/lib/test_utils.hh" #include @@ -102,24 +103,30 @@ template <> struct fmt::formatter : fmt::formatter sl_controller; sharded auth_service; + service_level_options sl_options; + sl_options.shares.emplace(1000); + scheduling_group default_scheduling_group = create_scheduling_group("sl_default_sg", 1.0).get(); locator::shared_token_metadata tm({}, {locator::topology::config{ .local_dc_rack = locator::endpoint_dc_rack::default_location }}); sharded as; as.start().get(); auto stop_as = defer([&as] { as.stop().get(); }); - sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), service_level_options{}).get(); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), sl_options, default_scheduling_group).get(); qos_configuration_change_suscriber_simple ccss; sl_controller.local().register_subscriber(&ccss); - sl_controller.local().add_service_level("sl1", service_level_options{}).get(); - sl_controller.local().add_service_level("sl2", service_level_options{}).get(); + sl_controller.local().add_service_level("sl1", sl_options).get(); + sl_controller.local().add_service_level("sl2", sl_options).get(); + sl_controller.local().add_service_level("sl3", service_level_options{}).get(); service_level_options slo; + slo.shares.emplace(500); slo.workload = service_level_options::workload_type::interactive; sl_controller.local().add_service_level("sl1", slo).get(); sl_controller.local().remove_service_level("sl2", false).get(); std::vector expected_result = { - add_op{"sl1", service_level_options{}}, - add_op{"sl2", service_level_options{}}, - change_op{"sl1", service_level_options{}, slo}, + add_op{"sl1", sl_options}, + add_op{"sl2", sl_options}, + add_op{"sl3", service_level_options{}}, + change_op{"sl1", sl_options, slo}, remove_op{"sl2"}, }; @@ -128,3 +135,180 @@ SEASTAR_THREAD_TEST_CASE(subscriber_simple) { as.invoke_on_all([] (auto& as) { as.request_abort(); }).get(); sl_controller.stop().get(); } + +SEASTAR_THREAD_TEST_CASE(too_many_service_levels) { + class data_accessor : public service_level_controller::service_level_distributed_data_accessor { + public: + mutable service_levels_info configuration; + future get_service_levels(qos::query_context) const override { + return make_ready_future(configuration); + } + future get_service_level(sstring service_level_name) const override { + service_levels_info ret; + if (configuration.contains(service_level_name)) { + ret[service_level_name] = configuration[service_level_name]; + } + return make_ready_future(ret); + } + future<> set_service_level(sstring service_level_name, qos::service_level_options slo, service::group0_batch&) const override { + configuration[service_level_name] = slo; + return make_ready_future<>(); + } + future<> drop_service_level(sstring service_level_name, service::group0_batch&) const override { + if (configuration.contains(service_level_name)) { + configuration.erase(service_level_name); + } + return make_ready_future<>(); + } + virtual bool is_v2() const override { + return true; + } + virtual ::shared_ptr upgrade_to_v2(cql3::query_processor& qp, service::raft_group0_client& group0_client) const override { + return make_shared(); + } + virtual future<> commit_mutations(service::group0_batch&& mc, abort_source& as) const override { + return make_ready_future<>(); + } + + }; + + shared_ptr test_accessor = make_shared(); + sharded sl_controller; + sharded auth_service; + service_level_options sl_options; + sl_options.shares.emplace(1000); + sl_options.workload = service_level_options::workload_type::interactive; + scheduling_group default_scheduling_group = create_scheduling_group("sl_default_sg1", 1.0).get(); + locator::shared_token_metadata tm({}, {locator::topology::config{ .local_dc_rack = locator::endpoint_dc_rack::default_location }}); + sharded as; + as.start().get(); + auto stop_as = defer([&as] { as.stop().get(); }); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), sl_options, default_scheduling_group, true).get(); + sl_controller.local().set_distributed_data_accessor(test_accessor); + int service_level_id = 0; + unsigned service_level_count = 0; + std::vector expected_service_levels; + while (service_level_count <= max_scheduling_groups()) { + try { + sstring sl_name = format("sl{:020}",service_level_id); + sl_controller.local().add_service_level(sl_name, sl_options).get(); + test_accessor->configuration[sl_name] = sl_options; + expected_service_levels.emplace_back(sl_name); + // create the service levels with gaps, this will allow to later "push" another service + // level between two others if odd id numbers are used. + service_level_id+=2; + service_level_count++; + } catch (std::runtime_error) { + break; + } + } + // If we have failed to create at least 2 service levels the test can pass but it will + // not really test anything. We know that there are a lot more available scheduling groups + // than only two. + BOOST_REQUIRE(service_level_count >= 2); + // make sure the service levels we believe to be active really have been created. + sl_controller.local().update_service_levels_cache().get(); + for (auto&& sl : expected_service_levels) { + BOOST_REQUIRE(sl_controller.local().has_service_level(sl)); + } + // Squize a service level betwin id 0 and id 2 - only to the configuration since + // we know that a creation of another service level will fail. + test_accessor->configuration[format("sl{:020}",1)] = sl_options; + + // do a config poll round + // we expect a failure to apply the configuration since it contains more service levels + // than available scheduling groups. + try { + sl_controller.local().update_service_levels_cache().get(); + } catch (std::runtime_error) { + } + expected_service_levels.clear(); + // Record the state of service levels after a configuration round (with a bad configuration). + for (auto&& sl : test_accessor->configuration) { + const auto& [sl_name, slo] = sl; + if (sl_controller.local().has_service_level(sl_name)) { + expected_service_levels.emplace_back(sl_name); + } + } + sl_controller.stop().get(); + // Simulate a rebooted node which haven't "witnesed" the configuration change and only knows + // the current configuration. + sharded new_sl_controller; + default_scheduling_group = create_scheduling_group("sl_default_sg2", 1.0).get(); + new_sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), sl_options, default_scheduling_group, true).get(); + new_sl_controller.local().set_distributed_data_accessor(test_accessor); + try { + new_sl_controller.local().update_service_levels_cache().get(); + } catch (std::runtime_error) { + } + // Finally, make sure that this rebooted node have the same service levels as the node + // that did "witness" the configuration change. + for (auto&& sl : expected_service_levels) { + BOOST_REQUIRE(new_sl_controller.local().has_service_level(sl)); + } + new_sl_controller.stop().get(); +} + +SEASTAR_THREAD_TEST_CASE(add_remove_bad_sequence) { + sharded sl_controller; + sharded auth_service; + service_level_options sl_options; + sl_options.shares.emplace(1000); + scheduling_group default_scheduling_group = create_scheduling_group("sl_default_sg3", 1.0).get(); + locator::shared_token_metadata tm({}, {locator::topology::config{ .local_dc_rack = locator::endpoint_dc_rack::default_location }}); + sharded as; + as.start().get(); + auto stop_as = defer([&as] { as.stop().get(); }); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), sl_options, default_scheduling_group, true).get(); + service_level_options slo; + slo.shares.emplace(500); + slo.workload = service_level_options::workload_type::interactive; + sl_controller.local().add_service_level("a", slo).get(); + sl_controller.local().add_service_level("b", slo).get(); + sl_controller.local().remove_service_level("b", false).get(); + sl_controller.local().remove_service_level("a", false).get(); + sl_controller.local().add_service_level("a", slo).get(); + sl_controller.local().remove_service_level("a", false).get(); + sl_controller.stop().get(); +} + +SEASTAR_THREAD_TEST_CASE(verify_unset_shares_in_cache_when_service_level_created_without_shares) { + using std::literals::chrono_literals::operator""ms; + + sharded sl_controller; + sharded auth_service; + + service_level_options sl_options; + sl_options.shares.emplace(1000); + scheduling_group default_scheduling_group = create_scheduling_group("sl_default_sg", 1.0).get(); + locator::shared_token_metadata tm({}, {locator::topology::config{ .local_dc_rack = locator::endpoint_dc_rack::default_location }}); + sharded as; + + as.start().get(); + auto stop_as = defer([&as] { as.stop().get(); }); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), sl_options, default_scheduling_group).get(); + + using timeout_duration = typename seastar::lowres_clock::duration; + using workload_type = typename service_level_options::workload_type; + + std::pair configs[] = { + {"sl_all_default", service_level_options{}}, + {"sl_timeout_set", service_level_options{.timeout = timeout_duration(10ms)}}, + {"sl_workload_set", service_level_options{.workload = workload_type::batch}}, + {"sl_shares_set", service_level_options {.shares = 100}}, + {"sl_timeout_and_workload_set", service_level_options{.timeout = timeout_duration(100ms), .workload = workload_type::interactive}}, + {"sl_timeout_and_shares_set", service_level_options{.timeout = timeout_duration(200ms), .shares = 50}}, + {"sl_workload_and_shares_set", service_level_options{.workload = workload_type::interactive, .shares = 250}}, + {"sl_everything_set", service_level_options{.timeout = timeout_duration(50ms), .workload = workload_type::interactive, .shares = 700}} + }; + + for (const auto& [name, opts] : configs) { + sl_controller.local().add_service_level(name, opts).get(); + const auto& sl = sl_controller.local().get_service_level(name); + BOOST_REQUIRE_MESSAGE(opts == sl.slo, seastar::format("Comparing options of {}", name)); + sl_controller.local().remove_service_level(name, false).get(); + } + + as.invoke_on_all([] (auto& as) { as.request_abort(); }).get(); + sl_controller.stop().get(); +} diff --git a/test/cqlpy/test_describe.py b/test/cqlpy/test_describe.py index 618c90a0aa..f79d50d3e5 100644 --- a/test/cqlpy/test_describe.py +++ b/test/cqlpy/test_describe.py @@ -1488,19 +1488,28 @@ class AuthSLContext: self.cql.execute(f"DROP SERVICE LEVEL {make_identifier(sl, quotation_mark='"')}") class ServiceLevel: - def __init__(self, name: str, timeout: int|None = None, wl_type: str|None = None): + default_shares_value = 1000 + + def __init__(self, name: str, timeout: int|None = None, wl_type: str|None = None, shares: int|None = None): self.name = name self.timeout = timeout self.wl_type = wl_type + self.shares = shares - def get_create_stmt(self) -> str: + # replace_default_shares - Scylla automatically assigns default value of shares it they are not + # specified. Set this argument to True to include the default shares in create statement + # to match describe result. + def get_create_stmt(self, replace_default_shares = False) -> str: # Note: `CREATE SERVICE LEVEL` statements returned by `DESC SCHEMA WITH INTERNALS` always uses # `std::chrono::milliseconds` as its resolution. For that reason, we use milliseconds in # create statements too so that they're easy to compare with Scylla's output. timeout = None if not self.timeout else f"TIMEOUT = {self.timeout}ms" wl_type = None if not self.wl_type else f"WORKLOAD_TYPE = '{self.wl_type}'" + shares = None if not self.shares else f"SHARES = {self.shares}" + if shares is None and replace_default_shares: + shares = f"SHARES = {self.default_shares_value}" - opts = [opt for opt in [timeout, wl_type] if opt is not None] + opts = [opt for opt in [timeout, wl_type, shares] if opt is not None] if opts: return f"CREATE SERVICE LEVEL {self.name} WITH {" AND ".join(opts)};" @@ -2358,7 +2367,7 @@ def test_desc_service_levels_format(cql): assert result.keyspace_name == None assert result.type == "service_level" assert result.name == sl.name - assert result.create_statement == sl.get_create_stmt() + assert result.create_statement == sl.get_create_stmt(replace_default_shares=True) def test_desc_service_levels_quotation_marks(cql): """ @@ -2387,8 +2396,8 @@ def test_desc_service_levels_quotation_marks(cql): desc_iter = extract_create_statements(desc_elements) expected_result = { - sl1_double_quote.get_create_stmt(), - sl2_double_quote.get_create_stmt() + sl1_double_quote.get_create_stmt(replace_default_shares=True), + sl2_double_quote.get_create_stmt(replace_default_shares=True) } assert set(desc_iter) == expected_result @@ -2412,7 +2421,7 @@ def test_desc_service_levels_uppercase(cql): assert list(sl_iter) == [sl.name] desc_iter = extract_create_statements(desc_elements) - assert list(desc_iter) == [sl.get_create_stmt()] + assert list(desc_iter) == [sl.get_create_stmt(replace_default_shares=True)] def test_desc_service_levels_unicode(cql): """ @@ -2433,7 +2442,7 @@ def test_desc_service_levels_unicode(cql): assert list(sl_iter) == [sl.name] desc_iter = extract_create_statements(desc_elements) - assert list(desc_iter) == [sl.get_create_stmt()] + assert list(desc_iter) == [sl.get_create_stmt(replace_default_shares=True)] def test_desc_auth_service_levels(cql): """ @@ -2452,8 +2461,12 @@ def test_desc_auth_service_levels(cql): # Timeout and workload parameter. ServiceLevel("sl7", timeout=25000, wl_type="interactive") } + service_levels |= { ServiceLevel(sl.name + 's', wl_type=sl.wl_type, timeout=sl.timeout, shares=400) for sl in service_levels } - sl_create_stmts = set(map(lambda sl: sl.get_create_stmt(), service_levels)) + sl_create_stmts = set(map(lambda sl: sl.get_create_stmt(replace_default_shares=True), service_levels)) + + # Enterprise is limited in the number of service levels it supports + sl_create_stmts = set(random.sample(list(sl_create_stmts), k=5)) for stmt in sl_create_stmts: cql.execute(stmt) @@ -2464,6 +2477,33 @@ def test_desc_auth_service_levels(cql): assert sl_create_stmts == set(desc_iter) +def test_desc_service_levels_default_shares(cql): + """ + Verify that DESCRIBE handles the default value of shares correctly: + (a) when a service level is created without specifying the number of shares, + we should get a create statement with the default number of shares, + (b) when a service level is created with the default number of shares but specified explicitly, + we should get a create statement with that number of shares too. + """ + + with AuthSLContext(cql): + default_share_count = 1000 + + stmts = [ + "CREATE SERVICE LEVEL sl_default;", + f"CREATE SERVICE LEVEL sl_set WITH SHARES = {default_share_count};", + ] + + for stmt in stmts: + cql.execute(stmt) + + desc_iter = cql.execute("DESC SCHEMA WITH INTERNALS") + desc_iter = filter_service_levels(desc_iter) + desc_iter = extract_create_statements(desc_iter) + + stmts[0] = f"CREATE SERVICE LEVEL sl_default WITH SHARES = {default_share_count};" + assert stmts == list(desc_iter) + def test_desc_attach_service_level_format(cql): """ Verify that the format of the output of `DESC SCHEMA WITH INTERNALS` corresponding to diff --git a/test/cqlpy/test_service_level_api.py b/test/cqlpy/test_service_level_api.py new file mode 100644 index 0000000000..6a6ae25d9c --- /dev/null +++ b/test/cqlpy/test_service_level_api.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- +# Copyright 2024-present ScyllaDB +# +# SPDX-License-Identifier: LicenseRef-ScyllaDB-Source-Available-1.0 + +######################################## +# Tests for the service levels HTTP API. +######################################## + +import pytest +from .rest_api import get_request, post_request +from .util import new_session, unique_name +import time + +def count_opened_connections(cql, retry_unauthenticated=True): + response = get_request(cql, "service_levels/count_connections") + return response + +def switch_tenants(cql): + return post_request(cql, "service_levels/switch_tenants") + +def count_opened_connections_from_table(cql): + connections = cql.execute("SELECT username, scheduling_group FROM system.clients WHERE client_type='cql' ALLOW FILTERING") + result = {} + for row in connections: + user = row[0] + shg = row[1] + + if shg in result: + if user in result[shg]: + result[shg][user] += 1 + else: + result[shg][user] = 1 + else: + result[shg] = {user: 1} + + return result + +def wait_until_all_connections_authenticated(cql, wait_s = 1, timeout_s = 30): + start_time = time.time() + while time.time() - start_time < timeout_s: + result = cql.execute("SELECT COUNT(*) FROM system.clients WHERE username='anonymous' ALLOW FILTERING") + if result.one()[0] == 0: + return + else: + time.sleep(wait_s) + + raise RuntimeError(f"Awaiting for connections authentication timed out.") + +def wait_for_scheduling_group_assignment(cql, user, scheduling_group, wait_s = 2, timeout_s = 60): + start_time = time.time() + while time.time() - start_time < timeout_s: + connections = cql.execute(f"SELECT username, scheduling_group FROM system.clients WHERE client_type='cql' AND username='{user}' ALLOW FILTERING") + + require_wait = False + for row in connections: + if row[1] != f"sl:{scheduling_group}": + require_wait = True + break + if require_wait: + time.sleep(wait_s) + continue + return + + raise RuntimeError(f"Awaiting for user '{user}' to switch tenant to scheduling group '{scheduling_group}' timed out.") + +# Test if `/service_levels/count_connections` prints counted CQL connections +# per scheduling group per user. +def test_count_opened_cql_connections(cql): + user = f"test_user_{unique_name()}" + sl = f"sl_{unique_name()}" + + cql.execute(f"CREATE ROLE {user} WITH login = true AND password='{user}'") + cql.execute(f"CREATE SERVICE LEVEL {sl} WITH shares = 100") + cql.execute(f"ATTACH SERVICE LEVEL {sl} TO {user}") + + # Service level controller updates in 10 seconds interval, so wait + # for sl1 to be assgined to test_user + time.sleep(10) + try: + with new_session(cql, user): # new sessions is created only to create user's connection to Scylla + wait_until_all_connections_authenticated(cql) + wait_for_scheduling_group_assignment(cql, user, sl) + + api_response = count_opened_connections(cql) + assert f"sl:{sl}" in api_response + assert user in api_response[f"sl:{sl}"] + + table_response = count_opened_connections_from_table(cql) + assert api_response == table_response + finally: + cql.execute(f"DETACH SERVICE LEVEL FROM {user}") + cql.execute(f"DROP ROLE {user}") + cql.execute(f"DROP SERVICE LEVEL {sl}") + +# Test if `/service_levels/switch_tenants` updates scheduling group +# of CQL connections without restarting them. +# +# This test creates a `test_user` and 2 service levels `sl1` and `sl2`. +# Firstly the user is assigned to `sl1` and his connections is created. +# Then the test changes user's service level to `sl2` and +# `/service_levels/switch_tenants` endpoint is called. +def test_switch_tenants(cql): + user = f"test_user_{unique_name()}" + sl1 = f"sl1_{unique_name()}" + sl2 = f"sl2_{unique_name()}" + + + cql.execute(f"CREATE ROLE {user} WITH login = true AND password='{user}'") + cql.execute(f"CREATE SERVICE LEVEL {sl1} WITH shares = 100") + cql.execute(f"CREATE SERVICE LEVEL {sl2} WITH shares = 200") + cql.execute(f"ATTACH SERVICE LEVEL {sl1} TO {user}") + + # Service level controller updates in 10 seconds interval, so wait + # for sl1 to be assgined to test_user + time.sleep(10) + try: + with new_session(cql, user): # new sessions is created only to create user's connection to Scylla + wait_until_all_connections_authenticated(cql) + wait_for_scheduling_group_assignment(cql, user, sl1) + + user_connections_sl1 = cql.execute(f"SELECT scheduling_group FROM system.clients WHERE username='{user}' ALLOW FILTERING") + for conn in user_connections_sl1: + assert conn[0] == f"sl:{sl1}" + + cql.execute(f"DETACH SERVICE LEVEL FROM {user}") + cql.execute(f"ATTACH SERVICE LEVEL {sl2} TO {user}") + # Again wait for service level controller to notice the change + time.sleep(10) + + switch_tenants(cql) + wait_for_scheduling_group_assignment(cql, user, sl2) + + user_connections_sl2 = cql.execute(f"SELECT scheduling_group FROM system.clients WHERE username='{user}' ALLOW FILTERING") + print(count_opened_connections(cql)) + for conn in user_connections_sl2: + assert conn[0] == f"sl:{sl2}" + finally: + cql.execute(f"DETACH SERVICE LEVEL FROM {user}") + cql.execute(f"DROP ROLE {user}") + cql.execute(f"DROP SERVICE LEVEL {sl1}") + cql.execute(f"DROP SERVICE LEVEL {sl2}") + + + + + + diff --git a/test/cqlpy/test_service_levels.py b/test/cqlpy/test_service_levels.py index f4307ec2c3..efb5ae5605 100644 --- a/test/cqlpy/test_service_levels.py +++ b/test/cqlpy/test_service_levels.py @@ -8,8 +8,9 @@ # to roles in order to apply various role-specific parameters, like timeouts. ############################################################################# -from contextlib import contextmanager +from contextlib import contextmanager, ExitStack from .util import unique_name, new_test_table, new_user +from .rest_api import scylla_inject_error from cassandra.protocol import InvalidRequest, ReadTimeout from cassandra.util import Duration @@ -18,14 +19,30 @@ import pytest import time @contextmanager -def new_service_level(cql, timeout=None, workload_type=None, role=None): +def new_service_level(cql, timeout=None, workload_type=None, shares=None, role=None): params = "" - if timeout and workload_type: - params = f"WITH timeout = {timeout} AND workload_type = '{workload_type}'" - elif timeout: - params = f"WITH timeout = {timeout}" - elif workload_type: - params = f"WITH workload_type = '{workload_type}'" + if timeout or workload_type or shares: + params = "WITH " + first = True + + if timeout: + if first: + first = False + else: + params += "AND " + params += f"timeout = {timeout} " + if workload_type: + if first: + first = False + else: + params += "AND " + params += f"workload_type = '{workload_type}' " + if shares: + if first: + first = False + else: + params += "AND " + params += f"shares = {shares} " attach_to = role if role else cql.cluster.auth_provider.username @@ -96,3 +113,64 @@ def test_list_effective_service_level(scylla_only, cql): if row.service_level_option == "workload_type": assert row.effective_service_level == sl2 assert row.value == "batch" + +def test_list_effective_service_level_shares(scylla_only, cql): + sl1 = "sl1" + sl2 = "sl2" + shares1 = 500 + shares2 = 200 + + with new_user(cql, "r1") as r1: + with new_user(cql, "r2") as r2: + with new_service_level(cql, shares=shares1, role=r1) as sl1: + with new_service_level(cql, shares=shares2, role=r2) as sl2: + cql.execute(f"GRANT {r2} TO {r1}") + + list_r1 = cql.execute(f"LIST EFFECTIVE SERVICE LEVEL OF {r1}") + for row in list_r1: + if row.service_level_option == "shares": + assert row.effective_service_level == sl2 + assert row.value == f"{shares2}" + list_r2 = cql.execute(f"LIST EFFECTIVE SERVICE LEVEL OF {r2}") + for row in list_r2: + if row.service_level_option == "shares": + assert row.effective_service_level == sl2 + assert row.value == f"{shares2}" + +def test_list_effective_service_level_without_attached(scylla_only, cql): + with new_user(cql) as role: + with pytest.raises(InvalidRequest, match=f"Role {role} doesn't have assigned any service level"): + cql.execute(f"LIST EFFECTIVE SERVICE LEVEL OF {role}") + +# Scylla Enterprise limits the number of service levels to a small number (8 including 1 default service level). +# This test verifies that attempting to create more service levels than that results in an InvalidRequest error +# and doesn't silently succeed. +# The test also has a regression check if a user can create exactly 7 service levels. +# In case you are adding a new internal scheduling group and this test failed, you should increase `SCHEDULING_GROUPS_COUNT` +# +# Reproduces enterprise issue #4481. +# Reproduces enterprise issue #5014. +def test_scheduling_groups_limit(scylla_only, cql): + sl_count = 100 + created_count = 0 + + with pytest.raises(InvalidRequest, match="Can't create service level - no more scheduling groups exist"): + with ExitStack() as stack: + for i in range(sl_count): + stack.enter_context(new_service_level(cql)) + created_count = created_count + 1 + + assert created_count > 0 + assert created_count == 7 # regression check + +def test_default_shares_in_listings(scylla_only, cql): + with scylla_inject_error(cql, "create_service_levels_without_default_shares", one_shot=False), \ + new_user(cql) as role: + with new_service_level(cql, role=role) as sl: + list_effective = cql.execute(f"LIST EFFECTIVE SERVICE LEVEL OF {role}") + shares_info = [row for row in list_effective if row.service_level_option == "shares"][0] + assert shares_info.value == "1000" + assert shares_info.effective_service_level == sl + + list_sl = cql.execute(f"LIST SERVICE LEVEL {sl}").one() + assert list_sl.shares == 1000 diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc index cef4094bad..c1c906859e 100644 --- a/test/lib/cql_test_env.cc +++ b/test/lib/cql_test_env.cc @@ -593,6 +593,10 @@ private: _sstm.start(std::ref(*cfg), sstables::storage_manager::config{}).get(); auto stop_sstm = deferred_stop(_sstm); + _sl_controller.start(std::ref(_auth_service), std::ref(_token_metadata), std::ref(abort_sources), qos::service_level_options{.shares = 1000}, scheduling_groups.statement_scheduling_group).get(); + auto stop_sl_controller = defer([this] { _sl_controller.stop().get(); }); + _sl_controller.invoke_on_all(&qos::service_level_controller::start).get(); + lang::manager::config lang_config; lang_config.lua.max_bytes = cfg->user_defined_function_allocation_limit_bytes(); lang_config.lua.max_contiguous = cfg->user_defined_function_contiguous_allocation_limit_bytes(); @@ -618,7 +622,7 @@ private: _db.stop().get(); }); - _db.invoke_on_all(&replica::database::start).get(); + _db.invoke_on_all(&replica::database::start, std::ref(_sl_controller)).get(); smp::invoke_on_all([blocked_reactor_notify_ms] { engine().update_blocked_reactor_notify_ms(blocked_reactor_notify_ms); @@ -659,9 +663,6 @@ private: set_abort_on_internal_error(true); const gms::inet_address listen("127.0.0.1"); - _sl_controller.start(std::ref(_auth_service), std::ref(_token_metadata), std::ref(abort_sources), qos::service_level_options{}).get(); - auto stop_sl_controller = defer([this] { _sl_controller.stop().get(); }); - _sl_controller.invoke_on_all(&qos::service_level_controller::start).get(); _sys_ks.start(std::ref(_qp), std::ref(_db)).get(); auto stop_sys_kd = defer([this] { @@ -730,7 +731,8 @@ private: } // Don't start listening so tests can be run in parallel if cfg_in.ms_listen is not set to true explicitly. _ms.start(host_id, listen, std::move(port), std::ref(_feature_service), - std::ref(_gossip_address_map), std::ref(_compressor_tracker)).get(); + std::ref(_gossip_address_map), std::ref(_compressor_tracker), + std::ref(_sl_controller)).get(); stop_ms = defer(stop_type(stop_ms_func)); if (cfg_in.ms_listen) { @@ -1105,6 +1107,10 @@ public: return cql_transport::messages::propagate_exception_as_future(std::move(msg)); }); } + + virtual sharded& service_level_controller_service() override { + return _sl_controller; + } }; std::atomic single_node_cql_env::active = { false }; diff --git a/test/lib/cql_test_env.hh b/test/lib/cql_test_env.hh index 040684ca82..22e7527353 100644 --- a/test/lib/cql_test_env.hh +++ b/test/lib/cql_test_env.hh @@ -16,6 +16,7 @@ #include #include +#include "service/qos/service_level_controller.hh" #include "replica/database.hh" #include "transport/messages/result_message_base.hh" #include "cql3/query_options_fwd.hh" @@ -184,6 +185,8 @@ public: virtual sharded& get_task_manager() = 0; data_dictionary::database data_dictionary(); + + virtual sharded& service_level_controller_service() = 0; }; future<> do_with_cql_env(std::function(cql_test_env&)> func, cql_test_config = {}, std::optional = {}); diff --git a/test/manual/gossip.cc b/test/manual/gossip.cc index a0ee2fe315..806e290084 100644 --- a/test/manual/gossip.cc +++ b/test/manual/gossip.cc @@ -18,11 +18,13 @@ #include "message/messaging_service.hh" #include "gms/gossiper.hh" #include "gms/application_state.hh" +#include "service/qos/service_level_controller.hh" #include "utils/log.hh" #include #include #include "db/schema_tables.hh" + namespace bpo = boost::program_options; // === How to run the test @@ -62,6 +64,7 @@ int main(int ac, char ** av) { sharded feature_service; sharded gossip_address_map; sharded messaging; + sharded auth_service; abort_sources.start().get(); auto stop_abort_source = defer([&] { abort_sources.stop().get(); }); @@ -72,6 +75,13 @@ int main(int ac, char ** av) { tm_cfg.topo_cfg.this_cql_address = my_address; token_metadata.start([] () noexcept { return db::schema_tables::hold_merge_lock(); }, tm_cfg).get(); auto stop_token_mgr = defer([&] { token_metadata.stop().get(); }); + locator::shared_token_metadata tm({}, {}); + sharded sl_controller; + scheduling_group default_scheduling_group = create_scheduling_group("sl_default_sg", 1.0).get(); + sharded as; + as.start().get(); + auto stop_as = defer([&as] { as.stop().get(); }); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), qos::service_level_options{.shares = 1000}, default_scheduling_group).get(); compressor_tracker.start([] { return utils::walltime_compressor_tracker::config{}; }).get(); auto stop_compressor_tracker = deferred_stop(compressor_tracker); @@ -81,7 +91,8 @@ int main(int ac, char ** av) { gossip_address_map.start().get(); messaging.start(locator::host_id{}, listen, 7000, std::ref(feature_service), - std::ref(gossip_address_map), std::ref(compressor_tracker)).get(); + std::ref(gossip_address_map), std::ref(compressor_tracker), + std::ref(sl_controller)).get(); auto stop_messaging = deferred_stop(messaging); gms::gossip_config gcfg; diff --git a/test/manual/message.cc b/test/manual/message.cc index 73226d0b8f..7e153edd3e 100644 --- a/test/manual/message.cc +++ b/test/manual/message.cc @@ -27,6 +27,7 @@ #include "locator/token_metadata.hh" #include "db/schema_tables.hh" #include "idl/gossip.dist.hh" +#include "service/qos/service_level_controller.hh" using namespace std::chrono_literals; using namespace netw; @@ -181,9 +182,12 @@ int main(int ac, char ** av) { ("cpuid", bpo::value()->default_value(0), "Server cpuid"); distributed db; + sharded auth_service; + locator::shared_token_metadata tm({}, {}); + distributed sl_controller; - return app.run_deprecated(ac, av, [&app] { - return seastar::async([&app] { + return app.run_deprecated(ac, av, [&app, &auth_service, &tm, &sl_controller] { + return seastar::async([&app, &auth_service, &tm, &sl_controller] { auto config = app.configuration(); bool stay_alive = config["stay-alive"].as(); const gms::inet_address listen = gms::inet_address(config["listen-address"].as()); @@ -193,6 +197,11 @@ int main(int ac, char ** av) { sharded token_metadata; token_metadata.start([] () noexcept { return db::schema_tables::hold_merge_lock(); }, tm_cfg).get(); auto stop_tm = deferred_stop(token_metadata); + auto default_scheduling_group = create_scheduling_group("sl_default_sg", 1.0).get(); + sharded as; + as.start().get(); + auto stop_as = defer([&as] { as.stop().get(); }); + sl_controller.start(std::ref(auth_service), std::ref(tm), std::ref(as), qos::service_level_options{.shares = 1000}, default_scheduling_group).get(); seastar::sharded compressor_tracker; compressor_tracker.start([] { return utils::walltime_compressor_tracker::config{}; }).get(); auto stop_compressor_tracker = deferred_stop(compressor_tracker); @@ -203,7 +212,8 @@ int main(int ac, char ** av) { gossip_address_map.start().get(); seastar::sharded messaging; messaging.start(locator::host_id{}, listen, 7000, std::ref(feature_service), - std::ref(gossip_address_map), std::ref(compressor_tracker)).get(); + std::ref(gossip_address_map), std::ref(compressor_tracker), + std::ref(sl_controller)).get(); auto stop_messaging = deferred_stop(messaging); seastar::sharded testers; testers.start(std::ref(messaging)).get(); diff --git a/test/topology_custom/test_alternator.py b/test/topology_custom/test_alternator.py index 3f5287df6b..05080477ed 100644 --- a/test/topology_custom/test_alternator.py +++ b/test/topology_custom/test_alternator.py @@ -140,7 +140,8 @@ async def test_alternator_ttl_scheduling_group(manager: ManagerClient): for ip in ips: metrics = await manager.metrics.query(ip) ms_streaming += metrics.get('scylla_scheduler_runtime_ms', {'group': 'streaming'}) - ms_statement += metrics.get('scylla_scheduler_runtime_ms', {'group': 'statement'}) + # in enterprise, default execution is in sl:default, not statement + ms_statement += metrics.get('scylla_scheduler_runtime_ms', {'group': 'sl:default'}) return (ms_streaming, ms_statement) ms_streaming_before, ms_statement_before = await get_cpu_metrics() diff --git a/tracing/trace_keyspace_helper.cc b/tracing/trace_keyspace_helper.cc index 286e5459b5..c6067e253f 100644 --- a/tracing/trace_keyspace_helper.cc +++ b/tracing/trace_keyspace_helper.cc @@ -380,7 +380,7 @@ std::vector trace_keyspace_helper::make_event_mutation_data(gms cql3::raw_value::make_value(utf8_type->decompose(record.message)), cql3::raw_value::make_value(inet_addr_type->decompose(my_address.addr())), cql3::raw_value::make_value(int32_type->decompose(elapsed_to_micros(record.elapsed))), - cql3::raw_value::make_value(utf8_type->decompose(_local_tracing.get_thread_name())), + cql3::raw_value::make_value(utf8_type->decompose(fmt::format("{}/{}", _local_tracing.get_thread_name(), record.scheduling_group_name))), cql3::raw_value::make_value(long_type->decompose(int64_t(session_records.parent_id.get_id()))), cql3::raw_value::make_value(long_type->decompose(int64_t(session_records.my_span_id.get_id()))), cql3::raw_value::make_value(int32_type->decompose((int32_t)(session_records.ttl.count()))) diff --git a/tracing/tracing.hh b/tracing/tracing.hh index 66267bc051..1209c384bf 100644 --- a/tracing/tracing.hh +++ b/tracing/tracing.hh @@ -175,6 +175,7 @@ struct event_record { std::string message; elapsed_clock::duration elapsed; i_tracing_backend_helper::wall_clock::time_point event_time_point; + sstring scheduling_group_name = current_scheduling_group().name(); event_record(sstring message_, elapsed_clock::duration elapsed_, i_tracing_backend_helper::wall_clock::time_point event_time_point_) : message(std::move(message_)) diff --git a/transport/controller.cc b/transport/controller.cc index e3b3fd435a..458ba7beeb 100644 --- a/transport/controller.cc +++ b/transport/controller.cc @@ -351,6 +351,16 @@ future> controller::get_client_data() { return _server ? _server->local().get_client_data() : protocol_server::get_client_data(); } +future<> controller::update_connections_scheduling_group() { + if (!_server) { + co_return; + } + + co_await _server->invoke_on_all([] (auto& server) { + return server.update_connections_scheduling_group(); + }); +} + future> controller::get_connections_service_level_params() { if (!_server) { co_return std::vector(); diff --git a/transport/controller.hh b/transport/controller.hh index 30d02077b4..35ab8fdbb7 100644 --- a/transport/controller.hh +++ b/transport/controller.hh @@ -79,6 +79,7 @@ public: virtual future<> stop_server() override; virtual future<> request_stop_server() override; virtual future> get_client_data() override; + future<> update_connections_scheduling_group(); future> get_connections_service_level_params(); }; diff --git a/transport/server.cc b/transport/server.cc index 64690029c2..90c08f1787 100644 --- a/transport/server.cc +++ b/transport/server.cc @@ -16,14 +16,17 @@ #include "cql3/statements/batch_statement.hh" #include "cql3/statements/modification_statement.hh" +#include "seastar/core/scheduling.hh" #include "types/collection.hh" #include "types/list.hh" #include "types/set.hh" #include "types/map.hh" #include "dht/token-sharding.hh" #include "service/migration_manager.hh" +#include "service/storage_service.hh" #include "service/memory_limiter.hh" #include "service/storage_proxy.hh" +#include "service/qos/service_level_controller.hh" #include "db/consistency_level_type.hh" #include "db/write_type.hh" #include @@ -199,12 +202,15 @@ cql_sg_stats::cql_sg_stats(maintenance_socket_enabled used_by_maintenance_socket if (std::find(vector_ref.begin(), vector_ref.end(), current_scheduling_group().name()) != vector_ref.end()) { return; } + + _use_metrics = true; register_metrics(); } void cql_sg_stats::register_metrics() { namespace sm = seastar::metrics; + auto new_metrics = sm::metric_groups(); std::vector transport_metrics; auto cur_sg_name = current_scheduling_group().name(); @@ -230,7 +236,14 @@ void cql_sg_stats::register_metrics() ); } - _metrics.add_group("transport", std::move(transport_metrics)); + new_metrics.add_group("transport", std::move(transport_metrics)); + _metrics = std::exchange(new_metrics, {}); +} + +void cql_sg_stats::rename_metrics() { + if (_use_metrics) { + register_metrics(); + } } cql_server::cql_server(distributed& qp, auth::service& auth_service, @@ -605,6 +618,7 @@ cql_server::connection::connection(cql_server& server, socket_address server_add , _server(server) , _server_addr(server_addr) , _client_state(service::client_state::external_tag{}, server._auth_service, &server._sl_controller, server.timeout_config(), addr) + , _current_scheduling_group(default_scheduling_group()) { _shedding_timer.set_callback([this] { clogger.debug("Shedding all incoming requests due to overload"); @@ -640,6 +654,7 @@ client_data cql_server::connection::make_client_data() const { } else if (_authenticating) { cd.connection_stage = client_connection_stage::authenticating; } + cd.scheduling_group_name = _current_scheduling_group.name(); return cd; } @@ -933,6 +948,14 @@ future> cql_server::connection::process_st co_return res; } +void cql_server::connection::update_scheduling_group() { + switch_tenant([this] (noncopyable_function ()> process_loop) -> future<> { + auto shg = co_await _server._sl_controller.get_user_scheduling_group(_client_state.user()); + _current_scheduling_group = shg; + co_return co_await _server._sl_controller.with_user_service_level(_client_state.user(), std::move(process_loop)); + }); +} + future> cql_server::connection::process_auth_response(uint16_t stream, request_reader in, service::client_state& client_state, tracing::trace_state_ptr trace_state) { auto sasl_challenge = client_state.get_auth_service()->underlying_authenticator().new_sasl_challenge(); @@ -941,6 +964,7 @@ future> cql_server::connection::process_au if (sasl_challenge->is_complete()) { return sasl_challenge->get_authenticated_user().then([this, sasl_challenge, stream, &client_state, challenge = std::move(challenge), trace_state](auth::authenticated_user user) mutable { client_state.set_login(std::move(user)); + update_scheduling_group(); auto f = client_state.check_user_can_login(); f = f.then([&client_state] { return client_state.maybe_update_per_service_level_params(); @@ -1230,7 +1254,6 @@ process_batch_internal(service::client_state& client_state, distributed(ps->statement.get()) == nullptr) { throw exceptions::invalid_request_exception("Invalid statement in batch: only UPDATE, INSERT and DELETE statements are allowed."); } - ::shared_ptr modif_statement_ptr = static_pointer_cast(ps->statement); if (init_trace) { tracing::add_table_name(trace_state, modif_statement_ptr->keyspace(), modif_statement_ptr->column_family()); @@ -2053,6 +2076,13 @@ future> cql_server::get_client_data() { co_return ret; } +future<> cql_server::update_connections_scheduling_group() { + return for_each_gently([] (generic_server::connection& conn) { + connection& cql_conn = dynamic_cast(conn); + cql_conn.update_scheduling_group(); + }); +} + future<> cql_server::update_connections_service_level_params() { if (!_sl_controller.is_v2()) { // Auto update of connections' service level params requires @@ -2071,6 +2101,7 @@ future<> cql_server::update_connections_service_level_params() { cs.update_per_service_level_params(*slo); } } + cql_conn.update_scheduling_group(); }); } @@ -2084,7 +2115,7 @@ future> cql_server::get_connections ? (user->name ? *(user->name) : "ANONYMOUS") : "UNAUTHENTICATED"; - sl_params.emplace_back(std::move(role_name), client_state.get_timeout_config(), client_state.get_workload_type()); + sl_params.emplace_back(std::move(role_name), client_state.get_timeout_config(), client_state.get_workload_type(), cql_conn.get_scheduling_group().name()); }); co_return sl_params; } diff --git a/transport/server.hh b/transport/server.hh index 527ea508e7..546f03ebd8 100644 --- a/transport/server.hh +++ b/transport/server.hh @@ -10,6 +10,7 @@ #include "auth/service.hh" #include +#include "seastar/core/scheduling.hh" #include "service/endpoint_lifecycle_subscriber.hh" #include "service/migration_listener.hh" #include "auth/authenticator.hh" @@ -130,7 +131,9 @@ struct cql_sg_stats { cql_sg_stats(maintenance_socket_enabled); request_kind_stats& get_cql_opcode_stats(cql_binary_opcode op) { return _cql_requests_stats[static_cast(op)]; } void register_metrics(); + void rename_metrics(); private: + bool _use_metrics = false; seastar::metrics::metric_groups _metrics; std::vector _cql_requests_stats; }; @@ -139,6 +142,7 @@ struct connection_service_level_params { sstring role_name; timeout_config timeout_config; qos::service_level_options::workload_type workload_type; + sstring scheduling_group_name; }; class cql_server : public seastar::peering_sharded_service, public generic_server::server { @@ -198,6 +202,7 @@ public: } future> get_client_data(); + future<> update_connections_scheduling_group(); future<> update_connections_service_level_params(); future> get_connections_service_level_params(); private: @@ -214,10 +219,12 @@ private: cql_compression _compression = cql_compression::none; service::client_state _client_state; timer _shedding_timer; + scheduling_group _current_scheduling_group; bool _shed_incoming_requests = false; unsigned _request_cpu = 0; bool _ready = false; bool _authenticating = false; + bool _tenant_switch = false; enum class tracing_request_type : uint8_t { not_requested, @@ -244,7 +251,9 @@ private: static std::pair make_client_key(const service::client_state& cli_state); client_data make_client_data() const; const service::client_state& get_client_state() const { return _client_state; } + void update_scheduling_group(); service::client_state& get_client_state() { return _client_state; } + scheduling_group get_scheduling_group() const { return _current_scheduling_group; } private: friend class process_request_executor; future>> process_request_one(fragmented_temporary_buffer::istream buf, uint8_t op, uint16_t stream, service::client_state& client_state, tracing_request_type tracing_request, service_permit permit);