From b92fb3537abac4639a8b70eeaa6f4888402df9e8 Mon Sep 17 00:00:00 2001 From: Dawid Medrek Date: Thu, 21 Dec 2023 15:35:44 +0100 Subject: [PATCH] main: Postpone start-up of hint manager In this commit, we postpone the start-up of the hint manager until we obtain information about other nodes in the cluster. When we start the hint managers, one of the things that happen is creating endpoint managers -- structures managed by db::hints::manager. Whether we create an instance of endpoint manager depends on the value returned by host_filter::can_hint_for, which, in turn, may depend on the current state of locator::topology. If locator::topology is incomplete, some endpoint managers may not be started even though they should (because the target node IS part of the cluster and we SHOULD send hints to it if there are some). The situation like that can happen because we start the hint managers too early. This commit aims to solve that problem. We only start the hint managers when we've gathered information about the other nodes in the cluster and created the locator::topology using it. Hinted Handoff is not negatively affected by these changes since in between the previous point of starting the hint managers and the current one, all of the mutations performed by service::storage_proxy target the local node, so no hints would need to be generated anyway. Fixes scylladb/scylladb#11870 Closes scylladb/scylladb#16511 --- db/hints/manager.hh | 2 ++ main.cc | 5 ++--- service/storage_proxy.cc | 6 +++++- service/storage_service.cc | 20 +++++++++++++++++--- service/storage_service.hh | 9 +++++++-- test/lib/cql_test_env.cc | 2 +- 6 files changed, 34 insertions(+), 10 deletions(-) diff --git a/db/hints/manager.hh b/db/hints/manager.hh index 4c5f80b2c6..31d75071c8 100644 --- a/db/hints/manager.hh +++ b/db/hints/manager.hh @@ -295,10 +295,12 @@ private: _state.set(state::stopping); } +public: bool started() const noexcept { return _state.contains(state::started); } +private: void set_started() noexcept { _state.set(state::started); } diff --git a/main.cc b/main.cc index bf0d1f1020..ec6eb9621d 100644 --- a/main.cc +++ b/main.cc @@ -1559,9 +1559,8 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl } view_hints_dir_initializer.ensure_rebalanced().get(); - proxy.invoke_on_all([&lifecycle_notifier, &gossiper] (service::storage_proxy& local_proxy) { + proxy.invoke_on_all([&lifecycle_notifier] (service::storage_proxy& local_proxy) { lifecycle_notifier.local().register_subscriber(&local_proxy); - return local_proxy.start_hints_manager(gossiper.local().shared_from_this()); }).get(); auto drain_proxy = defer_verbose_shutdown("drain storage proxy", [&proxy, &lifecycle_notifier] { @@ -1792,7 +1791,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl }).get(); with_scheduling_group(maintenance_scheduling_group, [&] { - return ss.local().join_cluster(sys_dist_ks, proxy); + return ss.local().join_cluster(sys_dist_ks, proxy, gossiper, service::start_hint_manager::yes); }).get(); sl_controller.invoke_on_all([&lifecycle_notifier] (qos::service_level_controller& controller) { diff --git a/service/storage_proxy.cc b/service/storage_proxy.cc index 87c87e42ef..af340f69a9 100644 --- a/service/storage_proxy.cc +++ b/service/storage_proxy.cc @@ -3069,6 +3069,8 @@ storage_proxy::create_write_response_handler_helper(schema_ptr s, const dht::tok auto all = boost::range::join(natural_endpoints, pending_endpoints); + // If the manager hasn't started yet, no mutation will be performed to another node. + // No hint will need to be stored. if (cannot_hint(all, type)) { get_stats().writes_failed_due_to_too_many_in_flight_hints++; // avoid OOMing due to excess hints. we need to do this check even for "live" nodes, since we can @@ -3795,7 +3797,9 @@ mutation storage_proxy::do_get_batchlog_mutation_for(schema_ptr schema, const st template bool storage_proxy::cannot_hint(const Range& targets, db::write_type type) const { // if hints are disabled we "can always hint" since there's going to be no hint generated in this case - return hints_enabled(type) && boost::algorithm::any_of(targets, std::bind(&db::hints::manager::too_many_in_flight_hints_for, &_hints_manager, std::placeholders::_1)); + return hints_enabled(type) && + _hints_manager.started() && + boost::algorithm::any_of(targets, std::bind(&db::hints::manager::too_many_in_flight_hints_for, &_hints_manager, std::placeholders::_1)); } future<> storage_proxy::send_to_endpoint( diff --git a/service/storage_service.cc b/service/storage_service.cc index 05fac747ad..39a34113c3 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1107,10 +1107,12 @@ future<> storage_service::update_topology_with_local_metadata(raft::server& raft future<> storage_service::join_token_ring(sharded& sys_dist_ks, sharded& proxy, + sharded& gossiper, std::unordered_set initial_contact_nodes, std::unordered_set loaded_endpoints, std::unordered_map loaded_peer_features, - std::chrono::milliseconds delay) { + std::chrono::milliseconds delay, + start_hint_manager start_hm) { std::unordered_set bootstrap_tokens; gms::application_state_map app_states; /* The timestamp of the CDC streams generation that this node has proposed when joining. @@ -1378,6 +1380,16 @@ future<> storage_service::join_token_ring(sharded storage_service::join_cluster(sharded& sys_dist_ks, sharded& proxy) { +future<> storage_service::join_cluster(sharded& sys_dist_ks, sharded& proxy, + sharded& gossiper, start_hint_manager start_hm) { assert(this_shard_id() == 0); set_mode(mode::STARTING); @@ -2429,7 +2442,8 @@ future<> storage_service::join_cluster(sharded& for (auto& x : loaded_peer_features) { slogger.info("peer={}, supported_features={}", x.first, x.second); } - co_return co_await join_token_ring(sys_dist_ks, proxy, std::move(initial_contact_nodes), std::move(loaded_endpoints), std::move(loaded_peer_features), get_ring_delay()); + co_return co_await join_token_ring(sys_dist_ks, proxy, gossiper, std::move(initial_contact_nodes), + std::move(loaded_endpoints), std::move(loaded_peer_features), get_ring_delay(), start_hm); } future<> storage_service::replicate_to_all_cores(mutable_token_metadata_ptr tmptr) noexcept { diff --git a/service/storage_service.hh b/service/storage_service.hh index ebecdb2729..d77c8e06ab 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -102,6 +102,8 @@ enum class disk_error { regular, commit }; class node_ops_meta_data; +using start_hint_manager = seastar::bool_class; + /** * This abstraction contains the token/identifier of this node * on the identifier space. This token gets gossiped around. @@ -333,7 +335,8 @@ public: future<> check_for_endpoint_collision(std::unordered_set initial_contact_nodes, const std::unordered_map& loaded_peer_features); - future<> join_cluster(sharded& sys_dist_ks, sharded& proxy); + future<> join_cluster(sharded& sys_dist_ks, sharded& proxy, + sharded& gossiper_ptr, start_hint_manager start_hm); void set_group0(service::raft_group0&, bool raft_topology_change_enabled); @@ -354,10 +357,12 @@ private: bool is_first_node(); future<> join_token_ring(sharded& sys_dist_ks, sharded& proxy, + sharded& gossiper, std::unordered_set initial_contact_nodes, std::unordered_set loaded_endpoints, std::unordered_map loaded_peer_features, - std::chrono::milliseconds); + std::chrono::milliseconds, + start_hint_manager start_hm); future<> start_sys_dist_ks(); public: diff --git a/test/lib/cql_test_env.cc b/test/lib/cql_test_env.cc index b7f0747185..9baf991d6d 100644 --- a/test/lib/cql_test_env.cc +++ b/test/lib/cql_test_env.cc @@ -903,7 +903,7 @@ private: }); try { - _ss.local().join_cluster(_sys_dist_ks, _proxy).get(); + _ss.local().join_cluster(_sys_dist_ks, _proxy, _gossiper, service::start_hint_manager::no).get(); } catch (std::exception& e) { // if any of the defers crashes too, we'll never see // the error