From 5459a8b9a56a86a6a92c4e888f15c66fcbe2b326 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 24 Jan 2024 10:17:12 +0200 Subject: [PATCH 1/3] storage_service: drop redundant check from notify_joined() notify_joined() is called from handle_state_normal only, so there is no point checking that the state is normal inside the function as well. --- service/storage_service.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index c4241d27f1..41f3bbc684 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -8323,10 +8323,6 @@ future<> endpoint_lifecycle_notifier::notify_joined(gms::inet_address endpoint) } future<> storage_service::notify_joined(inet_address endpoint) { - if (!_gossiper.is_normal(endpoint)) { - co_return; - } - co_await utils::get_local_injector().inject( "storage_service_notify_joined_sleep", std::chrono::milliseconds{500}); From b97ff54a413abc1f4b28005cc075c968cc597964 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 24 Jan 2024 10:21:01 +0200 Subject: [PATCH 2/3] storage service: topology coordinator: call notify_left() when a node leaves a cluster When the topology coordinator is used for topology changes the gossiper based code that calls notify_left() is not called. The coordinator needs to call it itself. --- service/storage_service.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index 41f3bbc684..bac7a66cf3 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -398,6 +398,7 @@ future<> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tm if (_gossiper.get_endpoint_state_ptr(*ip) && !get_used_ips().contains(*ip)) { co_await _gossiper.force_remove_endpoint(*ip, gms::null_permit_id); + co_await notify_left(*ip); } } From adf70aae1548a01df109a8e88ee621ce35943c9d Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 24 Jan 2024 10:24:40 +0200 Subject: [PATCH 3/3] storage service: topology coordinator: call notify_joined() when a node joins a cluster When the topology coordinator is used for topology changes the gossiper based code that calls notify_joined() is not called. The coordinator needs to call it itself. But it needs to call it only once when node becomes normal. For that the patch changes state loading code to remember the old set of nodes in normal state to check if a node that is normal after new state is loaded was not in the normal state before. --- service/storage_service.cc | 11 ++++++++--- service/storage_service.hh | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index bac7a66cf3..651369562e 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -366,7 +366,7 @@ static locator::node::state to_topology_node_state(node_state ns) { // Synchronizes the local node state (token_metadata, system.peers/system.local tables, // gossiper) to align it with the other raft topology nodes. -future<> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::optional target_node) { +future<> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::optional target_node, std::unordered_set prev_normal) { const auto& am = _group0->address_map(); const auto& t = _topology_state_machine._topology; @@ -437,6 +437,9 @@ future<> storage_service::sync_raft_topology_nodes(mutable_token_metadata_ptr tm info.host_id = id.uuid(); info.release_version = rs.release_version; co_await _sys_ks.local().update_peer_info(*ip, info); + if (!prev_normal.contains(id)) { + co_await notify_joined(*ip); + } } update_topology(host_id, ip, rs); co_await tmptr->update_normal_tokens(rs.ring.value().tokens, host_id); @@ -552,6 +555,8 @@ future<> storage_service::topology_state_load() { } rtlogger.debug("reload raft topology state"); + std::unordered_set prev_normal = boost::copy_range>(_topology_state_machine._topology.normal_nodes | boost::adaptors::map_keys); + // read topology state from disk and recreate token_metadata from it _topology_state_machine._topology = co_await _sys_ks.local().load_topology_state(); @@ -595,7 +600,7 @@ future<> storage_service::topology_state_load() { }, _topology_state_machine._topology.tstate); tmptr->set_read_new(read_new); - co_await sync_raft_topology_nodes(tmptr, std::nullopt); + co_await sync_raft_topology_nodes(tmptr, std::nullopt, std::move(prev_normal)); if (_db.local().get_config().check_experimental(db::experimental_features_t::feature::TABLETS)) { tmptr->set_tablets(co_await replica::read_tablet_metadata(_qp)); @@ -703,7 +708,7 @@ class storage_service::raft_ip_address_updater: public gms::i_endpoint_state_cha const auto hid = locator::host_id{id.uuid()}; if (_address_map.find(id) == endpoint && _ss.get_token_metadata().get_endpoint_for_host_id_if_known(hid) != endpoint) { return _ss.mutate_token_metadata([this, hid](mutable_token_metadata_ptr t) { - return _ss.sync_raft_topology_nodes(std::move(t), hid); + return _ss.sync_raft_topology_nodes(std::move(t), hid, {}); }).finally([g = std::move(g)]{}); } return make_ready_future<>(); diff --git a/service/storage_service.hh b/service/storage_service.hh index e43ad4ad1a..0f511789e6 100644 --- a/service/storage_service.hh +++ b/service/storage_service.hh @@ -789,7 +789,7 @@ private: // Synchronizes the local node state (token_metadata, system.peers/system.local tables, // gossiper) to align it with the other raft topology nodes. // Optional target_node can be provided to restrict the synchronization to the specified node. - future<> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::optional target_node); + future<> sync_raft_topology_nodes(mutable_token_metadata_ptr tmptr, std::optional target_node, std::unordered_set prev_normal); // load topology state machine snapshot into memory // raft_group0_client::_read_apply_mutex must be held future<> topology_state_load();