From 377f87c91acbb5253237047d111b84b3c0712a8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Tue, 1 Aug 2023 12:53:25 +0200 Subject: [PATCH 1/3] raft: make a removed node a non-voter early For removenode, we make a removed node a non-voter early. There is no downside to it because the node is already dead. Moreover, it improves availability in some situations. Consider a 4-node cluster with one dead none. If we make the dead node a non-voter at the beginning of removenode, group 0 will survive the death of another node in the middle of removenode. --- service/storage_service.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index 83066bc5ab..aceea7e8d8 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1433,6 +1433,17 @@ class topology_coordinator { node = std::move(f).get(); } + if (_group0.is_member(node.id, true)) { + // If we remove a node, we make it a non-voter early to improve availability in some situations. + // There is no downside to it because the removed node is already considered dead by us. + // + // FIXME: removenode may be aborted and the already dead node can be resurrected. We should consider + // restoring its voter state on the recovery path. + if (node.rs->state == node_state::removing) { + co_await _group0.make_nonvoter(node.id); + } + } + raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges}; if (node.rs->state == node_state::removing) { // tell all nodes to stream data of the removed node to new range owners From 20b13f89a1e829c948768110a5c0a32687fd44a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Tue, 1 Aug 2023 12:38:58 +0200 Subject: [PATCH 2/3] raft: topology_coordinator: implement step_down_as_nonvoter We move the logic that makes topology_coordinator a non-voter to a separate function called step_down_as_nonvoter to avoid code duplication. We use this function in the next commit. --- service/storage_service.cc | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/service/storage_service.cc b/service/storage_service.cc index aceea7e8d8..39a6d97754 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1032,6 +1032,20 @@ class topology_coordinator { slogger.info("raft topology: node {} removed from group 0 configuration", id); } + future<> step_down_as_nonvoter() { + // Become a nonvoter which triggers a leader stepdown. + co_await _group0.become_nonvoter(); + if (_raft.is_leader()) { + co_await _raft.wait_for_state_change(&_as); + } + + // throw term_changed_error so we leave the coordinator loop instead of trying another + // read_barrier which may fail with an (harmless, but unnecessary and annoying) error + // telling us we're not in the configuration anymore (we'll get removed by the new + // coordinator) + throw term_changed_error{}; + } + struct bootstrapping_info { const std::unordered_set& bootstrap_tokens; const replica_state& rs; @@ -1672,17 +1686,7 @@ class topology_coordinator { // Someone else needs to coordinate the rest of the decommission process, // because the decommissioning node is going to shut down in the middle of this state. slogger.info("raft topology: coordinator is decommissioning; giving up leadership"); - // Become a nonvoter which triggers a leader stepdown. - co_await _group0.become_nonvoter(); - if (_raft.is_leader()) { - co_await _raft.wait_for_state_change(&_as); - } - - // throw term_changed_error so we leave the coordinator loop instead of trying another - // read_barrier which may fail with an (harmless, but unnecessary and annoying) error - // telling us we're not in the configuration anymore (we'll get removed by the new - // coordinator) - throw term_changed_error{}; + co_await step_down_as_nonvoter(); // Note: if we restart after this point and become a voter // and then a coordinator again, it's fine - we'll just repeat this step. From d9137c7bdc99b1b39245ae1fe9ec39eb210c0ac7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Tue, 1 Aug 2023 14:11:10 +0200 Subject: [PATCH 3/3] raft: make a decommissioning node a non-voter early If we decommission a node when the number of nodes is even, we make it a non-voter early to improve availability. All majorities containing this node will remain majorities when we make this node a non-voter and remove it from the set because the required size of a majority decreases. --- service/storage_service.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/service/storage_service.cc b/service/storage_service.cc index 39a6d97754..7d2f30dc49 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -1456,6 +1456,24 @@ class topology_coordinator { if (node.rs->state == node_state::removing) { co_await _group0.make_nonvoter(node.id); } + + // If we decommission a node when the number of nodes is even, we make it a non-voter early. + // All majorities containing this node will remain majorities when we make this node a non-voter + // and remove it from the set because the required size of a majority decreases. + // + // FIXME: when a node restarts and notices it's a non-voter, it will become a voter again. If the + // node restarts during a decommission, and we want the decommission to continue (e.g. because it's + // at a finishing non-abortable step), we must ensure that the node doesn't become a voter. + if (node.rs->state == node_state::decommissioning + && raft::configuration::voter_count(_group0.group0_server().get_configuration().current) % 2 == 0) { + if (node.id == _raft.id()) { + slogger.info("raft topology: coordinator is decommissioning and becomes a non-voter; " + "giving up leadership"); + co_await step_down_as_nonvoter(); + } else { + co_await _group0.make_nonvoter(node.id); + } + } } raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};