Merge 'raft: make a removed/decommissioning node a non-voter early' from Patryk Jędrzejczak
For `removenode`, we make a removed node a non-voter early. There is no downside to it because the node is already dead. Moreover, it improves availability in some situations. For `decommission`, if we decommission a node when the number of nodes is even, we make it a non-voter early to improve availability. All majorities containing this node will remain majorities when we make this node a non-voter and remove it from the set because the required size of a majority decreases. We don't change `decommission` when the number of nodes is odd since this may reduce availability. Fixes #13959 Closes #14911 * github.com:scylladb/scylladb: raft: make a decommissioning node a non-voter early raft: topology_coordinator: implement step_down_as_nonvoter raft: make a removed node a non-voter early
This commit is contained in:
@@ -1043,6 +1043,20 @@ class topology_coordinator {
|
||||
slogger.info("raft topology: node {} removed from group 0 configuration", id);
|
||||
}
|
||||
|
||||
future<> step_down_as_nonvoter() {
|
||||
// Become a nonvoter which triggers a leader stepdown.
|
||||
co_await _group0.become_nonvoter();
|
||||
if (_raft.is_leader()) {
|
||||
co_await _raft.wait_for_state_change(&_as);
|
||||
}
|
||||
|
||||
// throw term_changed_error so we leave the coordinator loop instead of trying another
|
||||
// read_barrier which may fail with an (harmless, but unnecessary and annoying) error
|
||||
// telling us we're not in the configuration anymore (we'll get removed by the new
|
||||
// coordinator)
|
||||
throw term_changed_error{};
|
||||
}
|
||||
|
||||
struct bootstrapping_info {
|
||||
const std::unordered_set<token>& bootstrap_tokens;
|
||||
const replica_state& rs;
|
||||
@@ -1607,6 +1621,35 @@ class topology_coordinator {
|
||||
node = std::move(f).get();
|
||||
}
|
||||
|
||||
if (_group0.is_member(node.id, true)) {
|
||||
// If we remove a node, we make it a non-voter early to improve availability in some situations.
|
||||
// There is no downside to it because the removed node is already considered dead by us.
|
||||
//
|
||||
// FIXME: removenode may be aborted and the already dead node can be resurrected. We should consider
|
||||
// restoring its voter state on the recovery path.
|
||||
if (node.rs->state == node_state::removing) {
|
||||
co_await _group0.make_nonvoter(node.id);
|
||||
}
|
||||
|
||||
// If we decommission a node when the number of nodes is even, we make it a non-voter early.
|
||||
// All majorities containing this node will remain majorities when we make this node a non-voter
|
||||
// and remove it from the set because the required size of a majority decreases.
|
||||
//
|
||||
// FIXME: when a node restarts and notices it's a non-voter, it will become a voter again. If the
|
||||
// node restarts during a decommission, and we want the decommission to continue (e.g. because it's
|
||||
// at a finishing non-abortable step), we must ensure that the node doesn't become a voter.
|
||||
if (node.rs->state == node_state::decommissioning
|
||||
&& raft::configuration::voter_count(_group0.group0_server().get_configuration().current) % 2 == 0) {
|
||||
if (node.id == _raft.id()) {
|
||||
slogger.info("raft topology: coordinator is decommissioning and becomes a non-voter; "
|
||||
"giving up leadership");
|
||||
co_await step_down_as_nonvoter();
|
||||
} else {
|
||||
co_await _group0.make_nonvoter(node.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
raft_topology_cmd cmd{raft_topology_cmd::command::stream_ranges};
|
||||
if (node.rs->state == node_state::removing) {
|
||||
// tell all nodes to stream data of the removed node to new range owners
|
||||
@@ -1819,17 +1862,7 @@ class topology_coordinator {
|
||||
// Someone else needs to coordinate the rest of the decommission process,
|
||||
// because the decommissioning node is going to shut down in the middle of this state.
|
||||
slogger.info("raft topology: coordinator is decommissioning; giving up leadership");
|
||||
// Become a nonvoter which triggers a leader stepdown.
|
||||
co_await _group0.become_nonvoter();
|
||||
if (_raft.is_leader()) {
|
||||
co_await _raft.wait_for_state_change(&_as);
|
||||
}
|
||||
|
||||
// throw term_changed_error so we leave the coordinator loop instead of trying another
|
||||
// read_barrier which may fail with an (harmless, but unnecessary and annoying) error
|
||||
// telling us we're not in the configuration anymore (we'll get removed by the new
|
||||
// coordinator)
|
||||
throw term_changed_error{};
|
||||
co_await step_down_as_nonvoter();
|
||||
|
||||
// Note: if we restart after this point and become a voter
|
||||
// and then a coordinator again, it's fine - we'll just repeat this step.
|
||||
|
||||
Reference in New Issue
Block a user