raft: fix the shutdown phase being stuck
Some of the calls inside the `raft_group0_client::start_operation()`
method were missing the abort source parameter. This caused the repair
test to be stuck in the shutdown phase - the abort source has been
triggered, but the operations were not checking it.
This was in particular the case of operations that try to take the
ownership of the raft group semaphore (`get_units(semaphore)`) - these
waits should be cancelled when the abort source is triggered.
This should fix the following tests that were failing in some percentage
of dtest runs (about 1-3 of 100):
* TestRepairAdditional::test_repair_kill_1
* TestRepairAdditional::test_repair_kill_3
Fixes scylladb/scylladb#19223
(cherry picked from commit 5dfc50d354)
This commit is contained in:
@@ -251,12 +251,12 @@ future<group0_guard> raft_group0_client::start_operation(seastar::abort_source&
|
||||
auto [upgrade_lock_holder, upgrade_state] = co_await get_group0_upgrade_state();
|
||||
switch (upgrade_state) {
|
||||
case group0_upgrade_state::use_post_raft_procedures: {
|
||||
auto operation_holder = co_await get_units(_operation_mutex, 1);
|
||||
auto operation_holder = co_await get_units(_operation_mutex, 1, as);
|
||||
co_await _raft_gr.group0_with_timeouts().read_barrier(&as, timeout);
|
||||
|
||||
// Take `_group0_read_apply_mutex` *after* read barrier.
|
||||
// Read barrier may wait for `group0_state_machine::apply` which also takes this mutex.
|
||||
auto read_apply_holder = co_await hold_read_apply_mutex();
|
||||
auto read_apply_holder = co_await hold_read_apply_mutex(as);
|
||||
|
||||
auto observed_group0_state_id = co_await _sys_ks.get_last_group0_state_id();
|
||||
auto new_group0_state_id = generate_group0_state_id(observed_group0_state_id);
|
||||
|
||||
Reference in New Issue
Block a user