From 1cbd0da5193b67798383b8f64e3072372ab92e63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 4 Mar 2026 15:13:19 +0100 Subject: [PATCH 1/4] raft: log: clarify the specification of term_for When `idx > last_idx()`, the function does an out-of-bounds access to `_log`. This may look contradictory to the current specification. --- raft/log.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raft/log.hh b/raft/log.hh index 6916c3a826..f97118bbb5 100644 --- a/raft/log.hh +++ b/raft/log.hh @@ -167,7 +167,7 @@ public: // and non matching term is in second std::pair match_term(index_t idx, term_t term) const; // Return term number of the entry matching the index. If the - // entry is not in the log and does not match snapshot index, + // entry is in the truncated part of the log and does not match snapshot index, // return an empty optional. // Used to validate the log matching rule. std::optional term_for(index_t idx) const; From 1ae2ae50a64f50fb736049ff4ef4f86e4e0e8fad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 4 Mar 2026 11:23:01 +0100 Subject: [PATCH 2/4] raft: read_barrier: update local commit_idx to read_idx when it's safe When the local entry with `read_idx` belongs to the current term, it's safe to update the local `commit_idx` to `read_idx`. The argument for safety is in the new comment above `maybe_update_commit_idx_for_read`. The motivation for this change is to speed up read barriers. `wait_for_apply` executed at the end of `read_barrier` is delayed until the follower learns that the entry with `read_idx` is committed. It usually happens quickly in the `read_quorum` message. However, non-voters don't receive this message, so they have to wait for `append_entries`. If no new entries are being added, `append_entries` can come only from `fsm::tick_leader()`. For group0, this happens once every 100ms. The issue above significantly slows down cluster setups in tests. Nodes join group0 as non-voters, and then they are met with several read barriers just after a write to group0. One example is `global_token_metadata_barrier` in `write_both_read_new` performed just after `update_topology_state` in `write_both_read_old`. Writing a test for this change would be difficult, so we trust the nemesis tests to do the job. They have already found consistency issues in read barriers. See #10578. --- raft/fsm.cc | 8 ++++++++ raft/fsm.hh | 9 +++++++++ raft/server.cc | 1 + 3 files changed, 18 insertions(+) diff --git a/raft/fsm.cc b/raft/fsm.cc index 952334e5d8..5f10187ffb 100644 --- a/raft/fsm.cc +++ b/raft/fsm.cc @@ -1108,6 +1108,14 @@ std::optional> fsm::start_read_barrier(server_id req return std::make_pair(id, _commit_idx); } +void fsm::maybe_update_commit_idx_for_read(index_t read_idx) { + // read_idx from the leader might not be replicated to the local node yet. + const bool in_local_log = read_idx <= _log.last_idx(); + if (in_local_log && log_term_for(read_idx) == get_current_term()) { + advance_commit_idx(read_idx); + } +} + void fsm::stop() { if (is_leader()) { // Become follower to stop accepting requests diff --git a/raft/fsm.hh b/raft/fsm.hh index 9d1ab9980f..6e14140d86 100644 --- a/raft/fsm.hh +++ b/raft/fsm.hh @@ -480,6 +480,15 @@ public: std::optional> start_read_barrier(server_id requester); + // Update the commit index to the read index (a read barrier result from the leader) if the local entry with the + // read index belongs to the current term. + // + // Satisfying the condition above guarantees that the local log matches the current leader's log up to the read + // index (the Log Matching Property), so the current leader won't drop the local entry with the read index. + // Moreover, this entry has been committed by the leader, so future leaders also won't drop it (the Leader + // Completeness Property). Hence, updating the commit index is safe. + void maybe_update_commit_idx_for_read(index_t read_idx); + size_t in_memory_log_size() const { return _log.in_memory_size(); } diff --git a/raft/server.cc b/raft/server.cc index e3c34f68d6..0b4ffb2dfa 100644 --- a/raft/server.cc +++ b/raft/server.cc @@ -1553,6 +1553,7 @@ future<> server_impl::read_barrier(seastar::abort_source* as) { co_return stop_iteration::no; } read_idx = std::get(res); + _fsm->maybe_update_commit_idx_for_read(read_idx); co_return stop_iteration::yes; }); From 5a43695f6a3f0fda67d4fe2fb88893d531b855ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20J=C4=99drzejczak?= Date: Wed, 4 Mar 2026 15:18:03 +0100 Subject: [PATCH 3/4] raft: clarify the comment about read_barrier_reply The comment could be misleading. It could suggest that the returned index is already safe to read. That's not necessarily true. The entry with the returned index could, for example, be dropped by the leader if the leader's entry with this index had a different term. --- raft/raft.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raft/raft.hh b/raft/raft.hh index 5ab31c1973..5963d6170e 100644 --- a/raft/raft.hh +++ b/raft/raft.hh @@ -505,7 +505,7 @@ using add_entry_reply = std::variant; using rpc_message = std::variant Date: Wed, 4 Mar 2026 15:23:19 +0100 Subject: [PATCH 4/4] raft: server: fix the repeating typo --- raft/server.hh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/raft/server.hh b/raft/server.hh index e956eb460a..7b1e5475d5 100644 --- a/raft/server.hh +++ b/raft/server.hh @@ -77,7 +77,7 @@ public: // committed locally means simply that the commit index is beyond this entry's index. // // The caller may pass a pointer to an abort_source to make the operation abortable. - // It it passes nullptr, the operation is unabortable. + // If it passes nullptr, the operation is unabortable. // // Successful `add_entry` with `wait_type::committed` does not guarantee that `state_machine::apply` will be called // locally for this entry. Between the commit and the application we may receive a snapshot containing this entry, @@ -125,7 +125,7 @@ public: // returned even in case of a successful config change. // // The caller may pass a pointer to an abort_source to make the operation abortable. - // It it passes nullptr, the operation is unabortable. + // If it passes nullptr, the operation is unabortable. // // Exceptions: // raft::conf_change_in_progress @@ -206,7 +206,7 @@ public: // future has resolved successfully. // // The caller may pass a pointer to an abort_source to make the operation abortable. - // It it passes nullptr, the operation is unabortable. + // If it passes nullptr, the operation is unabortable. // // Exceptions: // raft::request_aborted @@ -251,7 +251,7 @@ public: // the call as before, but term should be different. // // The caller may pass a pointer to an abort_source to make the function abortable. - // It it passes nullptr, the function is unabortable. + // If it passes nullptr, the function is unabortable. // // Exceptions: // raft::request_aborted @@ -265,7 +265,7 @@ public: // `raft::server_id`. // // The caller may pass a pointer to an abort_source to make the function abortable. - // It it passes nullptr, the function is unabortable. + // If it passes nullptr, the function is unabortable. // // Exceptions: // raft::request_aborted