From dccfd09d8375f56043638abaaf0a99268b55fbcf Mon Sep 17 00:00:00 2001 From: Kamil Braun Date: Tue, 16 Jan 2024 15:06:43 +0100 Subject: [PATCH] raft: pass `max_trailing_entries` through `fsm_output` to `store_snapshot_descriptor` This parameter says how many entries at most should be left trailing before the snapshot index. There are multiple places where this decision is made: - in `applier_fiber` when the server locally decides to take a snapshot due to log size pressure; this applies to the in-memory log - in `fsm::step` when the server received an `install_snapshot` message from the leader; this also applies to the in-memory log - and in `io_fiber` when calling `store_snapshot_descriptor`; this applies to the on-disk log. The logic of how many entries should be left trailing is calculated twice: - first, in `applier_fiber` or in `fsm::step` when truncating the in-memory log - and then again as the snapshot descriptor is being persisted. The logic is to take `_config.snapshot_trailing` for locally generated snapshots (coming from `applier_fiber`) and `0` for remote snapshots (from `fsm::step`). But there is already an error injection that changes the behavior of `applier_fiber` to leave `0` trailing entries. However, this doesn't affect the following `store_snapshot_descriptor` call which still uses `_config.snapshot_trailing`. So if the server got restarted, the entries which were truncated in-memory would get "revived" from disk. Fortunately, this is test-only code. However in future commits we'd like to change the logic of `applier_fiber` even further. So instead of having a separate calculation of trailing entries inside `io_fiber`, it's better for it to use the number that was already calculated once. This number is passed to `fsm::apply_snapshot` (by `applier_fiber` or `fsm::step`) and can then be received by `io_fiber` from `fsm_output` to use it inside `store_snapshot_descriptor`. --- raft/fsm.cc | 2 +- raft/fsm.hh | 5 +++++ raft/server.cc | 4 ++-- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/raft/fsm.cc b/raft/fsm.cc index b519a68d2e..3df087734b 100644 --- a/raft/fsm.cc +++ b/raft/fsm.cc @@ -1029,7 +1029,7 @@ bool fsm::apply_snapshot(snapshot_descriptor snp, size_t max_trailing_entries, s // If the snapshot is local, _commit_idx is larger than snp.idx. // Otherwise snp.idx becomes the new commit index. _commit_idx = std::max(_commit_idx, snp.idx); - _output.snp.emplace(fsm_output::applied_snapshot{snp, local}); + _output.snp.emplace(fsm_output::applied_snapshot{snp, local, max_trailing_entries}); size_t units = _log.apply_snapshot(std::move(snp), max_trailing_entries, max_trailing_bytes); if (is_leader()) { logger.trace("apply_snapshot[{}]: signal {} available units", _my_id, units); diff --git a/raft/fsm.hh b/raft/fsm.hh index 84498f7054..a72c5adaae 100644 --- a/raft/fsm.hh +++ b/raft/fsm.hh @@ -21,6 +21,11 @@ struct fsm_output { struct applied_snapshot { snapshot_descriptor snp; bool is_local; + + // Always 0 for non-local snapshots. + size_t max_trailing_entries; + + // FIXME: include max_trailing_bytes here and in store_snapshot_descriptor }; std::optional> term_and_vote; std::vector log_entries; diff --git a/raft/server.cc b/raft/server.cc index 580674d758..cb9be0df70 100644 --- a/raft/server.cc +++ b/raft/server.cc @@ -1003,10 +1003,10 @@ future<> server_impl::io_fiber(index_t last_stable) { } if (batch.snp) { - auto& [snp, is_local] = *batch.snp; + auto& [snp, is_local, max_trailing_entries] = *batch.snp; logger.trace("[{}] io_fiber storing snapshot {}", _id, snp.id); // Persist the snapshot - co_await _persistence->store_snapshot_descriptor(snp, is_local ? _config.snapshot_trailing : 0); + co_await _persistence->store_snapshot_descriptor(snp, max_trailing_entries); _stats.store_snapshot++; // If this is locally generated snapshot there is no need to // load it.