diff --git a/raft/raft.hh b/raft/raft.hh index 7e1138f027..39338e10bb 100644 --- a/raft/raft.hh +++ b/raft/raft.hh @@ -750,6 +750,18 @@ public: // apply call 'state_machine::load_snapshot(snapshot::id)' // Called during Raft server initialization only, should not // run in parallel with store. + // + // If you want to create a Raft cluster with a non-empty state + // machine, so that joining servers always receive a snapshot, + // you should: + // - make sure that members of the initial configuration have + // the same state machine state, + // - set the initial snapshot index on members of the initial + // configuration to 1, + // - set the initial snapshot index on all subsequently joining + // servers to 0. + // This also works if you start with an empty state machine, + // so consider it as the go-to default. virtual future load_snapshot_descriptor() = 0; // Persist given log entries. diff --git a/service/raft/raft_group0.cc b/service/raft/raft_group0.cc index dc444ad215..2587d2d5b1 100644 --- a/service/raft/raft_group0.cc +++ b/service/raft/raft_group0.cc @@ -420,14 +420,22 @@ future<> raft_group0::join_group0(std::vector seeds, bool as_ if (server == nullptr) { // This is the first time discovery is run. Create and start a Raft server for group 0 on this node. raft::configuration initial_configuration; + bool nontrivial_snapshot = false; if (g0_info.id == my_id) { // We were chosen as the discovery leader. // We should start a new group with this node as voter. group0_log.info("Server {} chosen as discovery leader; bootstrapping group 0 from scratch", my_id); initial_configuration.current.emplace(my_addr, true); + // Force snapshot transfer from us to subsequently joining servers. + // This is important for upgrade and recovery, where the group 0 state machine + // (schema tables in particular) is nonempty. + // In a fresh cluster this will trigger an empty snapshot transfer which is redundant but correct. + // See #14066. + nontrivial_snapshot = true; } // Bootstrap the initial configuration - co_await raft_sys_table_storage(qp, group0_id, my_id).bootstrap(std::move(initial_configuration)); + co_await raft_sys_table_storage(qp, group0_id, my_id) + .bootstrap(std::move(initial_configuration), nontrivial_snapshot); co_await start_server_for_group0(group0_id, ss, qp, mm, cdc_gen_service); server = &_raft_gr.group0(); // FIXME if we crash now or after getting added to the config but before storing group 0 ID, diff --git a/service/raft/raft_sys_table_storage.cc b/service/raft/raft_sys_table_storage.cc index 2c601cdf01..c24a321cfa 100644 --- a/service/raft/raft_sys_table_storage.cc +++ b/service/raft/raft_sys_table_storage.cc @@ -323,8 +323,9 @@ future<> raft_sys_table_storage::execute_with_linearization_point(std::function< } } -future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation) { - raft::snapshot_descriptor snapshot; +future<> raft_sys_table_storage::bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot) { + auto init_index = nontrivial_snapshot ? raft::index_t{1} : raft::index_t{0}; + raft::snapshot_descriptor snapshot{.idx{init_index}}; snapshot.id = raft::snapshot_id::create_random_id(); snapshot.config = std::move(initial_configuation); co_await store_snapshot_descriptor(snapshot, 0); diff --git a/service/raft/raft_sys_table_storage.hh b/service/raft/raft_sys_table_storage.hh index d82af8f153..482d68e71f 100644 --- a/service/raft/raft_sys_table_storage.hh +++ b/service/raft/raft_sys_table_storage.hh @@ -73,9 +73,14 @@ public: // Persist initial configuration of a new Raft group. // To be called before start for the new group. - // Uses a special snapshot id (0) to identify the snapshot - // descriptor. - future<> bootstrap(raft::configuration initial_configuation); + // + // If `nontrivial_snapshot` is true, the initial snapshot will have index 1 instead of 0, + // which will trigger a snapshot transfer to servers which start with snapshot index 0. + // This should be set for the first group 0 server during upgrade or recovery, which + // will force snapshot transfers for subsequently joining nodes (so we can transfer initial + // schema etc.). It's also correct to do it when booting a cluster from + // scratch with Raft, although not necessary (it will force an empty snapshot transfer). + future<> bootstrap(raft::configuration initial_configuation, bool nontrivial_snapshot); private: future do_store_log_entries_one_batch(const std::vector& entries, size_t start_idx); diff --git a/test/topology_raft_disabled/test_raft_upgrade_basic.py b/test/topology_raft_disabled/test_raft_upgrade_basic.py index d2e0a33555..31300d4c78 100644 --- a/test/topology_raft_disabled/test_raft_upgrade_basic.py +++ b/test/topology_raft_disabled/test_raft_upgrade_basic.py @@ -24,9 +24,6 @@ from test.topology_raft_disabled.util import restart, enable_raft, \ @log_run_time @pytest.mark.replication_factor(1) async def test_raft_upgrade_basic(manager: ManagerClient, random_tables: RandomTables): - """ - kbr-: the test takes about 7 seconds in dev mode on my laptop. - """ servers = await manager.running_servers() cql = manager.cql assert(cql) @@ -53,3 +50,8 @@ async def test_raft_upgrade_basic(manager: ManagerClient, random_tables: RandomT assert(rs) logging.info(f"group0_history entry description: '{rs[0].description}'") assert(table.full_name in rs[0].description) + + logging.info("Booting new node") + await manager.server_add(config={ + 'consistent_cluster_management': True + }) diff --git a/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py b/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py index cac0064aa5..36bfe45bab 100644 --- a/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py +++ b/test/topology_raft_disabled/test_raft_upgrade_majority_loss.py @@ -30,8 +30,6 @@ async def test_recovery_after_majority_loss(manager: ManagerClient, random_table used to recover group 0 might have missed them. However in this test the driver waits for schema agreement to complete before proceeding, so we know that every server learned about the schema changes. - - kbr-: the test takes about 22 seconds in dev mode on my laptop. """ servers = await manager.running_servers() @@ -85,3 +83,8 @@ async def test_recovery_after_majority_loss(manager: ManagerClient, random_table logging.info("Creating another table") await random_tables.add_table(ncolumns=5) + + logging.info("Booting new node") + await manager.server_add(config={ + 'consistent_cluster_management': True + })