Merge 'Fix crash during restart of a single node with topology over raft' from Gleb

This is a regression introduced in f26179cd27. Fixes: #14136 * 'gleb/set_group0' of github.com:scylladb/scylla-dev: test: restart first node to see if it can boot after restart service: move setting of group0 point in storage_service earlier
2023-06-07 10:19:27 +02:00
parent 84683c3549 e50f96fc4e
commit 2dbf6f32cd
5 changed files with 25 additions and 9 deletions
--- a/main.cc
+++ b/main.cc
@@ -1603,6 +1603,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
                group0_service.abort().get();
            });

+            // Set up group0 service earlier since it is needed by group0 setup just below
+            ss.local().set_group0(group0_service);
+
            // Setup group0 early in case the node is bootsrapped already and the group exists
            // Need to do it before allowing incomming messaging service connections since
            // storage proxy's and migration manager's verbs may access group0
@@ -1615,7 +1618,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
            }).get();

            with_scheduling_group(maintenance_scheduling_group, [&] {
-                return ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, group0_service, qp.local());
+                return ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, qp.local());
            }).get();

            sl_controller.invoke_on_all([&lifecycle_notifier] (qos::service_level_controller& controller) {
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -2717,12 +2717,14 @@ future<> storage_service::uninit_messaging_service_part() {
    return container().invoke_on_all(&service::storage_service::uninit_messaging_service);
 }

-future<> storage_service::join_cluster(cdc::generation_service& cdc_gen_service,
-        sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, raft_group0& group0, cql3::query_processor& qp) {
-    assert(this_shard_id() == 0);
-
+void storage_service::set_group0(raft_group0& group0) {
    _group0 = &group0;
    _raft_topology_change_enabled = _group0->is_raft_enabled() && _db.local().get_config().check_experimental(db::experimental_features_t::feature::RAFT);
+}
+
+future<> storage_service::join_cluster(cdc::generation_service& cdc_gen_service,
+        sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, cql3::query_processor& qp) {
+    assert(this_shard_id() == 0);

    set_mode(mode::STARTING);

--- a/service/storage_service.hh
+++ b/service/storage_service.hh
@@ -333,7 +333,9 @@ public:
     * \see init_messaging_service_part
     */
    future<> join_cluster(cdc::generation_service& cdc_gen_service,
-            sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, service::raft_group0&, cql3::query_processor& qp);
+            sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, cql3::query_processor& qp);
+
+    void set_group0(service::raft_group0&);

    future<> drain_on_shutdown();

--- a/test/lib/cql_test_env.cc
+++ b/test/lib/cql_test_env.cc
@@ -917,8 +917,10 @@ public:
                group0_service.abort().get();
            });

+            ss.local().set_group0(group0_service);
+
            try {
-                ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, group0_service, qp.local()).get();
+                ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, qp.local()).get();
            } catch (std::exception& e) {
                // if any of the defers crashes too, we'll never see
                // the error
--- a/test/topology_experimental_raft/test_topology_ops.py
+++ b/test/topology_experimental_raft/test_topology_ops.py
@@ -24,8 +24,15 @@ logger = logging.getLogger(__name__)
@pytest.mark.asyncio
 async def test_topology_ops(request, manager: ManagerClient):
    """Test basic topology operations using the topology coordinator."""
-    logger.info("Bootstrapping cluster")
-    servers = [await manager.server_add(), await manager.server_add(), await manager.server_add()]
+    logger.info("Bootstrapping first node")
+    servers = [await manager.server_add()]
+
+    logger.info(f"Restarting node {servers[0]}")
+    await manager.server_stop_gracefully(servers[0].server_id)
+    await manager.server_start(servers[0].server_id)
+
+    logger.info("Bootstrapping other nodes")
+    servers += [await manager.server_add(), await manager.server_add()]

    logger.info(f"Stopping node {servers[0]}")
    await manager.server_stop_gracefully(servers[0].server_id)