Merge 'Fix crash during restart of a single node with topology over raft' from Gleb
This is a regression introduced in f26179cd27.
Fixes: #14136
* 'gleb/set_group0' of github.com:scylladb/scylla-dev:
test: restart first node to see if it can boot after restart
service: move setting of group0 point in storage_service earlier
This commit is contained in:
5
main.cc
5
main.cc
@@ -1603,6 +1603,9 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
group0_service.abort().get();
|
||||
});
|
||||
|
||||
// Set up group0 service earlier since it is needed by group0 setup just below
|
||||
ss.local().set_group0(group0_service);
|
||||
|
||||
// Setup group0 early in case the node is bootsrapped already and the group exists
|
||||
// Need to do it before allowing incomming messaging service connections since
|
||||
// storage proxy's and migration manager's verbs may access group0
|
||||
@@ -1615,7 +1618,7 @@ To start the scylla server proper, simply invoke as: scylla server (or just scyl
|
||||
}).get();
|
||||
|
||||
with_scheduling_group(maintenance_scheduling_group, [&] {
|
||||
return ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, group0_service, qp.local());
|
||||
return ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, qp.local());
|
||||
}).get();
|
||||
|
||||
sl_controller.invoke_on_all([&lifecycle_notifier] (qos::service_level_controller& controller) {
|
||||
|
||||
@@ -2717,12 +2717,14 @@ future<> storage_service::uninit_messaging_service_part() {
|
||||
return container().invoke_on_all(&service::storage_service::uninit_messaging_service);
|
||||
}
|
||||
|
||||
future<> storage_service::join_cluster(cdc::generation_service& cdc_gen_service,
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, raft_group0& group0, cql3::query_processor& qp) {
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
void storage_service::set_group0(raft_group0& group0) {
|
||||
_group0 = &group0;
|
||||
_raft_topology_change_enabled = _group0->is_raft_enabled() && _db.local().get_config().check_experimental(db::experimental_features_t::feature::RAFT);
|
||||
}
|
||||
|
||||
future<> storage_service::join_cluster(cdc::generation_service& cdc_gen_service,
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, cql3::query_processor& qp) {
|
||||
assert(this_shard_id() == 0);
|
||||
|
||||
set_mode(mode::STARTING);
|
||||
|
||||
|
||||
@@ -333,7 +333,9 @@ public:
|
||||
* \see init_messaging_service_part
|
||||
*/
|
||||
future<> join_cluster(cdc::generation_service& cdc_gen_service,
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, service::raft_group0&, cql3::query_processor& qp);
|
||||
sharded<db::system_distributed_keyspace>& sys_dist_ks, sharded<service::storage_proxy>& proxy, cql3::query_processor& qp);
|
||||
|
||||
void set_group0(service::raft_group0&);
|
||||
|
||||
future<> drain_on_shutdown();
|
||||
|
||||
|
||||
@@ -917,8 +917,10 @@ public:
|
||||
group0_service.abort().get();
|
||||
});
|
||||
|
||||
ss.local().set_group0(group0_service);
|
||||
|
||||
try {
|
||||
ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, group0_service, qp.local()).get();
|
||||
ss.local().join_cluster(cdc_generation_service.local(), sys_dist_ks, proxy, qp.local()).get();
|
||||
} catch (std::exception& e) {
|
||||
// if any of the defers crashes too, we'll never see
|
||||
// the error
|
||||
|
||||
@@ -24,8 +24,15 @@ logger = logging.getLogger(__name__)
|
||||
@pytest.mark.asyncio
|
||||
async def test_topology_ops(request, manager: ManagerClient):
|
||||
"""Test basic topology operations using the topology coordinator."""
|
||||
logger.info("Bootstrapping cluster")
|
||||
servers = [await manager.server_add(), await manager.server_add(), await manager.server_add()]
|
||||
logger.info("Bootstrapping first node")
|
||||
servers = [await manager.server_add()]
|
||||
|
||||
logger.info(f"Restarting node {servers[0]}")
|
||||
await manager.server_stop_gracefully(servers[0].server_id)
|
||||
await manager.server_start(servers[0].server_id)
|
||||
|
||||
logger.info("Bootstrapping other nodes")
|
||||
servers += [await manager.server_add(), await manager.server_add()]
|
||||
|
||||
logger.info(f"Stopping node {servers[0]}")
|
||||
await manager.server_stop_gracefully(servers[0].server_id)
|
||||
|
||||
Reference in New Issue
Block a user