test: tablets: Restart cluster in a graceful manner to avoid connection drop in the middle of request serving

After restarting each node, we should wait for other nodes to notice
the node is UP before restarting the next server. Otherwise, the next
node we restart may not send the shutdown notification to the
previously restarted node, if it still sees it as down when we
initiate its shutdown. In this case, the node will learn about the
restart from gossip later, possible when we already started CQL
requests. When a node learns that some node restarted while it
considers it as UP, it will close connections to that node. This will
fail RPC sent to that node, which will cause CQL request to time-out.

Fixes #14746

Closes scylladb/scylladb#16010
This commit is contained in:
Tomasz Grabiec
2023-11-09 01:05:04 +01:00
committed by Kamil Braun
parent 63e4d6c965
commit 84ea8b32b2
2 changed files with 23 additions and 2 deletions

View File

@@ -164,6 +164,28 @@ class ManagerClient():
await self.server_sees_others(server_id, wait_others, interval = wait_interval)
self._driver_update()
async def rolling_restart(self, servers):
for idx, s in enumerate(servers):
await self.server_stop_gracefully(s.server_id)
# Wait for other servers to see the server to be stopped
# so that the later server_sees_other_server() call will not
# exit immediately, making it moot.
for idx2 in range(len(servers)):
if idx2 != idx:
await self.server_not_sees_other_server(servers[idx2].ip_addr, s.ip_addr)
await self.server_start(s.server_id)
# Wait for other servers to see the restarted server.
# Otherwise, the next server we are going to restart may not yet see "s" as restarted
# and will not send graceful shutdown message to it. Server "s" may learn about the
# restart from gossip later and close connections while we already sent CQL requests
# to it, which will cause them to time out. Refs #14746.
for idx2 in range(len(servers)):
if idx2 != idx:
await self.server_sees_other_server(servers[idx2].ip_addr, s.ip_addr)
async def server_pause(self, server_id: ServerNum) -> None:
"""Pause the specified server."""
logger.debug("ManagerClient pausing %s", server_id)

View File

@@ -110,8 +110,7 @@ async def test_tablet_metadata_propagates_with_schema_changes_in_snapshot_mode(m
conn_logger.setLevel(logging.DEBUG)
try:
# Check that after rolling restart the tablet metadata is still there
for s in servers:
await manager.server_restart(s.server_id, wait_others=2)
await manager.rolling_restart(servers)
cql = await reconnect_driver(manager)