From 0b9f221f2ad70cf0c18d52e2f465ebf95aa15f47 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Sun, 27 Aug 2023 19:03:39 +0300 Subject: [PATCH] gossiper: wait_for_live_nodes_to_show_up: increase timeout This function is too flaky with the 30 seconds timeout. For example, the following was seen locally with `test_updated_shards_during_add_decommission_node` in dev mode: alternator_stream_tests.py::TestAlternatorStreams::test_updated_shards_during_add_decommission_node/node6.log: ``` INFO 2023-08-27 15:47:25,753 [shard 0] gossip - Waiting for 2 live nodes to show up in gossip, currently 1 present... INFO 2023-08-27 15:47:30,754 [shard 0] gossip - (rate limiting dropped 498 similar messages) Waiting for 2 live nodes to show up in gossip, currently 1 present... INFO 2023-08-27 15:47:35,761 [shard 0] gossip - (rate limiting dropped 495 similar messages) Waiting for 2 live nodes to show up in gossip, currently 1 present... INFO 2023-08-27 15:47:40,766 [shard 0] gossip - (rate limiting dropped 498 similar messages) Waiting for 2 live nodes to show up in gossip, currently 1 present... INFO 2023-08-27 15:47:45,768 [shard 0] gossip - (rate limiting dropped 497 similar messages) Waiting for 2 live nodes to show up in gossip, currently 1 present... INFO 2023-08-27 15:47:50,768 [shard 0] gossip - (rate limiting dropped 497 similar messages) Waiting for 2 live nodes to show up in gossip, currently 1 present... ERROR 2023-08-27 15:47:55,758 [shard 0] gossip - Timed out waiting for 2 live nodes to show up in gossip INFO 2023-08-27 15:47:55,759 [shard 0] init - Shutting down group 0 service ``` alternator_stream_tests.py::TestAlternatorStreams::test_updated_shards_during_add_decommission_node/node1.log: ``` INFO 2023-08-27 15:48:02,532 [shard 0] gossip - InetAddress 127.0.43.6 is now UP, status = UNKNOWN ... WARN 2023-08-27 15:48:03,552 [shard 0] gossip - failure_detector_loop: Send echo to node 127.0.43.6, status = failed: seastar::rpc::closed_error (connection is closed) ``` Note that node1 saw node6 as UP after node6 already timed out and was shutting down. Increase the timeout to 3 minutes in all modes to reduce flakiness. Fixes #15185 Signed-off-by: Benny Halevy Closes #15186 --- gms/gossiper.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/gms/gossiper.cc b/gms/gossiper.cc index d4d1595ddc..fac74db3be 100644 --- a/gms/gossiper.cc +++ b/gms/gossiper.cc @@ -2332,12 +2332,8 @@ future<> gossiper::wait_alive(std::vector nodes, std::chrono: future<> gossiper::wait_for_live_nodes_to_show_up(size_t n) { logger::rate_limit rate_limit{std::chrono::seconds{5}}; -#ifdef SEASTAR_DEBUG - // Account for debug slowness. 3 minutes is probably overkill but we don't want flaky tests. + // Account for gossip slowness. 3 minutes is probably overkill but we don't want flaky tests. constexpr auto timeout_delay = std::chrono::minutes{3}; -#else - constexpr auto timeout_delay = std::chrono::seconds{30}; -#endif auto timeout = gossiper::clk::now() + timeout_delay; while (get_live_members().size() < n) { if (timeout <= gossiper::clk::now()) {