tablets: Equalize per-table balance when allocating tablets for a new table

Fixes the following scenario: 1. Scale out adds new nodes to each rack 2. Table is created - all tablets are allocated to new nodes because they have low load 3. Rebalancing moves tablets from old nodes to new nodes - table balance for the new table is not fixed We're wrong to try to equalize global load when allocating tablets, and we should equalize per-table load instead, and let background load balancing fix it in a fair way. It will add to the allocated storage imbalance, but: 1. The table is initially empty, so doesn't impact actual storage imbalance. 2. It's more important to avoid overloading CPU on the nodes - imbalance hurts this aspect immediately. 3. If the table was created before imbalance was formed, we would end up in the same situation in the problematic scenario after the patch. 4. It's the job of the load balancing to keep up with storage growing, and if it's not, scale out should kick in. Before we have CPU-aware tablet allocation, and thus can prove we have CPU capacity on the small nodes, we should respect per-table balance as this is the way in which we achieve full CPU utilization. Fixes #23631 (cherry picked from commit 1e407ab4d2)
2025-04-11 14:34:54 +02:00
parent e73954da80
commit ba3d53be55
2 changed files with 47 additions and 1 deletions
--- a/locator/network_topology_strategy.cc
+++ b/locator/network_topology_strategy.cc
@@ -348,7 +348,7 @@ future<tablet_map> network_topology_strategy::allocate_tablets_for_new_table(sch
 future<tablet_map> network_topology_strategy::reallocate_tablets(schema_ptr s, token_metadata_ptr tm, tablet_map tablets) const {
    natural_endpoints_tracker::check_enough_endpoints(*tm, _dc_rep_factor);
    load_sketch load(tm);
-    co_await load.populate();
+    co_await load.populate(std::nullopt, s->id());

    tablet_logger.debug("Allocating tablets for {}.{} ({}): dc_rep_factor={} tablet_count={}", s->ks_name(), s->cf_name(), s->id(), _dc_rep_factor, tablets.tablet_count());

--- a/test/boost/tablets_test.cc
+++ b/test/boost/tablets_test.cc
@@ -2780,6 +2780,52 @@ SEASTAR_THREAD_TEST_CASE(test_imbalance_in_hetero_cluster_with_two_tables) {
    }).get();
 }

+// Reproduces https://github.com/scylladb/scylladb/issues/23631
+SEASTAR_THREAD_TEST_CASE(test_imbalance_in_hetero_cluster_with_two_tables_imbalanced) {
+    do_with_cql_env_thread([] (auto& e) {
+        topology_builder topo(e);
+        shared_load_stats& load_stats = topo.get_shared_load_stats();
+
+        auto rack1 = topo.rack();
+        auto rack2 = topo.start_new_rack();
+        auto rack3 = topo.start_new_rack();
+
+        topo.add_i4i_2xlarge(rack1);
+        topo.add_i4i_2xlarge(rack2);
+        topo.add_i4i_2xlarge(rack3);
+
+        auto& stm = e.shared_token_metadata().local();
+
+        auto ks_name = add_keyspace(e, {{topo.dc(), 3}}, 512);
+        auto table1 = add_table(e, ks_name).get();
+        load_stats.set_size(table1, topo.get_capacity() * 0.8 / 3);
+        testlog.info("Initial cluster ready");
+
+        topo.add_i4i_large(rack1);
+        topo.add_i4i_large(rack2);
+        topo.add_i4i_large(rack3);
+        testlog.info("Expanded capacity");
+
+        auto ks2_name = add_keyspace(e, {{topo.dc(), 3}});
+        auto table2 = add_table(e, ks2_name).get();
+
+        auto& hosts = topo.hosts();
+
+        {
+            load_sketch load(stm.get());
+            load.populate(std::nullopt, table2).get();
+
+            min_max_tracker<double> node_utilization;
+            for (auto h : hosts) {
+                auto u = load.get_allocated_utilization(h, *topo.get_load_stats(), default_target_tablet_size);
+                testlog.info("table2: {}: {}", h, u);
+                node_utilization.update(u.value_or(0));
+            }
+            BOOST_REQUIRE_LT(node_utilization.max() - node_utilization.min(), 0.13);
+        }
+    }).get();
+}
+
 SEASTAR_TEST_CASE(test_tablet_id_and_range_side) {
    static constexpr size_t tablet_count = 128;
    locator::tablet_map tmap(tablet_count);