From 94b5165ac7fd6cdbb0d946f6f97af1e4d20baf11 Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Wed, 22 Jan 2025 22:14:32 +0100 Subject: [PATCH] tablets: Use scheduler's make_sizing_plan() to decide about tablet count of a new table This makes decisions made by the scheduler consistent with decisions made on table creation, with regard to tablet count. We want to avoid over-allocation of tablets when table is created, which would then be reduced by the scheduler's scaling logic. Not just to avoid wasteful migrations post table creation, but to respect the per-shard goal. To respect the per-shard goal, the algorithm will no longer be as simple as looking at hints, and we want to share the algorithm between the scheduler and initial tablet allocator. So invoke the scheduler to get the tablet count when table is created. --- locator/network_topology_strategy.cc | 4 +-- locator/network_topology_strategy.hh | 2 +- locator/tablet_replication_strategy.hh | 2 +- service/tablet_allocator.cc | 32 +++++++++++++++++--- test/boost/network_topology_strategy_test.cc | 4 +-- test/boost/tablets_test.cc | 2 +- test/perf/tablet_load_balancing.cc | 2 +- 7 files changed, 34 insertions(+), 14 deletions(-) diff --git a/locator/network_topology_strategy.cc b/locator/network_topology_strategy.cc index 3083de3276..52b32ac36b 100644 --- a/locator/network_topology_strategy.cc +++ b/locator/network_topology_strategy.cc @@ -344,14 +344,12 @@ future network_topology_strategy::calculate_min_tablet_count(schema_ptr co_return tablet_count; } -future network_topology_strategy::allocate_tablets_for_new_table(schema_ptr s, token_metadata_ptr tm, uint64_t target_tablet_size, std::optional initial_scale) const { - size_t tablet_count = co_await calculate_min_tablet_count(s, tm, target_tablet_size, initial_scale); +future network_topology_strategy::allocate_tablets_for_new_table(schema_ptr s, token_metadata_ptr tm, size_t tablet_count) const { auto aligned_tablet_count = 1ul << log2ceil(tablet_count); if (tablet_count != aligned_tablet_count) { rslogger.info("Rounding up tablet count from {} to {} for table {}.{}", tablet_count, aligned_tablet_count, s->ks_name(), s->cf_name()); tablet_count = aligned_tablet_count; } - co_return co_await reallocate_tablets(std::move(s), std::move(tm), tablet_map(tablet_count)); } diff --git a/locator/network_topology_strategy.hh b/locator/network_topology_strategy.hh index 59bbaa6e9b..332fbbbecf 100644 --- a/locator/network_topology_strategy.hh +++ b/locator/network_topology_strategy.hh @@ -47,7 +47,7 @@ public: public: // tablet_aware_replication_strategy virtual effective_replication_map_ptr make_replication_map(table_id, token_metadata_ptr) const override; virtual future calculate_min_tablet_count(schema_ptr s, token_metadata_ptr tm, uint64_t target_tablet_size, std::optional initial_scale) const override; - virtual future allocate_tablets_for_new_table(schema_ptr, token_metadata_ptr, uint64_t target_tablet_size, std::optional initial_scale = std::nullopt) const override; + virtual future allocate_tablets_for_new_table(schema_ptr, token_metadata_ptr, size_t tablet_count) const override; virtual future reallocate_tablets(schema_ptr, token_metadata_ptr, tablet_map cur_tablets) const override; protected: /** diff --git a/locator/tablet_replication_strategy.hh b/locator/tablet_replication_strategy.hh index a729df3a36..e63c8fa480 100644 --- a/locator/tablet_replication_strategy.hh +++ b/locator/tablet_replication_strategy.hh @@ -42,7 +42,7 @@ public: /// Generates tablet_map for a new table. /// Runs under group0 guard. - virtual future allocate_tablets_for_new_table(schema_ptr, token_metadata_ptr, uint64_t target_tablet_size, std::optional initial_scale = std::nullopt) const = 0; + virtual future allocate_tablets_for_new_table(schema_ptr, token_metadata_ptr, size_t tablet_count) const = 0; /// Generates tablet_map for a new table or when increasing replication factor. /// For a new table, cur_tablets is initialized with the tablet_count, diff --git a/service/tablet_allocator.cc b/service/tablet_allocator.cc index 98b4c5abc7..7a4457d25c 100644 --- a/service/tablet_allocator.cc +++ b/service/tablet_allocator.cc @@ -1067,7 +1067,7 @@ public: std::unordered_map tables; }; - future make_sizing_plan() { + future make_sizing_plan(schema_ptr new_table = nullptr, const tablet_aware_replication_strategy* new_rs = nullptr) { sizing_plan plan; auto process_table = [&] (table_id table, schema_ptr s, const tablet_aware_replication_strategy* rs, size_t tablet_count) -> future<> { @@ -1128,6 +1128,10 @@ public: co_await process_table(table, s, rs, tmap->tablet_count()); } + if (new_table) { + co_await process_table(new_table->id(), new_table, new_rs, 0); + } + co_return std::move(plan); } @@ -2765,6 +2769,18 @@ class tablet_allocator_impl : public tablet_allocator::impl load_balancer_stats_manager _load_balancer_stats; bool _stopped = false; bool _use_tablet_aware_balancing = true; +private: + load_balancer make_load_balancer(token_metadata_ptr tm, + locator::load_stats_ptr table_load_stats, + std::unordered_set skiplist) { + load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats, + _db.get_config().target_tablet_size_in_bytes(), + _db.get_config().tablets_per_shard_goal(), + std::move(skiplist)); + lb.set_use_table_aware_balancing(_use_tablet_aware_balancing); + lb.set_initial_scale(_config.initial_tablets_scale); + return lb; + } public: tablet_allocator_impl(tablet_allocator::config cfg, service::migration_notifier& mn, replica::database& db) : _config(std::move(cfg)) @@ -2789,9 +2805,7 @@ public: } future balance_tablets(token_metadata_ptr tm, locator::load_stats_ptr table_load_stats, std::unordered_set skiplist) { - load_balancer lb(_db, tm, std::move(table_load_stats), _load_balancer_stats, _db.get_config().target_tablet_size_in_bytes(), std::move(skiplist)); - lb.set_use_table_aware_balancing(_use_tablet_aware_balancing); - lb.set_initial_scale(_config.initial_tablets_scale); + auto lb = make_load_balancer(tm, std::move(table_load_stats), std::move(skiplist)); co_return co_await lb.make_plan(); } @@ -2805,7 +2819,15 @@ public: if (auto&& tablet_rs = rs->maybe_as_tablet_aware()) { auto tm = _db.get_shared_token_metadata().get(); lblogger.debug("Creating tablets for {}.{} id={}", s.ks_name(), s.cf_name(), s.id()); - auto map = tablet_rs->allocate_tablets_for_new_table(s.shared_from_this(), tm, _db.get_config().target_tablet_size_in_bytes(), _config.initial_tablets_scale).get(); + auto lb = make_load_balancer(tm, nullptr, {}); + auto plan = lb.make_sizing_plan(s.shared_from_this(), tablet_rs).get(); + auto& table_plan = plan.tables[s.id()]; + if (table_plan.target_tablet_count_aligned != table_plan.target_tablet_count) { + lblogger.info("Rounding up tablet count from {} to {} for table {}.{}", table_plan.target_tablet_count, + table_plan.target_tablet_count_aligned, s.ks_name(), s.cf_name()); + } + auto tablet_count = table_plan.target_tablet_count_aligned; + auto map = tablet_rs->allocate_tablets_for_new_table(s.shared_from_this(), tm, tablet_count).get(); muts.emplace_back(tablet_map_to_mutation(map, s.id(), s.ks_name(), s.cf_name(), ts, _db.features()).get()); } } diff --git a/test/boost/network_topology_strategy_test.cc b/test/boost/network_topology_strategy_test.cc index 553bd6bf8c..a2a8506a45 100644 --- a/test/boost/network_topology_strategy_test.cc +++ b/test/boost/network_topology_strategy_test.cc @@ -519,7 +519,7 @@ SEASTAR_THREAD_TEST_CASE(NetworkTopologyStrategy_tablets_test) { "NetworkTopologyStrategy", params); auto tab_awr_ptr = ars_ptr->maybe_as_tablet_aware(); BOOST_REQUIRE(tab_awr_ptr); - auto tmap = tab_awr_ptr->allocate_tablets_for_new_table(s, stm.get(), service::default_target_tablet_size).get(); + auto tmap = tab_awr_ptr->allocate_tablets_for_new_table(s, stm.get(), tablet_count).get(); full_ring_check(tmap, ars_ptr, stm.get()); // Test reallocate_tablets after randomizing a different set of options @@ -611,7 +611,7 @@ static void test_random_balancing(sharded& snitch, gms::inet_address auto nts_ptr = dynamic_cast(ars_ptr.get()); auto tab_awr_ptr = ars_ptr->maybe_as_tablet_aware(); BOOST_REQUIRE(tab_awr_ptr); - auto tmap = tab_awr_ptr->allocate_tablets_for_new_table(s, tmptr, 1).get(); + auto tmap = tab_awr_ptr->allocate_tablets_for_new_table(s, tmptr, tablet_count).get(); full_ring_check(tmap, ars_ptr, stm.get()); check_tablets_balance(tmap, nts_ptr, topo); diff --git a/test/boost/tablets_test.cc b/test/boost/tablets_test.cc index f32ad3a941..c83cae0443 100644 --- a/test/boost/tablets_test.cc +++ b/test/boost/tablets_test.cc @@ -3096,7 +3096,7 @@ static void execute_tablet_for_new_rf_test(calculate_tablet_replicas_for_new_rf_ return make_ready_future<>(); }).get(); - auto allocated_map = tablet_aware_ptr->allocate_tablets_for_new_table(s, stm.get(), 0).get(); + auto allocated_map = tablet_aware_ptr->allocate_tablets_for_new_table(s, stm.get(), tablet_count).get(); BOOST_REQUIRE_EQUAL(allocated_map.tablet_count(), tablet_count); diff --git a/test/perf/tablet_load_balancing.cc b/test/perf/tablet_load_balancing.cc index babc933a3c..5039d2e5ec 100644 --- a/test/perf/tablet_load_balancing.cc +++ b/test/perf/tablet_load_balancing.cc @@ -300,7 +300,7 @@ future test_load_balancing_with_many_tables(params p, bool tablet_aware opts[rack1.dc] = format("{}", rf); network_topology_strategy tablet_rs(replication_strategy_params(opts, initial_tablets)); stm.mutate_token_metadata([&] (token_metadata& tm) -> future<> { - auto map = co_await tablet_rs.allocate_tablets_for_new_table(s, stm.get(), service::default_target_tablet_size); + auto map = co_await tablet_rs.allocate_tablets_for_new_table(s, stm.get(), initial_tablets.value_or(1)); tm.tablets().set_tablet_map(s->id(), std::move(map)); }).get(); };