tablets: Balance tablets concurrently with active migrations

After this change, the load balancer can make progress with active migrations. If the algorithm is called with active tablet migrations in tablet metadata, those are treated by load balancer as if they were already completed. This allows the algorithm to incrementally make decision which when executed with active migrations will produce the desired result. Overload of shards is limited by the fact that the algorithm tracks streaming concurrency on both source and target shards of active migrations and takes concurrency limit into account when producing new migrations. The coordinator executes the load balancer on edges of tablet state machine stransitions. This allows new migrations to be started as soon as tablets finish streaming. The load balancer is also continuously invoked as long as it produces a non-empty plan. This is in order to saturate the cluster with streaming. A single make_plan() call is still not saturating, due to the way algorithm is implemented.
2023-07-24 23:55:27 +02:00
parent c9ea215ce1
commit fe181b3bac
7 changed files with 214 additions and 51 deletions
--- a/locator/load_sketch.hh
+++ b/locator/load_sketch.hh
@@ -57,6 +57,13 @@ class load_sketch {
    };
    std::unordered_map<host_id, node_load> _nodes;
    token_metadata_ptr _tm;
+private:
+    tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
+        // We reflect migrations in the load as if they already happened,
+        // optimistically assuming that they will succeed.
+        return trinfo ? trinfo->next : ti.replicas;
+    }
+
 public:
    load_sketch(token_metadata_ptr tm)
        : _tm(std::move(tm)) {
@@ -65,10 +72,10 @@ public:
    future<> populate(std::optional<host_id> host = std::nullopt) {
        const topology& topo = _tm->get_topology();
        co_await utils::clear_gently(_nodes);
-        for (auto&& [table, tmap] : _tm->tablets().all_tables()) {
-            for (const tablet_info& ti : tmap.tablets()) {
-                co_await coroutine::maybe_yield();
-                for (auto&& replica : ti.replicas) {
+        for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
+            auto& tmap = tmap_;
+            co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) {
+                for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
                    if (host && *host != replica.host) {
                        continue;
                    }
@@ -80,7 +87,7 @@ public:
                        n._shards[replica.shard].load += 1;
                    }
                }
-            }
+            });
        }
        for (auto&& n : _nodes) {
            std::make_heap(n.second._shards.begin(), n.second._shards.end(), shard_load_cmp());
--- a/locator/tablets.cc
+++ b/locator/tablets.cc
@@ -75,6 +75,20 @@ tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, ta
    , reads(get_selector_for_reads(stage))
 { }

+tablet_migration_streaming_info get_migration_streaming_info(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
+    tablet_migration_streaming_info result = {
+        .read_from = std::unordered_set<tablet_replica>(tinfo.replicas.begin(), tinfo.replicas.end()),
+        .written_to = std::unordered_set<tablet_replica>(trinfo.next.begin(), trinfo.next.end())
+    };
+    for (auto&& r : trinfo.next) {
+        result.read_from.erase(r);
+    }
+    for (auto&& r : tinfo.replicas) {
+        result.written_to.erase(r);
+    }
+    return result;
+}
+
 tablet_replica get_leaving_replica(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
    std::unordered_set<tablet_replica> leaving(tinfo.replicas.begin(), tinfo.replicas.end());
    for (auto&& r : trinfo.next) {
--- a/locator/tablets.hh
+++ b/locator/tablets.hh
@@ -171,6 +171,14 @@ struct tablet_transition_info {
 // Returns the leaving replica for a given transition.
 tablet_replica get_leaving_replica(const tablet_info&, const tablet_transition_info&);

+/// Describes streaming required for a given tablet transition.
+struct tablet_migration_streaming_info {
+    std::unordered_set<tablet_replica> read_from;
+    std::unordered_set<tablet_replica> written_to;
+};
+
+tablet_migration_streaming_info get_migration_streaming_info(const tablet_info&, const tablet_transition_info&);
+
 /// Stores information about tablets of a single table.
 ///
 /// The map contains a constant number of tablets, tablet_count().