tablets: Balance tablets concurrently with active migrations

After this change, the load balancer can make progress with active
migrations. If the algorithm is called with active tablet migrations
in tablet metadata, those are treated by load balancer as if they were
already completed. This allows the algorithm to incrementally make
decision which when executed with active migrations will produce the
desired result.

Overload of shards is limited by the fact that the algorithm tracks
streaming concurrency on both source and target shards of active
migrations and takes concurrency limit into account when producing new
migrations.

The coordinator executes the load balancer on edges of tablet state
machine stransitions. This allows new migrations to be started as soon
as tablets finish streaming.

The load balancer is also continuously invoked as long as it produces
a non-empty plan. This is in order to saturate the cluster with
streaming. A single make_plan() call is still not saturating, due
to the way algorithm is implemented.
This commit is contained in:
Tomasz Grabiec
2023-07-24 23:55:27 +02:00
parent c9ea215ce1
commit fe181b3bac
7 changed files with 214 additions and 51 deletions

View File

@@ -57,6 +57,13 @@ class load_sketch {
};
std::unordered_map<host_id, node_load> _nodes;
token_metadata_ptr _tm;
private:
tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
// We reflect migrations in the load as if they already happened,
// optimistically assuming that they will succeed.
return trinfo ? trinfo->next : ti.replicas;
}
public:
load_sketch(token_metadata_ptr tm)
: _tm(std::move(tm)) {
@@ -65,10 +72,10 @@ public:
future<> populate(std::optional<host_id> host = std::nullopt) {
const topology& topo = _tm->get_topology();
co_await utils::clear_gently(_nodes);
for (auto&& [table, tmap] : _tm->tablets().all_tables()) {
for (const tablet_info& ti : tmap.tablets()) {
co_await coroutine::maybe_yield();
for (auto&& replica : ti.replicas) {
for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
auto& tmap = tmap_;
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) {
for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
if (host && *host != replica.host) {
continue;
}
@@ -80,7 +87,7 @@ public:
n._shards[replica.shard].load += 1;
}
}
}
});
}
for (auto&& n : _nodes) {
std::make_heap(n.second._shards.begin(), n.second._shards.end(), shard_load_cmp());

View File

@@ -75,6 +75,20 @@ tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, ta
, reads(get_selector_for_reads(stage))
{ }
tablet_migration_streaming_info get_migration_streaming_info(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
tablet_migration_streaming_info result = {
.read_from = std::unordered_set<tablet_replica>(tinfo.replicas.begin(), tinfo.replicas.end()),
.written_to = std::unordered_set<tablet_replica>(trinfo.next.begin(), trinfo.next.end())
};
for (auto&& r : trinfo.next) {
result.read_from.erase(r);
}
for (auto&& r : tinfo.replicas) {
result.written_to.erase(r);
}
return result;
}
tablet_replica get_leaving_replica(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
std::unordered_set<tablet_replica> leaving(tinfo.replicas.begin(), tinfo.replicas.end());
for (auto&& r : trinfo.next) {

View File

@@ -171,6 +171,14 @@ struct tablet_transition_info {
// Returns the leaving replica for a given transition.
tablet_replica get_leaving_replica(const tablet_info&, const tablet_transition_info&);
/// Describes streaming required for a given tablet transition.
struct tablet_migration_streaming_info {
std::unordered_set<tablet_replica> read_from;
std::unordered_set<tablet_replica> written_to;
};
tablet_migration_streaming_info get_migration_streaming_info(const tablet_info&, const tablet_transition_info&);
/// Stores information about tablets of a single table.
///
/// The map contains a constant number of tablets, tablet_count().