Merge 'Concurrent tablet migration and balancing' from Tomasz Grabiec

This change makes tablet load balancing more efficient by performing
migrations independently for different tablets, and making new load
balancing plans concurrently with active migrations.

The migration track is interrupted by pending topology change operations.

The coordinator executes the load balancer on edges of tablet state
machine transitions. This allows new migrations to be started as soon
as tablets finish streaming.

The load balancer is also continuously invoked as long as it produces
a non-empty plan. This is in order to saturate the cluster with
streaming. A single make_plan() call is still not saturating, due
to the way algorithm is implemented.

Overload of shards is limited by the fact that load balancer algorithm tracks
streaming concurrency on both source and target shards of active
migrations and takes concurrency limit into account when producing new
migrations.

Closes #14851

* github.com:scylladb/scylladb:
  tablets: load_balancer: Remove double logging
  tests: tablets: Check that load balancing is interrupted by topology change
  tests: tablets: Add test for load balancing with active migrations
  tablets: Balance tablets concurrently with active migrations
  storage_service, tablets: Extract generate_migration_updates()
  storage_service, tablets: Move get_leaving_replica() to tablets.cc
  locator: tablets: Move std::hash definition earlier
  storage_service: Advance tablets independently
  topology_coordinator: Fix missed notification on abort
  tablets: Add formatter for tablet_migration_info
This commit is contained in:
Avi Kivity
2023-07-31 16:44:33 +03:00
11 changed files with 698 additions and 287 deletions

View File

@@ -57,6 +57,13 @@ class load_sketch {
};
std::unordered_map<host_id, node_load> _nodes;
token_metadata_ptr _tm;
private:
tablet_replica_set get_replicas_for_tablet_load(const tablet_info& ti, const tablet_transition_info* trinfo) const {
// We reflect migrations in the load as if they already happened,
// optimistically assuming that they will succeed.
return trinfo ? trinfo->next : ti.replicas;
}
public:
load_sketch(token_metadata_ptr tm)
: _tm(std::move(tm)) {
@@ -65,10 +72,10 @@ public:
future<> populate(std::optional<host_id> host = std::nullopt) {
const topology& topo = _tm->get_topology();
co_await utils::clear_gently(_nodes);
for (auto&& [table, tmap] : _tm->tablets().all_tables()) {
for (const tablet_info& ti : tmap.tablets()) {
co_await coroutine::maybe_yield();
for (auto&& replica : ti.replicas) {
for (auto&& [table, tmap_] : _tm->tablets().all_tables()) {
auto& tmap = tmap_;
co_await tmap.for_each_tablet([&] (tablet_id tid, const tablet_info& ti) {
for (auto&& replica : get_replicas_for_tablet_load(ti, tmap.get_tablet_transition_info(tid))) {
if (host && *host != replica.host) {
continue;
}
@@ -80,7 +87,7 @@ public:
n._shards[replica.shard].load += 1;
}
}
}
});
}
for (auto&& n : _nodes) {
std::make_heap(n.second._shards.begin(), n.second._shards.end(), shard_load_cmp());

View File

@@ -75,6 +75,34 @@ tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, ta
, reads(get_selector_for_reads(stage))
{ }
tablet_migration_streaming_info get_migration_streaming_info(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
tablet_migration_streaming_info result = {
.read_from = std::unordered_set<tablet_replica>(tinfo.replicas.begin(), tinfo.replicas.end()),
.written_to = std::unordered_set<tablet_replica>(trinfo.next.begin(), trinfo.next.end())
};
for (auto&& r : trinfo.next) {
result.read_from.erase(r);
}
for (auto&& r : tinfo.replicas) {
result.written_to.erase(r);
}
return result;
}
tablet_replica get_leaving_replica(const tablet_info& tinfo, const tablet_transition_info& trinfo) {
std::unordered_set<tablet_replica> leaving(tinfo.replicas.begin(), tinfo.replicas.end());
for (auto&& r : trinfo.next) {
leaving.erase(r);
}
if (leaving.empty()) {
throw std::runtime_error(format("No leaving replicas"));
}
if (leaving.size() > 1) {
throw std::runtime_error(format("More than one leaving replica"));
}
return *leaving.begin();
}
const tablet_map& tablet_metadata::get_tablet_map(table_id id) const {
try {
return _tablets.at(id);
@@ -162,6 +190,10 @@ future<> tablet_map::for_each_tablet(seastar::noncopyable_function<void(tablet_i
}
}
void tablet_map::clear_transitions() {
_transitions.clear();
}
std::optional<shard_id> tablet_map::get_shard(tablet_id tid, host_id host) const {
auto&& info = get_tablet_info(tid);

View File

@@ -45,21 +45,6 @@ struct tablet_id {
bool operator<=>(const tablet_id&) const = default;
};
}
namespace std {
template<>
struct hash<locator::tablet_id> {
size_t operator()(const locator::tablet_id& id) const {
return std::hash<size_t>()(id.value());
}
};
}
namespace locator {
/// Identifies tablet (not be confused with tablet replica) in the scope of the whole cluster.
struct global_tablet_id {
table_id table;
@@ -80,6 +65,39 @@ std::ostream& operator<<(std::ostream&, const tablet_replica&);
using tablet_replica_set = utils::small_vector<tablet_replica, 3>;
}
namespace std {
template<>
struct hash<locator::tablet_id> {
size_t operator()(const locator::tablet_id& id) const {
return std::hash<size_t>()(id.value());
}
};
template<>
struct hash<locator::tablet_replica> {
size_t operator()(const locator::tablet_replica& r) const {
return utils::hash_combine(
std::hash<locator::host_id>()(r.host),
std::hash<shard_id>()(r.shard));
}
};
template<>
struct hash<locator::global_tablet_id> {
size_t operator()(const locator::global_tablet_id& id) const {
return utils::hash_combine(
std::hash<table_id>()(id.table),
std::hash<locator::tablet_id>()(id.tablet));
}
};
}
namespace locator {
/// Creates a new replica set with old_replica replaced by new_replica.
/// If there is no old_replica, the set is returned unchanged.
inline
@@ -150,6 +168,17 @@ struct tablet_transition_info {
bool operator==(const tablet_transition_info&) const = default;
};
// Returns the leaving replica for a given transition.
tablet_replica get_leaving_replica(const tablet_info&, const tablet_transition_info&);
/// Describes streaming required for a given tablet transition.
struct tablet_migration_streaming_info {
std::unordered_set<tablet_replica> read_from;
std::unordered_set<tablet_replica> written_to;
};
tablet_migration_streaming_info get_migration_streaming_info(const tablet_info&, const tablet_transition_info&);
/// Stores information about tablets of a single table.
///
/// The map contains a constant number of tablets, tablet_count().
@@ -264,6 +293,7 @@ public:
public:
void set_tablet(tablet_id, tablet_info);
void set_tablet_transition_info(tablet_id, tablet_transition_info);
void clear_transitions();
// Destroys gently.
// The tablet map is not usable after this call and should be destroyed.
@@ -297,6 +327,7 @@ private:
public:
const tablet_map& get_tablet_map(table_id id) const;
const table_to_tablet_map& all_tables() const { return _tablets; }
table_to_tablet_map& all_tables() { return _tablets; }
size_t external_memory_usage() const;
public:
void set_tablet_map(table_id, tablet_map);
@@ -309,28 +340,6 @@ public:
}
namespace std {
template<>
struct hash<locator::tablet_replica> {
size_t operator()(const locator::tablet_replica& r) const {
return utils::hash_combine(
std::hash<locator::host_id>()(r.host),
std::hash<shard_id>()(r.shard));
}
};
template<>
struct hash<locator::global_tablet_id> {
size_t operator()(const locator::global_tablet_id& id) const {
return utils::hash_combine(
std::hash<table_id>()(id.table),
std::hash<locator::tablet_id>()(id.tablet));
}
};
}
template <>
struct fmt::formatter<locator::tablet_transition_stage> : fmt::formatter<std::string_view> {
auto format(const locator::tablet_transition_stage&, fmt::format_context& ctx) const -> decltype(ctx.out());