Merge 'Track tablet streaming under global sessions to prevent side-effects of failed streaming' from Tomasz Grabiec
Tablet streaming involves asynchronous RPCs to other replicas which transfer writes. We want side-effects from streaming only within the migration stage in which the streaming was started. This is currently not guaranteed on failure. When streaming master fails (e.g. due to RPC failing), it can be that some streaming work is still alive somewhere (e.g. RPC on wire) and will have side-effects at some point later. This PR implements tracking of all operations involved in streaming which may have side-effects, which allows the topology change coordinator to fence them and wait for them to complete if they were already admitted. The tracking and fencing is implemented by using global "sessions", created for streaming of a single tablet. Session is globally identified by UUID. The identifier is assigned by the topology change coordinator, and stored in system.tablets. Sessions are created and closed based on group0 state (tablet metadata) by the barrier command sent to each replica, which we already do on transitions between stages. Also, each barrier waits for sessions which have been closed to be drained. The barrier is blocked only if there is some session with work which was left behind by unsuccessful streaming. In which case it should not be blocked for long, because streaming process checks often if the guard was left behind and stops if it was. This mechanism of tracking is fault-tolerant: session id is stored in group0, so coordinator can make progress on failover. The barriers guarantee that session exists on all replicas, and that it will be closed on all replicas. Closes scylladb/scylladb#15847 * github.com:scylladb/scylladb: test: tablets: Add test for failed streaming being fenced away error_injection: Introduce poll_for_message() error_injection: Make is_enabled() public api: Add API to kill connection to a particular host range_streamer: Do not block topology change barriers around streaming range_streamer, tablets: Do not keep token metadata around streaming tablets: Fail gracefully when migrating tablet has no pending replica storage_service, api: Add API to disable tablet balancing storage_service, api: Add API to migrate a tablet storage_service, raft topology: Run streaming under session topology guard storage_service, tablets: Use session to guard tablet streaming tablets: Add per-tablet session id field to tablet metadata service: range_streamer: Propagate topology_guard to receivers streaming: Always close the rpc::sink storage_service: Introduce concept of a topology_guard storage_service: Introduce session concept tablets: Fix topology_metadata_guard holding on to the old erm docs: Document the topology_guard mechanism
This commit is contained in:
@@ -71,10 +71,14 @@ read_replica_set_selector get_selector_for_reads(tablet_transition_stage stage)
|
||||
on_internal_error(tablet_logger, format("Invalid tablet transition stage: {}", static_cast<int>(stage)));
|
||||
}
|
||||
|
||||
tablet_transition_info::tablet_transition_info(tablet_transition_stage stage, tablet_replica_set next, tablet_replica pending_replica)
|
||||
tablet_transition_info::tablet_transition_info(tablet_transition_stage stage,
|
||||
tablet_replica_set next,
|
||||
tablet_replica pending_replica,
|
||||
service::session_id session_id)
|
||||
: stage(stage)
|
||||
, next(std::move(next))
|
||||
, pending_replica(std::move(pending_replica))
|
||||
, session_id(session_id)
|
||||
, writes(get_selector_for_writes(stage))
|
||||
, reads(get_selector_for_reads(stage))
|
||||
{ }
|
||||
@@ -280,6 +284,9 @@ std::ostream& operator<<(std::ostream& out, const tablet_map& r) {
|
||||
out << format("\n [{}]: last_token={}, replicas={}", tid, r.get_last_token(tid), tablet.replicas);
|
||||
if (auto tr = r.get_tablet_transition_info(tid)) {
|
||||
out << format(", stage={}, new_replicas={}, pending={}", tr->stage, tr->next, tr->pending_replica);
|
||||
if (tr->session_id) {
|
||||
out << format(", session={}", tr->session_id);
|
||||
}
|
||||
}
|
||||
first = false;
|
||||
tid = *r.next_tablet(tid);
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include "dht/token.hh"
|
||||
#include "utils/small_vector.hh"
|
||||
#include "locator/host_id.hh"
|
||||
#include "service/session.hh"
|
||||
#include "dht/i_partitioner_fwd.hh"
|
||||
#include "schema/schema_fwd.hh"
|
||||
#include "utils/chunked_vector.hh"
|
||||
@@ -124,6 +125,11 @@ bool contains(const tablet_replica_set& rs, host_id host) {
|
||||
return false;
|
||||
}
|
||||
|
||||
inline
|
||||
bool contains(const tablet_replica_set& rs, const tablet_replica& r) {
|
||||
return std::ranges::any_of(rs, [&] (auto&& r_) { return r_ == r; });
|
||||
}
|
||||
|
||||
/// Stores information about a single tablet.
|
||||
struct tablet_info {
|
||||
tablet_replica_set replicas;
|
||||
@@ -171,10 +177,12 @@ struct tablet_transition_info {
|
||||
tablet_transition_stage stage;
|
||||
tablet_replica_set next;
|
||||
tablet_replica pending_replica; // Optimization (next - tablet_info::replicas)
|
||||
service::session_id session_id;
|
||||
write_replica_set_selector writes;
|
||||
read_replica_set_selector reads;
|
||||
|
||||
tablet_transition_info(tablet_transition_stage stage, tablet_replica_set next, tablet_replica pending_replica);
|
||||
tablet_transition_info(tablet_transition_stage stage, tablet_replica_set next, tablet_replica pending_replica,
|
||||
service::session_id session_id = {});
|
||||
|
||||
bool operator==(const tablet_transition_info&) const = default;
|
||||
};
|
||||
@@ -335,12 +343,17 @@ public:
|
||||
using table_to_tablet_map = std::unordered_map<table_id, tablet_map>;
|
||||
private:
|
||||
table_to_tablet_map _tablets;
|
||||
|
||||
// When false, tablet load balancer will not try to rebalance tablets.
|
||||
bool _balancing_enabled = true;
|
||||
public:
|
||||
bool balancing_enabled() const { return _balancing_enabled; }
|
||||
const tablet_map& get_tablet_map(table_id id) const;
|
||||
const table_to_tablet_map& all_tables() const { return _tablets; }
|
||||
table_to_tablet_map& all_tables() { return _tablets; }
|
||||
size_t external_memory_usage() const;
|
||||
public:
|
||||
void set_balancing_enabled(bool value) { _balancing_enabled = value; }
|
||||
void set_tablet_map(table_id, tablet_map);
|
||||
tablet_map& get_tablet_map(table_id id);
|
||||
future<> clear_gently();
|
||||
|
||||
Reference in New Issue
Block a user