Files
scylla/repair/repair.hh
Avi Kivity ae3a360725 database: Move database, keyspace, table classes to replica/ directory
The database, keyspace, and table classes represent the replica-only
part of the objects after which they are named. Reading from a table
doesn't give you the full data, just the replica's view, and it is not
consistent since reconciliation is applied on the coordinator.

As a first step in acknowledging this, move the related files to
a replica/ subdirectory.
2022-01-06 17:07:30 +02:00

538 lines
18 KiB
C++

/*
* Copyright (C) 2015-present ScyllaDB
*/
/*
* This file is part of Scylla.
*
* Scylla is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Scylla is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Scylla. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <unordered_set>
#include <unordered_map>
#include <exception>
#include <absl/container/btree_set.h>
#include <seastar/core/abort_source.hh>
#include <seastar/core/sstring.hh>
#include <seastar/core/sharded.hh>
#include <seastar/core/future.hh>
#include <seastar/core/condition-variable.hh>
#include <seastar/core/gate.hh>
#include "replica/database_fwd.hh"
#include "frozen_mutation.hh"
#include "utils/UUID.hh"
#include "utils/hash.hh"
#include "streaming/stream_reason.hh"
#include "locator/token_metadata.hh"
class flat_mutation_reader;
class database;
class repair_service;
namespace db {
namespace view {
class view_update_generator;
}
class system_distributed_keyspace;
}
namespace netw { class messaging_service; }
namespace service {
class migration_manager;
}
namespace gms { class gossiper; }
class repair_exception : public std::exception {
private:
sstring _what;
public:
repair_exception(sstring msg) : _what(std::move(msg)) { }
virtual const char* what() const noexcept override { return _what.c_str(); }
};
class repair_stopped_exception : public repair_exception {
public:
repair_stopped_exception() : repair_exception("Repair stopped") { }
};
struct repair_uniq_id {
// The integer ID used to identify a repair job. It is currently used by nodetool and http API.
int id;
// A UUID to identifiy a repair job. We will transit to use UUID over the integer ID.
utils::UUID uuid;
};
std::ostream& operator<<(std::ostream& os, const repair_uniq_id& x);
struct node_ops_info {
utils::UUID ops_uuid;
bool abort = false;
std::list<gms::inet_address> ignore_nodes;
void check_abort();
};
future<> abort_repair_node_ops(utils::UUID ops_uuid);
// NOTE: repair_start() can be run on any node, but starts a node-global
// operation.
// repair_start() starts the requested repair on this node. It returns an
// integer id which can be used to query the repair's status with
// repair_get_status(). The returned future<int> becomes available quickly,
// as soon as repair_get_status() can be used - it doesn't wait for the
// repair to complete.
future<int> repair_start(seastar::sharded<repair_service>& repair,
sstring keyspace, std::unordered_map<sstring, sstring> options);
// TODO: Have repair_progress contains a percentage progress estimator
// instead of just "RUNNING".
enum class repair_status { RUNNING, SUCCESSFUL, FAILED };
// repair_get_status() returns a future because it needs to run code on a
// different CPU (cpu 0) and that might be a deferring operation.
future<repair_status> repair_get_status(seastar::sharded<database>& db, int id);
// If the repair job is finished (SUCCESSFUL or FAILED), it returns immediately.
// It blocks if the repair job is still RUNNING until timeout.
future<repair_status> repair_await_completion(seastar::sharded<database>& db, int id, std::chrono::steady_clock::time_point timeout);
// returns a vector with the ids of the active repairs
future<std::vector<int>> get_active_repairs(seastar::sharded<database>& db);
void check_in_shutdown();
// Abort all the repairs
future<> repair_abort_all(seastar::sharded<database>& db);
enum class repair_checksum {
legacy = 0,
streamed = 1,
};
class repair_stats {
public:
uint64_t round_nr = 0;
uint64_t round_nr_fast_path_already_synced = 0;
uint64_t round_nr_fast_path_same_combined_hashes= 0;
uint64_t round_nr_slow_path = 0;
uint64_t rpc_call_nr = 0;
uint64_t tx_hashes_nr = 0;
uint64_t rx_hashes_nr = 0;
uint64_t tx_row_nr = 0;
uint64_t rx_row_nr = 0;
uint64_t tx_row_bytes = 0;
uint64_t rx_row_bytes = 0;
std::map<gms::inet_address, uint64_t> row_from_disk_bytes;
std::map<gms::inet_address, uint64_t> row_from_disk_nr;
std::map<gms::inet_address, uint64_t> tx_row_nr_peer;
std::map<gms::inet_address, uint64_t> rx_row_nr_peer;
lowres_clock::time_point start_time = lowres_clock::now();
public:
void add(const repair_stats& o);
sstring get_stats();
};
class repair_neighbors {
public:
std::vector<gms::inet_address> all;
std::vector<gms::inet_address> mandatory;
repair_neighbors() = default;
explicit repair_neighbors(std::vector<gms::inet_address> a)
: all(std::move(a)) {
}
repair_neighbors(std::vector<gms::inet_address> a, std::vector<gms::inet_address> m)
: all(std::move(a))
, mandatory(std::move(m)) {
}
};
class repair_info {
public:
repair_service& rs;
seastar::sharded<database>& db;
seastar::sharded<netw::messaging_service>& messaging;
sharded<db::system_distributed_keyspace>& sys_dist_ks;
sharded<db::view::view_update_generator>& view_update_generator;
service::migration_manager& mm;
gms::gossiper& gossiper;
const dht::sharder& sharder;
sstring keyspace;
dht::token_range_vector ranges;
std::vector<sstring> cfs;
std::vector<utils::UUID> table_ids;
repair_uniq_id id;
shard_id shard;
std::vector<sstring> data_centers;
std::vector<sstring> hosts;
std::unordered_set<gms::inet_address> ignore_nodes;
streaming::stream_reason reason;
std::unordered_map<dht::token_range, repair_neighbors> neighbors;
size_t total_rf;
uint64_t nr_ranges_finished = 0;
uint64_t nr_ranges_total;
size_t nr_failed_ranges = 0;
bool aborted = false;
int ranges_index = 0;
repair_stats _stats;
std::unordered_set<sstring> dropped_tables;
std::optional<utils::UUID> _ops_uuid;
bool _hints_batchlog_flushed = false;
public:
repair_info(repair_service& repair,
const sstring& keyspace_,
const dht::token_range_vector& ranges_,
std::vector<utils::UUID> table_ids_,
repair_uniq_id id_,
const std::vector<sstring>& data_centers_,
const std::vector<sstring>& hosts_,
const std::unordered_set<gms::inet_address>& ingore_nodes_,
streaming::stream_reason reason_,
std::optional<utils::UUID> ops_uuid,
bool hints_batchlog_flushed);
void check_failed_ranges();
void abort();
void check_in_abort();
repair_neighbors get_repair_neighbors(const dht::token_range& range);
void update_statistics(const repair_stats& stats) {
_stats.add(stats);
}
const std::vector<sstring>& table_names() {
return cfs;
}
const std::optional<utils::UUID>& ops_uuid() const {
return _ops_uuid;
};
bool hints_batchlog_flushed() const {
return _hints_batchlog_flushed;
}
future<> repair_range(const dht::token_range& range);
};
// The repair_tracker tracks ongoing repair operations and their progress.
// A repair which has already finished successfully is dropped from this
// table, but a failed repair will remain in the table forever so it can
// be queried about more than once (FIXME: reconsider this. But note that
// failed repairs should be rare anwyay).
// This object is not thread safe, and must be used by only one cpu.
class tracker {
private:
// Each repair_start() call returns a unique int which the user can later
// use to follow the status of this repair with repair_status().
// We can't use the number 0 - if repair_start() returns 0, it means it
// decide quickly that there is nothing to repair.
int _next_repair_command = 1;
// Note that there are no "SUCCESSFUL" entries in the "status" map:
// Successfully-finished repairs are those with id < _next_repair_command
// but aren't listed as running or failed the status map.
std::unordered_map<int, repair_status> _status;
// Used to allow shutting down repairs in progress, and waiting for them.
seastar::gate _gate;
// Set when the repair service is being shutdown
std::atomic_bool _shutdown alignas(seastar::cache_line_size);
// Triggered when service is being shutdown
seastar::abort_source _shutdown_as;
// Triggered when all repairs are requested to be aborted.
// It is immediately initialized again after an abort.
seastar::abort_source _abort_all_as;
// Map repair id into repair_info. The vector has smp::count elements, each
// element will be accessed by only one shard.
std::vector<std::unordered_map<int, lw_shared_ptr<repair_info>>> _repairs;
// Each element in the vector is the semaphore used to control the maximum
// ranges that can be repaired in parallel. Each element will be accessed
// by one shared.
std::vector<named_semaphore> _range_parallelism_semaphores;
static constexpr size_t _max_repair_memory_per_range = 32 * 1024 * 1024;
seastar::condition_variable _done_cond;
void start(repair_uniq_id id);
void done(repair_uniq_id id, bool succeeded);
public:
explicit tracker(size_t nr_shards, size_t max_repair_memory);
~tracker();
repair_status get(int id);
repair_uniq_id next_repair_command();
future<> shutdown();
void check_in_shutdown();
seastar::abort_source& get_shutdown_abort_source();
void add_repair_info(int id, lw_shared_ptr<repair_info> ri);
void remove_repair_info(int id);
lw_shared_ptr<repair_info> get_repair_info(int id);
std::vector<int> get_active() const;
size_t nr_running_repair_jobs();
void abort_all_repairs();
seastar::abort_source& get_abort_all_abort_source();
named_semaphore& range_parallelism_semaphore();
static size_t max_repair_memory_per_range() { return _max_repair_memory_per_range; }
future<> run(repair_uniq_id id, std::function<void ()> func);
future<repair_status> repair_await_completion(int id, std::chrono::steady_clock::time_point timeout);
float report_progress(streaming::stream_reason reason);
void abort_repair_node_ops(utils::UUID ops_uuid);
};
future<uint64_t> estimate_partitions(seastar::sharded<database>& db, const sstring& keyspace,
const sstring& cf, const dht::token_range& range);
// Represent a position of a mutation_fragment read from a flat mutation
// reader. Repair nodes negotiate a small range identified by two
// repair_sync_boundary to work on in each round.
struct repair_sync_boundary {
dht::decorated_key pk;
position_in_partition position;
class tri_compare {
dht::ring_position_comparator _pk_cmp;
position_in_partition::tri_compare _position_cmp;
public:
tri_compare(const schema& s) : _pk_cmp(s), _position_cmp(s) { }
std::strong_ordering operator()(const repair_sync_boundary& a, const repair_sync_boundary& b) const {
auto ret = _pk_cmp(a.pk, b.pk);
if (ret == 0) {
ret = _position_cmp(a.position, b.position);
}
return ret;
}
};
friend std::ostream& operator<<(std::ostream& os, const repair_sync_boundary& x) {
return os << "{ " << x.pk << "," << x.position << " }";
}
};
// Hash of a repair row
class repair_hash {
public:
uint64_t hash = 0;
repair_hash() = default;
explicit repair_hash(uint64_t h) : hash(h) {
}
void clear() {
hash = 0;
}
void add(const repair_hash& other) {
hash ^= other.hash;
}
bool operator==(const repair_hash& x) const {
return x.hash == hash;
}
bool operator!=(const repair_hash& x) const {
return x.hash != hash;
}
bool operator<(const repair_hash& x) const {
return x.hash < hash;
}
friend std::ostream& operator<<(std::ostream& os, const repair_hash& x) {
return os << x.hash;
}
};
using repair_hash_set = absl::btree_set<repair_hash>;
enum class repair_row_level_start_status: uint8_t {
ok,
no_such_column_family,
};
struct repair_row_level_start_response {
repair_row_level_start_status status;
};
// Return value of the REPAIR_GET_SYNC_BOUNDARY RPC verb
struct get_sync_boundary_response {
std::optional<repair_sync_boundary> boundary;
repair_hash row_buf_combined_csum;
// The current size of the row buf
uint64_t row_buf_size;
// The number of bytes this verb read from disk
uint64_t new_rows_size;
// The number of rows this verb read from disk
uint64_t new_rows_nr;
};
// Return value of the REPAIR_GET_COMBINED_ROW_HASH RPC verb
using get_combined_row_hash_response = repair_hash;
struct node_repair_meta_id {
gms::inet_address ip;
uint32_t repair_meta_id;
bool operator==(const node_repair_meta_id& x) const {
return x.ip == ip && x.repair_meta_id == repair_meta_id;
}
};
// Represent a partition_key and frozen_mutation_fragments within the partition_key.
class partition_key_and_mutation_fragments {
partition_key _key;
std::list<frozen_mutation_fragment> _mfs;
public:
partition_key_and_mutation_fragments()
: _key(std::vector<bytes>() ) {
}
partition_key_and_mutation_fragments(partition_key key, std::list<frozen_mutation_fragment> mfs)
: _key(std::move(key))
, _mfs(std::move(mfs)) {
}
const partition_key& get_key() const { return _key; }
const std::list<frozen_mutation_fragment>& get_mutation_fragments() const { return _mfs; }
partition_key& get_key() { return _key; }
std::list<frozen_mutation_fragment>& get_mutation_fragments() { return _mfs; }
void push_mutation_fragment(frozen_mutation_fragment mf) { _mfs.push_back(std::move(mf)); }
};
using repair_row_on_wire = partition_key_and_mutation_fragments;
using repair_rows_on_wire = std::list<partition_key_and_mutation_fragments>;
enum class repair_stream_cmd : uint8_t {
error,
hash_data,
row_data,
end_of_current_hash_set,
needs_all_rows,
end_of_current_rows,
get_full_row_hashes,
put_rows_done,
};
struct repair_hash_with_cmd {
repair_stream_cmd cmd;
repair_hash hash;
};
struct repair_row_on_wire_with_cmd {
repair_stream_cmd cmd;
repair_row_on_wire row;
};
enum class row_level_diff_detect_algorithm : uint8_t {
send_full_set,
send_full_set_rpc_stream,
};
std::ostream& operator<<(std::ostream& out, row_level_diff_detect_algorithm algo);
enum class node_ops_cmd : uint32_t {
removenode_prepare,
removenode_heartbeat,
removenode_sync_data,
removenode_abort,
removenode_done,
replace_prepare,
replace_prepare_mark_alive,
replace_prepare_pending_ranges,
replace_heartbeat,
replace_abort,
replace_done,
decommission_prepare,
decommission_heartbeat,
decommission_abort,
decommission_done,
bootstrap_prepare,
bootstrap_heartbeat,
bootstrap_abort,
bootstrap_done,
query_pending_ops,
repair_updater,
};
std::ostream& operator<<(std::ostream& out, node_ops_cmd cmd);
// The cmd and ops_uuid are mandatory for each request.
// The ignore_nodes and leaving_node are optional.
struct node_ops_cmd_request {
// Mandatory field, set by all cmds
node_ops_cmd cmd;
// Mandatory field, set by all cmds
utils::UUID ops_uuid;
// Optional field, list nodes to ignore, set by all cmds
std::list<gms::inet_address> ignore_nodes;
// Optional field, list leaving nodes, set by decommission and removenode cmd
std::list<gms::inet_address> leaving_nodes;
// Optional field, map existing nodes to replacing nodes, set by replace cmd
std::unordered_map<gms::inet_address, gms::inet_address> replace_nodes;
// Optional field, map bootstrapping nodes to bootstrap tokens, set by bootstrap cmd
std::unordered_map<gms::inet_address, std::list<dht::token>> bootstrap_nodes;
// Optional field, list uuids of tables being repaired, set by repair cmd
std::list<utils::UUID> repair_tables;
node_ops_cmd_request(node_ops_cmd command,
utils::UUID uuid,
std::list<gms::inet_address> ignore = {},
std::list<gms::inet_address> leaving = {},
std::unordered_map<gms::inet_address, gms::inet_address> replace = {},
std::unordered_map<gms::inet_address, std::list<dht::token>> bootstrap = {},
std::list<utils::UUID> tables = {})
: cmd(command)
, ops_uuid(std::move(uuid))
, ignore_nodes(std::move(ignore))
, leaving_nodes(std::move(leaving))
, replace_nodes(std::move(replace))
, bootstrap_nodes(std::move(bootstrap))
, repair_tables(std::move(tables)) {
}
};
struct node_ops_cmd_response {
// Mandatory field, set by all cmds
bool ok;
// Optional field, set by query_pending_ops cmd
std::list<utils::UUID> pending_ops;
node_ops_cmd_response(bool o, std::list<utils::UUID> pending = {})
: ok(o)
, pending_ops(std::move(pending)) {
}
};
struct repair_update_system_table_request {
utils::UUID repair_uuid;
utils::UUID table_uuid;
sstring keyspace_name;
sstring table_name;
dht::token_range range;
gc_clock::time_point repair_time;
};
struct repair_update_system_table_response {
};
struct repair_flush_hints_batchlog_request {
utils::UUID repair_uuid;
std::list<gms::inet_address> target_nodes;
std::chrono::seconds hints_timeout;
std::chrono::seconds batchlog_timeout;
};
struct repair_flush_hints_batchlog_response {
};
namespace std {
template<>
struct hash<repair_hash> {
size_t operator()(repair_hash h) const { return h.hash; }
};
template<>
struct hash<node_repair_meta_id> {
size_t operator()(node_repair_meta_id id) const { return utils::tuple_hash()(id.ip, id.repair_meta_id); }
};
}