gossip: Added SNITCH_NAME to application_state
Snitch name needs to be exchanged within cluster once, on shadow round, so joining nodes cannot use wrong snitch. The snitch names are compared on bootstrap and on normal node start. If the cluster already used mixed snitches, the upgrade to this version will fail. In this case customer needs to add a node with correct snitch for every node with the wrong snitch, then put down the nodes with the wrong snitch and only then do the upgrade. Fixes #6832 Closes #7739
This commit is contained in:
committed by
Avi Kivity
parent
781f9d9aca
commit
b150906d39
@@ -68,6 +68,7 @@ static const std::map<application_state, sstring> application_state_names = {
|
||||
{application_state::SHARD_COUNT, "SHARD_COUNT"},
|
||||
{application_state::IGNORE_MSB_BITS, "IGNOR_MSB_BITS"},
|
||||
{application_state::CDC_STREAMS_TIMESTAMP, "CDC_STREAMS_TIMESTAMP"},
|
||||
{application_state::SNITCH_NAME, "SNITCH_NAME"},
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream& os, const application_state& m) {
|
||||
|
||||
@@ -65,8 +65,8 @@ enum class application_state {
|
||||
SHARD_COUNT,
|
||||
IGNORE_MSB_BITS,
|
||||
CDC_STREAMS_TIMESTAMP,
|
||||
SNITCH_NAME,
|
||||
// pad to allow adding new states to existing cluster
|
||||
X9,
|
||||
X10,
|
||||
};
|
||||
|
||||
|
||||
@@ -1773,7 +1773,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
|
||||
gms::application_state::STATUS,
|
||||
gms::application_state::HOST_ID,
|
||||
gms::application_state::TOKENS,
|
||||
gms::application_state::SUPPORTED_FEATURES}};
|
||||
gms::application_state::SUPPORTED_FEATURES,
|
||||
gms::application_state::SNITCH_NAME}};
|
||||
logger.info("Gossip shadow round started with nodes={}", nodes);
|
||||
std::unordered_set<gms::inet_address> nodes_talked;
|
||||
size_t nodes_down = 0;
|
||||
@@ -2343,6 +2344,20 @@ void gossiper::check_knows_remote_features(std::set<std::string_view>& local_fea
|
||||
}
|
||||
}
|
||||
|
||||
void gossiper::check_snitch_name_matches() const {
|
||||
const auto& my_snitch_name = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_name();
|
||||
for (const auto& [address, state] : endpoint_state_map) {
|
||||
const auto remote_snitch_name = state.get_application_state_ptr(application_state::SNITCH_NAME);
|
||||
if (!remote_snitch_name) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (remote_snitch_name->value != my_snitch_name) {
|
||||
throw std::runtime_error(format("Snitch check failed. This node cannot join the cluster because it uses {} and not {}", my_snitch_name, remote_snitch_name->value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sstring gossiper::get_all_endpoint_states() {
|
||||
std::stringstream ss;
|
||||
for (auto& entry : endpoint_state_map) {
|
||||
|
||||
@@ -576,6 +576,7 @@ private:
|
||||
public:
|
||||
void append_endpoint_state(std::stringstream& ss, const endpoint_state& state);
|
||||
public:
|
||||
void check_snitch_name_matches() const;
|
||||
sstring get_all_endpoint_states();
|
||||
std::map<sstring, sstring> get_simple_states();
|
||||
int get_down_endpoint_count() const noexcept;
|
||||
|
||||
@@ -219,6 +219,10 @@ public:
|
||||
return versioned_value(rack_id);
|
||||
}
|
||||
|
||||
static versioned_value snitch_name(const sstring& snitch_name) {
|
||||
return versioned_value(snitch_name);
|
||||
}
|
||||
|
||||
static versioned_value shard_count(int shard_count) {
|
||||
return versioned_value(format("{}", shard_count));
|
||||
}
|
||||
|
||||
@@ -286,6 +286,7 @@ void storage_service::prepare_to_join(
|
||||
slogger.info("Checking remote features with gossip, initial_contact_nodes={}", initial_contact_nodes);
|
||||
_gossiper.do_shadow_round(initial_contact_nodes, gms::bind_messaging_port(bool(do_bind))).get();
|
||||
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
_gossiper.check_snitch_name_matches();
|
||||
_gossiper.reset_endpoint_state_map().get();
|
||||
for (auto ep : loaded_endpoints) {
|
||||
_gossiper.add_saved_endpoint(ep);
|
||||
@@ -360,6 +361,9 @@ void storage_service::prepare_to_join(
|
||||
if (replacing_a_node_with_same_ip || replacing_a_node_with_diff_ip) {
|
||||
app_states.emplace(gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens));
|
||||
}
|
||||
const auto& snitch_name = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_name();
|
||||
app_states.emplace(gms::application_state::SNITCH_NAME, versioned_value::snitch_name(snitch_name));
|
||||
|
||||
slogger.info("Starting up server gossip");
|
||||
|
||||
auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
|
||||
@@ -1755,6 +1759,7 @@ future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::i
|
||||
slogger.info("Checking remote features with gossip");
|
||||
_gossiper.do_shadow_round(initial_contact_nodes, gms::bind_messaging_port(bool(do_bind))).get();
|
||||
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
|
||||
_gossiper.check_snitch_name_matches();
|
||||
auto addr = get_broadcast_address();
|
||||
if (!_gossiper.is_safe_for_bootstrap(addr)) {
|
||||
throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "
|
||||
|
||||
Reference in New Issue
Block a user