gossip: Added SNITCH_NAME to application_state

Snitch name needs to be exchanged within cluster once, on shadow
round, so joining nodes cannot use wrong snitch. The snitch names
are compared on bootstrap and on normal node start.

If the cluster already used mixed snitches, the upgrade to this
version will fail. In this case customer needs to add a node with
correct snitch for every node with the wrong snitch, then put
down the nodes with the wrong snitch and only then do the upgrade.

Fixes #6832

Closes #7739
This commit is contained in:
Juliusz Stasiewicz
2020-11-24 09:26:55 +01:00
committed by Avi Kivity
parent 781f9d9aca
commit b150906d39
6 changed files with 28 additions and 2 deletions

View File

@@ -68,6 +68,7 @@ static const std::map<application_state, sstring> application_state_names = {
{application_state::SHARD_COUNT, "SHARD_COUNT"},
{application_state::IGNORE_MSB_BITS, "IGNOR_MSB_BITS"},
{application_state::CDC_STREAMS_TIMESTAMP, "CDC_STREAMS_TIMESTAMP"},
{application_state::SNITCH_NAME, "SNITCH_NAME"},
};
std::ostream& operator<<(std::ostream& os, const application_state& m) {

View File

@@ -65,8 +65,8 @@ enum class application_state {
SHARD_COUNT,
IGNORE_MSB_BITS,
CDC_STREAMS_TIMESTAMP,
SNITCH_NAME,
// pad to allow adding new states to existing cluster
X9,
X10,
};

View File

@@ -1773,7 +1773,8 @@ future<> gossiper::do_shadow_round(std::unordered_set<gms::inet_address> nodes,
gms::application_state::STATUS,
gms::application_state::HOST_ID,
gms::application_state::TOKENS,
gms::application_state::SUPPORTED_FEATURES}};
gms::application_state::SUPPORTED_FEATURES,
gms::application_state::SNITCH_NAME}};
logger.info("Gossip shadow round started with nodes={}", nodes);
std::unordered_set<gms::inet_address> nodes_talked;
size_t nodes_down = 0;
@@ -2343,6 +2344,20 @@ void gossiper::check_knows_remote_features(std::set<std::string_view>& local_fea
}
}
void gossiper::check_snitch_name_matches() const {
const auto& my_snitch_name = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_name();
for (const auto& [address, state] : endpoint_state_map) {
const auto remote_snitch_name = state.get_application_state_ptr(application_state::SNITCH_NAME);
if (!remote_snitch_name) {
continue;
}
if (remote_snitch_name->value != my_snitch_name) {
throw std::runtime_error(format("Snitch check failed. This node cannot join the cluster because it uses {} and not {}", my_snitch_name, remote_snitch_name->value));
}
}
}
sstring gossiper::get_all_endpoint_states() {
std::stringstream ss;
for (auto& entry : endpoint_state_map) {

View File

@@ -576,6 +576,7 @@ private:
public:
void append_endpoint_state(std::stringstream& ss, const endpoint_state& state);
public:
void check_snitch_name_matches() const;
sstring get_all_endpoint_states();
std::map<sstring, sstring> get_simple_states();
int get_down_endpoint_count() const noexcept;

View File

@@ -219,6 +219,10 @@ public:
return versioned_value(rack_id);
}
static versioned_value snitch_name(const sstring& snitch_name) {
return versioned_value(snitch_name);
}
static versioned_value shard_count(int shard_count) {
return versioned_value(format("{}", shard_count));
}

View File

@@ -286,6 +286,7 @@ void storage_service::prepare_to_join(
slogger.info("Checking remote features with gossip, initial_contact_nodes={}", initial_contact_nodes);
_gossiper.do_shadow_round(initial_contact_nodes, gms::bind_messaging_port(bool(do_bind))).get();
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
_gossiper.check_snitch_name_matches();
_gossiper.reset_endpoint_state_map().get();
for (auto ep : loaded_endpoints) {
_gossiper.add_saved_endpoint(ep);
@@ -360,6 +361,9 @@ void storage_service::prepare_to_join(
if (replacing_a_node_with_same_ip || replacing_a_node_with_diff_ip) {
app_states.emplace(gms::application_state::TOKENS, versioned_value::tokens(_bootstrap_tokens));
}
const auto& snitch_name = locator::i_endpoint_snitch::get_local_snitch_ptr()->get_name();
app_states.emplace(gms::application_state::SNITCH_NAME, versioned_value::snitch_name(snitch_name));
slogger.info("Starting up server gossip");
auto generation_number = db::system_keyspace::increment_and_get_generation().get0();
@@ -1755,6 +1759,7 @@ future<> storage_service::check_for_endpoint_collision(std::unordered_set<gms::i
slogger.info("Checking remote features with gossip");
_gossiper.do_shadow_round(initial_contact_nodes, gms::bind_messaging_port(bool(do_bind))).get();
_gossiper.check_knows_remote_features(local_features, loaded_peer_features);
_gossiper.check_snitch_name_matches();
auto addr = get_broadcast_address();
if (!_gossiper.is_safe_for_bootstrap(addr)) {
throw std::runtime_error(sprint("A node with address %s already exists, cancelling join. "