topology: propagate error messages through raft_topology_cmd_result
When a topology command (e.g., rebuild) fails on a target node, the
exception message was being swallowed at multiple levels:
1. raft_topology_cmd_handler caught exceptions and returned a bare
fail status with no error details.
2. exec_direct_command_helper saw the fail status and threw a generic
"failed status returned from {id}" message.
3. The rebuilding handler caught that and stored a hardcoded
"streaming failed" message.
This meant users only saw "rebuild failed: streaming failed" instead
of the actionable error from the safety check (e.g., "it is unsafe
to use source_dc=dc2 to rebuild keyspace=...").
Fix by:
- Adding an error_message field to raft_topology_cmd_result (with
[[version 2026.2]] for wire compatibility).
- Populating error_message with the exception text in the handler's
catch blocks.
- Including error_message in the exception thrown by
exec_direct_command_helper.
- Passing the actual error through to rtbuilder.done() instead of
the hardcoded "streaming failed".
A follow-up test is in https://github.com/scylladb/scylladb/pull/29363
Fixes: SCYLLADB-1404
This commit is contained in:
@@ -72,6 +72,7 @@ struct raft_topology_cmd_result {
|
||||
success
|
||||
};
|
||||
service::raft_topology_cmd_result::command_status status;
|
||||
sstring error_message [[version 2026.2]];
|
||||
};
|
||||
|
||||
struct raft_snapshot {
|
||||
|
||||
@@ -4792,8 +4792,13 @@ future<raft_topology_cmd_result> storage_service::raft_topology_cmd_handler(raft
|
||||
}
|
||||
} catch (const raft::request_aborted& e) {
|
||||
rtlogger.warn("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
|
||||
result.error_message = e.what();
|
||||
} catch (const std::exception& e) {
|
||||
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, e);
|
||||
result.error_message = e.what();
|
||||
} catch (...) {
|
||||
rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, std::current_exception());
|
||||
result.error_message = "unknown error";
|
||||
}
|
||||
|
||||
rtlogger.info("topology cmd rpc {} completed with status={} index={}",
|
||||
|
||||
@@ -443,8 +443,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd(
|
||||
&_messaging, to_host_id(id), id, _term, cmd_index, cmd);
|
||||
if (result.status == raft_topology_cmd_result::command_status::fail) {
|
||||
auto msg = result.error_message.empty()
|
||||
? ::format("failed status returned from {}", id)
|
||||
: ::format("failed status returned from {}: {}", id, result.error_message);
|
||||
co_await coroutine::exception(std::make_exception_ptr(
|
||||
std::runtime_error(::format("failed status returned from {}", id))));
|
||||
std::runtime_error(std::move(msg))));
|
||||
}
|
||||
};
|
||||
|
||||
@@ -3909,10 +3912,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber
|
||||
throw;
|
||||
} catch (seastar::abort_requested_exception&) {
|
||||
throw;
|
||||
} catch (const std::exception& e) {
|
||||
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
|
||||
" (node state is rebuilding): {}", e);
|
||||
rtbuilder.done(e.what());
|
||||
retake = true;
|
||||
} catch (...) {
|
||||
rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception"
|
||||
" (node state is rebuilding): {}", std::current_exception());
|
||||
rtbuilder.done("streaming failed");
|
||||
rtbuilder.done("unknown error");
|
||||
retake = true;
|
||||
}
|
||||
if (retake) {
|
||||
|
||||
@@ -318,6 +318,9 @@ struct raft_topology_cmd_result {
|
||||
success
|
||||
};
|
||||
command_status status = command_status::fail;
|
||||
// Carries the error description back to the topology coordinator
|
||||
// when the command fails.
|
||||
sstring error_message;
|
||||
};
|
||||
|
||||
// This class is used in RPC's signatures to hold the topology_version of the caller.
|
||||
|
||||
Reference in New Issue
Block a user