diff --git a/idl/storage_service.idl.hh b/idl/storage_service.idl.hh index 8c5a1c9b7a..79766f65c0 100644 --- a/idl/storage_service.idl.hh +++ b/idl/storage_service.idl.hh @@ -72,6 +72,7 @@ struct raft_topology_cmd_result { success }; service::raft_topology_cmd_result::command_status status; + sstring error_message [[version 2026.2]]; }; struct raft_snapshot { diff --git a/service/storage_service.cc b/service/storage_service.cc index 6b264b8e04..cd91d82287 100644 --- a/service/storage_service.cc +++ b/service/storage_service.cc @@ -4792,8 +4792,13 @@ future storage_service::raft_topology_cmd_handler(raft } } catch (const raft::request_aborted& e) { rtlogger.warn("raft_topology_cmd {} failed with: {}", cmd.cmd, e); + result.error_message = e.what(); + } catch (const std::exception& e) { + rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, e); + result.error_message = e.what(); } catch (...) { rtlogger.error("raft_topology_cmd {} failed with: {}", cmd.cmd, std::current_exception()); + result.error_message = "unknown error"; } rtlogger.info("topology cmd rpc {} completed with status={} index={}", diff --git a/service/topology_coordinator.cc b/service/topology_coordinator.cc index 05d987b98f..3b19328c35 100644 --- a/service/topology_coordinator.cc +++ b/service/topology_coordinator.cc @@ -443,8 +443,11 @@ class topology_coordinator : public endpoint_lifecycle_subscriber co_await ser::storage_service_rpc_verbs::send_raft_topology_cmd( &_messaging, to_host_id(id), id, _term, cmd_index, cmd); if (result.status == raft_topology_cmd_result::command_status::fail) { + auto msg = result.error_message.empty() + ? ::format("failed status returned from {}", id) + : ::format("failed status returned from {}: {}", id, result.error_message); co_await coroutine::exception(std::make_exception_ptr( - std::runtime_error(::format("failed status returned from {}", id)))); + std::runtime_error(std::move(msg)))); } }; @@ -3909,10 +3912,15 @@ class topology_coordinator : public endpoint_lifecycle_subscriber throw; } catch (seastar::abort_requested_exception&) { throw; + } catch (const std::exception& e) { + rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" + " (node state is rebuilding): {}", e); + rtbuilder.done(e.what()); + retake = true; } catch (...) { rtlogger.error("send_raft_topology_cmd(stream_ranges) failed with exception" " (node state is rebuilding): {}", std::current_exception()); - rtbuilder.done("streaming failed"); + rtbuilder.done("unknown error"); retake = true; } if (retake) { diff --git a/service/topology_state_machine.hh b/service/topology_state_machine.hh index 1379d728b8..b5344b300e 100644 --- a/service/topology_state_machine.hh +++ b/service/topology_state_machine.hh @@ -318,6 +318,9 @@ struct raft_topology_cmd_result { success }; command_status status = command_status::fail; + // Carries the error description back to the topology coordinator + // when the command fails. + sstring error_message; }; // This class is used in RPC's signatures to hold the topology_version of the caller.