storage_proxy: Make split_stats resilient to being called from different scheduling group

Fixes #11017 When doing writes, storage proxy creates types deriving from abstract_write_response_handler. These are created in the various scheduling groups executing the write inducing code. They pick up a group-local reference to the various metrics used by SP. Normally all code using (and esp. modifying) these metrics are executed in the same scheduling group. However, if gossip sees a node go down, it will notify listeners, which eventually calls get_ep_stat and register_metrics. This code (before this patch) uses _active_ scheduling group to eventually add metrics, using a local dict as guard against double regs. If, as described above, we're called in a different sched group than the original one however, this can cause double registrations. Fixed here by keeping a reference to creating scheduling group and using this, not active one, when/if creating new metrics. Closes #14294
2023-06-19 11:09:53 +00:00
parent 643e69af89
commit f18e967939
3 changed files with 40 additions and 4 deletions
--- a/service/storage_proxy.cc
+++ b/service/storage_proxy.cc
@@ -116,8 +116,12 @@ static const seastar::metrics::label op_type_label("op_type");
 static const seastar::metrics::label scheduling_group_label("scheduling_group_name");
 static const seastar::metrics::label rejected_by_coordinator_label("rejected_by_coordinator");

+seastar::metrics::label_instance make_scheduling_group_label(const scheduling_group& sg) {
+    return scheduling_group_label(sg.name());
+}
+
 seastar::metrics::label_instance current_scheduling_group_label() {
-    return scheduling_group_label(current_scheduling_group().name());
+    return make_scheduling_group_label(current_scheduling_group());
 }

 }
@@ -2376,7 +2380,8 @@ storage_proxy_stats::split_stats::split_stats(const sstring& category, const sst
        , _long_description_prefix(long_description_prefix)
        , _category(category)
        , _op_type(op_type)
-        , _auto_register_metrics(auto_register_metrics) { }
+        , _auto_register_metrics(auto_register_metrics) 
+        , _sg(current_scheduling_group()) { }

 storage_proxy_stats::write_stats::write_stats()
 : writes_attempts(COORDINATOR_STATS_CATEGORY, "total_write_attempts", "total number of write requests", "mutation_data")
@@ -2693,7 +2698,7 @@ void storage_proxy_stats::split_stats::register_metrics_local() {
    auto new_metrics = sm::metric_groups();
    new_metrics.add_group(_category, {
        sm::make_counter(_short_description_prefix + sstring("_local_node"), [this] { return _local.val; },
-                       sm::description(_long_description_prefix + "on a local Node"), {storage_proxy_stats::current_scheduling_group_label(), op_type_label(_op_type)})
+                       sm::description(_long_description_prefix + "on a local Node"), {storage_proxy_stats::make_scheduling_group_label(_sg), op_type_label(_op_type)})
    });
    _metrics = std::exchange(new_metrics, {});
 }
@@ -2706,7 +2711,7 @@ void storage_proxy_stats::split_stats::register_metrics_for(sstring dc, gms::ine
    if (auto [ignored, added] = _dc_stats.try_emplace(dc); added) {
        _metrics.add_group(_category, {
            sm::make_counter(_short_description_prefix + sstring("_remote_node"), [this, dc] { return _dc_stats[dc].val; },
-                            sm::description(seastar::format("{} when communicating with external Nodes in another DC", _long_description_prefix)), {storage_proxy_stats::current_scheduling_group_label(), datacenter_label(dc), op_type_label(_op_type)})
+                            sm::description(seastar::format("{} when communicating with external Nodes in another DC", _long_description_prefix)), {storage_proxy_stats::make_scheduling_group_label(_sg), datacenter_label(dc), op_type_label(_op_type)})
                                .set_skip_when_empty()
        });
    }
--- a/service/storage_proxy_stats.hh
+++ b/service/storage_proxy_stats.hh
@@ -44,6 +44,7 @@ private:
    // whether to register per-endpoint metrics automatically
    bool _auto_register_metrics;

+    scheduling_group _sg;
 public:
    /**
     * @param category a statistics category, e.g. "client" or "replica"
--- a/test/boost/storage_proxy_test.cc
+++ b/test/boost/storage_proxy_test.cc
@@ -103,3 +103,33 @@ SEASTAR_TEST_CASE(test_get_restricted_ranges) {
        });
    });
 }
+
+SEASTAR_THREAD_TEST_CASE(test_split_stats) {
+    auto ep1 = gms::inet_address("127.0.0.1");
+    auto sg1 = create_scheduling_group("apa1", 100).get();
+    auto sg2 = create_scheduling_group("apa2", 100).get();
+
+    std::optional<service::storage_proxy_stats::split_stats> stats1, stats2;
+
+    // pretending to be abstract_write_response_handler type. 
+    // created in various scheduling groups, in which they 
+    // instantiate group-local split_stats.
+    with_scheduling_group(sg1, [&] {
+        stats1.emplace("tuta", "nils", "en nils", "nilsa", true);
+    }).get0();
+
+    with_scheduling_group(sg2, [&] {
+        stats2.emplace("tuta", "nils", "en nils", "nilsa", true);
+    }).get0();
+
+    // simulating the calling of storage_proxy::on_down, from gossip
+    // on node dropping out. If inside a write operation, we'll pick up
+    // write handlers and to "timeout_cb" on them, which in turn might
+    // call get_ep_stat, which evenually calls register_metrics for 
+    // the DC written to.
+    // Point being is that either the above should not happen, or 
+    // split_stats should be resilient to being called from different
+    // scheduling group.
+    stats1->register_metrics_for("DC1", ep1);
+    stats2->register_metrics_for("DC1", ep1);
+}