Merge 'test: perf: report instructions retired per operations' from Avi Kivity

Instructions retired per op is a much more stable than time per op (inverse throughput) since it isn't much affected by changes in CPU frequencey or other load on the test system (it's still somewhat affected since a slower system will run more reactor polls per op). It's also less indicative of real performance, since it's possible for fewer inststructions to execute in more time than more instructions, but that isn't an issue for comparative tests). This allows incremental changes to the code base to be compared with more confidence. Current results are around 55k instructions per read, and 52k for writes. Closes #8563 * github.com:scylladb/scylla: test: perf: tidy up executor_stats snapshot computation test: perf: report instructions retired per operations test: perf: add RAII wrapper around Linux perf_event_open() test: perf: make executor_stats_snapshot() a member function of executor
2021-05-05 00:54:08 +02:00
parent b8665c459d 2b252ef9b7
commit 121eb32679
5 changed files with 172 additions and 17 deletions
--- a/configure.py
+++ b/configure.py
@@ -1170,7 +1170,7 @@ perf_tests_seastar_deps = [

 for t in perf_tests:
    deps[t] = [t + '.cc'] + scylla_tests_dependencies + perf_tests_seastar_deps
-    deps[t] += ['test/perf/perf.cc']
+    deps[t] += ['test/perf/perf.cc', 'test/perf/linux-perf-event.cc']

 deps['test/boost/sstable_test'] += ['test/lib/normalizing_reader.cc']
 deps['test/boost/sstable_datafile_test'] += ['test/lib/normalizing_reader.cc']
@@ -1193,9 +1193,9 @@ deps['test/boost/log_heap_test'] = ['test/boost/log_heap_test.cc']
 deps['test/boost/estimated_histogram_test'] = ['test/boost/estimated_histogram_test.cc']
 deps['test/boost/anchorless_list_test'] = ['test/boost/anchorless_list_test.cc']
 deps['test/perf/perf_fast_forward'] += ['release.cc']
-deps['test/perf/perf_simple_query'] += ['release.cc', 'test/perf/perf.cc']
-deps['test/perf/perf_row_cache_reads'] += ['test/perf/perf.cc']
-deps['test/perf/perf_row_cache_update'] += ['test/perf/perf.cc']
+deps['test/perf/perf_simple_query'] += ['release.cc', 'test/perf/perf.cc', 'test/perf/linux-perf-event.cc']
+deps['test/perf/perf_row_cache_reads'] += ['test/perf/perf.cc', 'test/perf/linux-perf-event.cc']
+deps['test/perf/perf_row_cache_update'] += ['test/perf/perf.cc', 'test/perf/linux-perf-event.cc']
 deps['test/boost/reusable_buffer_test'] = [
    "test/boost/reusable_buffer_test.cc",
    "test/lib/log.cc",
--- a/test/perf/linux-perf-event.cc
+++ b/test/perf/linux-perf-event.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+
+#include "linux-perf-event.hh"
+
+#include <linux/perf_event.h>
+#include <linux/hw_breakpoint.h>
+#include <sys/ioctl.h>
+#include <asm/unistd.h>
+
+linux_perf_event::linux_perf_event(const struct ::perf_event_attr& attr, pid_t pid, int cpu, int group_fd, unsigned long flags) {
+    int ret = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, flags);
+    if (ret != -1) {
+        _fd = ret; // ignore failures, can happen in constrained environments such as containers
+    }
+}
+
+linux_perf_event::~linux_perf_event() {
+    if (_fd != -1) {
+        ::close(_fd);
+    }
+}
+
+linux_perf_event&
+linux_perf_event::operator=(linux_perf_event&& x) noexcept {
+    if (this != &x) {
+        if (_fd != -1) {
+            ::close(_fd);
+        }
+        _fd = std::exchange(x._fd, -1);
+    }
+    return *this;
+}
+
+uint64_t
+linux_perf_event::read() {
+    if (_fd == -1) {
+        return 0;
+    }
+    uint64_t ret;
+    ::read(_fd, &ret, sizeof(ret));
+    return ret;
+}
+
+void
+linux_perf_event::enable() {
+    if (_fd == -1) {
+        return;
+    }
+    ::ioctl(_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+void
+linux_perf_event::disable() {
+    if (_fd == -1) {
+        return;
+    }
+    ::ioctl(_fd, PERF_EVENT_IOC_DISABLE, 0);
+}
+
+linux_perf_event
+linux_perf_event::user_instructions_retired() {
+    return linux_perf_event(perf_event_attr{
+            .type = PERF_TYPE_HARDWARE,
+            .size = sizeof(struct perf_event_attr),
+            .config = PERF_COUNT_HW_INSTRUCTIONS,
+            .disabled = 1,
+            .exclude_kernel = 1,
+            .exclude_hv = 1,
+            }, 0, -1, -1, 0);
+}
--- a/test/perf/linux-perf-event.hh
+++ b/test/perf/linux-perf-event.hh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (C) 2021 ScyllaDB
+ */
+
+/*
+ * This file is part of Scylla.
+ *
+ * Scylla is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * Scylla is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Scylla.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+
+#include <cstdint>
+#include <utility>
+#include <unistd.h>
+
+struct perf_event_attr; // from <linux/perf_event.h>
+
+class linux_perf_event {
+    int _fd = -1;
+public:
+    linux_perf_event(const struct ::perf_event_attr& attr, pid_t pid, int cpu, int group_fd, unsigned long flags);
+    linux_perf_event(linux_perf_event&& x) noexcept : _fd(std::exchange(x._fd, -1)) {}
+    linux_perf_event& operator=(linux_perf_event&& x) noexcept;
+    ~linux_perf_event();
+    uint64_t read();
+    void enable();
+    void disable();
+public:
+    static linux_perf_event user_instructions_retired();
+};
+
--- a/test/perf/perf.cc
+++ b/test/perf/perf.cc
@@ -24,6 +24,15 @@
 #include <seastar/core/memory.hh>
 #include "seastarx.hh"

+
+uint64_t perf_mallocs() {
+    return memory::stats().mallocs();
+}
+
+uint64_t perf_tasks_processed() {
+    return engine().get_sched_stats().tasks_processed;
+}
+
 void scheduling_latency_measurer::schedule_tick() {
    seastar::schedule(make_task(default_scheduling_group(), [self = weak_from_this()] () mutable {
        if (self) {
@@ -50,18 +59,9 @@ std::ostream& operator<<(std::ostream& out, const scheduling_latency_measurer& s
        to_ms(slm.max().count()));
 }

-
-executor_shard_stats
-executor_shard_stats_snapshot() {
-    return executor_shard_stats{
-        .allocations = memory::stats().mallocs(),
-        .tasks_executed = engine().get_sched_stats().tasks_processed,
-    };
-}
-
 std::ostream&
 operator<<(std::ostream& os, const perf_result& result) {
-    fmt::print(os, "{:.2f} tps ({:5.1f} allocs/op, {:5.1f} tasks/op)",
-            result.throughput, result.mallocs_per_op, result.tasks_per_op);
+    fmt::print(os, "{:.2f} tps ({:5.1f} allocs/op, {:5.1f} tasks/op, {:7.0f} insns/op)",
+            result.throughput, result.mallocs_per_op, result.tasks_per_op, result.instructions_per_op);
    return os;
 }
--- a/test/perf/perf.hh
+++ b/test/perf/perf.hh
@@ -28,6 +28,7 @@
 #include "seastarx.hh"
 #include "utils/extremum_tracking.hh"
 #include "utils/estimated_histogram.hh"
+#include "linux-perf-event.hh"

 #include <chrono>
 #include <iosfwd>
@@ -60,6 +61,7 @@ struct executor_shard_stats {
    uint64_t invocations = 0;
    uint64_t allocations = 0;
    uint64_t tasks_executed = 0;
+    uint64_t instructions_retired = 0;
 };

 inline
@@ -68,6 +70,7 @@ operator+(executor_shard_stats a, executor_shard_stats b) {
    a.invocations += b.invocations;
    a.allocations += b.allocations;
    a.tasks_executed += b.tasks_executed;
+    a.instructions_retired += b.instructions_retired;
    return a;
 }

@@ -77,10 +80,12 @@ operator-(executor_shard_stats a, executor_shard_stats b) {
    a.invocations -= b.invocations;
    a.allocations -= b.allocations;
    a.tasks_executed -= b.tasks_executed;
+    a.instructions_retired -= b.instructions_retired;
    return a;
 }

-executor_shard_stats executor_shard_stats_snapshot();
+uint64_t perf_tasks_processed();
+uint64_t perf_mallocs();


 // Drives concurrent and continuous execution of given asynchronous action
@@ -92,9 +97,12 @@ class executor {
    const uint64_t _end_at_count;
    const unsigned _n_workers;
    uint64_t _count;
+    linux_perf_event _instructions_retired_counter = linux_perf_event::user_instructions_retired();
 private:
+    executor_shard_stats executor_shard_stats_snapshot();
    future<> run_worker() {
        auto stats_begin = executor_shard_stats_snapshot();
+        _instructions_retired_counter.enable();
        return do_until([this] {
            return _end_at_count ? _count == _end_at_count : lowres_clock::now() >= _end_at;
        }, [this] () mutable {
@@ -118,8 +126,8 @@ public:
        return parallel_for_each(idx.begin(), idx.end(), [this] (auto idx) mutable {
            return this->run_worker();
        }).then([this, stats_start] {
+            _instructions_retired_counter.disable();
            auto stats_end = executor_shard_stats_snapshot();
-            stats_end.invocations = _count;
            return stats_end - stats_start;
        });
    }
@@ -129,10 +137,22 @@ public:
    }
 };

+template <typename Func>
+executor_shard_stats
+executor<Func>::executor_shard_stats_snapshot() {
+    return executor_shard_stats{
+        .invocations = _count,
+        .allocations = perf_mallocs(),
+        .tasks_executed = perf_tasks_processed(),
+        .instructions_retired = _instructions_retired_counter.read(),
+    };
+}
+
 struct perf_result {
    double throughput;
    double mallocs_per_op;
    double tasks_per_op;
+    double instructions_per_op;
 };

 std::ostream& operator<<(std::ostream& os, const perf_result& result);
@@ -166,6 +186,7 @@ std::vector<perf_result> time_parallel(Func func, unsigned concurrency_per_core,
            .throughput = static_cast<double>(stats.invocations) / duration,
            .mallocs_per_op = double(stats.allocations) / stats.invocations,
            .tasks_per_op = double(stats.tasks_executed) / stats.invocations,
+            .instructions_per_op = double(stats.instructions_retired) / stats.invocations,
        };
        std::cout << result << "\n";
        results.emplace_back(result);