tracing: rearrange shut down

tracing::tracing local instance is dereferenced from a cql_server::connection::process_request(), therefore tracing::tracing service may be stop()ed only after a CQL server service is down. On the other hand it may not be stopped before RPC service is down because a remote side may request a tracing for a specific command too. This patch splits the tracing::tracing stop() into two phases: 1) Flush all pending tracing records and stop the backend. 2) Stop the service. The first phase is called after CQL server is down and before RPC is down. The second phase is called after RPC is down. Fixes #1339 Signed-off-by: Vlad Zolotarov <vladz@cloudius-systems.com> Message-Id: <1465840496-19990-1-git-send-email-vladz@cloudius-systems.com>
2016-06-13 20:54:56 +03:00
parent 49449fc30c
commit d3960f0bbb
4 changed files with 30 additions and 10 deletions
--- a/main.cc
+++ b/main.cc
@@ -604,10 +604,6 @@ int main(int ac, char** av) {
                return service::get_local_storage_service().drain_on_shutdown();
            });

-            engine().at_exit([&db] {
-                return tracing::tracing::tracing_instance().stop();
-            });
-
            engine().at_exit([&db] {
                return db.invoke_on_all([](auto& db) {
                    return db.get_compaction_manager().stop();
--- a/service/storage_service.cc
+++ b/service/storage_service.cc
@@ -988,9 +988,16 @@ future<> storage_service::drain_on_shutdown() {
            ss.shutdown_client_servers().get();
            logger.info("Drain on shutdown: shutdown rpc and cql server done");

+            tracing::tracing::tracing_instance().invoke_on_all([] (auto& tr) {
+                return tr.shutdown();
+            }).get();
+
            ss.do_stop_ms().get();
            logger.info("Drain on shutdown: shutdown messaging_service done");

+            tracing::tracing::tracing_instance().stop().get();
+            logger.info("Drain on shutdown: tracing is stopped");
+
            auth::auth::shutdown().get();
            logger.info("Drain on shutdown: auth shutdown");

--- a/tracing/tracing.cc
+++ b/tracing/tracing.cc
@@ -134,7 +134,7 @@ future<> tracing::start() {
 }

 void tracing::flush_timer_callback() {
-    if (_stopped) {
+    if (_down) {
        return;
    }

@@ -143,15 +143,23 @@ void tracing::flush_timer_callback() {
    _flush_timer.arm(flush_period);
 }

-future<> tracing::stop() {
-    logger.info("Asked to stop");
-    _stopped = true;
+future<> tracing::shutdown() {
+    logger.info("Asked to shut down");
+    _down = true;
    _flush_timer.cancel();
    return _tracing_backend_helper_ptr->stop().then([] {
        logger.info("Tracing is down");
    });
 }

+future<> tracing::stop() {
+    if (!_down) {
+        throw std::logic_error("tracing: stop() called before shutdown()");
+    }
+
+    return make_ready_future<>();
+}
+
 void tracing::set_trace_probability(double p) {
    if (p < 0 || p > 1) {
        throw std::invalid_argument("trace probability must be in a [0,1] range");
--- a/tracing/tracing.hh
+++ b/tracing/tracing.hh
@@ -171,7 +171,7 @@ private:
    uint64_t _pending_for_flush_sessions = 0;
    uint64_t _flushing_sessions = 0;
    timer<lowres_clock> _flush_timer;
-    bool _stopped = false;
+    bool _down = false;
    std::unique_ptr<i_tracing_backend_helper> _tracing_backend_helper_ptr;
    sstring _thread_name;
    scollectd::registrations _registrations;
@@ -205,9 +205,18 @@ public:
    // Initialize a tracing backend (e.g. tracing_keyspace or logstash)
    future<> start();

-    // waits until all active tracing sessions are over.
    future<> stop();

+    /**
+     * Waits until all pending tracing records are flushed to the backend an
+     * shuts down the backend. The following calls to
+     * store_session_record()/store_event_record() methods of a backend instance
+     * should be a NOOP.
+     *
+     * @return a ready future when the shutdown is complete
+     */
+    future<> shutdown();
+
    void flush_pending_records() {
        _flushing_sessions += _pending_for_flush_sessions;
        _pending_for_flush_sessions = 0;