From 3068c4290166076093119d1dc08a2c0fada8235f Mon Sep 17 00:00:00 2001 From: Noah Watkins Date: Thu, 26 Jul 2018 10:52:34 -0700 Subject: [PATCH] mgr: report health check changes immediately Currently beacons and reports are sent periodically, which means that health check changes made by modules are only visible to the monitor after a delay. This patch sends a health report immediately after a change occurs. 1. decouple scheduling of reports from beacons 2. moves report scheduling into DaemonServer 3. exposes a "send now" option 4. health check change detection triggers immediate report Signed-off-by: Noah Watkins --- src/mgr/ActivePyModule.h | 7 ++++- src/mgr/ActivePyModules.cc | 27 ++++++++++++++++--- src/mgr/ActivePyModules.h | 4 ++- src/mgr/DaemonServer.cc | 52 ++++++++++++++++++++++++++++++++++++- src/mgr/DaemonServer.h | 9 +++++++ src/mgr/Mgr.cc | 8 +----- src/mgr/Mgr.h | 2 -- src/mgr/MgrStandby.cc | 4 --- src/mgr/PyModuleRegistry.cc | 4 +-- src/mgr/PyModuleRegistry.h | 2 +- 10 files changed, 96 insertions(+), 23 deletions(-) diff --git a/src/mgr/ActivePyModule.h b/src/mgr/ActivePyModule.h index 6b5eeda616c27..149d297faf13e 100644 --- a/src/mgr/ActivePyModule.h +++ b/src/mgr/ActivePyModule.h @@ -67,8 +67,13 @@ public: std::stringstream *ss); - void set_health_checks(health_check_map_t&& c) { + bool set_health_checks(health_check_map_t&& c) { + // when health checks change a report is immediately sent to the monitors. + // currently modules have static health check details, but this equality + // test could be made smarter if too much noise shows up in the future. + bool changed = health_checks != c; health_checks = std::move(c); + return changed; } void get_health_checks(health_check_map_t *checks); diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index 46a0f0a6c7537..399fff20cfd26 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -28,6 +28,7 @@ #include "PyModule.h" #include "ActivePyModules.h" +#include "DaemonServer.h" #define dout_context g_ceph_context #define dout_subsys ceph_subsys_mgr @@ -38,10 +39,10 @@ ActivePyModules::ActivePyModules(PyModuleConfig &module_config_, std::map store_data, DaemonStateIndex &ds, ClusterState &cs, MonClient &mc, LogChannelRef clog_, Objecter &objecter_, - Client &client_, Finisher &f) + Client &client_, Finisher &f, DaemonServer &server) : module_config(module_config_), daemon_state(ds), cluster_state(cs), monc(mc), clog(clog_), objecter(objecter_), client(client_), finisher(f), - lock("ActivePyModules") + server(server), lock("ActivePyModules") { store_cache = std::move(store_data); } @@ -790,11 +791,29 @@ PyObject *ActivePyModules::get_osdmap() void ActivePyModules::set_health_checks(const std::string& module_name, health_check_map_t&& checks) { - Mutex::Locker l(lock); + bool changed = false; + + lock.Lock(); auto p = modules.find(module_name); if (p != modules.end()) { - p->second->set_health_checks(std::move(checks)); + changed = p->second->set_health_checks(std::move(checks)); } + lock.Unlock(); + + // immediately schedule a report to be sent to the monitors with the new + // health checks that have changed. This is done asynchronusly to avoid + // blocking python land. ActivePyModules::lock needs to be dropped to make + // lockdep happy: + // + // send_report callers: DaemonServer::lock -> PyModuleRegistery::lock + // active_start: PyModuleRegistry::lock -> ActivePyModules::lock + // + // if we don't release this->lock before calling schedule_tick a cycle is + // formed with the addition of ActivePyModules::lock -> DaemonServer::lock. + // This is still correct as send_report is run asynchronously under + // DaemonServer::lock. + if (changed) + server.schedule_tick(0); } int ActivePyModules::handle_command( diff --git a/src/mgr/ActivePyModules.h b/src/mgr/ActivePyModules.h index d1f51c164e204..0b2ccd7ae8864 100644 --- a/src/mgr/ActivePyModules.h +++ b/src/mgr/ActivePyModules.h @@ -28,6 +28,7 @@ #include "ClusterState.h" class health_check_map_t; +class DaemonServer; class ActivePyModules { @@ -41,6 +42,7 @@ class ActivePyModules Objecter &objecter; Client &client; Finisher &finisher; + DaemonServer &server; mutable Mutex lock{"ActivePyModules::lock"}; @@ -50,7 +52,7 @@ public: std::map store_data, DaemonStateIndex &ds, ClusterState &cs, MonClient &mc, LogChannelRef clog_, Objecter &objecter_, Client &client_, - Finisher &f); + Finisher &f, DaemonServer &server); ~ActivePyModules(); diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index d94e663ae1e0e..4ec42dc694209 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -84,7 +84,10 @@ DaemonServer::DaemonServer(MonClient *monc_, g_conf()->auth_service_required : g_conf()->auth_supported), lock("DaemonServer"), - pgmap_ready(false) + pgmap_ready(false), + timer(g_ceph_context, lock), + shutting_down(false), + tick_event(nullptr) { g_conf().add_observer(this); } @@ -141,6 +144,12 @@ int DaemonServer::init(uint64_t gid, entity_addrvec_t client_addrs) started_at = ceph_clock_now(); + Mutex::Locker l(lock); + timer.init(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); + return 0; } @@ -341,12 +350,53 @@ void DaemonServer::maybe_ready(int32_t osd_id) } } +void DaemonServer::tick() +{ + dout(10) << dendl; + send_report(); + + schedule_tick_locked( + g_conf().get_val("mgr_tick_period").count()); +} + +// Currently modules do not set health checks in response to events delivered to +// all modules (e.g. notify) so we do not risk a thundering hurd situation here. +// if this pattern emerges in the future, this scheduler could be modified to +// fire after all modules have had a chance to set their health checks. +void DaemonServer::schedule_tick_locked(double delay_sec) +{ + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = nullptr; + } + + // on shutdown start rejecting explicit requests to send reports that may + // originate from python land which may still be running. + if (shutting_down) + return; + + tick_event = timer.add_event_after(delay_sec, + new FunctionContext([this](int r) { + tick(); + })); +} + +void DaemonServer::schedule_tick(double delay_sec) +{ + Mutex::Locker l(lock); + schedule_tick_locked(delay_sec); +} + void DaemonServer::shutdown() { dout(10) << "begin" << dendl; msgr->shutdown(); msgr->wait(); dout(10) << "done" << dendl; + + Mutex::Locker l(lock); + shutting_down = true; + timer.shutdown(); } static DaemonKey key_from_service( diff --git a/src/mgr/DaemonServer.h b/src/mgr/DaemonServer.h index 26554a47283c3..e7a7658cb714f 100644 --- a/src/mgr/DaemonServer.h +++ b/src/mgr/DaemonServer.h @@ -21,6 +21,7 @@ #include "common/Mutex.h" #include "common/LogClient.h" +#include "common/Timer.h" #include #include @@ -103,6 +104,12 @@ private: std::set reported_osds; void maybe_ready(int32_t osd_id); + SafeTimer timer; + bool shutting_down; + Context *tick_event; + void tick(); + void schedule_tick_locked(double delay_sec); + public: int init(uint64_t gid, entity_addrvec_t client_addrs); void shutdown(); @@ -147,6 +154,8 @@ public: virtual const char** get_tracked_conf_keys() const override; virtual void handle_conf_change(const ConfigProxy& conf, const std::set &changed) override; + + void schedule_tick(double delay_sec); }; #endif diff --git a/src/mgr/Mgr.cc b/src/mgr/Mgr.cc index 40200605dc775..ca784ebcab79a 100644 --- a/src/mgr/Mgr.cc +++ b/src/mgr/Mgr.cc @@ -286,7 +286,7 @@ void Mgr::init() // assume finisher already initialized in background_init dout(4) << "starting python modules..." << dendl; py_module_registry->active_start(daemon_state, cluster_state, - kv_store, *monc, clog, *objecter, *client, finisher); + kv_store, *monc, clog, *objecter, *client, finisher, server); dout(4) << "Complete." << dendl; initializing = false; @@ -650,12 +650,6 @@ void Mgr::handle_mgr_digest(MMgrDigest* m) } } -void Mgr::tick() -{ - dout(10) << dendl; - server.send_report(); -} - std::map Mgr::get_services() const { Mutex::Locker l(lock); diff --git a/src/mgr/Mgr.h b/src/mgr/Mgr.h index d684662a0c97f..e5739c8c9a7ea 100644 --- a/src/mgr/Mgr.h +++ b/src/mgr/Mgr.h @@ -90,8 +90,6 @@ public: bool ms_dispatch(Message *m); - void tick(); - void background_init(Context *completion); void shutdown(); diff --git a/src/mgr/MgrStandby.cc b/src/mgr/MgrStandby.cc index 57ceaeeb8da3d..6e305b7fb3750 100644 --- a/src/mgr/MgrStandby.cc +++ b/src/mgr/MgrStandby.cc @@ -238,10 +238,6 @@ void MgrStandby::tick() dout(10) << __func__ << dendl; send_beacon(); - if (active_mgr && active_mgr->is_initialized()) { - active_mgr->tick(); - } - timer.add_event_after( g_conf().get_val("mgr_tick_period").count(), new FunctionContext([this](int r){ diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc index ebb01a5c5b269..33816640535ee 100644 --- a/src/mgr/PyModuleRegistry.cc +++ b/src/mgr/PyModuleRegistry.cc @@ -171,7 +171,7 @@ void PyModuleRegistry::active_start( DaemonStateIndex &ds, ClusterState &cs, const std::map &kv_store, MonClient &mc, LogChannelRef clog_, Objecter &objecter_, - Client &client_, Finisher &f) + Client &client_, Finisher &f, DaemonServer &server) { Mutex::Locker locker(lock); @@ -190,7 +190,7 @@ void PyModuleRegistry::active_start( active_modules.reset(new ActivePyModules( module_config, kv_store, ds, cs, mc, - clog_, objecter_, client_, f)); + clog_, objecter_, client_, f, server)); for (const auto &i : modules) { // Anything we're skipping because of !can_run will be flagged diff --git a/src/mgr/PyModuleRegistry.h b/src/mgr/PyModuleRegistry.h index 241ba55ad11c3..568e74099607d 100644 --- a/src/mgr/PyModuleRegistry.h +++ b/src/mgr/PyModuleRegistry.h @@ -95,7 +95,7 @@ public: DaemonStateIndex &ds, ClusterState &cs, const std::map &kv_store, MonClient &mc, LogChannelRef clog_, Objecter &objecter_, - Client &client_, Finisher &f); + Client &client_, Finisher &f, DaemonServer &server); void standby_start(MonClient &mc); bool is_standby_running() const -- 2.39.5