From: Kefu Chai Date: Wed, 1 Nov 2017 15:17:57 +0000 (+0800) Subject: mgr: summarize osd metrics in MMgrReport and sent it to mon X-Git-Tag: v13.0.1~103^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=7e7978732d20c506eca581f2b153ede70ceefa3d;p=ceph.git mgr: summarize osd metrics in MMgrReport and sent it to mon Signed-off-by: Kefu Chai --- diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6bb1f5be316ce..3a1270738fa57 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -725,6 +725,7 @@ if (WITH_MGR) mgr/DaemonServer.cc mgr/ClusterState.cc mgr/ActivePyModules.cc + mgr/OSDHealthMetricCollector.cc mgr/StandbyPyModules.cc mgr/PyModuleRegistry.cc mgr/PyModuleRunner.cc diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index 4fb03f0ea44a3..2fe220b50b38a 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -20,6 +20,7 @@ #include "json_spirit/json_spirit_writer.h" #include "mgr/mgr_commands.h" +#include "mgr/OSDHealthMetricCollector.h" #include "mon/MonCommand.h" #include "messages/MMgrOpen.h" @@ -488,6 +489,10 @@ bool DaemonServer::handle_report(MMgrReport *m) } else if (m->daemon_status) { derr << "got status from non-daemon " << key << dendl; } + if (m->get_connection()->peer_is_osd()) { + // only OSD sends health_checks to me now + daemon->osd_health_metrics = std::move(m->osd_health_metrics); + } } // if there are any schema updates, notify the python modules @@ -1449,6 +1454,30 @@ void DaemonServer::send_report() *_dout << dendl; }); }); + + auto osds = daemon_state.get_by_service("osd"); + map> accumulated; + for (const auto& osd : osds) { + Mutex::Locker l(osd.second->lock); + for (const auto& metric : osd.second->osd_health_metrics) { + auto acc = accumulated.find(metric.get_type()); + if (acc == accumulated.end()) { + auto collector = OSDHealthMetricCollector::create(metric.get_type()); + if (!collector) { + derr << __func__ << " " << osd.first << "." << osd.second + << " sent me an unknown health metric: " + << static_cast(metric.get_type()) << dendl; + continue; + } + tie(acc, std::ignore) = accumulated.emplace(metric.get_type(), + std::move(collector)); + } + acc->second->update(osd.first, metric); + } + } + for (const auto& acc : accumulated) { + acc.second->summarize(m->health_checks); + } // TODO? We currently do not notify the PyModules // TODO: respect needs_send, so we send the report only if we are asked to do // so, or the state is updated. diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h index 846ce5dd8d9fc..9a12b1187c6aa 100644 --- a/src/mgr/DaemonState.h +++ b/src/mgr/DaemonState.h @@ -97,6 +97,9 @@ class DaemonState // The metadata (hostname, version, etc) sent from the daemon std::map metadata; + // TODO: this can be generalized to other daemons + std::vector osd_health_metrics; + // Ephemeral state bool service_daemon = false; utime_t service_status_stamp; diff --git a/src/mgr/OSDHealthMetricCollector.cc b/src/mgr/OSDHealthMetricCollector.cc new file mode 100644 index 0000000000000..0b3d4180d7f48 --- /dev/null +++ b/src/mgr/OSDHealthMetricCollector.cc @@ -0,0 +1,102 @@ +#include + +#include "include/health.h" +#include "include/types.h" +#include "OSDHealthMetricCollector.h" + + +using namespace std; + +ostream& operator<<(ostream& os, + const OSDHealthMetricCollector::DaemonKey& daemon) { + return os << daemon.first << "." << daemon.second; +} + +namespace { + +class SlowOps final : public OSDHealthMetricCollector { + bool _is_relevant(osd_metric type) const override { + return type == osd_metric::SLOW_OPS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("SLOW_OPS", HEALTH_WARN, ""); + } + bool _update(const DaemonKey& osd, + const OSDHealthMetric& metric) override { + auto num_slow = metric.get_n1(); + auto blocked_time = metric.get_n2(); + value.n1 += num_slow; + value.n2 = std::max(value.n2, blocked_time); + if (num_slow || blocked_time) { + osds.push_back(osd); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (osds.empty()) { + return; + } + static const char* fmt = "%1% slow ops, oldest one blocked for %2% sec"; + check.summary = boost::str(boost::format(fmt) % value.n1 % value.n2); + ostringstream ss; + if (osds.size() > 1) { + ss << "osds " << osds << " have slow ops."; + } else { + ss << osds.front() << " has slow ops"; + } + check.detail.push_back(ss.str()); + } + vector osds; +}; + + +class PendingPGs final : public OSDHealthMetricCollector { + bool _is_relevant(osd_metric type) const override { + return type == osd_metric::PENDING_CREATING_PGS; + } + health_check_t& _get_check(health_check_map_t& cm) const override { + return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, ""); + } + bool _update(const DaemonKey& osd, + const OSDHealthMetric& metric) override { + value.n += metric.get_n(); + if (metric.get_n()) { + osds.push_back(osd); + return true; + } else { + return false; + } + } + void _summarize(health_check_t& check) const override { + if (osds.empty()) { + return; + } + static const char* fmt = "%1% PGs pending on creation"; + check.summary = boost::str(boost::format(fmt) % value.n); + ostringstream ss; + if (osds.size() > 1) { + ss << "osds " << osds << " have pending PGs."; + } else { + ss << osds.front() << " has pending PGs"; + } + check.detail.push_back(ss.str()); + } + vector osds; +}; + +} // anonymous namespace + +unique_ptr +OSDHealthMetricCollector::create(osd_metric m) +{ + switch (m) { + case osd_metric::SLOW_OPS: + return unique_ptr{new SlowOps}; + case osd_metric::PENDING_CREATING_PGS: + return unique_ptr{new PendingPGs}; + default: + return unique_ptr{}; + } +} diff --git a/src/mgr/OSDHealthMetricCollector.h b/src/mgr/OSDHealthMetricCollector.h new file mode 100644 index 0000000000000..c28872ef13b0b --- /dev/null +++ b/src/mgr/OSDHealthMetricCollector.h @@ -0,0 +1,30 @@ +#include +#include + +#include "osd/OSDHealthMetric.h" +#include "mon/health_check.h" + +class OSDHealthMetricCollector { +public: + using DaemonKey = std::pair; + static std::unique_ptr create(osd_metric m); + void update(const DaemonKey& osd, const OSDHealthMetric& metric) { + if (_is_relevant(metric.get_type())) { + reported = _update(osd, metric); + } + } + void summarize(health_check_map_t& cm) { + if (reported) { + _summarize(_get_check(cm)); + } + } + virtual ~OSDHealthMetricCollector() {} +private: + virtual bool _is_relevant(osd_metric type) const = 0; + virtual health_check_t& _get_check(health_check_map_t& cm) const = 0; + virtual bool _update(const DaemonKey& osd, const OSDHealthMetric& metric) = 0; + virtual void _summarize(health_check_t& check) const = 0; +protected: + osd_metric_t value; + bool reported = false; +};