mgr/DaemonServer.cc
mgr/ClusterState.cc
mgr/ActivePyModules.cc
+ mgr/OSDHealthMetricCollector.cc
mgr/StandbyPyModules.cc
mgr/PyModuleRegistry.cc
mgr/PyModuleRunner.cc
#include "json_spirit/json_spirit_writer.h"
#include "mgr/mgr_commands.h"
+#include "mgr/OSDHealthMetricCollector.h"
#include "mon/MonCommand.h"
#include "messages/MMgrOpen.h"
} else if (m->daemon_status) {
derr << "got status from non-daemon " << key << dendl;
}
+ if (m->get_connection()->peer_is_osd()) {
+ // only OSD sends health_checks to me now
+ daemon->osd_health_metrics = std::move(m->osd_health_metrics);
+ }
}
// if there are any schema updates, notify the python modules
*_dout << dendl;
});
});
+
+ auto osds = daemon_state.get_by_service("osd");
+ map<osd_metric, unique_ptr<OSDHealthMetricCollector>> accumulated;
+ for (const auto& osd : osds) {
+ Mutex::Locker l(osd.second->lock);
+ for (const auto& metric : osd.second->osd_health_metrics) {
+ auto acc = accumulated.find(metric.get_type());
+ if (acc == accumulated.end()) {
+ auto collector = OSDHealthMetricCollector::create(metric.get_type());
+ if (!collector) {
+ derr << __func__ << " " << osd.first << "." << osd.second
+ << " sent me an unknown health metric: "
+ << static_cast<uint8_t>(metric.get_type()) << dendl;
+ continue;
+ }
+ tie(acc, std::ignore) = accumulated.emplace(metric.get_type(),
+ std::move(collector));
+ }
+ acc->second->update(osd.first, metric);
+ }
+ }
+ for (const auto& acc : accumulated) {
+ acc.second->summarize(m->health_checks);
+ }
// TODO? We currently do not notify the PyModules
// TODO: respect needs_send, so we send the report only if we are asked to do
// so, or the state is updated.
// The metadata (hostname, version, etc) sent from the daemon
std::map<std::string, std::string> metadata;
+ // TODO: this can be generalized to other daemons
+ std::vector<OSDHealthMetric> osd_health_metrics;
+
// Ephemeral state
bool service_daemon = false;
utime_t service_status_stamp;
--- /dev/null
+#include <boost/format.hpp>
+
+#include "include/health.h"
+#include "include/types.h"
+#include "OSDHealthMetricCollector.h"
+
+
+using namespace std;
+
+ostream& operator<<(ostream& os,
+ const OSDHealthMetricCollector::DaemonKey& daemon) {
+ return os << daemon.first << "." << daemon.second;
+}
+
+namespace {
+
+class SlowOps final : public OSDHealthMetricCollector {
+ bool _is_relevant(osd_metric type) const override {
+ return type == osd_metric::SLOW_OPS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "");
+ }
+ bool _update(const DaemonKey& osd,
+ const OSDHealthMetric& metric) override {
+ auto num_slow = metric.get_n1();
+ auto blocked_time = metric.get_n2();
+ value.n1 += num_slow;
+ value.n2 = std::max(value.n2, blocked_time);
+ if (num_slow || blocked_time) {
+ osds.push_back(osd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (osds.empty()) {
+ return;
+ }
+ static const char* fmt = "%1% slow ops, oldest one blocked for %2% sec";
+ check.summary = boost::str(boost::format(fmt) % value.n1 % value.n2);
+ ostringstream ss;
+ if (osds.size() > 1) {
+ ss << "osds " << osds << " have slow ops.";
+ } else {
+ ss << osds.front() << " has slow ops";
+ }
+ check.detail.push_back(ss.str());
+ }
+ vector<DaemonKey> osds;
+};
+
+
+class PendingPGs final : public OSDHealthMetricCollector {
+ bool _is_relevant(osd_metric type) const override {
+ return type == osd_metric::PENDING_CREATING_PGS;
+ }
+ health_check_t& _get_check(health_check_map_t& cm) const override {
+ return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "");
+ }
+ bool _update(const DaemonKey& osd,
+ const OSDHealthMetric& metric) override {
+ value.n += metric.get_n();
+ if (metric.get_n()) {
+ osds.push_back(osd);
+ return true;
+ } else {
+ return false;
+ }
+ }
+ void _summarize(health_check_t& check) const override {
+ if (osds.empty()) {
+ return;
+ }
+ static const char* fmt = "%1% PGs pending on creation";
+ check.summary = boost::str(boost::format(fmt) % value.n);
+ ostringstream ss;
+ if (osds.size() > 1) {
+ ss << "osds " << osds << " have pending PGs.";
+ } else {
+ ss << osds.front() << " has pending PGs";
+ }
+ check.detail.push_back(ss.str());
+ }
+ vector<DaemonKey> osds;
+};
+
+} // anonymous namespace
+
+unique_ptr<OSDHealthMetricCollector>
+OSDHealthMetricCollector::create(osd_metric m)
+{
+ switch (m) {
+ case osd_metric::SLOW_OPS:
+ return unique_ptr<OSDHealthMetricCollector>{new SlowOps};
+ case osd_metric::PENDING_CREATING_PGS:
+ return unique_ptr<OSDHealthMetricCollector>{new PendingPGs};
+ default:
+ return unique_ptr<OSDHealthMetricCollector>{};
+ }
+}
--- /dev/null
+#include <memory>
+#include <string>
+
+#include "osd/OSDHealthMetric.h"
+#include "mon/health_check.h"
+
+class OSDHealthMetricCollector {
+public:
+ using DaemonKey = std::pair<std::string, std::string>;
+ static std::unique_ptr<OSDHealthMetricCollector> create(osd_metric m);
+ void update(const DaemonKey& osd, const OSDHealthMetric& metric) {
+ if (_is_relevant(metric.get_type())) {
+ reported = _update(osd, metric);
+ }
+ }
+ void summarize(health_check_map_t& cm) {
+ if (reported) {
+ _summarize(_get_check(cm));
+ }
+ }
+ virtual ~OSDHealthMetricCollector() {}
+private:
+ virtual bool _is_relevant(osd_metric type) const = 0;
+ virtual health_check_t& _get_check(health_check_map_t& cm) const = 0;
+ virtual bool _update(const DaemonKey& osd, const OSDHealthMetric& metric) = 0;
+ virtual void _summarize(health_check_t& check) const = 0;
+protected:
+ osd_metric_t value;
+ bool reported = false;
+};