From: Venky Shankar Date: Tue, 2 Jul 2019 08:11:35 +0000 (-0400) Subject: mgr, mon: allow normal ceph services to register with manager X-Git-Tag: v14.2.10~168^2~19 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=5a1f09529902e7fd3591acb9be3d4f3fd44fad19;p=ceph.git mgr, mon: allow normal ceph services to register with manager Additionally, introduce `task status` field in manager report messages to forward status of executing tasks in daemons (e.g., status of executing scrubs in ceph metadata servers). `task status` makes its way upto service map which is then used to display the relevant information in ceph status. Signed-off-by: Venky Shankar (cherry picked from commit 5c25a018643b10aa78db8270cae1476f71d8f4f4) Conflicts: src/messages/MMgrReport.h src/mgr/DaemonServer.cc src/mgr/ServiceMap.h --- diff --git a/src/messages/MMgrReport.h b/src/messages/MMgrReport.h index dcfbf658546a..84c96e7f200f 100644 --- a/src/messages/MMgrReport.h +++ b/src/messages/MMgrReport.h @@ -75,8 +75,7 @@ class MMgrReport : public MessageInstance { public: friend factory; private: - - static constexpr int HEAD_VERSION = 7; + static constexpr int HEAD_VERSION = 8; static constexpr int COMPAT_VERSION = 1; public: @@ -101,6 +100,7 @@ public: // for service registration boost::optional> daemon_status; + boost::optional> task_status; std::vector daemon_health_metrics; @@ -130,6 +130,9 @@ public: if (header.version >= 7) { decode(osd_perf_metric_reports, p); } + if (header.version >= 8) { + decode(task_status, p); + } } void encode_payload(uint64_t features) override { @@ -143,6 +146,7 @@ public: encode(daemon_health_metrics, payload); encode(config_bl, payload); encode(osd_perf_metric_reports, payload); + encode(task_status, payload); } std::string_view get_type_name() const override { return "mgrreport"; } @@ -163,6 +167,9 @@ public: if (!daemon_health_metrics.empty()) { out << " daemon_metrics=" << daemon_health_metrics.size(); } + if (task_status) { + out << " task_status=" << task_status->size(); + } out << ")"; } diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index e0da9d03bd54..61a0950391be 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -42,7 +42,14 @@ #undef dout_prefix #define dout_prefix *_dout << "mgr.server " << __func__ << " " - +namespace { + template + bool map_compare(Map const &lhs, Map const &rhs) { + return lhs.size() == rhs.size() + && std::equal(lhs.begin(), lhs.end(), rhs.begin(), + [] (auto a, auto b) { return a.first == b.first && a.second == b.second; }); + } +} DaemonServer::DaemonServer(MonClient *monc_, Finisher &finisher_, @@ -457,13 +464,14 @@ bool DaemonServer::handle_open(MMgrOpen *m) std::lock_guard l(daemon->lock); daemon->perf_counters.clear(); + daemon->service_daemon = m->service_daemon; if (m->service_daemon) { daemon->service_status = m->daemon_status; utime_t now = ceph_clock_now(); auto d = pending_service_map.get_daemon(m->service_name, m->daemon_name); - if (d->gid != (uint64_t)m->get_source().num()) { + if (!d->gid || d->gid != (uint64_t)m->get_source().num()) { dout(10) << "registering " << key << " in pending_service_map" << dendl; d->gid = m->get_source().num(); d->addr = m->get_source_addr(); @@ -549,98 +557,114 @@ bool DaemonServer::handle_report(MMgrReport *m) return true; } - // Look up the DaemonState - DaemonStatePtr daemon; - if (daemon_state.exists(key)) { - dout(20) << "updating existing DaemonState for " << key << dendl; - daemon = daemon_state.get(key); - } else { - // we don't know the hostname at this stage, reject MMgrReport here. - dout(5) << "rejecting report from " << key << ", since we do not have its metadata now." - << dendl; - // issue metadata request in background - if (!daemon_state.is_updating(key) && - (key.first == "osd" || key.first == "mds" || key.first == "mon")) { + { + lock.lock(); + + DaemonStatePtr daemon; + // Look up the DaemonState + if (daemon_state.exists(key)) { + dout(20) << "updating existing DaemonState for " << key << dendl; + daemon = daemon_state.get(key); + } else { + lock.unlock(); - std::ostringstream oss; - auto c = new MetadataUpdate(daemon_state, key); - if (key.first == "osd") { - oss << "{\"prefix\": \"osd metadata\", \"id\": " - << key.second<< "}"; + // we don't know the hostname at this stage, reject MMgrReport here. + dout(5) << "rejecting report from " << key << ", since we do not have its metadata now." + << dendl; + // issue metadata request in background + if (!daemon_state.is_updating(key) && + (key.first == "osd" || key.first == "mds" || key.first == "mon")) { - } else if (key.first == "mds") { - c->set_default("addr", stringify(m->get_source_addr())); - oss << "{\"prefix\": \"mds metadata\", \"who\": \"" - << key.second << "\"}"; + std::ostringstream oss; + auto c = new MetadataUpdate(daemon_state, key); + if (key.first == "osd") { + oss << "{\"prefix\": \"osd metadata\", \"id\": " + << key.second<< "}"; + + } else if (key.first == "mds") { + c->set_default("addr", stringify(m->get_source_addr())); + oss << "{\"prefix\": \"mds metadata\", \"who\": \"" + << key.second << "\"}"; - } else if (key.first == "mon") { - oss << "{\"prefix\": \"mon metadata\", \"id\": \"" - << key.second << "\"}"; - } else { - ceph_abort(); + } else if (key.first == "mon") { + oss << "{\"prefix\": \"mon metadata\", \"id\": \"" + << key.second << "\"}"; + } else { + ceph_abort(); + } + + monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); } - monc->start_mon_command({oss.str()}, {}, &c->outbl, &c->outs, c); - } - - { - std::lock_guard l(lock); + lock.lock(); + // kill session auto priv = m->get_connection()->get_priv(); auto session = static_cast(priv.get()); if (!session) { - return false; + return false; } m->get_connection()->mark_down(); dout(10) << "unregistering osd." << session->osd_id - << " session " << session << " con " << m->get_connection() << dendl; + << " session " << session << " con " << m->get_connection() << dendl; if (osd_cons.find(session->osd_id) != osd_cons.end()) { - osd_cons[session->osd_id].erase(m->get_connection()); - } + osd_cons[session->osd_id].erase(m->get_connection()); + } auto iter = daemon_connections.find(m->get_connection()); if (iter != daemon_connections.end()) { - daemon_connections.erase(iter); + daemon_connections.erase(iter); } - } - - return false; - } - - // Update the DaemonState - ceph_assert(daemon != nullptr); - { - std::lock_guard l(daemon->lock); - auto &daemon_counters = daemon->perf_counters; - daemon_counters.update(m); - auto p = m->config_bl.cbegin(); - if (p != m->config_bl.end()) { - decode(daemon->config, p); - decode(daemon->ignored_mon_config, p); - dout(20) << " got config " << daemon->config - << " ignored " << daemon->ignored_mon_config << dendl; + lock.unlock(); + return false; } - if (daemon->service_daemon) { - utime_t now = ceph_clock_now(); - if (m->daemon_status) { - daemon->service_status = *m->daemon_status; - daemon->service_status_stamp = now; + // Update the DaemonState + ceph_assert(daemon != nullptr); + { + std::lock_guard l(daemon->lock); + auto &daemon_counters = daemon->perf_counters; + daemon_counters.update(m); + + auto p = m->config_bl.cbegin(); + if (p != m->config_bl.end()) { + decode(daemon->config, p); + decode(daemon->ignored_mon_config, p); + dout(20) << " got config " << daemon->config + << " ignored " << daemon->ignored_mon_config << dendl; + } + + if (daemon->service_daemon) { + utime_t now = ceph_clock_now(); + if (m->daemon_status) { + daemon->service_status_stamp = now; + daemon->service_status = *m->daemon_status; + } + if (m->task_status && !map_compare(daemon->task_status, *m->task_status)) { + auto d = pending_service_map.get_daemon(m->service_name, m->daemon_name); + if (d->gid) { + daemon->task_status = *m->task_status; + d->task_status = *m->task_status; + pending_service_map_dirty = pending_service_map.epoch; + } + } + daemon->last_service_beacon = now; + } else if (m->daemon_status) { + derr << "got status from non-daemon " << key << dendl; + } + if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) { + // only OSD and MON send health_checks to me now + daemon->daemon_health_metrics = std::move(m->daemon_health_metrics); + dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics + << dendl; } - daemon->last_service_beacon = now; - } else if (m->daemon_status) { - derr << "got status from non-daemon " << key << dendl; - } - if (m->get_connection()->peer_is_osd() || m->get_connection()->peer_is_mon()) { - // only OSD and MON send health_checks to me now - daemon->daemon_health_metrics = std::move(m->daemon_health_metrics); - dout(10) << "daemon_health_metrics " << daemon->daemon_health_metrics - << dendl; } + + lock.unlock(); } // if there are any schema updates, notify the python modules diff --git a/src/mgr/DaemonState.h b/src/mgr/DaemonState.h index 0661f61a01ce..a8c878c73f4e 100644 --- a/src/mgr/DaemonState.h +++ b/src/mgr/DaemonState.h @@ -147,6 +147,7 @@ class DaemonState bool service_daemon = false; utime_t service_status_stamp; std::map service_status; + std::map task_status; utime_t last_service_beacon; // running config diff --git a/src/mgr/MgrClient.cc b/src/mgr/MgrClient.cc index 4737ea428d54..0d768dd21665 100644 --- a/src/mgr/MgrClient.cc +++ b/src/mgr/MgrClient.cc @@ -345,6 +345,11 @@ void MgrClient::_send_report() daemon_dirty_status = false; } + if (task_dirty_status) { + report->task_status = task_status; + task_dirty_status = false; + } + report->daemon_health_metrics = std::move(daemon_health_metrics); cct->_conf.get_config_bl(last_config_bl_version, &report->config_bl, @@ -480,14 +485,6 @@ int MgrClient::service_daemon_register( const std::map& metadata) { std::lock_guard l(lock); - if (service == "osd" || - service == "mds" || - service == "client" || - service == "mon" || - service == "mgr") { - // normal ceph entity types are not allowed! - return -EINVAL; - } if (service_daemon) { return -EEXIST; } @@ -516,6 +513,15 @@ int MgrClient::service_daemon_update_status( return 0; } +int MgrClient::service_daemon_update_task_status( + std::map &&status) { + std::lock_guard l(lock); + ldout(cct,10) << status << dendl; + task_status = std::move(status); + task_dirty_status = true; + return 0; +} + void MgrClient::update_daemon_health(std::vector&& metrics) { std::lock_guard l(lock); diff --git a/src/mgr/MgrClient.h b/src/mgr/MgrClient.h index 09764afaf8cc..35c36e23fd8c 100644 --- a/src/mgr/MgrClient.h +++ b/src/mgr/MgrClient.h @@ -87,9 +87,11 @@ protected: // for service registration and beacon bool service_daemon = false; bool daemon_dirty_status = false; + bool task_dirty_status = false; std::string service_name, daemon_name; std::map daemon_metadata; std::map daemon_status; + std::map task_status; std::vector daemon_health_metrics; void reconnect(); @@ -147,6 +149,8 @@ public: const std::map& metadata); int service_daemon_update_status( std::map&& status); + int service_daemon_update_task_status( + std::map &&task_status); void update_daemon_health(std::vector&& metrics); private: diff --git a/src/mgr/ServiceMap.cc b/src/mgr/ServiceMap.cc index ba7a43b24a8f..eca037009558 100644 --- a/src/mgr/ServiceMap.cc +++ b/src/mgr/ServiceMap.cc @@ -9,23 +9,27 @@ void ServiceMap::Daemon::encode(bufferlist& bl, uint64_t features) const { - ENCODE_START(1, 1, bl); + ENCODE_START(2, 1, bl); encode(gid, bl); encode(addr, bl, features); encode(start_epoch, bl); encode(start_stamp, bl); encode(metadata, bl); + encode(task_status, bl); ENCODE_FINISH(bl); } void ServiceMap::Daemon::decode(bufferlist::const_iterator& p) { - DECODE_START(1, p); + DECODE_START(2, p); decode(gid, p); decode(addr, p); decode(start_epoch, p); decode(start_stamp, p); decode(metadata, p); + if (struct_v >= 2) { + decode(task_status, p); + } DECODE_FINISH(p); } @@ -33,13 +37,18 @@ void ServiceMap::Daemon::dump(Formatter *f) const { f->dump_unsigned("start_epoch", start_epoch); f->dump_stream("start_stamp") << start_stamp; - f->dump_unsigned("gid", gid); + f->dump_unsigned("gid", *gid); f->dump_string("addr", addr.get_legacy_str()); f->open_object_section("metadata"); for (auto& p : metadata) { f->dump_string(p.first.c_str(), p.second); } f->close_section(); + f->open_object_section("task_status"); + for (auto& p : task_status) { + f->dump_string(p.first.c_str(), p.second); + } + f->close_section(); } void ServiceMap::Daemon::generate_test_instances(std::list& ls) @@ -48,6 +57,7 @@ void ServiceMap::Daemon::generate_test_instances(std::list& ls) ls.push_back(new Daemon); ls.back()->gid = 222; ls.back()->metadata["this"] = "that"; + ls.back()->task_status["task1"] = "running"; } // Service diff --git a/src/mgr/ServiceMap.h b/src/mgr/ServiceMap.h index d2a1c9b37c14..3d226dd5454b 100644 --- a/src/mgr/ServiceMap.h +++ b/src/mgr/ServiceMap.h @@ -12,17 +12,20 @@ #include "include/buffer.h" #include "msg/msg_types.h" +#include + namespace ceph { class Formatter; } struct ServiceMap { struct Daemon { - uint64_t gid = 0; + boost::optional gid; entity_addr_t addr; epoch_t start_epoch = 0; ///< epoch first registered utime_t start_stamp; ///< timestamp daemon started/registered std::map metadata; ///< static metadata + std::map task_status; ///< running task status void encode(bufferlist& bl, uint64_t features) const; void decode(bufferlist::const_iterator& p); @@ -64,7 +67,34 @@ struct ServiceMap { return ss.str(); } - void count_metadata(const string& field, + std::string get_task_summary(const std::string_view task_prefix) const { + // contruct a map similar to: + // {"service1 status" -> {"service1.0" -> "running"}} + // {"service2 status" -> {"service2.0" -> "idle"}, + // {"service2.1" -> "running"}} + std::map> by_task; + for (const auto &p : daemons) { + std::stringstream d; + d << task_prefix << "." << p.first; + for (const auto &q : p.second.task_status) { + auto p1 = by_task.emplace(q.first, std::map{}).first; + auto p2 = p1->second.emplace(d.str(), std::string()).first; + p2->second = q.second; + } + } + + std::stringstream ss; + for (const auto &p : by_task) { + ss << "\n " << p.first << ":"; + for (auto q : p.second) { + ss << "\n " << q.first << ": " << q.second; + } + } + + return ss.str(); + } + + void count_metadata(const std::string& field, std::map *out) const { for (auto& p : daemons) { auto q = p.second.metadata.find(field); diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index b90901d6f48e..14d9a9b88fbc 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -3010,11 +3010,32 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f) osdmon()->osdmap.print_summary(NULL, ss, string(maxlen + 6, ' ')); ss << "\n"; for (auto& p : service_map.services) { + const std::string &service = p.first; + // filter out normal ceph entity types + if (service == "osd" || + service == "client" || + service == "mon" || + service == "mds" || + service == "mgr") { + continue; + } ss << " " << p.first << ": " << string(maxlen - p.first.size(), ' ') << p.second.get_summary() << "\n"; } } + { + auto& service_map = mgrstatmon()->get_service_map(); + if (!service_map.services.empty()) { + ss << "\n \n task status:\n"; + { + for (auto &p : service_map.services) { + ss << p.second.get_task_summary(p.first); + } + } + } + } + ss << "\n \n data:\n"; mgrstatmon()->print_summary(NULL, &ss);