From: Prashant D Date: Thu, 17 Mar 2022 14:29:40 +0000 (+0000) Subject: mgr, mgr/prometheus: Fix regression with prometheus metrics X-Git-Tag: v16.2.11~103^2~109^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=45a8604232bd772a8da88c45c4f9934e063db0e4;p=ceph.git mgr, mgr/prometheus: Fix regression with prometheus metrics The ceph dameons on host are inheriting ceph version from the host. This introduces a wrong interpretation in prometheus metrics as well as in dump_server. Each ceph daemon should represent it's own ceph version based on the ceph binary is use for that daemon. Consider a situation where partial upgrade is done on host, some daemons which are restarted should have ceph version tag as upgraded version and rest should have older ceph version but presently all inherites host version. In containerized environment, all daemons are using ceph version of last daemon registered as a service on the host. Fixes: https://tracker.ceph.com/issues/54611 Signed-off-by: Prashant D (cherry picked from commit aeca2e41ef560cf51c1ad935cfb6470e782aa8d5) --- diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index beab72850048b..99af794235d6c 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -91,6 +91,7 @@ void ActivePyModules::dump_server(const std::string &hostname, f->open_object_section("service"); f->dump_string("type", key.type); f->dump_string("id", key.name); + f->dump_string("ceph_version", ceph_version); if (!id.empty()) { f->dump_string("name", id); } diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 7890b5eec9279..6516dc352516a 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -971,11 +971,11 @@ class Module(MgrModule): for mon in mon_status['monmap']['mons']: rank = mon['rank'] id_ = mon['name'] - host_version = servers.get((id_, 'mon'), ('', '', '')) + mon_version = servers.get((id_, 'mon'), ('', '', '')) self.metrics['mon_metadata'].set(1, ( - 'mon.{}'.format(id_), host_version[0], + 'mon.{}'.format(id_), mon_version[0], mon['public_addr'].rsplit(':', 1)[0], rank, - host_version[1] + mon_version[1] )) in_quorum = int(rank in mon_status['quorum']) self.metrics['mon_quorum_status'].set(in_quorum, ( @@ -1058,10 +1058,9 @@ class Module(MgrModule): def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str, str]]: ret = {} for server in self.list_servers(): - version = cast(str, server.get('ceph_version', '')) host = cast(str, server.get('hostname', '')) for service in cast(List[ServiceInfoT], server.get('services', [])): - ret.update({(service['id'], service['type']): (host, version, service.get('name', ''))}) + ret.update({(service['id'], service['type']): (host, service['ceph_version'], service.get('name', ''))}) return ret @profile_method() @@ -1099,7 +1098,7 @@ class Module(MgrModule): "skipping output".format(id_)) continue - host_version = servers.get((str(id_), 'osd'), ('', '', '')) + osd_version = servers.get((str(id_), 'osd'), ('', '', '')) # collect disk occupation metadata osd_metadata = self.get_metadata("osd", str(id_)) @@ -1116,10 +1115,10 @@ class Module(MgrModule): c_addr, dev_class, f_iface, - host_version[0], + osd_version[0], obj_store, p_addr, - host_version[1] + osd_version[1] )) # collect osd status