From: Prashant D Date: Thu, 17 Mar 2022 14:29:40 +0000 (+0000) Subject: mgr, mgr/prometheus: Fix regression with prometheus metrics X-Git-Tag: v18.0.0~1090^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=aeca2e41ef560cf51c1ad935cfb6470e782aa8d5;p=ceph.git mgr, mgr/prometheus: Fix regression with prometheus metrics The ceph dameons on host are inheriting ceph version from the host. This introduces a wrong interpretation in prometheus metrics as well as in dump_server. Each ceph daemon should represent it's own ceph version based on the ceph binary is use for that daemon. Consider a situation where partial upgrade is done on host, some daemons which are restarted should have ceph version tag as upgraded version and rest should have older ceph version but presently all inherites host version. In containerized environment, all daemons are using ceph version of last daemon registered as a service on the host. Fixes: https://tracker.ceph.com/issues/54611 Signed-off-by: Prashant D --- diff --git a/src/mgr/ActivePyModules.cc b/src/mgr/ActivePyModules.cc index 18b988ef3e0..58c3d9ee4d6 100644 --- a/src/mgr/ActivePyModules.cc +++ b/src/mgr/ActivePyModules.cc @@ -97,6 +97,7 @@ void ActivePyModules::dump_server(const std::string &hostname, f->open_object_section("service"); f->dump_string("type", key.type); f->dump_string("id", key.name); + f->dump_string("ceph_version", ceph_version); if (!id.empty()) { f->dump_string("name", id); } diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 62c38079132..52a1b424576 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -947,11 +947,11 @@ class Module(MgrModule): for mon in mon_status['monmap']['mons']: rank = mon['rank'] id_ = mon['name'] - host_version = servers.get((id_, 'mon'), ('', '', '')) + mon_version = servers.get((id_, 'mon'), ('', '', '')) self.metrics['mon_metadata'].set(1, ( - 'mon.{}'.format(id_), host_version[0], + 'mon.{}'.format(id_), mon_version[0], mon['public_addr'].rsplit(':', 1)[0], rank, - host_version[1] + mon_version[1] )) in_quorum = int(rank in mon_status['quorum']) self.metrics['mon_quorum_status'].set(in_quorum, ( @@ -1034,10 +1034,9 @@ class Module(MgrModule): def get_service_list(self) -> Dict[Tuple[str, str], Tuple[str, str, str]]: ret = {} for server in self.list_servers(): - version = cast(str, server.get('ceph_version', '')) host = cast(str, server.get('hostname', '')) for service in cast(List[ServiceInfoT], server.get('services', [])): - ret.update({(service['id'], service['type']): (host, version, service.get('name', ''))}) + ret.update({(service['id'], service['type']): (host, service['ceph_version'], service.get('name', ''))}) return ret @profile_method() @@ -1075,7 +1074,7 @@ class Module(MgrModule): "skipping output".format(id_)) continue - host_version = servers.get((str(id_), 'osd'), ('', '', '')) + osd_version = servers.get((str(id_), 'osd'), ('', '', '')) # collect disk occupation metadata osd_metadata = self.get_metadata("osd", str(id_)) @@ -1092,10 +1091,10 @@ class Module(MgrModule): c_addr, dev_class, f_iface, - host_version[0], + osd_version[0], obj_store, p_addr, - host_version[1] + osd_version[1] )) # collect osd status