From 6a37fc870a7051317cc8d49ddae74fde63cce396 Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Tue, 8 Jul 2025 19:07:35 +0530 Subject: [PATCH] mgr/prometheus: fix metrics service not coming up https://github.com/ceph/ceph/pull/61468 unintentionally broke the http metric service while it removed the code that starts the metrics. adding them back up. adding a basic test to catch these issues Regression from https://github.com/ceph/ceph/pull/61468/commits/64f590cc8f03c9aab909c3ea5b9f53271ba3c15b#diff-031e09c4297d84a407cf55f8981d38764efc3c37e9827e12e638521f69284e1f Fixes: https://tracker.ceph.com/issues/72012 Signed-off-by: Nizamudeen A --- .../cephadm/workunits/task/test_monitoring_stack_basic.yaml | 4 ++++ src/pybind/mgr/prometheus/module.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml index 515293ea83a71..99fe1e91365ad 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -64,3 +64,7 @@ tasks: curl -s http://${ALERTM_IP}:9093/api/v2/status curl -s http://${ALERTM_IP}:9093/api/v2/alerts curl -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e '.[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' + # check prometheus metrics endpoint is not empty and make sure we can get metrics + METRICS_URL=$(ceph mgr services | jq -r .prometheus) + [ -n "$METRICS_URL" ] || exit 1 + curl -s "${METRICS_URL}metrics" | grep -q '^ceph_health_status' diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 8579895463255..5d7eec5d5167d 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -1773,6 +1773,9 @@ class Module(MgrModule, OrchestratorClientMixin): self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n', 'Falling back to default configuration') + # In any error fallback to plain http mode + self.setup_default_config(server_addr, server_port) + def setup_default_config(self, server_addr: str, server_port: int) -> None: cherrypy.config.update({ 'server.socket_host': server_addr, -- 2.39.5