From: Nizamudeen A Date: Tue, 8 Jul 2025 13:37:35 +0000 (+0530) Subject: mgr/prometheus: fix metrics service not coming up X-Git-Tag: v20.1.0~51^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=248041e095627e5c7d76a2056e6fee5992181dee;p=ceph.git mgr/prometheus: fix metrics service not coming up https://github.com/ceph/ceph/pull/61468 unintentionally broke the http metric service while it removed the code that starts the metrics. adding them back up. adding a basic test to catch these issues Regression from https://github.com/ceph/ceph/pull/61468/commits/64f590cc8f03c9aab909c3ea5b9f53271ba3c15b#diff-031e09c4297d84a407cf55f8981d38764efc3c37e9827e12e638521f69284e1f Signed-off-by: Nizamudeen A (cherry picked from commit 0f2dbbda42b571fee8c990befd2677f3250e19ce) --- diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml index 515293ea83a71..99fe1e91365ad 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -64,3 +64,7 @@ tasks: curl -s http://${ALERTM_IP}:9093/api/v2/status curl -s http://${ALERTM_IP}:9093/api/v2/alerts curl -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e '.[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' + # check prometheus metrics endpoint is not empty and make sure we can get metrics + METRICS_URL=$(ceph mgr services | jq -r .prometheus) + [ -n "$METRICS_URL" ] || exit 1 + curl -s "${METRICS_URL}metrics" | grep -q '^ceph_health_status' diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 8579895463255..5d7eec5d5167d 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -1773,6 +1773,9 @@ class Module(MgrModule, OrchestratorClientMixin): self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n', 'Falling back to default configuration') + # In any error fallback to plain http mode + self.setup_default_config(server_addr, server_port) + def setup_default_config(self, server_addr: str, server_port: int) -> None: cherrypy.config.update({ 'server.socket_host': server_addr,