From 248041e095627e5c7d76a2056e6fee5992181dee Mon Sep 17 00:00:00 2001 From: Nizamudeen A Date: Tue, 8 Jul 2025 19:07:35 +0530 Subject: [PATCH] mgr/prometheus: fix metrics service not coming up https://github.com/ceph/ceph/pull/61468 unintentionally broke the http metric service while it removed the code that starts the metrics. adding them back up. adding a basic test to catch these issues Regression from https://github.com/ceph/ceph/pull/61468/commits/64f590cc8f03c9aab909c3ea5b9f53271ba3c15b#diff-031e09c4297d84a407cf55f8981d38764efc3c37e9827e12e638521f69284e1f Signed-off-by: Nizamudeen A (cherry picked from commit 0f2dbbda42b571fee8c990befd2677f3250e19ce) --- .../cephadm/workunits/task/test_monitoring_stack_basic.yaml | 4 ++++ src/pybind/mgr/prometheus/module.py | 3 +++ 2 files changed, 7 insertions(+) diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml index 515293ea83a..99fe1e91365 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -64,3 +64,7 @@ tasks: curl -s http://${ALERTM_IP}:9093/api/v2/status curl -s http://${ALERTM_IP}:9093/api/v2/alerts curl -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e '.[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' + # check prometheus metrics endpoint is not empty and make sure we can get metrics + METRICS_URL=$(ceph mgr services | jq -r .prometheus) + [ -n "$METRICS_URL" ] || exit 1 + curl -s "${METRICS_URL}metrics" | grep -q '^ceph_health_status' diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 85798954632..5d7eec5d516 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -1773,6 +1773,9 @@ class Module(MgrModule, OrchestratorClientMixin): self.log.exception(f'Failed to setup cephadm based secure monitoring stack: {e}\n', 'Falling back to default configuration') + # In any error fallback to plain http mode + self.setup_default_config(server_addr, server_port) + def setup_default_config(self, server_addr: str, server_port: int) -> None: cherrypy.config.update({ 'server.socket_host': server_addr, -- 2.39.5