From: David Galloway Date: Fri, 27 Mar 2026 13:56:48 +0000 (-0400) Subject: mgr/rook: Gracefully wait for prometheus metrics. No guessing. X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=fa41ae5caefed3e14ed3223264e397fde7520d59;p=ceph.git mgr/rook: Gracefully wait for prometheus metrics. No guessing. Signed-off-by: David Galloway --- diff --git a/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh b/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh index b59c1a730e0c..a2fab9608e31 100755 --- a/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh +++ b/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh @@ -202,6 +202,41 @@ enable_monitoring() { $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/exporter-service-monitor.yaml $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus.yaml $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus-service.yaml + + # Wait for the prometheus pod itself to be ready before checking metrics + $KUBECTL wait --for=condition=ready pod -l app.kubernetes.io/name=prometheus \ + -n rook-ceph --timeout=120s + + # Verify ceph mgr prometheus module is actually serving on port 9283 + local mgr_pod + mgr_pod=$($KUBECTL -n rook-ceph get pods -l app=rook-ceph-mgr \ + -o jsonpath='{.items[0].metadata.name}') + local attempts=0 + until $KUBECTL -n rook-ceph exec "$mgr_pod" -- \ + curl -sf http://localhost:9283/metrics | grep -q 'ceph_health_status'; do + echo "Waiting for ceph mgr prometheus module on port 9283..." + sleep 10 + attempts=$((attempts + 1)) + if [ $attempts -ge 18 ]; then + echo "ceph mgr prometheus module not available after max attempts" + exit 1 + fi + done + echo "ceph mgr prometheus module is serving metrics" + + # Wait for prometheus to scrape ceph metrics + attempts=0 + until $KUBECTL -n rook-ceph exec deploy/rook-ceph-tools -- \ + curl -s 'http://rook-prometheus.rook-ceph:9090/api/v1/query?query=ceph_osd_in' \ + | grep -q '"result":\[{'; do + echo "Waiting for prometheus to scrape ceph metrics..." + sleep 20 + attempts=$((attempts + 1)) + if [ $attempts -ge 10 ]; then + echo "Prometheus metrics not available after max attempts" + exit 1 + fi + done } #################################################################### @@ -219,7 +254,6 @@ wait_for_rook_operator wait_for_ceph_cluster enable_rook_orchestrator enable_monitoring -sleep 30 # wait for the metrics cache warmup #################################################################### ####################################################################