From: Redouane Kachach Date: Thu, 10 Apr 2025 08:00:26 +0000 (+0200) Subject: mgr/cephadm: Improving error handling and logging for mgmt-gw test X-Git-Tag: v20.3.0~13^2 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=ded9b41ce70778bff00d2da43dc35cd00b4c4d89;p=ceph.git mgr/cephadm: Improving error handling and logging for mgmt-gw test Signed-off-by: Redouane Kachach --- diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml index bf66fd8b2fa8d..9855c56dfbae8 100644 --- a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml +++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml @@ -48,7 +48,7 @@ tasks: enable_health_check_endpoint: True EOT # Add generated certificates to spec file - echo " ssl_cert: |" >> /tmp/mgmt.spec + echo " ssl_cert: |" >> /tmp/mgmt.spec while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/cert.pem >> /tmp/mgmt.spec echo " ssl_key: |" >> /tmp/mgmt.spec while read LINE; do echo $LINE | sed -e "s/^/ /"; done < /tmp/key.pem >> /tmp/mgmt.spec @@ -60,18 +60,42 @@ tasks: host.a: - | set -ex + + # Function to wait for a service to be healthy and log response on error + wait_for_service() { + local name="$1" + local url="$2" + local jq_filter="$3" + + echo "Waiting for service $name to be healthy at $url..." + for i in {1..30}; do + local response + response=$(curl -k -s -u admin:admin "$url") + if echo "$response" | jq -e "$jq_filter" > /dev/null; then + echo "Service $name is healthy." + return 0 + fi + echo "Attempt $i: service $name not ready yet" + sleep 10 + done + + echo "Timeout waiting for $name at $url" + echo "Last HTTP response:" + echo "$response" + echo "jq output:" + echo "$response" | jq "$jq_filter" || echo "(jq parse error or no match)" + return 1 + } + # retrieve mgmt hostname and ip MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname') MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr') + # check mgmt-gateway health curl -k -s https://${MGMT_GTW_IP}/health curl -k -s https://${MGMT_GTW_IP}:29443/health - # wait for background services to be reconfigured following mgmt-gateway installation - sleep 180 - # check grafana endpoints are responsive and database health is okay - curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"' - # check prometheus endpoints are responsive - curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"' - # check alertmanager endpoints are responsive - curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status + # wait for monitoring services + wait_for_service "Grafana" "https://${MGMT_GTW_IP}/grafana/api/health" '.database == "ok"' || exit 1 + wait_for_service "Prometheus" "https://${MGMT_GTW_IP}/prometheus/api/v1/status/config" '.status == "success"' || exit 1 + wait_for_service "Alertmanager" "https://${MGMT_GTW_IP}/alertmanager/api/v2/status" '.cluster.status == "ready"' || exit 1