mgr/cephadm: Improving error handling and logging for mgmt-gw test

author Redouane Kachach <rkachach@ibm.com>

Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)

committer Redouane Kachach <rkachach@ibm.com>

Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
author Redouane Kachach <rkachach@ibm.com>
Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
committer Redouane Kachach <rkachach@ibm.com>
Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
diff --git a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml

index bf66fd8b2fa8dcdaf91ff7dde7ad63c712c853fe..9855c56dfbae82e14ee41dedd28820e017a97876 100644 (file)
--- a/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
+++ b/qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml
@@ -48,7 +48,7 @@ tasks:
            enable_health_check_endpoint: True
          EOT
          # Add generated certificates to spec file
-        echo "  ssl_cert: |" >> /tmp/mgmt.spec 
+        echo "  ssl_cert: |" >> /tmp/mgmt.spec
          while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
          echo "  ssl_key: |" >> /tmp/mgmt.spec
          while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/key.pem >> /tmp/mgmt.spec
@@ -60,18 +60,42 @@ tasks:
      host.a:
        - |
          set -ex
+
+        # Function to wait for a service to be healthy and log response on error
+        wait_for_service() {
+          local name="$1"
+          local url="$2"
+          local jq_filter="$3"
+
+          echo "Waiting for service $name to be healthy at $url..."
+          for i in {1..30}; do
+            local response
+            response=$(curl -k -s -u admin:admin "$url")
+            if echo "$response" | jq -e "$jq_filter" > /dev/null; then
+              echo "Service $name is healthy."
+              return 0
+            fi
+            echo "Attempt $i: service $name not ready yet"
+            sleep 10
+          done
+
+          echo "Timeout waiting for $name at $url"
+          echo "Last HTTP response:"
+          echo "$response"
+          echo "jq output:"
+          echo "$response" | jq "$jq_filter" || echo "(jq parse error or no match)"
+          return 1
+        }
+
          # retrieve mgmt hostname and ip
          MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
          MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+
          # check mgmt-gateway health
          curl -k -s https://${MGMT_GTW_IP}/health
          curl -k -s https://${MGMT_GTW_IP}:29443/health
-        # wait for background services to be reconfigured following mgmt-gateway installation
-        sleep 180
-        # check grafana endpoints are responsive and database health is okay
-        curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
-        # check prometheus endpoints are responsive
-        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
-        # check alertmanager endpoints are responsive
-        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
  
+        # wait for monitoring services
+        wait_for_service "Grafana" "https://${MGMT_GTW_IP}/grafana/api/health" '.database == "ok"' || exit 1
+        wait_for_service "Prometheus" "https://${MGMT_GTW_IP}/prometheus/api/v1/status/config" '.status == "success"' || exit 1
+        wait_for_service "Alertmanager" "https://${MGMT_GTW_IP}/alertmanager/api/v2/status" '.cluster.status == "ready"' || exit 1
author	Redouane Kachach <rkachach@ibm.com>
	Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
committer	Redouane Kachach <rkachach@ibm.com>
	Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)