]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Improving error handling and logging for mgmt-gw test 62561/head
authorRedouane Kachach <rkachach@ibm.com>
Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
committerRedouane Kachach <rkachach@ibm.com>
Thu, 10 Apr 2025 08:00:26 +0000 (10:00 +0200)
Signed-off-by: Redouane Kachach <rkachach@ibm.com>
qa/suites/orch/cephadm/workunits/task/test_mgmt_gateway.yaml

index bf66fd8b2fa8dcdaf91ff7dde7ad63c712c853fe..9855c56dfbae82e14ee41dedd28820e017a97876 100644 (file)
@@ -48,7 +48,7 @@ tasks:
           enable_health_check_endpoint: True
         EOT
         # Add generated certificates to spec file
-        echo "  ssl_cert: |" >> /tmp/mgmt.spec 
+        echo "  ssl_cert: |" >> /tmp/mgmt.spec
         while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/cert.pem >> /tmp/mgmt.spec
         echo "  ssl_key: |" >> /tmp/mgmt.spec
         while read LINE; do echo $LINE | sed -e "s/^/    /"; done < /tmp/key.pem >> /tmp/mgmt.spec
@@ -60,18 +60,42 @@ tasks:
     host.a:
       - |
         set -ex
+
+        # Function to wait for a service to be healthy and log response on error
+        wait_for_service() {
+          local name="$1"
+          local url="$2"
+          local jq_filter="$3"
+
+          echo "Waiting for service $name to be healthy at $url..."
+          for i in {1..30}; do
+            local response
+            response=$(curl -k -s -u admin:admin "$url")
+            if echo "$response" | jq -e "$jq_filter" > /dev/null; then
+              echo "Service $name is healthy."
+              return 0
+            fi
+            echo "Attempt $i: service $name not ready yet"
+            sleep 10
+          done
+
+          echo "Timeout waiting for $name at $url"
+          echo "Last HTTP response:"
+          echo "$response"
+          echo "jq output:"
+          echo "$response" | jq "$jq_filter" || echo "(jq parse error or no match)"
+          return 1
+        }
+
         # retrieve mgmt hostname and ip
         MGMT_GTW_HOST=$(ceph orch ps --daemon-type mgmt-gateway -f json | jq -e '.[]' | jq -r '.hostname')
         MGMT_GTW_IP=$(ceph orch host ls -f json | jq -r --arg MGMT_GTW_HOST "$MGMT_GTW_HOST" '.[] | select(.hostname==$MGMT_GTW_HOST) | .addr')
+
         # check mgmt-gateway health
         curl -k -s https://${MGMT_GTW_IP}/health
         curl -k -s https://${MGMT_GTW_IP}:29443/health
-        # wait for background services to be reconfigured following mgmt-gateway installation
-        sleep 180
-        # check grafana endpoints are responsive and database health is okay
-        curl -k -s https://${MGMT_GTW_IP}/grafana/api/health | jq -e '.database == "ok"'
-        # check prometheus endpoints are responsive
-        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/prometheus/api/v1/status/config | jq -e '.status == "success"'
-        # check alertmanager endpoints are responsive
-        curl -k -s -u admin:admin https://${MGMT_GTW_IP}/alertmanager/api/v2/status
 
+        # wait for monitoring services
+        wait_for_service "Grafana" "https://${MGMT_GTW_IP}/grafana/api/health" '.database == "ok"' || exit 1
+        wait_for_service "Prometheus" "https://${MGMT_GTW_IP}/prometheus/api/v1/status/config" '.status == "success"' || exit 1
+        wait_for_service "Alertmanager" "https://${MGMT_GTW_IP}/alertmanager/api/v2/status" '.cluster.status == "ready"' || exit 1