From: Nizamudeen A <nia@redhat.com>
Date: Tue, 10 Sep 2024 05:44:46 +0000 (+0530)
Subject: mgr/dashboard: fix indefinite loop in cephadm dashboard e2e
X-Git-Tag: testing/wip-rishabh-testing-20240930.143059~57^2
X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=17cc32377154ffe59354a2f64f6cd706e078aeef;p=ceph-ci.git

mgr/dashboard: fix indefinite loop in cephadm dashboard e2e

the tests seems waiting to fetch the prometheus details incase the
cephadm ran into error and it just waits there for more than an hour
without any progress. fixing that and some minor improvements.

an example log: https://jenkins.ceph.com/job/ceph-dashboard-cephadm-e2e/12287/consoleFull

Signed-off-by: Nizamudeen A <nia@redhat.com>
---

diff --git a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
index 7c42800fd0c..ae720e6d49b 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
@@ -32,7 +32,7 @@ cephadm_shell="$CEPHADM shell --fsid ${fsid} -c /etc/ceph/ceph.conf -k /etc/ceph
 {% for number in range(1, nodes) %}
   ssh-copy-id -f -i /etc/ceph/ceph.pub  -o StrictHostKeyChecking=no root@192.168.100.10{{ number }}
   {% if expanded_cluster is defined %}
-    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }}
+    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }} 192.168.100.10{{ number }}
   {% endif %}
 {% endfor %}
 
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh b/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
index a48f759f5e7..b3ae3e2e7ad 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
@@ -38,22 +38,4 @@ cypress_run () {
 
 cd ${CEPH_DEV_FOLDER}/src/pybind/mgr/dashboard/frontend
 
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
-
-# check if the prometheus daemon is running
-# before starting the e2e tests
-
-PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-while [[ $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
-    PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-done
-
-# grafana ip address is set to the fqdn by default.
-# kcli is not working with that, so setting the IP manually.
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
-
-cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts"]
-cypress_run "cypress/e2e/orchestrator/grafana/*.feature"
+cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts","cypress/e2e/orchestrator/grafana/*.feature"]
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
index d5aa56efc9f..cda0635bc08 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
@@ -82,3 +82,37 @@ while [[ -z $(kcli ssh -u root -- ceph-node-00 'journalctl --no-tail --no-pager
     fi
     kcli ssh -u root -- ceph-node-00 'journalctl -n 100 --no-pager -t cloud-init'
 done
+
+kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
+
+get_prometheus_running_count() {
+    echo $(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
+}
+
+# check if the prometheus daemon is running on jenkins node
+# before starting the e2e tests
+if [[ -n "${JENKINS_HOME}" ]]; then
+    retry=0
+    PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+    # retrying for 10 times to see if we can get the prometheus count
+    # otherwise this would run indefinitely and bloat up the machine
+    while [[ $retry -lt 10 && $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
+        if [[ ${retry} -gt 0 ]]; then
+            echo "Retry attempt to get the prometheus count..." ${retry}
+        fi
+        PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+        retry=$((retry +1))
+        sleep 10
+    done
+
+    if [[ ${retry} -ge 10 ]]; then
+        exit 1
+    fi
+
+    # grafana ip address is set to the fqdn by default.
+    # kcli is not working with that, so setting the IP manually.
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
+fi