From 17cc32377154ffe59354a2f64f6cd706e078aeef Mon Sep 17 00:00:00 2001
From: Nizamudeen A <nia@redhat.com>
Date: Tue, 10 Sep 2024 11:14:46 +0530
Subject: [PATCH] mgr/dashboard: fix indefinite loop in cephadm dashboard e2e

the tests seems waiting to fetch the prometheus details incase the
cephadm ran into error and it just waits there for more than an hour
without any progress. fixing that and some minor improvements.

an example log: https://jenkins.ceph.com/job/ceph-dashboard-cephadm-e2e/12287/consoleFull

Signed-off-by: Nizamudeen A <nia@redhat.com>
---
 .../dashboard/ci/cephadm/bootstrap-cluster.sh |  2 +-
 .../ci/cephadm/run-cephadm-e2e-tests.sh       | 20 +----------
 .../mgr/dashboard/ci/cephadm/start-cluster.sh | 34 +++++++++++++++++++
 3 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
index 7c42800fd0c..ae720e6d49b 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/bootstrap-cluster.sh
@@ -32,7 +32,7 @@ cephadm_shell="$CEPHADM shell --fsid ${fsid} -c /etc/ceph/ceph.conf -k /etc/ceph
 {% for number in range(1, nodes) %}
   ssh-copy-id -f -i /etc/ceph/ceph.pub  -o StrictHostKeyChecking=no root@192.168.100.10{{ number }}
   {% if expanded_cluster is defined %}
-    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }}
+    ${cephadm_shell} ceph orch host add {{ prefix }}-node-0{{ number }} 192.168.100.10{{ number }}
   {% endif %}
 {% endfor %}
 
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh b/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
index a48f759f5e7..b3ae3e2e7ad 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/run-cephadm-e2e-tests.sh
@@ -38,22 +38,4 @@ cypress_run () {
 
 cd ${CEPH_DEV_FOLDER}/src/pybind/mgr/dashboard/frontend
 
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
-
-# check if the prometheus daemon is running
-# before starting the e2e tests
-
-PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-while [[ $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
-    PROMETHEUS_RUNNING_COUNT=$(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
-done
-
-# grafana ip address is set to the fqdn by default.
-# kcli is not working with that, so setting the IP manually.
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
-kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
-
-cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts"]
-cypress_run "cypress/e2e/orchestrator/grafana/*.feature"
+cypress_run ["cypress/e2e/orchestrator/workflow/*.feature","cypress/e2e/orchestrator/workflow/*-spec.ts","cypress/e2e/orchestrator/grafana/*.feature"]
diff --git a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
index d5aa56efc9f..cda0635bc08 100755
--- a/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
+++ b/src/pybind/mgr/dashboard/ci/cephadm/start-cluster.sh
@@ -82,3 +82,37 @@ while [[ -z $(kcli ssh -u root -- ceph-node-00 'journalctl --no-tail --no-pager
     fi
     kcli ssh -u root -- ceph-node-00 'journalctl -n 100 --no-pager -t cloud-init'
 done
+
+kcli ssh -u root ceph-node-00 'cephadm shell "ceph config set mgr mgr/prometheus/exclude_perf_counters false"'
+
+get_prometheus_running_count() {
+    echo $(kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch ls --service_name=prometheus --format=json"' | jq -r '.[] | .status.running')
+}
+
+# check if the prometheus daemon is running on jenkins node
+# before starting the e2e tests
+if [[ -n "${JENKINS_HOME}" ]]; then
+    retry=0
+    PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+    # retrying for 10 times to see if we can get the prometheus count
+    # otherwise this would run indefinitely and bloat up the machine
+    while [[ $retry -lt 10 && $PROMETHEUS_RUNNING_COUNT -lt 1 ]]; do
+        if [[ ${retry} -gt 0 ]]; then
+            echo "Retry attempt to get the prometheus count..." ${retry}
+        fi
+        PROMETHEUS_RUNNING_COUNT=$(get_prometheus_running_count)
+        retry=$((retry +1))
+        sleep 10
+    done
+
+    if [[ ${retry} -ge 10 ]]; then
+        exit 1
+    fi
+
+    # grafana ip address is set to the fqdn by default.
+    # kcli is not working with that, so setting the IP manually.
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-alertmanager-api-host http://192.168.100.100:9093"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-prometheus-api-host http://192.168.100.100:9095"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph dashboard set-grafana-api-url https://192.168.100.100:3000"'
+    kcli ssh -u root ceph-node-00 'cephadm shell "ceph orch apply node-exporter --placement 'count:2'"'
+fi
-- 
2.39.5