From bf52c18043117cccb68c5d9dabe8672dd2b943b3 Mon Sep 17 00:00:00 2001 From: Adam King Date: Mon, 30 Jan 2023 14:38:41 -0500 Subject: [PATCH] qa/cephadm: basic test for monitoring stack Testing that the monitoring stack daemons are all basically functioning by checking their HTTP APIs are responsive and and that putting down a mon daemon, which should cause an alert, actually triggers an alert that is viewable in the prometheus and alertmanager API Signed-off-by: Adam King --- .../task/test_monitoring_stack_basic.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml diff --git a/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml new file mode 100644 index 00000000000..62947ef65d9 --- /dev/null +++ b/qa/suites/orch/cephadm/workunits/task/test_monitoring_stack_basic.yaml @@ -0,0 +1,55 @@ +roles: +- - host.a + - mon.a + - mgr.a + - osd.0 +- - host.b + - mon.b + - mgr.b + - osd.1 +- - host.c + - mon.c + - osd.2 +tasks: +- install: +- cephadm: +- cephadm.shell: + host.a: + - | + set -e + set -x + ceph orch apply node-exporter + ceph orch apply grafana + ceph orch apply alertmanager + ceph orch apply prometheus + sleep 240 + ceph orch ls + ceph orch ps + ceph orch host ls + MON_DAEMON=$(ceph orch ps --daemon-type mon -f json | jq -r 'last | .daemon_name') + GRAFANA_HOST=$(ceph orch ps --daemon-type grafana -f json | jq -e '.[]' | jq -r '.hostname') + PROM_HOST=$(ceph orch ps --daemon-type prometheus -f json | jq -e '.[]' | jq -r '.hostname') + ALERTM_HOST=$(ceph orch ps --daemon-type alertmanager -f json | jq -e '.[]' | jq -r '.hostname') + GRAFANA_IP=$(ceph orch host ls -f json | jq -r --arg GRAFANA_HOST "$GRAFANA_HOST" '.[] | select(.hostname==$GRAFANA_HOST) | .addr') + PROM_IP=$(ceph orch host ls -f json | jq -r --arg PROM_HOST "$PROM_HOST" '.[] | select(.hostname==$PROM_HOST) | .addr') + ALERTM_IP=$(ceph orch host ls -f json | jq -r --arg ALERTM_HOST "$ALERTM_HOST" '.[] | select(.hostname==$ALERTM_HOST) | .addr') + # check each host node-exporter metrics endpoint is responsive + ALL_HOST_IPS=$(ceph orch host ls -f json | jq -r '.[] | .addr') + for ip in $ALL_HOST_IPS; do + curl -s http://${ip}:9100/metric + done + # check grafana endpoints are responsive and database health is okay + curl -k -s https://${GRAFANA_IP}:3000/api/health + curl -k -s https://${GRAFANA_IP}:3000/api/health | jq -e '.database == "ok"' + # stop mon daemon in order to trigger an alert + ceph orch daemon stop $MON_DAEMON + sleep 120 + # check prometheus endpoints are responsive and mon down alert is firing + curl -s http://${PROM_IP}:9095/api/v1/status/config + curl -s http://${PROM_IP}:9095/api/v1/status/config | jq -e '.status == "success"' + curl -s http://${PROM_IP}:9095/api/v1/alerts + curl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e '.data | .alerts | .[] | select(.labels | .alertname == "CephMonDown") | .state == "firing"' + # check alertmanager endpoints are responsive and mon down alert is active + curl -s http://${ALERTM_IP}:9093/api/v1/status + curl -s http://${ALERTM_IP}:9093/api/v1/alerts + curl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e '.data | .[] | select(.labels | .alertname == "CephMonDown") | .status | .state == "active"' -- 2.47.3