From 21195114d4852ce4fba3b2f1bcc1985ddf9a6ac6 Mon Sep 17 00:00:00 2001 From: Redouane Kachach Date: Thu, 25 Jan 2024 13:57:09 +0100 Subject: [PATCH] mgr/rook: adding metrics monitoring e2e testing Fixes: https://tracker.ceph.com/issues/64183 Signed-off-by: Redouane Kachach --- .../rook/ci/scripts/bootstrap-rook-cluster.sh | 17 +++- .../cluster-prometheus-monitoring.feature | 14 +++ .../ci/tests/features/steps/implementation.py | 93 +++++++++++++++++++ 3 files changed, 122 insertions(+), 2 deletions(-) create mode 100644 src/pybind/mgr/rook/ci/tests/features/cluster-prometheus-monitoring.feature diff --git a/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh b/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh index d3c35544131e8..dcd22fce9a75b 100755 --- a/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh +++ b/src/pybind/mgr/rook/ci/scripts/bootstrap-rook-cluster.sh @@ -120,7 +120,7 @@ configure_libvirt(){ echo "User added to libvirt group successfully." sudo systemctl enable --now libvirtd sudo systemctl restart libvirtd - sleep 10 # wait some time for libvirtd service to restart + sleep 30 # wait some time for libvirtd service to restart newgrp libvirt else echo "Error adding user to libvirt group." @@ -146,7 +146,7 @@ recreate_default_network(){ # restart libvirtd service and wait a little bit for the service sudo systemctl restart libvirtd - sleep 10 + sleep 30 # Just some debugging information all_networks=$(virsh net-list --all) @@ -161,6 +161,17 @@ enable_rook_orchestrator() { $KUBECTL -n "$ROOK_CLUSTER_NS" exec -it deploy/rook-ceph-tools -- ceph orch status } +enable_monitoring() { + echo "Enabling monitoring" + $KUBECTL apply -f https://raw.githubusercontent.com/coreos/prometheus-operator/v0.40.0/bundle.yaml + $KUBECTL wait --for=condition=ready pod -l app.kubernetes.io/name=prometheus-operator --timeout=90s + $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/rbac.yaml + $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/service-monitor.yaml + $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/exporter-service-monitor.yaml + $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus.yaml + $KUBECTL apply -f https://raw.githubusercontent.com/rook/rook/master/deploy/examples/monitoring/prometheus-service.yaml +} + #################################################################### #################################################################### @@ -174,6 +185,8 @@ create_rook_cluster wait_for_rook_operator wait_for_ceph_cluster enable_rook_orchestrator +enable_monitoring +sleep 30 # wait for the metrics cache warmup #################################################################### #################################################################### diff --git a/src/pybind/mgr/rook/ci/tests/features/cluster-prometheus-monitoring.feature b/src/pybind/mgr/rook/ci/tests/features/cluster-prometheus-monitoring.feature new file mode 100644 index 0000000000000..5180c72939de2 --- /dev/null +++ b/src/pybind/mgr/rook/ci/tests/features/cluster-prometheus-monitoring.feature @@ -0,0 +1,14 @@ +Feature: Testing Rook orchestrator commands + Ceph has been installed using the cluster CRD available in deploy/examples/cluster-test.yaml + + Scenario: Verify Prometheus metrics endpoint is working properly + Given I can get prometheus server configuration + Given the prometheus server is serving metrics + + Scenario: Verify some basic metrics are working properly + Given I can get prometheus server configuration + Given the prometheus server is serving metrics + Then the response contains the metric "ceph_osd_in" where "ceph_daemon" is "osd.0" and value equal to 1 + Then the response contains the metric "ceph_osd_in" where "ceph_daemon" is "osd.1" and value equal to 1 + Then the response contains the metric "ceph_osd_in" where "ceph_daemon" is "osd.2" and value equal to 1 + Then the response contains the metric "ceph_mon_quorum_status" where "ceph_daemon" is "mon.a" and value equal to 1 diff --git a/src/pybind/mgr/rook/ci/tests/features/steps/implementation.py b/src/pybind/mgr/rook/ci/tests/features/steps/implementation.py index 69dcde458855d..59cb117c8b1e0 100644 --- a/src/pybind/mgr/rook/ci/tests/features/steps/implementation.py +++ b/src/pybind/mgr/rook/ci/tests/features/steps/implementation.py @@ -1,7 +1,22 @@ +import requests +from behave import given, when, then from behave import * from utils import * +import subprocess import re +PROMETHEUS_SERVER_URL = None + +def get_prometheus_pod_host_ip(): + try: + command = "minikube --profile minikube kubectl -- -n rook-ceph -o jsonpath='{.status.hostIP}' get pod prometheus-rook-prometheus-0" + result = subprocess.run(command, shell=True, capture_output=True, text=True, check=True) + host_ip = result.stdout.strip() + return host_ip + except subprocess.CalledProcessError as e: + print(f"Error running command: {e}") + return None + @when("I run ceph command") def run_step(context): context.output = run_ceph_commands(context.text) @@ -25,3 +40,81 @@ def verify_fuzzy_result_step(context): if not re.match(expected_lines[n], output_lines[n]): display_side_by_side(expected_lines[n], output_lines[n]) assert False, "" + +@given('I can get prometheus server configuration') +def step_get_prometheus_server_ip(context): + global PROMETHEUS_SERVER_URL + try: + PROMETHEUS_SERVER_URL = f"http://{get_prometheus_pod_host_ip()}:30900" + except requests.exceptions.RequestException as e: + print(f"Error connecting to Prometheus server: {e}") + assert False, f"Error connecting to Prometheus server: {e}" + +@given('the prometheus server is serving metrics') +def step_given_server_running(context): + try: + params = {'match[]': '{__name__!=""}'} + response = requests.get(f"{PROMETHEUS_SERVER_URL}/federate", params) + # Check if the response status code is successful (2xx) + response.raise_for_status() + # Store the response object in the context for later use + context.response = response + print(f"Prometheus server is running. Status code: {response.status_code}") + except requests.exceptions.RequestException as e: + print(f"Error connecting to Prometheus server: {e}") + assert False, f"Error connecting to Prometheus server: {e}" + +@when('I query the Prometheus metrics endpoint') +def step_when_query_metrics_endpoint(context): + params = {'match[]': '{__name__!=""}'} + context.response = requests.get(f"{PROMETHEUS_SERVER_URL}/federate", params) + context.response.raise_for_status() + +@then('the response contains the metric "{metric_name}"') +def step_then_check_metric_value(context, metric_name): + metric_value = parse_metric_value(context.response.text, metric_name) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + +@then('the response contains the metric "{metric_name}" with value equal to {expected_value}') +def step_then_check_metric_value(context, metric_name, expected_value): + metric_value = parse_metric_value(context.response.text, metric_name) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + assert metric_value == float(expected_value), f"Metric '{metric_name}' value {metric_value} is not equal to {expected_value}" + +@then('the response contains the metric "{metric_name}" with value greater than {expected_value}') +def step_then_check_metric_value(context, metric_name, expected_value): + metric_value = parse_metric_value(context.response.text, metric_name) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + assert metric_value > float(expected_value), f"Metric '{metric_name}' value {metric_value} is not greater than {expected_value}" + +@then('the response contains the metric "{metric_name}" with value less than {expected_value}') +def step_then_check_metric_value(context, metric_name, expected_value): + metric_value = parse_metric_value(context.response.text, metric_name) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + assert metric_value < float(expected_value), f"Metric '{metric_name}' value {metric_value} is not less than {expected_value}" + +@then('the response contains the metric "{metric_name}" with value in the range {min_value}-{max_value}') +def step_then_check_metric_value(context, metric_name, min_value, max_value): + metric_value = parse_metric_value(context.response.text, metric_name) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + assert metric_value >= float(min_value) and metric_value <= float(max_value), f"Metric '{metric_name}' value {metric_value} is not in the range {min_value}-{max_value}" + +@then('the response contains the metric "{metric_name}" where "{filter_by_field}" is "{field_value}" and value equal to {expected_value}') +def step_then_check_metric_value(context, metric_name, expected_value, filter_by_field, field_value): + metric_value = parse_metric_value(context.response.text, metric_name, filter_by_field, field_value) + assert metric_value is not None, f"Metric '{metric_name}' not found in the response" + assert metric_value == float(expected_value), f"Metric '{metric_name}' value {metric_value} is not equal to {expected_value}" + + +def parse_metric_value(metrics_text, metric_name, filter_by_field=None, field_value=None): + filter_condition = f'{filter_by_field}="{field_value}"' if filter_by_field and field_value else '' + pattern_str = rf'^{metric_name}\{{[^}}]*{filter_condition}[^}}]*\}} (\d+) (\d+)' + pattern = re.compile(pattern_str, re.MULTILINE) + match = pattern.search(metrics_text) + if match: + # Extract the values and timestamp from the matched groups + metric_value, _ = match.groups() + return float(metric_value) + else: + # Metric not found + return None -- 2.39.5