From 93ec6284fb3002b4778c4e54972ff1d864060922 Mon Sep 17 00:00:00 2001 From: Avan Thakkar Date: Thu, 22 Feb 2024 16:30:06 +0530 Subject: [PATCH] cephadm/nvmeof: scrape nvmeof prometheus endpoint Fixes: https://tracker.ceph.com/issues/64536 Signed-off-by: Avan Thakkar --- src/cephadm/cephadmlib/constants.py | 2 +- src/pybind/mgr/cephadm/module.py | 2 +- src/pybind/mgr/cephadm/service_discovery.py | 17 ++++++++++++++++ src/pybind/mgr/cephadm/services/monitoring.py | 4 +++- src/pybind/mgr/cephadm/services/nvmeof.py | 1 + .../services/nvmeof/ceph-nvmeof.conf.j2 | 3 +++ .../services/prometheus/prometheus.yml.j2 | 20 +++++++++++++++++++ .../cephadm/tests/test_service_discovery.py | 17 ++++++++++++++++ src/pybind/mgr/cephadm/tests/test_services.py | 20 +++++++++++++++++++ 9 files changed, 83 insertions(+), 3 deletions(-) diff --git a/src/cephadm/cephadmlib/constants.py b/src/cephadm/cephadmlib/constants.py index dfa660f4898..119f43b459b 100644 --- a/src/cephadm/cephadmlib/constants.py +++ b/src/cephadm/cephadmlib/constants.py @@ -12,7 +12,7 @@ DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0' DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:9.4.12' DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3' DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.2.4' -DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:latest' +DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0' DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1' DEFAULT_ELASTICSEARCH_IMAGE = 'quay.io/omrizeneva/elasticsearch:6.8.23' DEFAULT_JAEGER_COLLECTOR_IMAGE = 'quay.io/jaegertracing/jaeger-collector:1.29' diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 87f7024bb25..c43152856a6 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -117,7 +117,7 @@ os._exit = os_exit_noop # type: ignore DEFAULT_IMAGE = 'quay.io/ceph/ceph' DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.43.0' DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.5.0' -DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:latest' +DEFAULT_NVMEOF_IMAGE = 'quay.io/ceph/nvmeof:1.0.0' DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0' DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0' DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.25.0' diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index b681cc8e7ff..2b82f87493f 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -19,6 +19,7 @@ import secrets from cephadm.services.ingress import IngressSpec from cephadm.ssl_cert_utils import SSLCerts from cephadm.services.cephadmservice import CephExporterService +from cephadm.services.nvmeof import NvmeofService if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -145,6 +146,7 @@ class Root(Server):

Node exporter http sd-config

HAProxy http sd-config

Ceph exporter http sd-config

+

NVMeoF http sd-config

Prometheus rules

''' @@ -163,6 +165,8 @@ class Root(Server): return self.haproxy_sd_config() elif service == 'ceph-exporter': return self.ceph_exporter_sd_config() + elif service == 'nvmeof': + return self.nvmeof_sd_config() else: return [] @@ -231,6 +235,19 @@ class Root(Server): }) return srv_entries + def nvmeof_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for nvmeof service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_type('nvmeof'): + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port = NvmeofService.PROMETHEUS_PORT + srv_entries.append({ + 'targets': [build_url(host=addr, port=port).lstrip('/')], + 'labels': {'instance': dd.hostname} + }) + return srv_entries + @cherrypy.expose(alias='prometheus/rules') def get_prometheus_rules(self) -> str: """Return currently configured prometheus rules as Yaml.""" diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 114c848608a..d3439c04d04 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -402,6 +402,7 @@ class PrometheusService(CephadmService): haproxy_sd_url = f'{srv_end_point}service=haproxy' if haproxy_cnt > 0 else None mgr_prometheus_sd_url = f'{srv_end_point}service=mgr-prometheus' # always included ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included + nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() @@ -417,7 +418,8 @@ class PrometheusService(CephadmService): 'node_exporter_sd_url': node_exporter_sd_url, 'alertmanager_sd_url': alertmanager_sd_url, 'haproxy_sd_url': haproxy_sd_url, - 'ceph_exporter_sd_url': ceph_exporter_sd_url + 'ceph_exporter_sd_url': ceph_exporter_sd_url, + 'nvmeof_sd_url': nvmeof_sd_url, } web_context = { diff --git a/src/pybind/mgr/cephadm/services/nvmeof.py b/src/pybind/mgr/cephadm/services/nvmeof.py index 7d2de75f67f..6cd1f4604d3 100644 --- a/src/pybind/mgr/cephadm/services/nvmeof.py +++ b/src/pybind/mgr/cephadm/services/nvmeof.py @@ -16,6 +16,7 @@ logger = logging.getLogger(__name__) class NvmeofService(CephService): TYPE = 'nvmeof' + PROMETHEUS_PORT = 10008 def config(self, spec: NvmeofServiceSpec) -> None: # type: ignore assert self.TYPE == spec.service_type diff --git a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 index 4aa0b909357..711af0ee724 100644 --- a/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 +++ b/src/pybind/mgr/cephadm/templates/services/nvmeof/ceph-nvmeof.conf.j2 @@ -10,6 +10,9 @@ state_update_interval_sec = 5 min_controller_id = {{ spec.min_controller_id }} max_controller_id = {{ spec.max_controller_id }} enable_spdk_discovery_controller = {{ spec.enable_spdk_discovery_controller }} +enable_prometheus_exporter = True +prometheus_exporter_ssl = False +prometheus_port = 10008 [ceph] pool = {{ spec.pool }} diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index b5684399455..57d2f8a3f4b 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -107,3 +107,23 @@ scrape_configs: - url: {{ ceph_exporter_sd_url }} {% endif %} {% endif %} + +{% if nvmeof_sd_url %} + - job_name: 'nvmeof' +{% if secure_monitoring_stack %} + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: {{ nvmeof_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} + tls_config: + ca_file: root_cert.pem +{% else %} + http_sd_configs: + - url: {{ nvmeof_sd_url }} +{% endif %} +{% endif %} diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index ff98a13885f..687b64553ea 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -19,6 +19,9 @@ class FakeCache: if service_type == 'ceph-exporter': return [FakeDaemonDescription('1.2.3.4', [9926], 'node0'), FakeDaemonDescription('1.2.3.5', [9926], 'node1')] + if service_type == 'nvmeof': + return [FakeDaemonDescription('1.2.3.4', [10008], 'node0'), + FakeDaemonDescription('1.2.3.5', [10008], 'node1')] return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), FakeDaemonDescription('1.2.3.5', [9200], 'node1')] @@ -171,6 +174,20 @@ class TestServiceDiscovery: # check content assert cfg[0]['targets'] == ['1.2.3.4:9926'] + def test_get_sd_config_nvmeof(self): + mgr = FakeMgr() + root = Root(mgr, 5000, '0.0.0.0') + cfg = root.get_sd_config('nvmeof') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:10008'] + def test_get_sd_config_invalid_service(self): mgr = FakeMgr() root = Root(mgr, 5000, '0.0.0.0') diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index dbab022058d..6c157ea4336 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -393,6 +393,9 @@ state_update_interval_sec = 5 min_controller_id = 1 max_controller_id = 65519 enable_spdk_discovery_controller = False +enable_prometheus_exporter = True +prometheus_exporter_ssl = False +prometheus_port = 10008 [ceph] pool = {pool} @@ -716,6 +719,10 @@ class TestMonitoring: honor_labels: true http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=ceph-exporter + + - job_name: 'nvmeof' + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof """).lstrip() _run_cephadm.assert_called_with( @@ -872,6 +879,19 @@ class TestMonitoring: password: sd_password tls_config: ca_file: root_cert.pem + + - job_name: 'nvmeof' + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + basic_auth: + username: sd_user + password: sd_password + tls_config: + ca_file: root_cert.pem """).lstrip() _run_cephadm.assert_called_with( -- 2.39.5