From 5fb45e5fb8330721e03b04d493202c9c845e33b1 Mon Sep 17 00:00:00 2001 From: avanthakkar Date: Wed, 3 Jan 2024 17:22:33 +0530 Subject: [PATCH] mgr/nfs: scrape nfs monitoring endpoint Fixes: https://tracker.ceph.com/issues/62558 Signed-off-by: avanthakkar --- src/pybind/mgr/cephadm/service_discovery.py | 17 +++++++++++++++ src/pybind/mgr/cephadm/services/monitoring.py | 4 +++- src/pybind/mgr/cephadm/services/nfs.py | 1 + .../services/prometheus/prometheus.yml.j2 | 21 ++++++++++++++++++- .../cephadm/tests/test_service_discovery.py | 18 ++++++++++++++++ src/pybind/mgr/cephadm/tests/test_services.py | 17 +++++++++++++++ 6 files changed, 76 insertions(+), 2 deletions(-) diff --git a/src/pybind/mgr/cephadm/service_discovery.py b/src/pybind/mgr/cephadm/service_discovery.py index 2095eee537c27..e9b072a9a50ae 100644 --- a/src/pybind/mgr/cephadm/service_discovery.py +++ b/src/pybind/mgr/cephadm/service_discovery.py @@ -12,6 +12,7 @@ import orchestrator # noqa from mgr_module import ServiceInfoT from mgr_util import build_url from typing import Dict, List, TYPE_CHECKING, cast, Collection, Callable, NamedTuple, Optional, IO +from cephadm.services.nfs import NFSService from cephadm.services.monitoring import AlertmanagerService, NodeExporterService, PrometheusService import secrets from mgr_util import verify_tls_files @@ -147,6 +148,7 @@ class Root(Server):

HAProxy http sd-config

Ceph exporter http sd-config

NVMeoF http sd-config

+

NFS http sd-config

Prometheus rules

''' @@ -167,6 +169,8 @@ class Root(Server): return self.ceph_exporter_sd_config() elif service == 'nvmeof': return self.nvmeof_sd_config() + elif service == 'nfs': + return self.nfs_sd_config() else: return [] @@ -248,6 +252,19 @@ class Root(Server): }) return srv_entries + def nfs_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for nfs service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_type('nfs'): + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port = NFSService.DEFAULT_EXPORTER_PORT + srv_entries.append({ + 'targets': [build_url(host=addr, port=port).lstrip('/')], + 'labels': {'instance': dd.hostname} + }) + return srv_entries + @cherrypy.expose(alias='prometheus/rules') def get_prometheus_rules(self) -> str: """Return currently configured prometheus rules as Yaml.""" diff --git a/src/pybind/mgr/cephadm/services/monitoring.py b/src/pybind/mgr/cephadm/services/monitoring.py index 33a986fed7a3c..2cb02f4e21983 100644 --- a/src/pybind/mgr/cephadm/services/monitoring.py +++ b/src/pybind/mgr/cephadm/services/monitoring.py @@ -470,6 +470,7 @@ class PrometheusService(CephadmService): ceph_exporter_sd_url = f'{srv_end_point}service=ceph-exporter' # always included nvmeof_sd_url = f'{srv_end_point}service=nvmeof' # always included mgmt_gw_enabled = len(self.mgr.cache.get_daemons_by_service('mgmt-gateway')) > 0 + nfs_sd_url = f'{srv_end_point}service=nfs' # always included alertmanager_user, alertmanager_password = self.mgr._get_alertmanager_credentials() prometheus_user, prometheus_password = self.mgr._get_prometheus_credentials() @@ -490,7 +491,8 @@ class PrometheusService(CephadmService): 'ceph_exporter_sd_url': ceph_exporter_sd_url, 'nvmeof_sd_url': nvmeof_sd_url, 'external_prometheus_targets': targets, - 'cluster_fsid': FSID + 'cluster_fsid': FSID, + 'nfs_sd_url': nfs_sd_url } ip_to_bind_to = '' diff --git a/src/pybind/mgr/cephadm/services/nfs.py b/src/pybind/mgr/cephadm/services/nfs.py index a0d7da9bb7e46..89a977c4624df 100644 --- a/src/pybind/mgr/cephadm/services/nfs.py +++ b/src/pybind/mgr/cephadm/services/nfs.py @@ -22,6 +22,7 @@ logger = logging.getLogger(__name__) class NFSService(CephService): TYPE = 'nfs' + DEFAULT_EXPORTER_PORT = 9587 def ranked(self, spec: ServiceSpec) -> bool: return True diff --git a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 index ac1ceb54f21ae..3707f47f9770f 100644 --- a/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 +++ b/src/pybind/mgr/cephadm/templates/services/prometheus/prometheus.yml.j2 @@ -161,6 +161,26 @@ scrape_configs: {% endif %} {% endif %} +{% if nfs_sd_url %} + - job_name: 'nfs' +{% if security_enabled %} + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: {{ nfs_sd_url }} + basic_auth: + username: {{ service_discovery_username }} + password: {{ service_discovery_password }} + tls_config: + ca_file: root_cert.pem +{% else %} + http_sd_configs: + - url: {{ nfs_sd_url }} +{% endif %} +{% endif %} + {% if not security_enabled %} - job_name: 'federate' scrape_interval: 15s @@ -175,4 +195,3 @@ scrape_configs: static_configs: - targets: {{ external_prometheus_targets }} {% endif %} - diff --git a/src/pybind/mgr/cephadm/tests/test_service_discovery.py b/src/pybind/mgr/cephadm/tests/test_service_discovery.py index 687b64553eaad..159431b3b889a 100644 --- a/src/pybind/mgr/cephadm/tests/test_service_discovery.py +++ b/src/pybind/mgr/cephadm/tests/test_service_discovery.py @@ -23,6 +23,10 @@ class FakeCache: return [FakeDaemonDescription('1.2.3.4', [10008], 'node0'), FakeDaemonDescription('1.2.3.5', [10008], 'node1')] + if service_type == 'nfs': + return [FakeDaemonDescription('1.2.3.4', [9587], 'node0'), + FakeDaemonDescription('1.2.3.5', [9587], 'node1')] + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), FakeDaemonDescription('1.2.3.5', [9200], 'node1')] @@ -188,6 +192,20 @@ class TestServiceDiscovery: # check content assert cfg[0]['targets'] == ['1.2.3.4:10008'] + def test_get_sd_config_nfs(self): + mgr = FakeMgr() + root = Root(mgr, 5000, '0.0.0.0') + cfg = root.get_sd_config('nfs') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:9587'] + def test_get_sd_config_invalid_service(self): mgr = FakeMgr() root = Root(mgr, 5000, '0.0.0.0') diff --git a/src/pybind/mgr/cephadm/tests/test_services.py b/src/pybind/mgr/cephadm/tests/test_services.py index 226d744a7cd1e..87ba8eb1344fe 100644 --- a/src/pybind/mgr/cephadm/tests/test_services.py +++ b/src/pybind/mgr/cephadm/tests/test_services.py @@ -824,6 +824,10 @@ class TestMonitoring: http_sd_configs: - url: http://[::1]:8765/sd/prometheus/sd-config?service=nvmeof + - job_name: 'nfs' + http_sd_configs: + - url: http://[::1]:8765/sd/prometheus/sd-config?service=nfs + - job_name: 'federate' scrape_interval: 15s honor_labels: true @@ -1021,6 +1025,19 @@ class TestMonitoring: tls_config: ca_file: root_cert.pem + - job_name: 'nfs' + honor_labels: true + scheme: https + tls_config: + ca_file: root_cert.pem + http_sd_configs: + - url: https://[::1]:8765/sd/prometheus/sd-config?service=nfs + basic_auth: + username: sd_user + password: sd_password + tls_config: + ca_file: root_cert.pem + """).lstrip() _run_cephadm.assert_called_with( -- 2.39.5