From: Redouane Kachach Date: Tue, 15 Mar 2022 16:02:41 +0000 (+0100) Subject: mgr/cephadm: Adding prometheus service discovery endpoints X-Git-Tag: v17.2.1~48^2~27 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=6b1a8397345fcb2baabee96364cf4dc8f424f885;p=ceph.git mgr/cephadm: Adding prometheus service discovery endpoints Signed-off-by: Redouane Kachach Fixes: https://tracker.ceph.com/issues/54309 (cherry picked from commit 0e0135a1b640dc06e9c2295f3fe172b0914bae2c) --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index f6672be0bbd..fa75a8759bb 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -8,13 +8,15 @@ import tempfile import threading import time -from mgr_util import verify_tls_files +from mgr_module import ServiceInfoT +from mgr_util import verify_tls_files, build_url from orchestrator import DaemonDescriptionStatus, OrchestratorError from orchestrator._interface import daemon_type_to_service from ceph.utils import datetime_now from ceph.deployment.inventory import Devices from ceph.deployment.service_spec import ServiceSpec, PlacementSpec from cephadm.services.cephadmservice import CephadmDaemonDeploySpec +from cephadm.services.ingress import IngressSpec from datetime import datetime, timedelta from cryptography import x509 @@ -24,7 +26,7 @@ from cryptography.hazmat.primitives import hashes, serialization from cryptography.hazmat.backends import default_backend from typing import Any, Dict, List, Set, Tuple, \ - TYPE_CHECKING, Optional + TYPE_CHECKING, Optional, cast, Collection if TYPE_CHECKING: from cephadm.module import CephadmOrchestrator @@ -51,6 +53,30 @@ class CherryPyThread(threading.Thread): self.server_addr = self.mgr.get_mgr_ip() super(CherryPyThread, self).__init__(target=self.run) + def configure_cherrypy(self) -> None: + cherrypy.config.update({ + 'environment': 'production', + 'server.socket_host': self.server_addr, + 'server.socket_port': self.server_port, + 'engine.autoreload.on': False, + 'server.ssl_module': 'builtin', + 'server.ssl_certificate': self.cert_tmp.name, + 'server.ssl_private_key': self.key_tmp.name, + }) + + # configure routes + root = Root(self.mgr) + host_data = HostData(self.mgr) + d = cherrypy.dispatch.RoutesDispatcher() + d.connect(name='index', route='/', controller=root.index) + d.connect(name='sd-config', route='/prometheus/sd-config', controller=root.get_sd_config) + d.connect(name='rules', route='/prometheus/rules', controller=root.get_prometheus_rules) + d.connect(name='host-data', route='/data', controller=host_data.POST, + conditions=dict(method=['POST'])) + + conf = {'/': {'request.dispatch': d}} + cherrypy.tree.mount(None, "/", config=conf) + def run(self) -> None: try: try: @@ -77,18 +103,8 @@ class CherryPyThread(threading.Thread): cert_fname = self.cert_tmp.name verify_tls_files(cert_fname, key_fname) + self.configure_cherrypy() - cherrypy.config.update({ - 'server.socket_host': self.server_addr, - 'server.socket_port': self.server_port, - 'engine.autoreload.on': False, - 'server.ssl_module': 'builtin', - 'server.ssl_certificate': cert_fname, - 'server.ssl_private_key': key_fname, - }) - root_conf = {'/': {'request.dispatch': cherrypy.dispatch.MethodDispatcher(), - 'tools.response_headers.on': True}} - cherrypy.tree.mount(Root(self.mgr), '/', root_conf) self.mgr.log.debug('Starting cherrypy engine...') self.start_engine() self.mgr.log.debug('Cherrypy engine started.') @@ -130,22 +146,104 @@ class CherryPyThread(threading.Thread): self.cherrypy_shutdown_event.set() -class Root: - exposed = True +class Root(object): + + # collapse everything to '/' + def _cp_dispatch(self, vpath: str) -> 'Root': + cherrypy.request.path = '' + return self def __init__(self, mgr: "CephadmOrchestrator"): self.mgr = mgr - self.data = HostData(self.mgr) - def GET(self) -> str: + @cherrypy.expose + def index(self) -> str: return ''' Cephadm HTTP Endpoint -

Cephadm HTTP Endpoint is up and running

+

Cephadm Service Discovery Endpoints

+

mgr/Prometheus http sd-config

+

Alertmanager http sd-config

+

Node exporter http sd-config

+

HAProxy http sd-config

+

Prometheus rules

''' + @cherrypy.expose + @cherrypy.tools.json_out() + def get_sd_config(self, service: str) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for the specified service.""" + if service == 'mgr-prometheus': + return self.prometheus_sd_config() + elif service == 'alertmanager': + return self.alertmgr_sd_config() + elif service == 'node-exporter': + return self.node_exporter_sd_config() + elif service == 'haproxy': + return self.haproxy_sd_config() + else: + return [] + + def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for prometheus service.""" + servers = self.mgr.list_servers() + targets = [] + for server in servers: + hostname = server.get('hostname', '') + for service in cast(List[ServiceInfoT], server.get('services', [])): + if service['type'] != 'mgr': + continue + port = self.mgr.get_module_option_ex('prometheus', 'server_port', 9283) + targets.append(f'{hostname}:{port}') + return [{"targets": targets, "labels": {}}] + + def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for mgr alertmanager service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_service('alertmanager'): + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port = dd.ports[0] if dd.ports else 9093 + srv_entries.append('{}'.format(build_url(host=addr, port=port).lstrip('/'))) + return [{"targets": srv_entries, "labels": {}}] + + def node_exporter_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for node-exporter service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_service('node-exporter'): + assert dd.hostname is not None + addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname) + port = dd.ports[0] if dd.ports else 9100 + srv_entries.append({ + 'targets': [build_url(host=addr, port=port).lstrip('/')], + 'labels': {'instance': dd.hostname} + }) + return srv_entries + + def haproxy_sd_config(self) -> List[Dict[str, Collection[str]]]: + """Return compatible prometheus config for haproxy service.""" + srv_entries = [] + for dd in self.mgr.cache.get_daemons_by_type('ingress'): + if dd.service_name() in self.mgr.spec_store: + spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec) + assert dd.hostname is not None + if dd.daemon_type == 'haproxy': + addr = self.mgr.inventory.get_addr(dd.hostname) + srv_entries.append({ + 'targets': [f"{build_url(host=addr, port=spec.monitor_port).lstrip('/')}"], + 'labels': {'instance': dd.service_name()} + }) + return srv_entries + + @cherrypy.expose(alias='prometheus/rules') + def get_prometheus_rules(self) -> str: + """Return currently configured prometheus rules as Yaml.""" + cherrypy.response.headers['Content-Type'] = 'text/plain' + with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f: + return f.read() + class HostData: exposed = True diff --git a/src/pybind/mgr/cephadm/tests/test_agent.py b/src/pybind/mgr/cephadm/tests/test_agent.py new file mode 100644 index 00000000000..a4b1dc1b243 --- /dev/null +++ b/src/pybind/mgr/cephadm/tests/test_agent.py @@ -0,0 +1,157 @@ +from unittest.mock import MagicMock +from cephadm.agent import Root + + +class FakeDaemonDescription: + def __init__(self, ip, ports, hostname, service_name='', daemon_type=''): + self.ip = ip + self.ports = ports + self.hostname = hostname + self._service_name = service_name + self.daemon_type = daemon_type + + def service_name(self): + return self._service_name + + +class FakeCache: + def get_daemons_by_service(self, service_type): + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'), + FakeDaemonDescription('1.2.3.5', [9200], 'node1')] + + def get_daemons_by_type(self, daemon_type): + return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'), + FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')] + + +class FakeInventory: + def get_addr(self, name: str): + return '1.2.3.4' + + +class FakeServiceSpec: + def __init__(self, port): + self.monitor_port = port + + +class FakeSpecDescription: + def __init__(self, port): + self.spec = FakeServiceSpec(port) + + +class FakeSpecStore(): + def __init__(self, mgr): + self.mgr = mgr + self._specs = {'ingress': FakeSpecDescription(9049)} + + def __contains__(self, name): + return name in self._specs + + def __getitem__(self, name): + return self._specs['ingress'] + + +class FakeMgr: + def __init__(self): + self.config = '' + self.check_mon_command = MagicMock(side_effect=self._check_mon_command) + self.mon_command = MagicMock(side_effect=self._check_mon_command) + self.template = MagicMock() + self.log = MagicMock() + self.inventory = FakeInventory() + self.cache = FakeCache() + self.spec_store = FakeSpecStore(self) + + def list_servers(self): + + servers = [ + {'hostname': 'node0', + 'ceph_version': '16.2', + 'services': [{'type': 'mgr'}, {'type': 'mon'}]}, + {'hostname': 'node1', + 'ceph_version': '16.2', + 'services': [{'type': 'mgr'}, {'type': 'mon'}]} + ] + + return servers + + def _check_mon_command(self, cmd_dict, inbuf=None): + prefix = cmd_dict.get('prefix') + if prefix == 'get-cmd': + return 0, self.config, '' + if prefix == 'set-cmd': + self.config = cmd_dict.get('value') + return 0, 'value set', '' + return -1, '', 'error' + + def get_module_option_ex(self, module, option, default_value): + return "9283" + + +class TestCephadmService: + + def test_get_sd_config_prometheus(self): + mgr = FakeMgr() + root = Root(mgr) + cfg = root.get_sd_config('mgr-prometheus') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['node0:9283', 'node1:9283'] + + def test_get_sd_config_node_exporter(self): + mgr = FakeMgr() + root = Root(mgr) + cfg = root.get_sd_config('node-exporter') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:9100'] + assert cfg[0]['labels'] == {'instance': 'node0'} + assert cfg[1]['targets'] == ['1.2.3.5:9200'] + assert cfg[1]['labels'] == {'instance': 'node1'} + + def test_get_sd_config_alertmgr(self): + mgr = FakeMgr() + root = Root(mgr) + cfg = root.get_sd_config('alertmanager') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:9100', '1.2.3.5:9200'] + + def test_get_sd_config_haproxy(self): + mgr = FakeMgr() + root = Root(mgr) + cfg = root.get_sd_config('haproxy') + + # check response structure + assert cfg + for entry in cfg: + assert 'labels' in entry + assert 'targets' in entry + + # check content + assert cfg[0]['targets'] == ['1.2.3.4:9049'] + assert cfg[0]['labels'] == {'instance': 'ingress'} + + def test_get_sd_config_invalid_service(self): + mgr = FakeMgr() + root = Root(mgr) + cfg = root.get_sd_config('invalid-service') + assert cfg == []