]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: Adding prometheus service discovery endpoints
authorRedouane Kachach <rkachach@redhat.com>
Tue, 15 Mar 2022 16:02:41 +0000 (17:02 +0100)
committerAdam King <adking@redhat.com>
Sat, 21 May 2022 23:12:08 +0000 (19:12 -0400)
Signed-off-by: Redouane Kachach <rkachach@redhat.com>
Fixes: https://tracker.ceph.com/issues/54309
(cherry picked from commit 0e0135a1b640dc06e9c2295f3fe172b0914bae2c)

src/pybind/mgr/cephadm/agent.py
src/pybind/mgr/cephadm/tests/test_agent.py [new file with mode: 0644]

index f6672be0bbda5881542ba0bca0358557b9956122..fa75a8759bbcf3907ef8bd941b25b612106418f6 100644 (file)
@@ -8,13 +8,15 @@ import tempfile
 import threading
 import time
 
-from mgr_util import verify_tls_files
+from mgr_module import ServiceInfoT
+from mgr_util import verify_tls_files, build_url
 from orchestrator import DaemonDescriptionStatus, OrchestratorError
 from orchestrator._interface import daemon_type_to_service
 from ceph.utils import datetime_now
 from ceph.deployment.inventory import Devices
 from ceph.deployment.service_spec import ServiceSpec, PlacementSpec
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
+from cephadm.services.ingress import IngressSpec
 
 from datetime import datetime, timedelta
 from cryptography import x509
@@ -24,7 +26,7 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.backends import default_backend
 
 from typing import Any, Dict, List, Set, Tuple, \
-    TYPE_CHECKING, Optional
+    TYPE_CHECKING, Optional, cast, Collection
 
 if TYPE_CHECKING:
     from cephadm.module import CephadmOrchestrator
@@ -51,6 +53,30 @@ class CherryPyThread(threading.Thread):
         self.server_addr = self.mgr.get_mgr_ip()
         super(CherryPyThread, self).__init__(target=self.run)
 
+    def configure_cherrypy(self) -> None:
+        cherrypy.config.update({
+            'environment': 'production',
+            'server.socket_host': self.server_addr,
+            'server.socket_port': self.server_port,
+            'engine.autoreload.on': False,
+            'server.ssl_module': 'builtin',
+            'server.ssl_certificate': self.cert_tmp.name,
+            'server.ssl_private_key': self.key_tmp.name,
+        })
+
+        # configure routes
+        root = Root(self.mgr)
+        host_data = HostData(self.mgr)
+        d = cherrypy.dispatch.RoutesDispatcher()
+        d.connect(name='index', route='/', controller=root.index)
+        d.connect(name='sd-config', route='/prometheus/sd-config', controller=root.get_sd_config)
+        d.connect(name='rules', route='/prometheus/rules', controller=root.get_prometheus_rules)
+        d.connect(name='host-data', route='/data', controller=host_data.POST,
+                  conditions=dict(method=['POST']))
+
+        conf = {'/': {'request.dispatch': d}}
+        cherrypy.tree.mount(None, "/", config=conf)
+
     def run(self) -> None:
         try:
             try:
@@ -77,18 +103,8 @@ class CherryPyThread(threading.Thread):
             cert_fname = self.cert_tmp.name
 
             verify_tls_files(cert_fname, key_fname)
+            self.configure_cherrypy()
 
-            cherrypy.config.update({
-                'server.socket_host': self.server_addr,
-                'server.socket_port': self.server_port,
-                'engine.autoreload.on': False,
-                'server.ssl_module': 'builtin',
-                'server.ssl_certificate': cert_fname,
-                'server.ssl_private_key': key_fname,
-            })
-            root_conf = {'/': {'request.dispatch': cherrypy.dispatch.MethodDispatcher(),
-                               'tools.response_headers.on': True}}
-            cherrypy.tree.mount(Root(self.mgr), '/', root_conf)
             self.mgr.log.debug('Starting cherrypy engine...')
             self.start_engine()
             self.mgr.log.debug('Cherrypy engine started.')
@@ -130,22 +146,104 @@ class CherryPyThread(threading.Thread):
         self.cherrypy_shutdown_event.set()
 
 
-class Root:
-    exposed = True
+class Root(object):
+
+    # collapse everything to '/'
+    def _cp_dispatch(self, vpath: str) -> 'Root':
+        cherrypy.request.path = ''
+        return self
 
     def __init__(self, mgr: "CephadmOrchestrator"):
         self.mgr = mgr
-        self.data = HostData(self.mgr)
 
-    def GET(self) -> str:
+    @cherrypy.expose
+    def index(self) -> str:
         return '''<!DOCTYPE html>
 <html>
 <head><title>Cephadm HTTP Endpoint</title></head>
 <body>
-<p>Cephadm HTTP Endpoint is up and running</p>
+<h2>Cephadm Service Discovery Endpoints</h2>
+<p><a href='prometheus/sd-config?service=mgr-prometheus'>mgr/Prometheus http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=alertmanager'>Alertmanager http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=node-exporter'>Node exporter http sd-config</a></p>
+<p><a href='prometheus/sd-config?service=haproxy'>HAProxy http sd-config</a></p>
+<p><a href='prometheus/rules'>Prometheus rules</a></p>
 </body>
 </html>'''
 
+    @cherrypy.expose
+    @cherrypy.tools.json_out()
+    def get_sd_config(self, service: str) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for the specified service."""
+        if service == 'mgr-prometheus':
+            return self.prometheus_sd_config()
+        elif service == 'alertmanager':
+            return self.alertmgr_sd_config()
+        elif service == 'node-exporter':
+            return self.node_exporter_sd_config()
+        elif service == 'haproxy':
+            return self.haproxy_sd_config()
+        else:
+            return []
+
+    def prometheus_sd_config(self) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for prometheus service."""
+        servers = self.mgr.list_servers()
+        targets = []
+        for server in servers:
+            hostname = server.get('hostname', '')
+            for service in cast(List[ServiceInfoT], server.get('services', [])):
+                if service['type'] != 'mgr':
+                    continue
+                port = self.mgr.get_module_option_ex('prometheus', 'server_port', 9283)
+                targets.append(f'{hostname}:{port}')
+        return [{"targets": targets, "labels": {}}]
+
+    def alertmgr_sd_config(self) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for mgr alertmanager service."""
+        srv_entries = []
+        for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
+            assert dd.hostname is not None
+            addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+            port = dd.ports[0] if dd.ports else 9093
+            srv_entries.append('{}'.format(build_url(host=addr, port=port).lstrip('/')))
+        return [{"targets": srv_entries, "labels": {}}]
+
+    def node_exporter_sd_config(self) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for node-exporter service."""
+        srv_entries = []
+        for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
+            assert dd.hostname is not None
+            addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
+            port = dd.ports[0] if dd.ports else 9100
+            srv_entries.append({
+                'targets': [build_url(host=addr, port=port).lstrip('/')],
+                'labels': {'instance': dd.hostname}
+            })
+        return srv_entries
+
+    def haproxy_sd_config(self) -> List[Dict[str, Collection[str]]]:
+        """Return <http_sd_config> compatible prometheus config for haproxy service."""
+        srv_entries = []
+        for dd in self.mgr.cache.get_daemons_by_type('ingress'):
+            if dd.service_name() in self.mgr.spec_store:
+                spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
+                assert dd.hostname is not None
+                if dd.daemon_type == 'haproxy':
+                    addr = self.mgr.inventory.get_addr(dd.hostname)
+                    srv_entries.append({
+                        'targets': [f"{build_url(host=addr, port=spec.monitor_port).lstrip('/')}"],
+                        'labels': {'instance': dd.service_name()}
+                    })
+        return srv_entries
+
+    @cherrypy.expose(alias='prometheus/rules')
+    def get_prometheus_rules(self) -> str:
+        """Return currently configured prometheus rules as Yaml."""
+        cherrypy.response.headers['Content-Type'] = 'text/plain'
+        with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
+            return f.read()
+
 
 class HostData:
     exposed = True
diff --git a/src/pybind/mgr/cephadm/tests/test_agent.py b/src/pybind/mgr/cephadm/tests/test_agent.py
new file mode 100644 (file)
index 0000000..a4b1dc1
--- /dev/null
@@ -0,0 +1,157 @@
+from unittest.mock import MagicMock
+from cephadm.agent import Root
+
+
+class FakeDaemonDescription:
+    def __init__(self, ip, ports, hostname, service_name='', daemon_type=''):
+        self.ip = ip
+        self.ports = ports
+        self.hostname = hostname
+        self._service_name = service_name
+        self.daemon_type = daemon_type
+
+    def service_name(self):
+        return self._service_name
+
+
+class FakeCache:
+    def get_daemons_by_service(self, service_type):
+        return [FakeDaemonDescription('1.2.3.4', [9100], 'node0'),
+                FakeDaemonDescription('1.2.3.5', [9200], 'node1')]
+
+    def get_daemons_by_type(self, daemon_type):
+        return [FakeDaemonDescription('1.2.3.4', [9100], 'node0', 'ingress', 'haproxy'),
+                FakeDaemonDescription('1.2.3.5', [9200], 'node1', 'ingress', 'haproxy')]
+
+
+class FakeInventory:
+    def get_addr(self, name: str):
+        return '1.2.3.4'
+
+
+class FakeServiceSpec:
+    def __init__(self, port):
+        self.monitor_port = port
+
+
+class FakeSpecDescription:
+    def __init__(self, port):
+        self.spec = FakeServiceSpec(port)
+
+
+class FakeSpecStore():
+    def __init__(self, mgr):
+        self.mgr = mgr
+        self._specs = {'ingress': FakeSpecDescription(9049)}
+
+    def __contains__(self, name):
+        return name in self._specs
+
+    def __getitem__(self, name):
+        return self._specs['ingress']
+
+
+class FakeMgr:
+    def __init__(self):
+        self.config = ''
+        self.check_mon_command = MagicMock(side_effect=self._check_mon_command)
+        self.mon_command = MagicMock(side_effect=self._check_mon_command)
+        self.template = MagicMock()
+        self.log = MagicMock()
+        self.inventory = FakeInventory()
+        self.cache = FakeCache()
+        self.spec_store = FakeSpecStore(self)
+
+    def list_servers(self):
+
+        servers = [
+            {'hostname': 'node0',
+             'ceph_version': '16.2',
+             'services': [{'type': 'mgr'}, {'type': 'mon'}]},
+            {'hostname': 'node1',
+             'ceph_version': '16.2',
+             'services': [{'type': 'mgr'}, {'type': 'mon'}]}
+        ]
+
+        return servers
+
+    def _check_mon_command(self, cmd_dict, inbuf=None):
+        prefix = cmd_dict.get('prefix')
+        if prefix == 'get-cmd':
+            return 0, self.config, ''
+        if prefix == 'set-cmd':
+            self.config = cmd_dict.get('value')
+            return 0, 'value set', ''
+        return -1, '', 'error'
+
+    def get_module_option_ex(self, module, option, default_value):
+        return "9283"
+
+
+class TestCephadmService:
+
+    def test_get_sd_config_prometheus(self):
+        mgr = FakeMgr()
+        root = Root(mgr)
+        cfg = root.get_sd_config('mgr-prometheus')
+
+        # check response structure
+        assert cfg
+        for entry in cfg:
+            assert 'labels' in entry
+            assert 'targets' in entry
+
+        # check content
+        assert cfg[0]['targets'] == ['node0:9283', 'node1:9283']
+
+    def test_get_sd_config_node_exporter(self):
+        mgr = FakeMgr()
+        root = Root(mgr)
+        cfg = root.get_sd_config('node-exporter')
+
+        # check response structure
+        assert cfg
+        for entry in cfg:
+            assert 'labels' in entry
+            assert 'targets' in entry
+
+        # check content
+        assert cfg[0]['targets'] == ['1.2.3.4:9100']
+        assert cfg[0]['labels'] == {'instance': 'node0'}
+        assert cfg[1]['targets'] == ['1.2.3.5:9200']
+        assert cfg[1]['labels'] == {'instance': 'node1'}
+
+    def test_get_sd_config_alertmgr(self):
+        mgr = FakeMgr()
+        root = Root(mgr)
+        cfg = root.get_sd_config('alertmanager')
+
+        # check response structure
+        assert cfg
+        for entry in cfg:
+            assert 'labels' in entry
+            assert 'targets' in entry
+
+        # check content
+        assert cfg[0]['targets'] == ['1.2.3.4:9100', '1.2.3.5:9200']
+
+    def test_get_sd_config_haproxy(self):
+        mgr = FakeMgr()
+        root = Root(mgr)
+        cfg = root.get_sd_config('haproxy')
+
+        # check response structure
+        assert cfg
+        for entry in cfg:
+            assert 'labels' in entry
+            assert 'targets' in entry
+
+        # check content
+        assert cfg[0]['targets'] == ['1.2.3.4:9049']
+        assert cfg[0]['labels'] == {'instance': 'ingress'}
+
+    def test_get_sd_config_invalid_service(self):
+        mgr = FakeMgr()
+        root = Root(mgr)
+        cfg = root.get_sd_config('invalid-service')
+        assert cfg == []