]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: fixing prometheus port handling
authorAdam King <adking@redhat.com>
Wed, 2 Mar 2022 05:23:52 +0000 (00:23 -0500)
committerAdam King <adking@redhat.com>
Tue, 5 Apr 2022 20:10:22 +0000 (16:10 -0400)
Fixes: https://tracker.ceph.com/issues/51072
Signed-off-by: Redouane Kachach <rkachach@redhat.com>
(cherry picked from commit 8eb1397d77dace25f387e88137a1807993a0796d)

Conflicts:
src/pybind/mgr/prometheus/module.py

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/services/monitoring.py
src/pybind/mgr/cephadm/tests/fixtures.py
src/pybind/mgr/cephadm/tests/test_services.py
src/pybind/mgr/prometheus/module.py

index 5a633eae172f7e2c1c9b623c2bb6201fa2f58a61..44fe7f09d0718ea3da6b932607ab38b5ebb0ee5c 100644 (file)
@@ -2427,6 +2427,8 @@ Then run the following:
             for dep_type in need.get(daemon_type, []):
                 for dd in self.cache.get_daemons_by_type(dep_type):
                     deps.append(dd.name())
+            if daemon_type == 'prometheus':
+                deps.append(str(self.get_module_option_ex('prometheus', 'server_port', 9283)))
         return sorted(deps)
 
     @forall_hosts
index 34ae333d6678ed1d6b94a10a200ab0d734db7649..e0682b1a3263a0e587d0c0b9ad6607ccb3f231f6 100644 (file)
@@ -688,7 +688,7 @@ class CephadmServe:
                 slot = slot.assign_name(self.mgr.get_unique_name(
                     slot.daemon_type,
                     slot.hostname,
-                    daemons,
+                    [d for d in daemons if d not in daemons_to_remove],
                     prefix=spec.service_id,
                     forcename=slot.name,
                     rank=slot.rank,
@@ -718,18 +718,20 @@ class CephadmServe:
             # create daemons
             daemon_place_fails = []
             for slot in slots_to_add:
-                # first remove daemon on conflicting port?
-                if slot.ports:
+                # first remove daemon with conflicting port or name?
+                if slot.ports or slot.name in [d.name() for d in daemons_to_remove]:
                     for d in daemons_to_remove:
-                        if d.hostname != slot.hostname:
+                        if (
+                            d.hostname != slot.hostname
+                            or not (set(d.ports or []) & set(slot.ports))
+                            or (d.ip and slot.ip and d.ip != slot.ip)
+                            and d.name() != slot.name
+                        ):
                             continue
-                        if not (set(d.ports or []) & set(slot.ports)):
-                            continue
-                        if d.ip and slot.ip and d.ip != slot.ip:
-                            continue
-                        self.log.info(
-                            f'Removing {d.name()} before deploying to {slot} to avoid a port conflict'
-                        )
+                        if d.name() != slot.name:
+                            self.log.info(
+                                f'Removing {d.name()} before deploying to {slot} to avoid a port or conflict'
+                            )
                         # NOTE: we don't check ok-to-stop here to avoid starvation if
                         # there is only 1 gateway.
                         self._remove_daemon(d.name(), d.hostname)
index 3b738a330bec03c94772f03ae798a92bc815b91c..13d0ff497f7d5658e9b6cb4296bfcfac1d6093eb 100644 (file)
@@ -217,6 +217,7 @@ class AlertmanagerService(CephadmService):
 class PrometheusService(CephadmService):
     TYPE = 'prometheus'
     DEFAULT_SERVICE_PORT = 9095
+    DEFAULT_MGR_PROMETHEUS_PORT = 9283
 
     def config(self, spec: ServiceSpec) -> None:
         # make sure module is enabled
@@ -247,13 +248,19 @@ class PrometheusService(CephadmService):
         # scrape mgrs
         mgr_scrape_list = []
         mgr_map = self.mgr.get('mgr_map')
-        port = None
+        port = cast(int, self.mgr.get_module_option_ex(
+            'prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT))
+        deps.append(str(port))
         t = mgr_map.get('services', {}).get('prometheus', None)
         if t:
             p_result = urlparse(t)
-            t = t.split('/')[2]
-            mgr_scrape_list.append(t)
-            port = p_result.port or 9283
+            # urlparse .hostname removes '[]' from the hostname in case
+            # of ipv6 addresses so if this is the case then we just
+            # append the brackets when building the final scrape endpoint
+            if '[' in p_result.netloc and ']' in p_result.netloc:
+                mgr_scrape_list.append(f"[{p_result.hostname}]:{port}")
+            else:
+                mgr_scrape_list.append(f"{p_result.hostname}:{port}")
         # scan all mgrs to generate deps and to get standbys too.
         # assume that they are all on the same port as the active mgr.
         for dd in self.mgr.cache.get_daemons_by_service('mgr'):
index 40a8ad6360ca828e6ee49e57cdd6ab6990407194..eef15e83056870cb9fe59e8eb5f4d1ebc714fafd 100644 (file)
@@ -19,6 +19,13 @@ def get_ceph_option(_, key):
     return __file__
 
 
+def get_module_option_ex(_, module, key, default=None):
+    if module == 'prometheus':
+        if key == 'server_port':
+            return 9283
+    return None
+
+
 def _run_cephadm(ret):
     def foo(s, host, entity, cmd, e, **kwargs):
         if cmd == 'gather-facts':
@@ -41,6 +48,7 @@ def with_cephadm_module(module_options=None, store=None):
     """
     with mock.patch("cephadm.module.CephadmOrchestrator.get_ceph_option", get_ceph_option),\
             mock.patch("cephadm.services.osd.RemoveUtil._run_mon_cmd"), \
+            mock.patch('cephadm.module.CephadmOrchestrator.get_module_option_ex', get_module_option_ex),\
             mock.patch("cephadm.module.CephadmOrchestrator.get_osdmap"), \
             mock.patch("cephadm.module.CephadmOrchestrator.remote"), \
             mock.patch('cephadm.offline_watcher.OfflineHostWatcher.run'):
index a753cfd9fb9bf066cb7dfded5b18e4caf980722a..a5823a8eedabac83a0dc241908bc7dd9a88aa19b 100644 (file)
@@ -301,7 +301,7 @@ class TestMonitoring:
                     honor_labels: true
                     static_configs:
                     - targets:
-                      - '[::1]:8081'
+                      - '[::1]:9283'
 
                   - job_name: 'node'
                     static_configs:
index c51c9ab3bbe4a7efaa1e238fbf6dbf6556b2de41..7890b5eec92793d0ea96fbc5f083d68361c07365 100644 (file)
@@ -550,7 +550,10 @@ class Module(MgrModule):
         ),
         Option(
             'server_port',
-            type='int'
+            type='int',
+            default=DEFAULT_PORT,
+            desc='the port on which the module listens for HTTP requests',
+            runtime=True
         ),
         Option(
             'scrape_interval',
@@ -815,6 +818,31 @@ class Module(MgrModule):
 
         return metrics
 
+    def get_server_addr(self) -> str:
+        """
+        Return the current mgr server IP.
+        """
+        server_addr = cast(str, self.get_localized_module_option('server_addr', get_default_addr()))
+        if server_addr in ['::', '0.0.0.0']:
+            return self.get_mgr_ip()
+        return server_addr
+
+    def config_notify(self) -> None:
+        """
+        This method is called whenever one of our config options is changed.
+        """
+        # https://stackoverflow.com/questions/7254845/change-cherrypy-port-and-restart-web-server
+        # if we omit the line: cherrypy.server.httpserver = None
+        # then the cherrypy server is not restarted correctly
+        self.log.info('Restarting engine...')
+        cherrypy.engine.stop()
+        cherrypy.server.httpserver = None
+        server_port = cast(int, self.get_localized_module_option('server_port', DEFAULT_PORT))
+        self.set_uri(build_url(scheme='http', host=self.get_server_addr(), port=server_port, path='/'))
+        cherrypy.config.update({'server.socket_port': server_port})
+        cherrypy.engine.start()
+        self.log.info('Engine started.')
+
     @profile_method()
     def get_health(self) -> None:
 
@@ -1728,9 +1756,7 @@ class Module(MgrModule):
         })
         # Publish the URI that others may use to access the service we're
         # about to start serving
-        if server_addr in ['::', '0.0.0.0']:
-            server_addr = self.get_mgr_ip()
-        self.set_uri(build_url(scheme='http', host=server_addr, port=server_port, path='/'))
+        self.set_uri(build_url(scheme='http', host=self.get_server_addr(), port=server_port, path='/'))
 
         cherrypy.tree.mount(Root(), "/")
         self.log.info('Starting engine...')
@@ -1742,6 +1768,7 @@ class Module(MgrModule):
         # tell metrics collection thread to stop collecting new metrics
         self.metrics_thread.stop()
         cherrypy.engine.stop()
+        cherrypy.server.httpserver = None
         self.log.info('Engine stopped.')
         self.shutdown_rbd_stats()
         # wait for the metrics collection thread to stop
@@ -1838,6 +1865,7 @@ class StandbyModule(MgrStandbyModule):
         self.shutdown_event.wait()
         self.shutdown_event.clear()
         cherrypy.engine.stop()
+        cherrypy.server.httpserver = None
         self.log.info('Engine stopped.')
 
     def shutdown(self) -> None: