mgr/cephadm: health alert for stray hosts and/or services

author Sage Weil <sage@redhat.com>

Tue, 21 Jan 2020 16:40:07 +0000 (10:40 -0600)

committer Sage Weil <sage@redhat.com>

Thu, 23 Jan 2020 17:13:40 +0000 (11:13 -0600)
author Sage Weil <sage@redhat.com>
Tue, 21 Jan 2020 16:40:07 +0000 (10:40 -0600)
committer Sage Weil <sage@redhat.com>
Thu, 23 Jan 2020 17:13:40 +0000 (11:13 -0600)
diff --git a/doc/mgr/cephadm.rst b/doc/mgr/cephadm.rst

index 1d5266b9d0b4d39f15782c60ffced3f2ce84e286..a6d4c88366a917c59ae1c144817b34ae43e68512 100644 (file)
--- a/doc/mgr/cephadm.rst
+++ b/doc/mgr/cephadm.rst
@@ -37,3 +37,44 @@ To clear this value use the command:
  ::
  
      # ceph cephadm clear-ssh-config
+
+Health checks
+-------------
+
+CEPHADM_STRAY_HOST
+^^^^^^^^^^^^^^^^^^
+
+One or more hosts have running Ceph daemons but are not registered as
+hosts managed by *cephadm*.  This means that those services cannot
+currently be managed by cephadm (e.g., restarted, upgraded, included
+in `ceph orchestrator service ls`).
+
+You can manage the host(s) with::
+
+  ceph orchestrator host add *<hostname>*
+
+Note that you may need to configure SSH access to the remote host
+before this will work.
+
+Alternatively, you can manually connect to the host and ensure that
+services on that host are removed and/or migrated to a host that is
+managed by *cephadm*.
+
+You can also disable this warning entirely with::
+
+  ceph config set mgr mgr/cephadm/warn_on_stray_hosts false
+
+CEPHADM_STRAY_SERVICE
+^^^^^^^^^^^^^^^^^^^^^
+
+One or more Ceph daemons are running but not are not managed by
+*cephadm*, perhaps because they were deploy using a different tool, or
+were started manually.  This means that those services cannot
+currently be managed by cephadm (e.g., restarted, upgraded, included
+in `ceph orchestrator service ls`).
+
+**FIXME:** We need to implement and document an adopt procedure here.
+
+You can also disable this warning entirely with::
+
+  ceph config set mgr mgr/cephadm/warn_on_stray_services false
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index 31ba5b321b3ba95a5b52bd24aa0a7bcf880572cf..6f46d1d877c251dc8879c0262f23c5248cb51e05 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -303,12 +303,30 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
              'desc': 'Container image name, without the tag',
              'runtime': True,
          },
+        {
+            'name': 'warn_on_stray_hosts',
+            'type': 'bool',
+            'default': True,
+            'desc': 'raise a health warning if services are detected on a host '
+                    'that is not managed by cephadm',
+        },
+        {
+            'name': 'warn_on_stray_services',
+            'type': 'bool',
+            'default': True,
+            'desc': 'raise a health warning if services are detected '
+                    'that are not managed by cephadm',
+        },
      ]
  
      def __init__(self, *args, **kwargs):
          super(CephadmOrchestrator, self).__init__(*args, **kwargs)
          self._cluster_fsid = self.get('mon_map')['fsid']
  
+        # for serve()
+        self.run = True
+        self.event = Event()
+
          self.config_notify()
  
          path = self.get_ceph_option('cephadm_path')
@@ -368,10 +386,6 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
              if h not in self.inventory:
                  del self.service_cache[h]
  
-        # for serve()
-        self.run = True
-        self.event = Event()
-
      def shutdown(self):
          self.log.info('shutdown')
          self._worker_pool.close()
@@ -405,8 +419,10 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                                (s.service_type, s.service_instance))
                  return True
  
-    def _clear_health_checks(self):
-        self.health_checks = {}
+    def _clear_upgrade_health_checks(self):
+        for k in ['UPGRADE_NO_STANDBY_MGR']:
+            if k in self.health_checks:
+                del self.health_checks[k]
          self.set_health_checks(self.health_checks)
  
      def _do_upgrade(self, daemons):
@@ -557,6 +573,61 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          self._save_upgrade_state()
          return None
  
+    def _check_for_strays(self):
+        self.log.debug('_check_for_strays')
+        for k in ['CEPHADM_STRAY_HOST',
+                  'CEPHADM_STRAY_SERVICE']:
+            if k in self.health_checks:
+                del self.health_checks[k]
+        if self.warn_on_stray_hosts or self.warn_on_stray_services:
+            ls = self.list_servers()
+            managed = []
+            if self.warn_on_stray_services:
+                completion = self._get_services()
+                self._orchestrator_wait([completion])
+                orchestrator.raise_if_exception(completion)
+                self.log.debug('services %s' % completion.result)
+                for s in completion.result:
+                    managed.append(s.name())
+                self.log.debug('cephadm daemons %s' % managed)
+            host_detail = []     # type: List[str]
+            host_num_services = 0
+            service_detail = []  # type: List[str]
+            for item in ls:
+                host = item.get('hostname')
+                services = item.get('services')
+                missing_names = []
+                for s in services:
+                    name = '%s.%s' % (s.get('type'), s.get('id'))
+                    if host not in self.inventory:
+                        missing_names.append(name)
+                        host_num_services += 1
+                    if name not in managed:
+                        service_detail.append(
+                            'stray service %s on host %s not managed by cephadm' % (name, host))
+                if missing_names:
+                    host_detail.append(
+                        'stray host %s has %d stray daemons: %s' % (
+                            host, len(missing_names), missing_names))
+            if host_detail:
+                self.health_checks['CEPHADM_STRAY_HOST'] = {
+                    'severity': 'warning',
+                    'summary': '%d stray host(s) with %s service(s) '
+                    'not managed by cephadm' % (
+                        len(host_detail), host_num_services),
+                    'count': len(host_detail),
+                    'detail': host_detail,
+                }
+            if service_detail:
+                self.health_checks['CEPHADM_STRAY_SERVICE'] = {
+                    'severity': 'warning',
+                    'summary': '%d stray service(s) not managed by cephadm' % (
+                        len(service_detail)),
+                    'count': len(service_detail),
+                    'detail': service_detail,
+                }
+        self.set_health_checks(self.health_checks)
+
      def serve(self):
          # type: () -> None
          self.log.info("serve starting")
@@ -578,7 +649,9 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                      orchestrator.raise_if_exception(completion)
                  self.log.debug('did _do_upgrade')
  
-            sleep_interval = 60*60  # this really doesn't matter
+            self._check_for_strays()
+
+            sleep_interval = 600
              self.log.debug('Sleeping for %d seconds', sleep_interval)
              ret = self.event.wait(sleep_interval)
              self.event.clear()
@@ -599,6 +672,10 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                      opt,  # type: ignore
                      self.get_ceph_option(opt))
              self.log.debug(' native option %s = %s', opt, getattr(self, opt))  # type: ignore
+        self.event.set()
+
+    def notify(self, notify_type, notify_id):
+        self.event.set()
  
      def get_unique_name(self, existing, prefix=None, forcename=None):
          """
@@ -929,6 +1006,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          self._save_inventory()
          self.inventory_cache[host] = orchestrator.OutdatableData()
          self.service_cache[host] = orchestrator.OutdatableData()
+        self.event.set()  # refresh stray health check
          return "Added host '{}'".format(host)
  
      @async_completion
@@ -942,6 +1020,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          self._save_inventory()
          del self.inventory_cache[host]
          del self.service_cache[host]
+        self.event.set()  # refresh stray health check
          return "Removed host '{}'".format(host)
  
      @trivial_completion
@@ -1807,7 +1886,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
              'target_version': target_version,
          }
          self._save_upgrade_state()
-        self._clear_health_checks()
+        self._clear_upgrade_health_checks()
          self.event.set()
          return trivial_result('Initiating upgrade to %s %s' % (image, target_id))
  
@@ -1840,7 +1919,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          target_name = self.upgrade_state.get('target_name')
          self.upgrade_state = None
          self._save_upgrade_state()
-        self._clear_health_checks()
+        self._clear_upgrade_health_checks()
          self.event.set()
          return trivial_result('Stopped upgrade to %s' % target_name)
author	Sage Weil <sage@redhat.com>
	Tue, 21 Jan 2020 16:40:07 +0000 (10:40 -0600)
committer	Sage Weil <sage@redhat.com>
	Thu, 23 Jan 2020 17:13:40 +0000 (11:13 -0600)
doc/mgr/cephadm.rst		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history