mgr/cephadm: periodically check hosts, and warn if the host check fails

author Sage Weil <sage@redhat.com>

Fri, 24 Jan 2020 17:46:40 +0000 (11:46 -0600)

committer Sage Weil <sage@redhat.com>

Thu, 30 Jan 2020 13:13:05 +0000 (07:13 -0600)
author Sage Weil <sage@redhat.com>
Fri, 24 Jan 2020 17:46:40 +0000 (11:46 -0600)
committer Sage Weil <sage@redhat.com>
Thu, 30 Jan 2020 13:13:05 +0000 (07:13 -0600)
diff --git a/doc/mgr/cephadm.rst b/doc/mgr/cephadm.rst

index a6d4c88366a917c59ae1c144817b34ae43e68512..f2241ea229d99d307efa2f0ccd4e36af2f7e4934 100644 (file)
--- a/doc/mgr/cephadm.rst
+++ b/doc/mgr/cephadm.rst
@@ -78,3 +78,24 @@ in `ceph orchestrator service ls`).
  You can also disable this warning entirely with::
  
    ceph config set mgr mgr/cephadm/warn_on_stray_services false
+
+CEPHADM_HOST_CHECK_FAILED
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One or more hosts have failed the basic cephadm host check, which verifies
+that (1) the host is reachable and cephadm can be executed there, and (2)
+that the host satisfies basic prerequisites, like a working container
+runtime (podman or docker) and working time synchronization.
+If this test fails, cephadm will no be able to manage services on that host.
+
+You can manually run this check with::
+
+  ceph cephadm check-host *<hostname>*
+
+You can remove a broken host from management with::
+
+  ceph orchestrator host rm *<hostname>*
+
+You can disable this health warning with::
+
+  ceph config set mgr mgr/cephadm/warn_on_failed_host_check false
diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py

index ca97fe3cb30e2b4b2e6a1cc9132d21a120705731..24578de1f8ea0e46e4f68201e1ab59489034f61e 100644 (file)
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -325,6 +325,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
              'desc': 'raise a health warning if services are detected '
                      'that are not managed by cephadm',
          },
+        {
+            'name': 'warn_on_failed_host_check',
+            'type': 'bool',
+            'default': True,
+            'desc': 'raise a health warning if the host check fails',
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -344,6 +350,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
              self.container_image_base = ''
              self.warn_on_stray_hosts = True
              self.warn_on_stray_services = True
+            self.warn_on_failed_host_check = True
  
          self._cons = {}  # type: Dict[str, Tuple[remoto.backends.BaseConnection,remoto.backends.LegacyModuleExecute]]
  
@@ -603,6 +610,30 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          self._save_upgrade_state()
          return None
  
+    def _check_hosts(self):
+        self.log.debug('_check_hosts')
+        bad_hosts = []
+        for host, v in self.inventory.items():
+            self.log.debug(' checking %s' % host)
+            out, err, code = self._run_cephadm(host, 'client', 'check-host', [],
+                                               error_ok=True, no_fsid=True)
+            if code:
+                self.log.debug(' host %s failed check' % host)
+                if self.warn_on_failed_host_check:
+                    bad_hosts.append('host %s failed check: %s' % (host, err))
+            else:
+                self.log.debug(' host %s ok' % host)
+        if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+            del self.health_checks['CEPHADM_HOST_CHECK_FAILED']
+        if bad_hosts:
+            self.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
+                'severity': 'warning',
+                'summary': '%d hosts fail cephadm check' % len(bad_hosts),
+                'count': len(bad_hosts),
+                'detail': bad_hosts,
+            }
+        self.set_health_checks(self.health_checks)
+
      def _check_for_strays(self):
          self.log.debug('_check_for_strays')
          for k in ['CEPHADM_STRAY_HOST',
@@ -662,6 +693,9 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
          # type: () -> None
          self.log.info("serve starting")
          while self.run:
+            self._check_hosts()
+            self._check_for_strays()
+
              while self.upgrade_state and not self.upgrade_state.get('paused'):
                  self.log.debug('Upgrade in progress, refreshing services')
                  completion = self._get_services()
@@ -679,8 +713,6 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                      orchestrator.raise_if_exception(completion)
                  self.log.debug('did _do_upgrade')
  
-            self._check_for_strays()
-
              sleep_interval = 600
              self.log.debug('Sleeping for %d seconds', sleep_interval)
              ret = self.event.wait(sleep_interval)
@@ -926,6 +958,13 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                                             error_ok=True, no_fsid=True)
          if code:
              return 1, '', ('check-host failed:\n' + '\n'.join(err))
+        # if we have an outstanding health alert for this host, give the
+        # serve thread a kick
+        if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+            for item in self.health_checks['CEPHADM_HOST_CHECK_FAILED']['detail']:
+                if item.startswith('host %s ' % host):
+                    self.log.debug('kicking serve thread')
+                    self.event.set()
          return 0, '%s ok' % host, err
  
      def _get_connection(self, host):
author	Sage Weil <sage@redhat.com>
	Fri, 24 Jan 2020 17:46:40 +0000 (11:46 -0600)
committer	Sage Weil <sage@redhat.com>
	Thu, 30 Jan 2020 13:13:05 +0000 (07:13 -0600)
doc/mgr/cephadm.rst		patch \| blob \| history
src/pybind/mgr/cephadm/module.py		patch \| blob \| history