]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: periodically check hosts, and warn if the host check fails
authorSage Weil <sage@redhat.com>
Fri, 24 Jan 2020 17:46:40 +0000 (11:46 -0600)
committerSage Weil <sage@redhat.com>
Thu, 30 Jan 2020 13:13:05 +0000 (07:13 -0600)
If we manually run the check, wake up the serve thread to recheck things.

Signed-off-by: Sage Weil <sage@redhat.com>
doc/mgr/cephadm.rst
src/pybind/mgr/cephadm/module.py

index a6d4c88366a917c59ae1c144817b34ae43e68512..f2241ea229d99d307efa2f0ccd4e36af2f7e4934 100644 (file)
@@ -78,3 +78,24 @@ in `ceph orchestrator service ls`).
 You can also disable this warning entirely with::
 
   ceph config set mgr mgr/cephadm/warn_on_stray_services false
+
+CEPHADM_HOST_CHECK_FAILED
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One or more hosts have failed the basic cephadm host check, which verifies
+that (1) the host is reachable and cephadm can be executed there, and (2)
+that the host satisfies basic prerequisites, like a working container
+runtime (podman or docker) and working time synchronization.
+If this test fails, cephadm will no be able to manage services on that host.
+
+You can manually run this check with::
+
+  ceph cephadm check-host *<hostname>*
+
+You can remove a broken host from management with::
+
+  ceph orchestrator host rm *<hostname>*
+
+You can disable this health warning with::
+
+  ceph config set mgr mgr/cephadm/warn_on_failed_host_check false
index ca97fe3cb30e2b4b2e6a1cc9132d21a120705731..24578de1f8ea0e46e4f68201e1ab59489034f61e 100644 (file)
@@ -325,6 +325,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
             'desc': 'raise a health warning if services are detected '
                     'that are not managed by cephadm',
         },
+        {
+            'name': 'warn_on_failed_host_check',
+            'type': 'bool',
+            'default': True,
+            'desc': 'raise a health warning if the host check fails',
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -344,6 +350,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
             self.container_image_base = ''
             self.warn_on_stray_hosts = True
             self.warn_on_stray_services = True
+            self.warn_on_failed_host_check = True
 
         self._cons = {}  # type: Dict[str, Tuple[remoto.backends.BaseConnection,remoto.backends.LegacyModuleExecute]]
 
@@ -603,6 +610,30 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
         self._save_upgrade_state()
         return None
 
+    def _check_hosts(self):
+        self.log.debug('_check_hosts')
+        bad_hosts = []
+        for host, v in self.inventory.items():
+            self.log.debug(' checking %s' % host)
+            out, err, code = self._run_cephadm(host, 'client', 'check-host', [],
+                                               error_ok=True, no_fsid=True)
+            if code:
+                self.log.debug(' host %s failed check' % host)
+                if self.warn_on_failed_host_check:
+                    bad_hosts.append('host %s failed check: %s' % (host, err))
+            else:
+                self.log.debug(' host %s ok' % host)
+        if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+            del self.health_checks['CEPHADM_HOST_CHECK_FAILED']
+        if bad_hosts:
+            self.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
+                'severity': 'warning',
+                'summary': '%d hosts fail cephadm check' % len(bad_hosts),
+                'count': len(bad_hosts),
+                'detail': bad_hosts,
+            }
+        self.set_health_checks(self.health_checks)
+
     def _check_for_strays(self):
         self.log.debug('_check_for_strays')
         for k in ['CEPHADM_STRAY_HOST',
@@ -662,6 +693,9 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
         # type: () -> None
         self.log.info("serve starting")
         while self.run:
+            self._check_hosts()
+            self._check_for_strays()
+
             while self.upgrade_state and not self.upgrade_state.get('paused'):
                 self.log.debug('Upgrade in progress, refreshing services')
                 completion = self._get_services()
@@ -679,8 +713,6 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                     orchestrator.raise_if_exception(completion)
                 self.log.debug('did _do_upgrade')
 
-            self._check_for_strays()
-
             sleep_interval = 600
             self.log.debug('Sleeping for %d seconds', sleep_interval)
             ret = self.event.wait(sleep_interval)
@@ -926,6 +958,13 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin):
                                            error_ok=True, no_fsid=True)
         if code:
             return 1, '', ('check-host failed:\n' + '\n'.join(err))
+        # if we have an outstanding health alert for this host, give the
+        # serve thread a kick
+        if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+            for item in self.health_checks['CEPHADM_HOST_CHECK_FAILED']['detail']:
+                if item.startswith('host %s ' % host):
+                    self.log.debug('kicking serve thread')
+                    self.event.set()
         return 0, '%s ok' % host, err
 
     def _get_connection(self, host):