From 16db65fee2297bfdf52e0dca4dc9a3fb89d984de Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 24 Jan 2020 11:46:40 -0600 Subject: [PATCH] mgr/cephadm: periodically check hosts, and warn if the host check fails If we manually run the check, wake up the serve thread to recheck things. Signed-off-by: Sage Weil --- doc/mgr/cephadm.rst | 21 ++++++++++++++++ src/pybind/mgr/cephadm/module.py | 43 ++++++++++++++++++++++++++++++-- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/doc/mgr/cephadm.rst b/doc/mgr/cephadm.rst index a6d4c88366a91..f2241ea229d99 100644 --- a/doc/mgr/cephadm.rst +++ b/doc/mgr/cephadm.rst @@ -78,3 +78,24 @@ in `ceph orchestrator service ls`). You can also disable this warning entirely with:: ceph config set mgr mgr/cephadm/warn_on_stray_services false + +CEPHADM_HOST_CHECK_FAILED +^^^^^^^^^^^^^^^^^^^^^^^^^ + +One or more hosts have failed the basic cephadm host check, which verifies +that (1) the host is reachable and cephadm can be executed there, and (2) +that the host satisfies basic prerequisites, like a working container +runtime (podman or docker) and working time synchronization. +If this test fails, cephadm will no be able to manage services on that host. + +You can manually run this check with:: + + ceph cephadm check-host ** + +You can remove a broken host from management with:: + + ceph orchestrator host rm ** + +You can disable this health warning with:: + + ceph config set mgr mgr/cephadm/warn_on_failed_host_check false diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index ca97fe3cb30e2..24578de1f8ea0 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -325,6 +325,12 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): 'desc': 'raise a health warning if services are detected ' 'that are not managed by cephadm', }, + { + 'name': 'warn_on_failed_host_check', + 'type': 'bool', + 'default': True, + 'desc': 'raise a health warning if the host check fails', + }, ] def __init__(self, *args, **kwargs): @@ -344,6 +350,7 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self.container_image_base = '' self.warn_on_stray_hosts = True self.warn_on_stray_services = True + self.warn_on_failed_host_check = True self._cons = {} # type: Dict[str, Tuple[remoto.backends.BaseConnection,remoto.backends.LegacyModuleExecute]] @@ -603,6 +610,30 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): self._save_upgrade_state() return None + def _check_hosts(self): + self.log.debug('_check_hosts') + bad_hosts = [] + for host, v in self.inventory.items(): + self.log.debug(' checking %s' % host) + out, err, code = self._run_cephadm(host, 'client', 'check-host', [], + error_ok=True, no_fsid=True) + if code: + self.log.debug(' host %s failed check' % host) + if self.warn_on_failed_host_check: + bad_hosts.append('host %s failed check: %s' % (host, err)) + else: + self.log.debug(' host %s ok' % host) + if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks: + del self.health_checks['CEPHADM_HOST_CHECK_FAILED'] + if bad_hosts: + self.health_checks['CEPHADM_HOST_CHECK_FAILED'] = { + 'severity': 'warning', + 'summary': '%d hosts fail cephadm check' % len(bad_hosts), + 'count': len(bad_hosts), + 'detail': bad_hosts, + } + self.set_health_checks(self.health_checks) + def _check_for_strays(self): self.log.debug('_check_for_strays') for k in ['CEPHADM_STRAY_HOST', @@ -662,6 +693,9 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): # type: () -> None self.log.info("serve starting") while self.run: + self._check_hosts() + self._check_for_strays() + while self.upgrade_state and not self.upgrade_state.get('paused'): self.log.debug('Upgrade in progress, refreshing services') completion = self._get_services() @@ -679,8 +713,6 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): orchestrator.raise_if_exception(completion) self.log.debug('did _do_upgrade') - self._check_for_strays() - sleep_interval = 600 self.log.debug('Sleeping for %d seconds', sleep_interval) ret = self.event.wait(sleep_interval) @@ -926,6 +958,13 @@ class CephadmOrchestrator(MgrModule, orchestrator.OrchestratorClientMixin): error_ok=True, no_fsid=True) if code: return 1, '', ('check-host failed:\n' + '\n'.join(err)) + # if we have an outstanding health alert for this host, give the + # serve thread a kick + if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks: + for item in self.health_checks['CEPHADM_HOST_CHECK_FAILED']['detail']: + if item.startswith('host %s ' % host): + self.log.debug('kicking serve thread') + self.event.set() return 0, '%s ok' % host, err def _get_connection(self, host): -- 2.39.5