You can also disable this warning entirely with::
ceph config set mgr mgr/cephadm/warn_on_stray_services false
+
+CEPHADM_HOST_CHECK_FAILED
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One or more hosts have failed the basic cephadm host check, which verifies
+that (1) the host is reachable and cephadm can be executed there, and (2)
+that the host satisfies basic prerequisites, like a working container
+runtime (podman or docker) and working time synchronization.
+If this test fails, cephadm will no be able to manage services on that host.
+
+You can manually run this check with::
+
+ ceph cephadm check-host *<hostname>*
+
+You can remove a broken host from management with::
+
+ ceph orchestrator host rm *<hostname>*
+
+You can disable this health warning with::
+
+ ceph config set mgr mgr/cephadm/warn_on_failed_host_check false
'desc': 'raise a health warning if services are detected '
'that are not managed by cephadm',
},
+ {
+ 'name': 'warn_on_failed_host_check',
+ 'type': 'bool',
+ 'default': True,
+ 'desc': 'raise a health warning if the host check fails',
+ },
]
def __init__(self, *args, **kwargs):
self.container_image_base = ''
self.warn_on_stray_hosts = True
self.warn_on_stray_services = True
+ self.warn_on_failed_host_check = True
self._cons = {} # type: Dict[str, Tuple[remoto.backends.BaseConnection,remoto.backends.LegacyModuleExecute]]
self._save_upgrade_state()
return None
+ def _check_hosts(self):
+ self.log.debug('_check_hosts')
+ bad_hosts = []
+ for host, v in self.inventory.items():
+ self.log.debug(' checking %s' % host)
+ out, err, code = self._run_cephadm(host, 'client', 'check-host', [],
+ error_ok=True, no_fsid=True)
+ if code:
+ self.log.debug(' host %s failed check' % host)
+ if self.warn_on_failed_host_check:
+ bad_hosts.append('host %s failed check: %s' % (host, err))
+ else:
+ self.log.debug(' host %s ok' % host)
+ if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+ del self.health_checks['CEPHADM_HOST_CHECK_FAILED']
+ if bad_hosts:
+ self.health_checks['CEPHADM_HOST_CHECK_FAILED'] = {
+ 'severity': 'warning',
+ 'summary': '%d hosts fail cephadm check' % len(bad_hosts),
+ 'count': len(bad_hosts),
+ 'detail': bad_hosts,
+ }
+ self.set_health_checks(self.health_checks)
+
def _check_for_strays(self):
self.log.debug('_check_for_strays')
for k in ['CEPHADM_STRAY_HOST',
# type: () -> None
self.log.info("serve starting")
while self.run:
+ self._check_hosts()
+ self._check_for_strays()
+
while self.upgrade_state and not self.upgrade_state.get('paused'):
self.log.debug('Upgrade in progress, refreshing services')
completion = self._get_services()
orchestrator.raise_if_exception(completion)
self.log.debug('did _do_upgrade')
- self._check_for_strays()
-
sleep_interval = 600
self.log.debug('Sleeping for %d seconds', sleep_interval)
ret = self.event.wait(sleep_interval)
error_ok=True, no_fsid=True)
if code:
return 1, '', ('check-host failed:\n' + '\n'.join(err))
+ # if we have an outstanding health alert for this host, give the
+ # serve thread a kick
+ if 'CEPHADM_HOST_CHECK_FAILED' in self.health_checks:
+ for item in self.health_checks['CEPHADM_HOST_CHECK_FAILED']['detail']:
+ if item.startswith('host %s ' % host):
+ self.log.debug('kicking serve thread')
+ self.event.set()
return 0, '%s ok' % host, err
def _get_connection(self, host):