if host in self.offline_hosts:
self.offline_hosts.remove(host)
+ def update_failed_daemon_health_check(self) -> None:
+ self.remove_health_warning('CEPHADM_FAILED_DAEMON')
+ failed_daemons = []
+ for dd in self.cache.get_daemons():
+ if dd.status is not None and dd.status == DaemonDescriptionStatus.error:
+ failed_daemons.append('daemon %s on %s is in %s state' % (
+ dd.name(), dd.hostname, dd.status_desc
+ ))
+ if failed_daemons:
+ self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(
+ failed_daemons), failed_daemons)
+
@staticmethod
def can_run() -> Tuple[bool, str]:
if asyncssh is not None:
self.remove_health_warning('HOST_IN_MAINTENANCE')
else:
s = "host is" if len(in_maintenance) == 1 else "hosts are"
- self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [f"{h} is in maintenance" for h in in_maintenance])
+ self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [
+ f"{h} is in maintenance" for h in in_maintenance])
@handle_orch_error
@host_exists()
def _update_paused_health(self) -> None:
if self.mgr.paused:
- self.mgr.set_health_warning('CEPHADM_PAUSED', 'cephadm background work is paused', 1, ["'ceph orch resume' to resume"])
+ self.mgr.set_health_warning('CEPHADM_PAUSED', 'cephadm background work is paused', 1, [
+ "'ceph orch resume' to resume"])
else:
self.mgr.remove_health_warning('CEPHADM_PAUSED')
for k in [
'CEPHADM_HOST_CHECK_FAILED',
- 'CEPHADM_FAILED_DAEMON',
'CEPHADM_REFRESH_FAILED',
]:
self.mgr.remove_health_warning(k)
if bad_hosts:
- self.mgr.set_health_warning('CEPHADM_HOST_CHECK_FAILED', f'{len(bad_hosts)} hosts fail cephadm check', len(bad_hosts), bad_hosts)
+ self.mgr.set_health_warning(
+ 'CEPHADM_HOST_CHECK_FAILED', f'{len(bad_hosts)} hosts fail cephadm check', len(bad_hosts), bad_hosts)
if failures:
- self.mgr.set_health_warning('CEPHADM_REFRESH_FAILED', 'failed to probe daemons or devices', len(failures), failures)
- failed_daemons = []
- for dd in self.mgr.cache.get_daemons():
- if dd.status is not None and dd.status == DaemonDescriptionStatus.error:
- failed_daemons.append('daemon %s on %s is in %s state' % (
- dd.name(), dd.hostname, dd.status_desc
- ))
- if failed_daemons:
- self.mgr.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(failed_daemons), failed_daemons)
+ self.mgr.set_health_warning(
+ 'CEPHADM_REFRESH_FAILED', 'failed to probe daemons or devices', len(failures), failures)
+ self.mgr.update_failed_daemon_health_check()
def _check_host(self, host: str) -> Optional[str]:
if host not in self.mgr.inventory:
'stray host %s has %d stray daemons: %s' % (
host, len(missing_names), missing_names))
if self.mgr.warn_on_stray_hosts and host_detail:
- self.mgr.set_health_warning('CEPHADM_STRAY_HOST', f'{len(host_detail)} stray host(s) with {host_num_daemons} daemon(s) not managed by cephadm', len(host_detail), host_detail)
+ self.mgr.set_health_warning(
+ 'CEPHADM_STRAY_HOST', f'{len(host_detail)} stray host(s) with {host_num_daemons} daemon(s) not managed by cephadm', len(host_detail), host_detail)
if self.mgr.warn_on_stray_daemons and daemon_detail:
- self.mgr.set_health_warning('CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail)
+ self.mgr.set_health_warning(
+ 'CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail)
def _apply_all_services(self) -> bool:
r = False
options_failed_to_set.append(msg)
if invalid_config_options:
- self.mgr.set_health_warning('CEPHADM_INVALID_CONFIG_OPTION', f'Ignoring {len(invalid_config_options)} invalid config option(s)', len(invalid_config_options), invalid_config_options)
+ self.mgr.set_health_warning('CEPHADM_INVALID_CONFIG_OPTION', f'Ignoring {len(invalid_config_options)} invalid config option(s)', len(
+ invalid_config_options), invalid_config_options)
if options_failed_to_set:
- self.mgr.set_health_warning('CEPHADM_FAILED_SET_OPTION', f'Failed to set {len(options_failed_to_set)} option(s)', len(options_failed_to_set), options_failed_to_set)
+ self.mgr.set_health_warning('CEPHADM_FAILED_SET_OPTION', f'Failed to set {len(options_failed_to_set)} option(s)', len(
+ options_failed_to_set), options_failed_to_set)
def _apply_service(self, spec: ServiceSpec) -> bool:
"""
daemons.append(sd)
if daemon_place_fails:
- self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails)
+ self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(
+ daemon_place_fails), daemon_place_fails)
# remove any?
def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
assert not r
assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None
assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1
- assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary']
- assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail']
+ assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks[
+ 'CEPHADM_DAEMON_PLACE_FAIL']['summary']
+ assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks[
+ 'CEPHADM_DAEMON_PLACE_FAIL']['detail']
@mock.patch("cephadm.serve.CephadmServe._run_cephadm")
def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
assert cephadm_module.apply_spec_fails
assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None
assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1
- assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary']
+ assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks[
+ 'CEPHADM_APPLY_SPEC_FAIL']['summary']
@mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option")
@mock.patch("cephadm.serve.CephadmServe._run_cephadm")
with with_host(cephadm_module, 'test'):
ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1)
get_foreign_ceph_option.side_effect = KeyError
- CephadmServe(cephadm_module)._apply_service_config(ServiceSpec('mgr', placement=ps, config={'test': 'foo'}))
+ CephadmServe(cephadm_module)._apply_service_config(
+ ServiceSpec('mgr', placement=ps, config={'test': 'foo'}))
assert cephadm_module.health_checks.get('CEPHADM_INVALID_CONFIG_OPTION') is not None
assert cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['count'] == 1
- assert 'Ignoring 1 invalid config option(s)' in cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['summary']
- assert 'Ignoring invalid mgr config option test' in cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['detail']
+ assert 'Ignoring 1 invalid config option(s)' in cephadm_module.health_checks[
+ 'CEPHADM_INVALID_CONFIG_OPTION']['summary']
+ assert 'Ignoring invalid mgr config option test' in cephadm_module.health_checks[
+ 'CEPHADM_INVALID_CONFIG_OPTION']['detail']
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@mock.patch("cephadm.services.nfs.NFSService.run_grace_tool", mock.MagicMock())