specs = [] # type: List[ServiceSpec]
for sn, spec in self.mgr.spec_store.active_specs.items():
specs.append(spec)
+ for name in ['CEPHADM_APPLY_SPEC_FAIL', 'CEPHADM_DAEMON_PLACE_FAIL']:
+ self.mgr.remove_health_warning(name)
+ self.mgr.apply_spec_fails = []
for spec in specs:
try:
if self._apply_service(spec):
r = True
except Exception as e:
- self.log.exception('Failed to apply %s spec %s: %s' % (
- spec.service_name(), spec, e))
+ msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+ self.log.exception(msg)
self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+ self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+ warnings = []
+ for x in self.mgr.apply_spec_fails:
+ warnings.append(f'{x[0]}: {x[1]}')
+ self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+ f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+ len(self.mgr.apply_spec_fails),
+ warnings)
return r
'status', '').lower() not in ['maintenance', 'offline'] and d.hostname not in self.mgr.offline_hosts)]
self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove))
except OrchestratorError as e:
- self.log.error('Failed to apply %s spec %s: %s' % (
- spec.service_name(), spec, e))
+ msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+ self.log.error(msg)
self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+ self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+ warnings = []
+ for x in self.mgr.apply_spec_fails:
+ warnings.append(f'{x[0]}: {x[1]}')
+ self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+ f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+ len(self.mgr.apply_spec_fails),
+ warnings)
return False
r = None
svc.fence_old_ranks(spec, rank_map, len(all_slots))
# create daemons
+ daemon_place_fails = []
for slot in slots_to_add:
# first remove daemon on conflicting port?
if slot.ports:
f"on {slot.hostname}: {e}")
self.mgr.events.for_service(spec, 'ERROR', msg)
self.mgr.log.error(msg)
+ daemon_place_fails.append(msg)
# only return "no change" if no one else has already succeeded.
# later successes will also change to True
if r is None:
)
daemons.append(sd)
+ if daemon_place_fails:
+ self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails)
+
# remove any?
def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
daemon_ids = [d.daemon_id for d in remove_daemons]
'entity': entity,
})
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+ def test_daemon_place_fail_health_warning(self, _run_cephadm, cephadm_module):
+ _run_cephadm.return_value = ('{}', '', 0)
+ with with_host(cephadm_module, 'test'):
+ _run_cephadm.side_effect = OrchestratorError('fail')
+ ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1)
+ r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+ assert not r
+ assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None
+ assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1
+ assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary']
+ assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail']
+
+ @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+ def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+ _run_cephadm.return_value = ('{}', '', 0)
+ with with_host(cephadm_module, 'test'):
+ CephadmServe(cephadm_module)._apply_all_services()
+ ps = PlacementSpec(hosts=['fail'], count=1)
+ r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+ assert not r
+ assert cephadm_module.apply_spec_fails
+ assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None
+ assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1
+ assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary']
+
@mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option")
@mock.patch("cephadm.serve.CephadmServe._run_cephadm")
def test_invalid_config_option_health_warning(self, _run_cephadm, get_foreign_ceph_option, cephadm_module: CephadmOrchestrator):