From: Adam King Date: Wed, 29 Sep 2021 19:56:00 +0000 (-0400) Subject: mgr/cephadm: update CEPHADM_FAILED_DAEMON after receiving agent metadata X-Git-Tag: v17.1.0~638^2 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=1f43d8e618b68a8a1698745950bb095aa27886c5;p=ceph.git mgr/cephadm: update CEPHADM_FAILED_DAEMON after receiving agent metadata otherwise, there can be a period where a daemon will be marked incorrectly as failed or not failed even though mgr/cephadm knows it is or isn't in an error state Signed-off-by: Adam King --- diff --git a/src/pybind/mgr/cephadm/agent.py b/src/pybind/mgr/cephadm/agent.py index ead4c0a3bee5..1350d079794d 100644 --- a/src/pybind/mgr/cephadm/agent.py +++ b/src/pybind/mgr/cephadm/agent.py @@ -178,6 +178,7 @@ class HostData: if 'ls' in data and data['ls']: self.mgr._process_ls_output(host, data['ls']) + self.mgr.update_failed_daemon_health_check() if 'networks' in data and data['networks']: self.mgr.cache.update_host_networks(host, data['networks']) if 'facts' in data and data['facts']: @@ -261,6 +262,7 @@ class AgentMessageThread(threading.Thread): secure_agent_socket.sendall(msg.encode('utf-8')) agent_response = secure_agent_socket.recv(1024).decode() self.mgr.log.info(f'Received "{agent_response}" from agent on host {self.host}') + self.mgr.cache.sending_agent_message[self.host] = False return except ConnectionError as e: # if it's a connection error, possibly try to connect again. diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 41476c1c9920..238b1757ec13 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -729,6 +729,18 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, if host in self.offline_hosts: self.offline_hosts.remove(host) + def update_failed_daemon_health_check(self) -> None: + self.remove_health_warning('CEPHADM_FAILED_DAEMON') + failed_daemons = [] + for dd in self.cache.get_daemons(): + if dd.status is not None and dd.status == DaemonDescriptionStatus.error: + failed_daemons.append('daemon %s on %s is in %s state' % ( + dd.name(), dd.hostname, dd.status_desc + )) + if failed_daemons: + self.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len( + failed_daemons), failed_daemons) + @staticmethod def can_run() -> Tuple[bool, str]: if asyncssh is not None: @@ -1503,7 +1515,8 @@ Then run the following: self.remove_health_warning('HOST_IN_MAINTENANCE') else: s = "host is" if len(in_maintenance) == 1 else "hosts are" - self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [f"{h} is in maintenance" for h in in_maintenance]) + self.set_health_warning("HOST_IN_MAINTENANCE", f"{len(in_maintenance)} {s} in maintenance mode", 1, [ + f"{h} is in maintenance" for h in in_maintenance]) @handle_orch_error @host_exists() diff --git a/src/pybind/mgr/cephadm/serve.py b/src/pybind/mgr/cephadm/serve.py index c93e95cc26c0..f42465647fae 100644 --- a/src/pybind/mgr/cephadm/serve.py +++ b/src/pybind/mgr/cephadm/serve.py @@ -135,7 +135,8 @@ class CephadmServe: def _update_paused_health(self) -> None: if self.mgr.paused: - self.mgr.set_health_warning('CEPHADM_PAUSED', 'cephadm background work is paused', 1, ["'ceph orch resume' to resume"]) + self.mgr.set_health_warning('CEPHADM_PAUSED', 'cephadm background work is paused', 1, [ + "'ceph orch resume' to resume"]) else: self.mgr.remove_health_warning('CEPHADM_PAUSED') @@ -379,22 +380,16 @@ class CephadmServe: for k in [ 'CEPHADM_HOST_CHECK_FAILED', - 'CEPHADM_FAILED_DAEMON', 'CEPHADM_REFRESH_FAILED', ]: self.mgr.remove_health_warning(k) if bad_hosts: - self.mgr.set_health_warning('CEPHADM_HOST_CHECK_FAILED', f'{len(bad_hosts)} hosts fail cephadm check', len(bad_hosts), bad_hosts) + self.mgr.set_health_warning( + 'CEPHADM_HOST_CHECK_FAILED', f'{len(bad_hosts)} hosts fail cephadm check', len(bad_hosts), bad_hosts) if failures: - self.mgr.set_health_warning('CEPHADM_REFRESH_FAILED', 'failed to probe daemons or devices', len(failures), failures) - failed_daemons = [] - for dd in self.mgr.cache.get_daemons(): - if dd.status is not None and dd.status == DaemonDescriptionStatus.error: - failed_daemons.append('daemon %s on %s is in %s state' % ( - dd.name(), dd.hostname, dd.status_desc - )) - if failed_daemons: - self.mgr.set_health_warning('CEPHADM_FAILED_DAEMON', f'{len(failed_daemons)} failed cephadm daemon(s)', len(failed_daemons), failed_daemons) + self.mgr.set_health_warning( + 'CEPHADM_REFRESH_FAILED', 'failed to probe daemons or devices', len(failures), failures) + self.mgr.update_failed_daemon_health_check() def _check_host(self, host: str) -> Optional[str]: if host not in self.mgr.inventory: @@ -547,9 +542,11 @@ class CephadmServe: 'stray host %s has %d stray daemons: %s' % ( host, len(missing_names), missing_names)) if self.mgr.warn_on_stray_hosts and host_detail: - self.mgr.set_health_warning('CEPHADM_STRAY_HOST', f'{len(host_detail)} stray host(s) with {host_num_daemons} daemon(s) not managed by cephadm', len(host_detail), host_detail) + self.mgr.set_health_warning( + 'CEPHADM_STRAY_HOST', f'{len(host_detail)} stray host(s) with {host_num_daemons} daemon(s) not managed by cephadm', len(host_detail), host_detail) if self.mgr.warn_on_stray_daemons and daemon_detail: - self.mgr.set_health_warning('CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail) + self.mgr.set_health_warning( + 'CEPHADM_STRAY_DAEMON', f'{len(daemon_detail)} stray daemon(s) not managed by cephadm', len(daemon_detail), daemon_detail) def _apply_all_services(self) -> bool: r = False @@ -624,9 +621,11 @@ class CephadmServe: options_failed_to_set.append(msg) if invalid_config_options: - self.mgr.set_health_warning('CEPHADM_INVALID_CONFIG_OPTION', f'Ignoring {len(invalid_config_options)} invalid config option(s)', len(invalid_config_options), invalid_config_options) + self.mgr.set_health_warning('CEPHADM_INVALID_CONFIG_OPTION', f'Ignoring {len(invalid_config_options)} invalid config option(s)', len( + invalid_config_options), invalid_config_options) if options_failed_to_set: - self.mgr.set_health_warning('CEPHADM_FAILED_SET_OPTION', f'Failed to set {len(options_failed_to_set)} option(s)', len(options_failed_to_set), options_failed_to_set) + self.mgr.set_health_warning('CEPHADM_FAILED_SET_OPTION', f'Failed to set {len(options_failed_to_set)} option(s)', len( + options_failed_to_set), options_failed_to_set) def _apply_service(self, spec: ServiceSpec) -> bool: """ @@ -867,7 +866,8 @@ class CephadmServe: daemons.append(sd) if daemon_place_fails: - self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails) + self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len( + daemon_place_fails), daemon_place_fails) # remove any? def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool: diff --git a/src/pybind/mgr/cephadm/tests/test_cephadm.py b/src/pybind/mgr/cephadm/tests/test_cephadm.py index 63d7eb67fee8..f3ccf69904f1 100644 --- a/src/pybind/mgr/cephadm/tests/test_cephadm.py +++ b/src/pybind/mgr/cephadm/tests/test_cephadm.py @@ -899,8 +899,10 @@ spec: assert not r assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1 - assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary'] - assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail'] + assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks[ + 'CEPHADM_DAEMON_PLACE_FAIL']['summary'] + assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks[ + 'CEPHADM_DAEMON_PLACE_FAIL']['detail'] @mock.patch("cephadm.serve.CephadmServe._run_cephadm") def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator): @@ -913,7 +915,8 @@ spec: assert cephadm_module.apply_spec_fails assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1 - assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary'] + assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks[ + 'CEPHADM_APPLY_SPEC_FAIL']['summary'] @mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option") @mock.patch("cephadm.serve.CephadmServe._run_cephadm") @@ -922,11 +925,14 @@ spec: with with_host(cephadm_module, 'test'): ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1) get_foreign_ceph_option.side_effect = KeyError - CephadmServe(cephadm_module)._apply_service_config(ServiceSpec('mgr', placement=ps, config={'test': 'foo'})) + CephadmServe(cephadm_module)._apply_service_config( + ServiceSpec('mgr', placement=ps, config={'test': 'foo'})) assert cephadm_module.health_checks.get('CEPHADM_INVALID_CONFIG_OPTION') is not None assert cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['count'] == 1 - assert 'Ignoring 1 invalid config option(s)' in cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['summary'] - assert 'Ignoring invalid mgr config option test' in cephadm_module.health_checks['CEPHADM_INVALID_CONFIG_OPTION']['detail'] + assert 'Ignoring 1 invalid config option(s)' in cephadm_module.health_checks[ + 'CEPHADM_INVALID_CONFIG_OPTION']['summary'] + assert 'Ignoring invalid mgr config option test' in cephadm_module.health_checks[ + 'CEPHADM_INVALID_CONFIG_OPTION']['detail'] @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @mock.patch("cephadm.services.nfs.NFSService.run_grace_tool", mock.MagicMock())