]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mgr/cephadm: set health check warning for apply spec failures and daemon place failur...
authorMelissa Li <li.melissa.kun@gmail.com>
Thu, 5 Aug 2021 18:14:38 +0000 (14:14 -0400)
committerSebastian Wagner <sewagner@redhat.com>
Wed, 17 Nov 2021 10:25:57 +0000 (11:25 +0100)
Fixes: https://tracker.ceph.com/issues/44414
Signed-off-by: Melissa Li <li.melissa.kun@gmail.com>
(cherry picked from commit 1ccdd941c938d6fc3cd8996353495b28545427b1)

Conflicts:
src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py

src/pybind/mgr/cephadm/module.py
src/pybind/mgr/cephadm/serve.py
src/pybind/mgr/cephadm/tests/test_cephadm.py

index 6f44e49502ffe7a42554caf5095af9bf2fd52b68..c88a4a6bdbc180e8d7df869b22452709dfc6b2b4 100644 (file)
@@ -407,7 +407,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
             self.default_registry = ''
             self.autotune_memory_target_ratio = 0.0
             self.autotune_interval = 0
-
+            self.apply_spec_fails: List[Tuple[str, str]] = []
             self.max_osd_draining_count = 10
 
         self._cons: Dict[str, Tuple[remoto.backends.BaseConnection,
index 186bd4dd7a14c20b53454300220c2c9ca2e5974b..acbdd9af53ad79a28fd57dcc67213f8c56b5a4c8 100644 (file)
@@ -548,14 +548,25 @@ class CephadmServe:
         specs = []  # type: List[ServiceSpec]
         for sn, spec in self.mgr.spec_store.active_specs.items():
             specs.append(spec)
+        for name in ['CEPHADM_APPLY_SPEC_FAIL', 'CEPHADM_DAEMON_PLACE_FAIL']:
+            self.mgr.remove_health_warning(name)
+        self.mgr.apply_spec_fails = []
         for spec in specs:
             try:
                 if self._apply_service(spec):
                     r = True
             except Exception as e:
-                self.log.exception('Failed to apply %s spec %s: %s' % (
-                    spec.service_name(), spec, e))
+                msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+                self.log.exception(msg)
                 self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+                self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+                warnings = []
+                for x in self.mgr.apply_spec_fails:
+                    warnings.append(f'{x[0]}: {x[1]}')
+                self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+                                            f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+                                            len(self.mgr.apply_spec_fails),
+                                            warnings)
 
         return r
 
@@ -670,9 +681,17 @@ class CephadmServe:
                 'status', '').lower() not in ['maintenance', 'offline'] and d.hostname not in self.mgr.offline_hosts)]
             self.log.debug('Add %s, remove %s' % (slots_to_add, daemons_to_remove))
         except OrchestratorError as e:
-            self.log.error('Failed to apply %s spec %s: %s' % (
-                spec.service_name(), spec, e))
+            msg = f'Failed to apply {spec.service_name()} spec {spec}: {str(e)}'
+            self.log.error(msg)
             self.mgr.events.for_service(spec, 'ERROR', 'Failed to apply: ' + str(e))
+            self.mgr.apply_spec_fails.append((spec.service_name(), str(e)))
+            warnings = []
+            for x in self.mgr.apply_spec_fails:
+                warnings.append(f'{x[0]}: {x[1]}')
+            self.mgr.set_health_warning('CEPHADM_APPLY_SPEC_FAIL',
+                                        f"Failed to apply {len(self.mgr.apply_spec_fails)} service(s): {','.join(x[0] for x in self.mgr.apply_spec_fails)}",
+                                        len(self.mgr.apply_spec_fails),
+                                        warnings)
             return False
 
         r = None
@@ -746,6 +765,7 @@ class CephadmServe:
                 svc.fence_old_ranks(spec, rank_map, len(all_slots))
 
             # create daemons
+            daemon_place_fails = []
             for slot in slots_to_add:
                 # first remove daemon on conflicting port?
                 if slot.ports:
@@ -794,6 +814,7 @@ class CephadmServe:
                            f"on {slot.hostname}: {e}")
                     self.mgr.events.for_service(spec, 'ERROR', msg)
                     self.mgr.log.error(msg)
+                    daemon_place_fails.append(msg)
                     # only return "no change" if no one else has already succeeded.
                     # later successes will also change to True
                     if r is None:
@@ -810,6 +831,9 @@ class CephadmServe:
                 )
                 daemons.append(sd)
 
+            if daemon_place_fails:
+                self.mgr.set_health_warning('CEPHADM_DAEMON_PLACE_FAIL', f'Failed to place {len(daemon_place_fails)} daemon(s)', len(daemon_place_fails), daemon_place_fails)
+
             # remove any?
             def _ok_to_stop(remove_daemons: List[orchestrator.DaemonDescription]) -> bool:
                 daemon_ids = [d.daemon_id for d in remove_daemons]
index a83042cf49f22085a00256adbfb4a372d718932a..85bc87a9dab1d9edf3c24d8cc13e9196f5251afc 100644 (file)
@@ -858,6 +858,32 @@ class TestCephadm(object):
                             'entity': entity,
                         })
 
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    def test_daemon_place_fail_health_warning(self, _run_cephadm, cephadm_module):
+        _run_cephadm.return_value = ('{}', '', 0)
+        with with_host(cephadm_module, 'test'):
+            _run_cephadm.side_effect = OrchestratorError('fail')
+            ps = PlacementSpec(hosts=['test:0.0.0.0=a'], count=1)
+            r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+            assert not r
+            assert cephadm_module.health_checks.get('CEPHADM_DAEMON_PLACE_FAIL') is not None
+            assert cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['count'] == 1
+            assert 'Failed to place 1 daemon(s)' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['summary']
+            assert 'Failed while placing mgr.a on test: fail' in cephadm_module.health_checks['CEPHADM_DAEMON_PLACE_FAIL']['detail']
+
+    @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
+    def test_apply_spec_fail_health_warning(self, _run_cephadm, cephadm_module: CephadmOrchestrator):
+        _run_cephadm.return_value = ('{}', '', 0)
+        with with_host(cephadm_module, 'test'):
+            CephadmServe(cephadm_module)._apply_all_services()
+            ps = PlacementSpec(hosts=['fail'], count=1)
+            r = CephadmServe(cephadm_module)._apply_service(ServiceSpec('mgr', placement=ps))
+            assert not r
+            assert cephadm_module.apply_spec_fails
+            assert cephadm_module.health_checks.get('CEPHADM_APPLY_SPEC_FAIL') is not None
+            assert cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['count'] == 1
+            assert 'Failed to apply 1 service(s)' in cephadm_module.health_checks['CEPHADM_APPLY_SPEC_FAIL']['summary']
+
     @mock.patch("cephadm.module.CephadmOrchestrator.get_foreign_ceph_option")
     @mock.patch("cephadm.serve.CephadmServe._run_cephadm")
     def test_invalid_config_option_health_warning(self, _run_cephadm, get_foreign_ceph_option, cephadm_module: CephadmOrchestrator):