mgr/crash: separate RECENT_MGR_MODULE_CRASH error for mgr module crashes

author Sage Weil <sage@newdream.net>

Sat, 19 Jun 2021 16:21:47 +0000 (12:21 -0400)

committer Sage Weil <sage@newdream.net>

Wed, 23 Jun 2021 17:00:49 +0000 (13:00 -0400)
author Sage Weil <sage@newdream.net>
Sat, 19 Jun 2021 16:21:47 +0000 (12:21 -0400)
committer Sage Weil <sage@newdream.net>
Wed, 23 Jun 2021 17:00:49 +0000 (13:00 -0400)
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index f1f80c7fc53e9f17b3c49cc508299f2bb35d4a08..5b8f759b1134f65a84dc6785b6ea450b5af0d6f0 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -1227,6 +1227,40 @@ Archived crashes will still be visible via ``ceph crash ls`` but not
  The time period for what "recent" means is controlled by the option
  ``mgr/crash/warn_recent_interval`` (default: two weeks).
  
+These warnings can be disabled entirely with::
+
+  ceph config set mgr/crash/warn_recent_interval 0
+
+RECENT_MGR_MODULE_CRASH
+_______________________
+
+One or more ceph-mgr modules has crashed recently, and the crash as
+not yet been archived (acknowledged) by the administrator.  This
+generally indicates a software bug in one of the software modules run
+inside the ceph-mgr daemon.  Although the module that experienced the
+problem maybe be disabled as a result, the function of other modules
+is normally unaffected.
+
+As with the *RECENT_CRASH* health alert, the crash can be inspected with::
+
+    ceph crash info <crash-id>
+
+This warning can be silenced by "archiving" the crash (perhaps after
+being examined by an administrator) so that it does not generate this
+warning::
+
+  ceph crash archive <crash-id>
+
+Similarly, all new crashes can be archived with::
+
+  ceph crash archive-all
+
+Archived crashes will still be visible via ``ceph crash ls`` but not
+``ceph crash ls-new``.
+
+The time period for what "recent" means is controlled by the option
+``mgr/crash/warn_recent_interval`` (default: two weeks).
+
  These warnings can be disabled entirely with::
  
    ceph config set mgr/crash/warn_recent_interval 0
diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py

index 1cbe2026fb9365ac37c9eda8fde1f9100a91ed94..739555e627042ed3d4af8e027d33512cbd8b7036 100644 (file)
--- a/src/pybind/mgr/crash/module.py
+++ b/src/pybind/mgr/crash/module.py
@@ -100,25 +100,55 @@ class Module(MgrModule):
              if (self.time_from_string(cast(str, crash['timestamp'])) > cutoff
                  and 'archived' not in crash)
          }
-        num = len(recent)
-        health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {}
-        if recent:
-            detail = [
-                '%s crashed on host %s at %s' % (
-                    crash.get('entity_name', 'unidentified daemon'),
-                    crash.get('utsname_hostname', '(unknown)'),
-                    crash.get('timestamp', 'unknown time'))
-                for crash in recent.values()]
+
+        def prune_detail(ls: List[str]) -> int:
+            num = len(ls)
              if num > 30:
-                detail = detail[0:30]
-                detail.append('and %d more' % (num - 30))
-            self.log.debug('detail %s' % detail)
+                ls = ls[0:30]
+                ls.append('and %d more' % (num - 30))
+            return num
+
+        daemon_crashes = []
+        module_crashes = []
+        for c in recent.values():
+            if 'mgr_module' in c:
+                module_crashes.append(c)
+            else:
+                daemon_crashes.append(c)
+        daemon_detail = [
+            '%s crashed on host %s at %s' % (
+                crash.get('entity_name', 'unidentified daemon'),
+                crash.get('utsname_hostname', '(unknown)'),
+                crash.get('timestamp', 'unknown time'))
+            for crash in daemon_crashes]
+        module_detail = [
+            'mgr module %s crashed in daemon %s on host %s at %s' % (
+                crash.get('mgr_module', 'unidentified module'),
+                crash.get('entity_name', 'unidentified daemon'),
+                crash.get('utsname_hostname', '(unknown)'),
+                crash.get('timestamp', 'unknown time'))
+            for crash in module_crashes]
+        daemon_num = prune_detail(daemon_detail)
+        module_num = prune_detail(module_detail)
+
+        health_checks: Dict[str, Dict[str, Union[int, str, List[str]]]] = {}
+        if daemon_detail:
+            self.log.debug('daemon detail %s' % daemon_detail)
              health_checks['RECENT_CRASH'] = {
                  'severity': 'warning',
-                'summary': '%d daemons have recently crashed' % (num),
-                'count': num,
-                'detail': detail,
+                'summary': '%d daemons have recently crashed' % (daemon_num),
+                'count': daemon_num,
+                'detail': daemon_detail,
              }
+        if module_detail:
+            self.log.debug('module detail %s' % module_detail)
+            health_checks['RECENT_MGR_MODULE_CRASH'] = {
+                'severity': 'warning',
+                'summary': '%d mgr modules have recently crashed' % (module_num),
+                'count': module_num,
+                'detail': module_detail,
+            }
+
          self.set_health_checks(health_checks)
  
      def time_from_string(self, timestr: str) -> datetime.datetime:
author	Sage Weil <sage@newdream.net>
	Sat, 19 Jun 2021 16:21:47 +0000 (12:21 -0400)
committer	Sage Weil <sage@newdream.net>
	Wed, 23 Jun 2021 17:00:49 +0000 (13:00 -0400)
doc/rados/operations/health-checks.rst		patch \| blob \| history
src/pybind/mgr/crash/module.py		patch \| blob \| history