mgr/crash: raise RECENT_CRASH warning for recent (new) crashes

author Sage Weil <sage@redhat.com>

Sun, 14 Jul 2019 16:31:26 +0000 (11:31 -0500)

committer Sage Weil <sage@redhat.com>

Fri, 19 Jul 2019 14:43:04 +0000 (09:43 -0500)
author Sage Weil <sage@redhat.com>
Sun, 14 Jul 2019 16:31:26 +0000 (11:31 -0500)
committer Sage Weil <sage@redhat.com>
Fri, 19 Jul 2019 14:43:04 +0000 (09:43 -0500)
diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst

index 88ca193d8ad9ac955b6ae77c912c420d2d438d0e..e83cd479b6669bf82ab4f1f56ee31e302a7a2d9a 100644 (file)
--- a/doc/rados/operations/health-checks.rst
+++ b/doc/rados/operations/health-checks.rst
@@ -857,3 +857,43 @@ happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
  You can manually initiate a scrub of a clean PG with::
  
    ceph pg deep-scrub <pgid>
+
+
+Miscellaneous
+-------------
+
+RECENT_CRASH
+____________
+
+One or more Ceph daemons has crashed recently, and the crash has not
+yet been archived (acknowledged) by the administrator.  This may
+indicate a software bug, a hardware problem (e.g., a failing disk), or
+some other problem.
+
+New crashes can be listed with::
+
+  ceph crash ls-new
+
+Information about a specific crash can be examined with::
+
+  ceph crash info <crash-id>
+
+This warning can be silenced by "archiving" the crash (perhaps after
+being examined by an administrator) so that it does not generate this
+warning::
+
+  ceph crash archive <crash-id>
+
+Similarly, all new crashes can be archived with::
+
+  ceph crash archive-all
+
+Archived crashes will still be visible via ``ceph crash ls`` but not
+``ceph crash ls-new``.
+
+The time period for what "recent" means is controlled by the option
+``mgr/crash/warn_recent_interval`` (default: two weeks).
+
+These warnings can be disabled entirely with::
+
+  ceph config set mgr/crash/warn_recent_interval 0
diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py

index dec4f50a55705ebedde8d2046b3d5fe275df3980..133cd5d417bef057006f1565d9a191bc8d89e0d6 100644 (file)
--- a/src/pybind/mgr/crash/module.py
+++ b/src/pybind/mgr/crash/module.py
@@ -11,9 +11,18 @@ from threading import Event
  DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
  OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
  
+MAX_WAIT = 600
+MIN_WAIT = 60
  
  class Module(MgrModule):
      MODULE_OPTIONS = [
+        {
+            'name': 'warn_recent_interval',
+            'type': 'secs',
+            'default': 60*60*24*14,
+            'desc': 'time interval in which to warn about recent crashes',
+            'runtime': True,
+        },
      ]
  
      def __init__(self, *args, **kwargs):
@@ -29,7 +38,9 @@ class Module(MgrModule):
      def serve(self):
          self.config_notify()
          while self.run:
-            self.event.wait(self.warn_recent_interval / 100)
+            self._refresh_health_checks()
+            wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
+            self.event.wait(wait)
              self.event.clear()
  
      def config_notify(self):
@@ -44,6 +55,35 @@ class Module(MgrModule):
          raw = self.get_store_prefix('crash/')
          self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
  
+    def _refresh_health_checks(self):
+        if not self.crashes:
+            self._load_crashes()
+        cutoff = datetime.datetime.utcnow() - datetime.timedelta(
+            seconds=self.warn_recent_interval)
+        recent = {
+            crashid: crash for crashid, crash in self.crashes.items()
+            if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash
+        }
+        num = len(recent)
+        health_checks = {}
+        if recent:
+            detail = [
+                '%s crashed on host %s at %s' % (
+                    crash.get('entity_name', 'unidentified daemon'),
+                    crash.get('utsname_hostname', '(unknown)'),
+                    crash.get('timestamp', 'unknown time'))
+                    for (_, crash) in recent.items()]
+            if num > 30:
+                detail = detail[0:30]
+                detail.append('and %d more' % (num - 30))
+            self.log.debug('detail %s' % detail)
+            health_checks['RECENT_CRASH'] = {
+                'severity': 'warning',
+                'summary': '%d daemons have recently crashed' % (num),
+                'detail': detail,
+            }
+        self.set_health_checks(health_checks)
+
      def handle_command(self, inbuf, command):
          if not self.crashes:
              self._load_crashes()
@@ -137,6 +177,7 @@ class Module(MgrModule):
              del self.crashes[crashid]
              key = 'crash/%s' % crashid
              self.set_store(key, None)       # removes key
+            self._refresh_health_checks()
          return 0, '', ''
  
      def do_prune(self, cmd, inbuf):
@@ -159,6 +200,9 @@ class Module(MgrModule):
              del self.crashes[crashid]
              key = 'crash/%s' % crashid
              self.set_store(key, None)
+            removed_any = True
+        if removed_any:
+            self._refresh_health_checks()
  
      def do_archive(self, cmd, inbuf):
          crashid = cmd['id']
@@ -170,6 +214,7 @@ class Module(MgrModule):
              self.crashes[crashid] = crash
              key = 'crash/%s' % crashid
              self.set_store(key, json.dumps(crash))
+            self._refresh_health_checks()
          return 0, '', ''
  
      def do_archive_all(self, cmd, inbuf):
@@ -179,6 +224,7 @@ class Module(MgrModule):
                  self.crashes[crashid] = crash
                  key = 'crash/%s' % crashid
                  self.set_store(key, json.dumps(crash))
+        self._refresh_health_checks()
          return 0, '', ''
  
      def do_stat(self, cmd, inbuf):
author	Sage Weil <sage@redhat.com>
	Sun, 14 Jul 2019 16:31:26 +0000 (11:31 -0500)
committer	Sage Weil <sage@redhat.com>
	Fri, 19 Jul 2019 14:43:04 +0000 (09:43 -0500)
doc/rados/operations/health-checks.rst		patch \| blob \| history
src/pybind/mgr/crash/module.py		patch \| blob \| history