qa/tasks/mgr: add tests for insights module

author Noah Watkins <nwatkins@redhat.com>

Thu, 16 Aug 2018 18:24:01 +0000 (11:24 -0700)

committer Noah Watkins <nwatkins@redhat.com>

Thu, 16 Aug 2018 22:29:46 +0000 (15:29 -0700)
author Noah Watkins <nwatkins@redhat.com>
Thu, 16 Aug 2018 18:24:01 +0000 (11:24 -0700)
committer Noah Watkins <nwatkins@redhat.com>
Thu, 16 Aug 2018 22:29:46 +0000 (15:29 -0700)
diff --git a/qa/suites/rados/mgr/tasks/insights.yaml b/qa/suites/rados/mgr/tasks/insights.yaml

new file mode 100644 (file)

index 0000000..5b262eb
--- /dev/null
+++ b/qa/suites/rados/mgr/tasks/insights.yaml
@@ -0,0 +1,16 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_insights
diff --git a/qa/tasks/mgr/test_insights.py b/qa/tasks/mgr/test_insights.py

new file mode 100644 (file)

index 0000000..48f34dc
--- /dev/null
+++ b/qa/tasks/mgr/test_insights.py
@@ -0,0 +1,224 @@
+import logging
+import json
+import datetime
+import time
+from mgr_test_case import MgrTestCase
+
+log = logging.getLogger(__name__)
+UUID = 'd5775432-0742-44a3-a435-45095e32e6b2'
+DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
+
+class TestInsights(MgrTestCase):
+    def setUp(self):
+        self.setup_mgrs()
+        self._load_module("insights")
+        self._load_module("crash")
+        self._load_module("selftest")
+        self.crash_ids = []
+
+    def tearDown(self):
+        self._clear_crashes()
+
+    def _insights(self):
+        retstr = self.mgr_cluster.mon_manager.raw_cluster_cmd("insights")
+        return json.loads(retstr)
+
+    def _add_crash(self, hours, make_invalid = False):
+        now = datetime.datetime.utcnow()
+        timestamp = now - datetime.timedelta(hours = hours)
+        timestamp = timestamp.strftime(DATEFMT) + 'Z'
+        crash_id = '_'.join((timestamp, UUID)).replace(' ', '_')
+        crash = {
+            'crash_id': crash_id,
+            'timestamp': timestamp,
+        }
+        if make_invalid:
+            crash["timestamp"] = "not a timestamp"
+
+        ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            'crash', 'post', '-i', '-',
+            stdin=json.dumps(crash)
+        )
+        self.crash_ids.append(crash_id)
+        self.assertEqual(0, ret)
+
+    def _clear_crashes(self):
+        for crash_id in self.crash_ids:
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                'crash', 'rm', crash_id
+            )
+
+    def _wait_for_health_history_checks(self, *args):
+        """Wait for a set of health checks to appear in the health history"""
+        timeout = datetime.datetime.utcnow() + \
+            datetime.timedelta(seconds = 15)
+        while True:
+            report = self._insights()
+            missing = False
+            for check in args:
+                if check not in report["health"]["history"]["checks"]:
+                    missing = True
+                    break
+            if not missing:
+                return
+            self.assertGreater(timeout,
+                    datetime.datetime.utcnow())
+            time.sleep(0.25)
+
+    def _wait_for_curr_health_cleared(self, check):
+        timeout = datetime.datetime.utcnow() + \
+            datetime.timedelta(seconds = 15)
+        while True:
+            report = self._insights()
+            if check not in report["health"]["current"]["checks"]:
+                return
+            self.assertGreater(timeout,
+                    datetime.datetime.utcnow())
+            time.sleep(0.25)
+
+    def test_health_history(self):
+        # use empty health history as starting point
+        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            "insights", "prune-health", "0")
+        report = self._insights()
+        self.assertFalse(report["health"]["history"]["checks"])
+
+        # generate health check history entries. we want to avoid the edge case
+        # of running these tests at _exactly_ the top of the hour so we can
+        # explicitly control when hourly work occurs. for this we use the
+        # current time offset to a half hour.
+        now = datetime.datetime.utcnow()
+        now = datetime.datetime(
+            year = now.year,
+            month = now.month,
+            day = now.day,
+            hour = now.hour,
+            minute = 30)
+
+        check_names = set()
+        for hours in [-18, -11, -5, -1, 0]:
+            # change the insight module's perception of "now" ...
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "insights_set_now_offset", str(hours))
+
+            # ... to simulate health check arrivals in the past
+            unique_check_name = "insights_health_check_{}".format(hours)
+            health_check = {
+                unique_check_name: {
+                    "severity": "warning",
+                    "summary": "summary",
+                    "detail": ["detail"]
+                }
+            }
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "set",
+                json.dumps(health_check))
+
+            check_names.add(unique_check_name)
+
+            # and also set the same health check to test deduplication
+            dupe_check_name = "insights_health_check".format(hours)
+            health_check = {
+                dupe_check_name: {
+                    "severity": "warning",
+                    "summary": "summary",
+                    "detail": ["detail"]
+                }
+            }
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "set",
+                json.dumps(health_check))
+
+            check_names.add(dupe_check_name)
+
+            # wait for the health check to show up in the history report
+            self._wait_for_health_history_checks(unique_check_name, dupe_check_name)
+
+            # clear out the current health checks before moving on
+            self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+                "mgr", "self-test", "health", "clear")
+            self._wait_for_curr_health_cleared(unique_check_name)
+
+        report = self._insights()
+        for check in check_names:
+            self.assertIn(check, report["health"]["history"]["checks"])
+
+        # restart the manager
+        active_id = self.mgr_cluster.get_active_id()
+        self.mgr_cluster.mgr_restart(active_id)
+
+        # ensure that at least one of the checks is present after the restart.
+        # we don't for them all to be present because "earlier" checks may not
+        # have sat in memory long enough to be flushed.
+        all_missing = True
+        report = self._insights()
+        for check in check_names:
+            if check in report["health"]["history"]["checks"]:
+                all_missing = False
+                break
+        self.assertFalse(all_missing)
+
+        # pruning really removes history
+        self.mgr_cluster.mon_manager.raw_cluster_cmd_result(
+            "insights", "prune-health", "0")
+        report = self._insights()
+        self.assertFalse(report["health"]["history"]["checks"])
+
+    def test_insights_health(self):
+        """The insights module reports health checks"""
+        self._add_crash(1, True) # add invalid crash data
+        timeout = 10
+        while timeout > 0:
+            time.sleep(1)
+            timeout -= 1
+            # should observe a health check because it can't read the invalid
+            # crash data created at the beginning of this test
+            report = self._insights()
+            if "MGR_INSIGHTS_WARNING" in report["health"]["current"]["checks"]:
+                self._clear_crashes()
+                return
+        self._clear_crashes()
+        self.fail("Insights module did not set health check")
+        pass
+
+    def test_schema(self):
+        """TODO: assert conformance to a full schema specification?"""
+        report = self._insights()
+        for key in ["osd_metadata",
+                    "pg_summary",
+                    "mon_status",
+                    "manager_map",
+                    "service_map",
+                    "mon_map",
+                    "crush_map",
+                    "fs_map",
+                    "osd_tree",
+                    "df",
+                    "osd_dump",
+                    "config",
+                    "health",
+                    "crashes",
+                    "version",
+                    "errors"]:
+            self.assertIn(key, report)
+
+    def test_crash_history(self):
+        self._clear_crashes()
+        report = self._insights()
+        self.assertFalse(report["crashes"]["summary"])
+        self.assertFalse(report["errors"])
+
+        # crashes show up in the report
+        self._add_crash(1)
+        report = self._insights()
+        self.assertTrue(report["crashes"]["summary"])
+        self.assertFalse(report["errors"])
+        log.warning("{}".format(json.dumps(report["crashes"], indent=2)))
+
+        # handling of comm. error with crash module
+        self._add_crash(1, True)
+        report = self._insights()
+        self.assertFalse(report["crashes"]["summary"])
+        self.assertTrue(report["errors"])
+
+        self._clear_crashes()
author	Noah Watkins <nwatkins@redhat.com>
	Thu, 16 Aug 2018 18:24:01 +0000 (11:24 -0700)
committer	Noah Watkins <nwatkins@redhat.com>
	Thu, 16 Aug 2018 22:29:46 +0000 (15:29 -0700)
qa/suites/rados/mgr/tasks/insights.yaml	[new file with mode: 0644]	patch \| blob
qa/tasks/mgr/test_insights.py	[new file with mode: 0644]	patch \| blob