qa: expand mgr testing

author John Spray <john.spray@redhat.com>

Thu, 19 Oct 2017 11:50:19 +0000 (07:50 -0400)

committer John Spray <john.spray@redhat.com>

Wed, 1 Nov 2017 12:21:42 +0000 (08:21 -0400)
author John Spray <john.spray@redhat.com>
Thu, 19 Oct 2017 11:50:19 +0000 (07:50 -0400)
committer John Spray <john.spray@redhat.com>
Wed, 1 Nov 2017 12:21:42 +0000 (08:21 -0400)
diff --git a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml

index bc950e5afff5c0ad5327bb5d0c75716cb648c720..abc90e22d358787ccb4490649cdf24c17087884e 100644 (file)
--- a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
+++ b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
@@ -1,6 +1,6 @@
  roles:
  - [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
  log-rotate:
    ceph-mds: 10G
    ceph-osd: 10G
diff --git a/qa/suites/rados/mgr/tasks/dashboard.yaml b/qa/suites/rados/mgr/tasks/dashboard.yaml

new file mode 100644 (file)

index 0000000..3065e11
--- /dev/null
+++ b/qa/suites/rados/mgr/tasks/dashboard.yaml
@@ -0,0 +1,16 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_dashboard
diff --git a/qa/suites/rados/mgr/tasks/module_selftest.yaml b/qa/suites/rados/mgr/tasks/module_selftest.yaml

index e9d90ffb1be05a823e6308cf1def2e6b98506aee..ffdfe8be2c2f2799243bd4bfa3bfcda7aab82941 100644 (file)
--- a/qa/suites/rados/mgr/tasks/module_selftest.yaml
+++ b/qa/suites/rados/mgr/tasks/module_selftest.yaml
@@ -11,6 +11,9 @@ tasks:
          - \(PG_
          - replacing it with standby
          - No standby daemons available
+        - Reduced data availability
+        - Degraded data redundancy
+        - objects misplaced
    - cephfs_test_runner:
        modules:
          - tasks.mgr.test_module_selftest
diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py

index a5531d33ed2e18079525b2175b33092b390dd48d..0734842fa056957ebc4bb9557a5a76cdf55be303 100644 (file)
--- a/qa/tasks/mgr/mgr_test_case.py
+++ b/qa/tasks/mgr/mgr_test_case.py
@@ -1,14 +1,18 @@
  
  from unittest import case
  import json
+import logging
  
  from teuthology import misc
  from tasks.ceph_test_case import CephTestCase
  
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
  from tasks.cephfs.filesystem import CephCluster
  
  
+log = logging.getLogger(__name__)
+
+
  class MgrCluster(CephCluster):
      def __init__(self, ctx):
          super(MgrCluster, self).__init__(ctx)
@@ -43,6 +47,12 @@ class MgrCluster(CephCluster):
      def get_standby_ids(self):
          return [s['name'] for s in self.get_mgr_map()["standbys"]]
  
+    def set_module_localized_conf(self, module, mgr_id, key, val):
+        self.mon_manager.raw_cluster_cmd("config-key", "set",
+                                         "mgr/{0}/{1}/{2}".format(
+                                             module, mgr_id, key
+                                         ), val)
+
  
  class MgrTestCase(CephTestCase):
      MGRS_REQUIRED = 1
@@ -77,3 +87,83 @@ class MgrTestCase(CephTestCase):
          self.wait_until_true(
              lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
              timeout=20)
+
+    def _load_module(self, module_name):
+        loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "ls"))
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+                                         module_name)
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.info("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(has_restarted, timeout=30)
+
+
+    def _get_uri(self, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.info("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+
+    def _assign_ports(self, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_stop(mgr_id)
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            log.info("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                       config_name,
+                                                       str(assign_port))
+            assign_port += 1
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.info("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(is_available, timeout=30)
diff --git a/qa/tasks/mgr/test_dashboard.py b/qa/tasks/mgr/test_dashboard.py

new file mode 100644 (file)

index 0000000..3b8a2cc
--- /dev/null
+++ b/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def test_standby(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running at {0}".format(original_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_urls(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/health",
+            "/servers",
+            "/osd/",
+            "/osd/perf/0",
+            "/rbd_mirroring",
+            "/rbd_iscsi"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py

index 31bc27f8a3db35e29e13072115959488ecfc7ed7..2776fb8729824b0055ca9c0f8aea286412c7f4e0 100644 (file)
--- a/qa/tasks/mgr/test_module_selftest.py
+++ b/qa/tasks/mgr/test_module_selftest.py
@@ -1,4 +1,7 @@
  
+import time
+import requests
+
  from tasks.mgr.mgr_test_case import MgrTestCase
  
  
@@ -14,19 +17,11 @@ class TestModuleSelftest(MgrTestCase):
      """
      MGRS_REQUIRED = 1
  
-    def _selftest_plugin(self, plugin_name):
-        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
-        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
-                                         plugin_name)
-
-        # Wait for the module to load
-        def has_restarted():
-            map = self.mgr_cluster.get_mgr_map()
-            return map['active_gid'] != initial_gid and map['available']
-        self.wait_until_true(has_restarted, timeout=30)
+    def _selftest_plugin(self, module_name):
+        self._load_module(module_name)
  
          # Execute the module's self-test routine
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(plugin_name, "self-test")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
  
      def test_zabbix(self):
          self._selftest_plugin("zabbix")
@@ -36,3 +31,44 @@ class TestModuleSelftest(MgrTestCase):
  
      def test_influx(self):
          self._selftest_plugin("influx")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
author	John Spray <john.spray@redhat.com>
	Thu, 19 Oct 2017 11:50:19 +0000 (07:50 -0400)
committer	John Spray <john.spray@redhat.com>
	Wed, 1 Nov 2017 12:21:42 +0000 (08:21 -0400)
qa/suites/rados/mgr/clusters/2-node-mgr.yaml		patch \| blob \| history
qa/suites/rados/mgr/tasks/dashboard.yaml	[new file with mode: 0644]	patch \| blob
qa/suites/rados/mgr/tasks/module_selftest.yaml		patch \| blob \| history
qa/tasks/mgr/mgr_test_case.py		patch \| blob \| history
qa/tasks/mgr/test_dashboard.py	[new file with mode: 0644]	patch \| blob
qa/tasks/mgr/test_module_selftest.py		patch \| blob \| history