From 05e648be6a51d3efa110ad9826bbdd0adcc4dd4d Mon Sep 17 00:00:00 2001
From: John Spray <john.spray@redhat.com>
Date: Thu, 19 Oct 2017 07:50:19 -0400
Subject: [PATCH] qa: expand mgr testing

Some extra coverage of the dashboard, including its standby
redirect mode and the publishing of URIs.

Also invoking the command_spam mode of the selftest module.

Signed-off-by: John Spray <john.spray@redhat.com>
---
 qa/suites/rados/mgr/clusters/2-node-mgr.yaml  |  2 +-
 qa/suites/rados/mgr/tasks/dashboard.yaml      | 16 ++++
 .../rados/mgr/tasks/module_selftest.yaml      |  3 +
 qa/tasks/mgr/mgr_test_case.py                 | 92 ++++++++++++++++++-
 qa/tasks/mgr/test_dashboard.py                | 70 ++++++++++++++
 qa/tasks/mgr/test_module_selftest.py          | 58 +++++++++---
 6 files changed, 228 insertions(+), 13 deletions(-)
 create mode 100644 qa/suites/rados/mgr/tasks/dashboard.yaml
 create mode 100644 qa/tasks/mgr/test_dashboard.py

diff --git a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
index bc950e5afff5c..abc90e22d3587 100644
--- a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
+++ b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml
@@ -1,6 +1,6 @@
 roles:
 - [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
 log-rotate:
   ceph-mds: 10G
   ceph-osd: 10G
diff --git a/qa/suites/rados/mgr/tasks/dashboard.yaml b/qa/suites/rados/mgr/tasks/dashboard.yaml
new file mode 100644
index 0000000000000..3065e11bcb59e
--- /dev/null
+++ b/qa/suites/rados/mgr/tasks/dashboard.yaml
@@ -0,0 +1,16 @@
+
+tasks:
+  - install:
+  - ceph:
+      # tests may leave mgrs broken, so don't try and call into them
+      # to invoke e.g. pg dump during teardown.
+      wait-for-scrub: false
+      log-whitelist:
+        - overall HEALTH_
+        - \(MGR_DOWN\)
+        - \(PG_
+        - replacing it with standby
+        - No standby daemons available
+  - cephfs_test_runner:
+      modules:
+        - tasks.mgr.test_dashboard
diff --git a/qa/suites/rados/mgr/tasks/module_selftest.yaml b/qa/suites/rados/mgr/tasks/module_selftest.yaml
index e9d90ffb1be05..ffdfe8be2c2f2 100644
--- a/qa/suites/rados/mgr/tasks/module_selftest.yaml
+++ b/qa/suites/rados/mgr/tasks/module_selftest.yaml
@@ -11,6 +11,9 @@ tasks:
         - \(PG_
         - replacing it with standby
         - No standby daemons available
+        - Reduced data availability
+        - Degraded data redundancy
+        - objects misplaced
   - cephfs_test_runner:
       modules:
         - tasks.mgr.test_module_selftest
diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py
index a5531d33ed2e1..0734842fa0569 100644
--- a/qa/tasks/mgr/mgr_test_case.py
+++ b/qa/tasks/mgr/mgr_test_case.py
@@ -1,14 +1,18 @@
 
 from unittest import case
 import json
+import logging
 
 from teuthology import misc
 from tasks.ceph_test_case import CephTestCase
 
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
 from tasks.cephfs.filesystem import CephCluster
 
 
+log = logging.getLogger(__name__)
+
+
 class MgrCluster(CephCluster):
     def __init__(self, ctx):
         super(MgrCluster, self).__init__(ctx)
@@ -43,6 +47,12 @@ class MgrCluster(CephCluster):
     def get_standby_ids(self):
         return [s['name'] for s in self.get_mgr_map()["standbys"]]
 
+    def set_module_localized_conf(self, module, mgr_id, key, val):
+        self.mon_manager.raw_cluster_cmd("config-key", "set",
+                                         "mgr/{0}/{1}/{2}".format(
+                                             module, mgr_id, key
+                                         ), val)
+
 
 class MgrTestCase(CephTestCase):
     MGRS_REQUIRED = 1
@@ -77,3 +87,83 @@ class MgrTestCase(CephTestCase):
         self.wait_until_true(
             lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
             timeout=20)
+
+    def _load_module(self, module_name):
+        loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "ls"))
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+                                         module_name)
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.info("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(has_restarted, timeout=30)
+
+
+    def _get_uri(self, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.info("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+
+    def _assign_ports(self, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_stop(mgr_id)
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            log.info("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                       config_name,
+                                                       str(assign_port))
+            assign_port += 1
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.info("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(is_available, timeout=30)
diff --git a/qa/tasks/mgr/test_dashboard.py b/qa/tasks/mgr/test_dashboard.py
new file mode 100644
index 0000000000000..3b8a2cc80c1d8
--- /dev/null
+++ b/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def test_standby(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running at {0}".format(original_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_urls(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/health",
+            "/servers",
+            "/osd/",
+            "/osd/perf/0",
+            "/rbd_mirroring",
+            "/rbd_iscsi"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py
index 31bc27f8a3db3..2776fb8729824 100644
--- a/qa/tasks/mgr/test_module_selftest.py
+++ b/qa/tasks/mgr/test_module_selftest.py
@@ -1,4 +1,7 @@
 
+import time
+import requests
+
 from tasks.mgr.mgr_test_case import MgrTestCase
 
 
@@ -14,19 +17,11 @@ class TestModuleSelftest(MgrTestCase):
     """
     MGRS_REQUIRED = 1
 
-    def _selftest_plugin(self, plugin_name):
-        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
-        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
-                                         plugin_name)
-
-        # Wait for the module to load
-        def has_restarted():
-            map = self.mgr_cluster.get_mgr_map()
-            return map['active_gid'] != initial_gid and map['available']
-        self.wait_until_true(has_restarted, timeout=30)
+    def _selftest_plugin(self, module_name):
+        self._load_module(module_name)
 
         # Execute the module's self-test routine
-        self.mgr_cluster.mon_manager.raw_cluster_cmd(plugin_name, "self-test")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
 
     def test_zabbix(self):
         self._selftest_plugin("zabbix")
@@ -36,3 +31,44 @@ class TestModuleSelftest(MgrTestCase):
 
     def test_influx(self):
         self._selftest_plugin("influx")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
-- 
2.39.5