From 05e648be6a51d3efa110ad9826bbdd0adcc4dd4d Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 19 Oct 2017 07:50:19 -0400 Subject: [PATCH] qa: expand mgr testing Some extra coverage of the dashboard, including its standby redirect mode and the publishing of URIs. Also invoking the command_spam mode of the selftest module. Signed-off-by: John Spray --- qa/suites/rados/mgr/clusters/2-node-mgr.yaml | 2 +- qa/suites/rados/mgr/tasks/dashboard.yaml | 16 ++++ .../rados/mgr/tasks/module_selftest.yaml | 3 + qa/tasks/mgr/mgr_test_case.py | 92 ++++++++++++++++++- qa/tasks/mgr/test_dashboard.py | 70 ++++++++++++++ qa/tasks/mgr/test_module_selftest.py | 58 +++++++++--- 6 files changed, 228 insertions(+), 13 deletions(-) create mode 100644 qa/suites/rados/mgr/tasks/dashboard.yaml create mode 100644 qa/tasks/mgr/test_dashboard.py diff --git a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml index bc950e5afff5c..abc90e22d3587 100644 --- a/qa/suites/rados/mgr/clusters/2-node-mgr.yaml +++ b/qa/suites/rados/mgr/clusters/2-node-mgr.yaml @@ -1,6 +1,6 @@ roles: - [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0] -- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1] +- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1] log-rotate: ceph-mds: 10G ceph-osd: 10G diff --git a/qa/suites/rados/mgr/tasks/dashboard.yaml b/qa/suites/rados/mgr/tasks/dashboard.yaml new file mode 100644 index 0000000000000..3065e11bcb59e --- /dev/null +++ b/qa/suites/rados/mgr/tasks/dashboard.yaml @@ -0,0 +1,16 @@ + +tasks: + - install: + - ceph: + # tests may leave mgrs broken, so don't try and call into them + # to invoke e.g. pg dump during teardown. + wait-for-scrub: false + log-whitelist: + - overall HEALTH_ + - \(MGR_DOWN\) + - \(PG_ + - replacing it with standby + - No standby daemons available + - cephfs_test_runner: + modules: + - tasks.mgr.test_dashboard diff --git a/qa/suites/rados/mgr/tasks/module_selftest.yaml b/qa/suites/rados/mgr/tasks/module_selftest.yaml index e9d90ffb1be05..ffdfe8be2c2f2 100644 --- a/qa/suites/rados/mgr/tasks/module_selftest.yaml +++ b/qa/suites/rados/mgr/tasks/module_selftest.yaml @@ -11,6 +11,9 @@ tasks: - \(PG_ - replacing it with standby - No standby daemons available + - Reduced data availability + - Degraded data redundancy + - objects misplaced - cephfs_test_runner: modules: - tasks.mgr.test_module_selftest diff --git a/qa/tasks/mgr/mgr_test_case.py b/qa/tasks/mgr/mgr_test_case.py index a5531d33ed2e1..0734842fa0569 100644 --- a/qa/tasks/mgr/mgr_test_case.py +++ b/qa/tasks/mgr/mgr_test_case.py @@ -1,14 +1,18 @@ from unittest import case import json +import logging from teuthology import misc from tasks.ceph_test_case import CephTestCase -# TODO move definition of CephCluster +# TODO move definition of CephCluster away from the CephFS stuff from tasks.cephfs.filesystem import CephCluster +log = logging.getLogger(__name__) + + class MgrCluster(CephCluster): def __init__(self, ctx): super(MgrCluster, self).__init__(ctx) @@ -43,6 +47,12 @@ class MgrCluster(CephCluster): def get_standby_ids(self): return [s['name'] for s in self.get_mgr_map()["standbys"]] + def set_module_localized_conf(self, module, mgr_id, key, val): + self.mon_manager.raw_cluster_cmd("config-key", "set", + "mgr/{0}/{1}/{2}".format( + module, mgr_id, key + ), val) + class MgrTestCase(CephTestCase): MGRS_REQUIRED = 1 @@ -77,3 +87,83 @@ class MgrTestCase(CephTestCase): self.wait_until_true( lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys, timeout=20) + + def _load_module(self, module_name): + loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "ls")) + if module_name in loaded: + # The enable command is idempotent, but our wait for a restart + # isn't, so let's return now if it's already loaded + return + + initial_gid = self.mgr_cluster.get_mgr_map()['active_gid'] + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", + module_name) + + # Wait for the module to load + def has_restarted(): + mgr_map = self.mgr_cluster.get_mgr_map() + done = mgr_map['active_gid'] != initial_gid and mgr_map['available'] + if done: + log.info("Restarted after module load (new active {0}/{1})".format( + mgr_map['active_name'] , mgr_map['active_gid'])) + return done + self.wait_until_true(has_restarted, timeout=30) + + + def _get_uri(self, service_name): + # Little dict hack so that I can assign into this from + # the get_or_none function + mgr_map = {'x': None} + + def _get_or_none(): + mgr_map['x'] = self.mgr_cluster.get_mgr_map() + result = mgr_map['x']['services'].get(service_name, None) + return result + + self.wait_until_true(lambda: _get_or_none() is not None, 30) + + uri = mgr_map['x']['services'][service_name] + + log.info("Found {0} at {1} (daemon {2}/{3})".format( + service_name, uri, mgr_map['x']['active_name'], + mgr_map['x']['active_gid'])) + + return uri + + + def _assign_ports(self, module_name, config_name, min_port=7789): + """ + To avoid the need to run lots of hosts in teuthology tests to + get different URLs per mgr, we will hand out different ports + to each mgr here. + + This is already taken care of for us when running in a vstart + environment. + """ + # Start handing out ports well above Ceph's range. + assign_port = min_port + + for mgr_id in self.mgr_cluster.mgr_ids: + self.mgr_cluster.mgr_stop(mgr_id) + self.mgr_cluster.mgr_fail(mgr_id) + + for mgr_id in self.mgr_cluster.mgr_ids: + log.info("Using port {0} for {1} on mgr.{2}".format( + assign_port, module_name, mgr_id + )) + self.mgr_cluster.set_module_localized_conf(module_name, mgr_id, + config_name, + str(assign_port)) + assign_port += 1 + + for mgr_id in self.mgr_cluster.mgr_ids: + self.mgr_cluster.mgr_restart(mgr_id) + + def is_available(): + mgr_map = self.mgr_cluster.get_mgr_map() + done = mgr_map['available'] + if done: + log.info("Available after assign ports (new active {0}/{1})".format( + mgr_map['active_name'] , mgr_map['active_gid'])) + return done + self.wait_until_true(is_available, timeout=30) diff --git a/qa/tasks/mgr/test_dashboard.py b/qa/tasks/mgr/test_dashboard.py new file mode 100644 index 0000000000000..3b8a2cc80c1d8 --- /dev/null +++ b/qa/tasks/mgr/test_dashboard.py @@ -0,0 +1,70 @@ + + +from mgr_test_case import MgrTestCase + +import logging +import requests + + +log = logging.getLogger(__name__) + + +class TestDashboard(MgrTestCase): + MGRS_REQUIRED = 3 + + def test_standby(self): + self._assign_ports("dashboard", "server_port") + self._load_module("dashboard") + + original_active = self.mgr_cluster.get_active_id() + + original_uri = self._get_uri("dashboard") + log.info("Originally running at {0}".format(original_uri)) + + self.mgr_cluster.mgr_fail(original_active) + + failed_over_uri = self._get_uri("dashboard") + log.info("After failover running at {0}".format(original_uri)) + + self.assertNotEqual(original_uri, failed_over_uri) + + # The original active daemon should have come back up as a standby + # and be doing redirects to the new active daemon + r = requests.get(original_uri, allow_redirects=False) + self.assertEqual(r.status_code, 303) + self.assertEqual(r.headers['Location'], failed_over_uri) + + def test_urls(self): + self._assign_ports("dashboard", "server_port") + self._load_module("dashboard") + + base_uri = self._get_uri("dashboard") + + # This is a very simple smoke test to check that the dashboard can + # give us a 200 response to requests. We're not testing that + # the content is correct or even renders! + + urls = [ + "/health", + "/servers", + "/osd/", + "/osd/perf/0", + "/rbd_mirroring", + "/rbd_iscsi" + ] + + failures = [] + + for url in urls: + r = requests.get(base_uri + url, allow_redirects=False) + if r.status_code >= 300 and r.status_code < 400: + log.error("Unexpected redirect to: {0} (from {1})".format( + r.headers['Location'], base_uri)) + if r.status_code != 200: + failures.append(url) + + log.info("{0}: {1} ({2} bytes)".format( + url, r.status_code, len(r.content) + )) + + self.assertListEqual(failures, []) diff --git a/qa/tasks/mgr/test_module_selftest.py b/qa/tasks/mgr/test_module_selftest.py index 31bc27f8a3db3..2776fb8729824 100644 --- a/qa/tasks/mgr/test_module_selftest.py +++ b/qa/tasks/mgr/test_module_selftest.py @@ -1,4 +1,7 @@ +import time +import requests + from tasks.mgr.mgr_test_case import MgrTestCase @@ -14,19 +17,11 @@ class TestModuleSelftest(MgrTestCase): """ MGRS_REQUIRED = 1 - def _selftest_plugin(self, plugin_name): - initial_gid = self.mgr_cluster.get_mgr_map()['active_gid'] - self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", - plugin_name) - - # Wait for the module to load - def has_restarted(): - map = self.mgr_cluster.get_mgr_map() - return map['active_gid'] != initial_gid and map['available'] - self.wait_until_true(has_restarted, timeout=30) + def _selftest_plugin(self, module_name): + self._load_module(module_name) # Execute the module's self-test routine - self.mgr_cluster.mon_manager.raw_cluster_cmd(plugin_name, "self-test") + self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test") def test_zabbix(self): self._selftest_plugin("zabbix") @@ -36,3 +31,44 @@ class TestModuleSelftest(MgrTestCase): def test_influx(self): self._selftest_plugin("influx") + + def test_selftest_run(self): + self._load_module("selftest") + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run") + + def test_selftest_command_spam(self): + # Use the selftest module to stress the mgr daemon + self._load_module("selftest") + + # Use the dashboard to test that the mgr is still able to do its job + self._assign_ports("dashboard", "server_port") + self._load_module("dashboard") + + original_active = self.mgr_cluster.get_active_id() + original_standbys = self.mgr_cluster.get_standby_ids() + + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", + "background", "start", + "command_spam") + + dashboard_uri = self._get_uri("dashboard") + + delay = 10 + periods = 10 + for i in range(0, periods): + t1 = time.time() + # Check that an HTTP module remains responsive + r = requests.get(dashboard_uri) + self.assertEqual(r.status_code, 200) + + # Check that a native non-module command remains responsive + self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df") + + time.sleep(delay - (time.time() - t1)) + + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", + "background", "stop") + + # Check that all mgr daemons are still running + self.assertEqual(original_active, self.mgr_cluster.get_active_id()) + self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids()) -- 2.39.5