roles:
- [mgr.x, mon.a, mon.c, mds.a, mds.c, osd.0, client.0]
-- [mgr.y, mon.b, mds.b, osd.1, osd.2, client.1]
+- [mgr.y, mgr.z, mon.b, mds.b, osd.1, osd.2, client.1]
log-rotate:
ceph-mds: 10G
ceph-osd: 10G
--- /dev/null
+
+tasks:
+ - install:
+ - ceph:
+ # tests may leave mgrs broken, so don't try and call into them
+ # to invoke e.g. pg dump during teardown.
+ wait-for-scrub: false
+ log-whitelist:
+ - overall HEALTH_
+ - \(MGR_DOWN\)
+ - \(PG_
+ - replacing it with standby
+ - No standby daemons available
+ - cephfs_test_runner:
+ modules:
+ - tasks.mgr.test_dashboard
- \(PG_
- replacing it with standby
- No standby daemons available
+ - Reduced data availability
+ - Degraded data redundancy
+ - objects misplaced
- cephfs_test_runner:
modules:
- tasks.mgr.test_module_selftest
from unittest import case
import json
+import logging
from teuthology import misc
from tasks.ceph_test_case import CephTestCase
-# TODO move definition of CephCluster
+# TODO move definition of CephCluster away from the CephFS stuff
from tasks.cephfs.filesystem import CephCluster
+log = logging.getLogger(__name__)
+
+
class MgrCluster(CephCluster):
def __init__(self, ctx):
super(MgrCluster, self).__init__(ctx)
def get_standby_ids(self):
return [s['name'] for s in self.get_mgr_map()["standbys"]]
+ def set_module_localized_conf(self, module, mgr_id, key, val):
+ self.mon_manager.raw_cluster_cmd("config-key", "set",
+ "mgr/{0}/{1}/{2}".format(
+ module, mgr_id, key
+ ), val)
+
class MgrTestCase(CephTestCase):
MGRS_REQUIRED = 1
self.wait_until_true(
lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
timeout=20)
+
+ def _load_module(self, module_name):
+ loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "ls"))
+ if module_name in loaded:
+ # The enable command is idempotent, but our wait for a restart
+ # isn't, so let's return now if it's already loaded
+ return
+
+ initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+ module_name)
+
+ # Wait for the module to load
+ def has_restarted():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+ if done:
+ log.info("Restarted after module load (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(has_restarted, timeout=30)
+
+
+ def _get_uri(self, service_name):
+ # Little dict hack so that I can assign into this from
+ # the get_or_none function
+ mgr_map = {'x': None}
+
+ def _get_or_none():
+ mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+ result = mgr_map['x']['services'].get(service_name, None)
+ return result
+
+ self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+ uri = mgr_map['x']['services'][service_name]
+
+ log.info("Found {0} at {1} (daemon {2}/{3})".format(
+ service_name, uri, mgr_map['x']['active_name'],
+ mgr_map['x']['active_gid']))
+
+ return uri
+
+
+ def _assign_ports(self, module_name, config_name, min_port=7789):
+ """
+ To avoid the need to run lots of hosts in teuthology tests to
+ get different URLs per mgr, we will hand out different ports
+ to each mgr here.
+
+ This is already taken care of for us when running in a vstart
+ environment.
+ """
+ # Start handing out ports well above Ceph's range.
+ assign_port = min_port
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_stop(mgr_id)
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ log.info("Using port {0} for {1} on mgr.{2}".format(
+ assign_port, module_name, mgr_id
+ ))
+ self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+ config_name,
+ str(assign_port))
+ assign_port += 1
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_restart(mgr_id)
+
+ def is_available():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['available']
+ if done:
+ log.info("Available after assign ports (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(is_available, timeout=30)
--- /dev/null
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+ MGRS_REQUIRED = 3
+
+ def test_standby(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+
+ original_uri = self._get_uri("dashboard")
+ log.info("Originally running at {0}".format(original_uri))
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ failed_over_uri = self._get_uri("dashboard")
+ log.info("After failover running at {0}".format(original_uri))
+
+ self.assertNotEqual(original_uri, failed_over_uri)
+
+ # The original active daemon should have come back up as a standby
+ # and be doing redirects to the new active daemon
+ r = requests.get(original_uri, allow_redirects=False)
+ self.assertEqual(r.status_code, 303)
+ self.assertEqual(r.headers['Location'], failed_over_uri)
+
+ def test_urls(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ base_uri = self._get_uri("dashboard")
+
+ # This is a very simple smoke test to check that the dashboard can
+ # give us a 200 response to requests. We're not testing that
+ # the content is correct or even renders!
+
+ urls = [
+ "/health",
+ "/servers",
+ "/osd/",
+ "/osd/perf/0",
+ "/rbd_mirroring",
+ "/rbd_iscsi"
+ ]
+
+ failures = []
+
+ for url in urls:
+ r = requests.get(base_uri + url, allow_redirects=False)
+ if r.status_code >= 300 and r.status_code < 400:
+ log.error("Unexpected redirect to: {0} (from {1})".format(
+ r.headers['Location'], base_uri))
+ if r.status_code != 200:
+ failures.append(url)
+
+ log.info("{0}: {1} ({2} bytes)".format(
+ url, r.status_code, len(r.content)
+ ))
+
+ self.assertListEqual(failures, [])
+import time
+import requests
+
from tasks.mgr.mgr_test_case import MgrTestCase
"""
MGRS_REQUIRED = 1
- def _selftest_plugin(self, plugin_name):
- initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
- self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
- plugin_name)
-
- # Wait for the module to load
- def has_restarted():
- map = self.mgr_cluster.get_mgr_map()
- return map['active_gid'] != initial_gid and map['available']
- self.wait_until_true(has_restarted, timeout=30)
+ def _selftest_plugin(self, module_name):
+ self._load_module(module_name)
# Execute the module's self-test routine
- self.mgr_cluster.mon_manager.raw_cluster_cmd(plugin_name, "self-test")
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
def test_zabbix(self):
self._selftest_plugin("zabbix")
def test_influx(self):
self._selftest_plugin("influx")
+
+ def test_selftest_run(self):
+ self._load_module("selftest")
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+ def test_selftest_command_spam(self):
+ # Use the selftest module to stress the mgr daemon
+ self._load_module("selftest")
+
+ # Use the dashboard to test that the mgr is still able to do its job
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "start",
+ "command_spam")
+
+ dashboard_uri = self._get_uri("dashboard")
+
+ delay = 10
+ periods = 10
+ for i in range(0, periods):
+ t1 = time.time()
+ # Check that an HTTP module remains responsive
+ r = requests.get(dashboard_uri)
+ self.assertEqual(r.status_code, 200)
+
+ # Check that a native non-module command remains responsive
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+ time.sleep(delay - (time.time() - t1))
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "stop")
+
+ # Check that all mgr daemons are still running
+ self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+ self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())