roles:
- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
-- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7]
openstack:
- volumes: # attached to each instance
count: 4
roles:
-- [mon.a, mgr.y, mds.a, mds.x-s, osd.0, osd.1, osd.2, osd.3]
-- [mon.b, mon.c, mgr.x, mds.y-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.a, mgr.y, mds.a, mds.c, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7]
- [client.0]
openstack:
- volumes: # attached to each instance
roles:
- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
-- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.1]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7, client.1]
openstack:
- volumes: # attached to each instance
count: 4
roles:
-- [mon.a, mgr.y, mds.a, mds.x-s, osd.0, osd.1, osd.2, osd.3]
-- [mon.b, mon.c, mgr.x, mds.y-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.a, mgr.y, mds.a, mds.c, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7]
- [client.0]
- [client.1]
openstack:
roles:
- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
-- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7]
- [client.0]
- [client.1]
- [client.2]
roles:
- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0, client.1]
-- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7, client.2, client.3]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7, client.2, client.3]
openstack:
- volumes: # attached to each instance
count: 4
roles:
-- [mon.a, mgr.y, mds.a, mds.x-s, osd.0, osd.1, osd.2, osd.3]
-- [mon.b, mon.c, mgr.x, mds.y-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.a, mgr.y, mds.a, mds.b, osd.0, osd.1, osd.2, osd.3]
+- [mon.b, mon.c, mgr.x, mds.c, osd.4, osd.5, osd.6, osd.7]
- [client.0]
- [client.1]
- [client.2]
roles:
-- [mon.a, mgr.y, mds.a, mds.w-s, osd.0, osd.1, osd.2, osd.3, client.0]
-- [mon.b, mon.c, mgr.x, mds.x-s, mds.y-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.a, mgr.y, mds.a, mds.c, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.b, mds.d, osd.4, osd.5, osd.6, osd.7]
openstack:
- volumes: # attached to each instance
count: 4
roles:
-- [mon.a, mgr.y, mds.a, mds.w-s, osd.0, osd.1, osd.2, osd.3, client.0]
-- [mon.b, mon.c, mgr.x, mds.x-s, mds.y-s, osd.4, osd.5, osd.6, osd.7, client.1]
+- [mon.a, mgr.y, mds.a, mds.c, osd.0, osd.1, osd.2, osd.3, client.0]
+- [mon.b, mon.c, mgr.x, mds.b, mds.d, osd.4, osd.5, osd.6, osd.7, client.1]
openstack:
- volumes: # attached to each instance
count: 4
- [mon.a, mon.c, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3]
- [mon.b, mgr.x, mds.b, mds.c, osd.4, osd.5, osd.6, osd.7]
- [client.0, client.1]
+overrides:
+ ceph:
+ max_mds: 3
openstack:
- volumes: # attached to each instance
count: 4
- [mon.a, mon.c, mgr.y, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, osd.3]
- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.4, osd.5, osd.6, osd.7]
- [client.0, client.1]
+overrides:
+ ceph:
+ max_mds: 9
openstack:
- volumes: # attached to each instance
count: 4
roles:
- [mon.a, mgr.y, mds.a, osd.0, osd.1, osd.2, osd.3, client.0]
-- [mon.b, mon.c, mgr.x, mds.a-s, osd.4, osd.5, osd.6, osd.7]
+- [mon.b, mon.c, mgr.x, mds.b, osd.4, osd.5, osd.6, osd.7]
openstack:
- volumes: # attached to each instance
count: 4
roles:
- [mon.a, mds.a, mgr.x, osd.0, osd.1]
-- [mon.b, mds.a-s, mon.c, mgr.y, osd.2, osd.3]
+- [mon.b, mds.b, mon.c, mgr.y, osd.2, osd.3]
- [client.0]
openstack:
- volumes: # attached to each instance
ceph:
conf:
osd:
- osd shutdown pgref assert: true
\ No newline at end of file
+ osd shutdown pgref assert: true
roles:
-- [mon.a, mgr.x, mds.a, mds.a-s]
-- [mon.b, mgr.y, mds.b, mds.b-s]
-- [mon.c, mgr.z, mds.c, mds.c-s]
+- [mon.a, mgr.x, mds.a, mds.d]
+- [mon.b, mgr.y, mds.b, mds.e]
+- [mon.c, mgr.z, mds.c, mds.f]
- [osd.0]
- [osd.1]
- [osd.2]
roles:
-- [mon.a, mon.c, mds.a, osd.0, osd.1, osd.2, mds.d-s]
-- [mon.b, mgr.x, mds.b, mds.c, osd.3, osd.4, osd.5, mds.e-s]
+- [mon.a, mon.c, mds.a, mds.c, mds.e, osd.0, osd.1, osd.2]
+- [mon.b, mgr.x, mds.b, mds.d, osd.3, osd.4, osd.5]
- [client.0]
roles:
-- [mon.a, mon.c, mds.a, mds.b, mds.c, mds.d, osd.0, osd.1, osd.2, mds.j-s, mds.k-s]
-- [mon.b, mgr.x, mds.e, mds.f, mds.g, mds.h, mds.i, osd.3, osd.4, osd.5, mds.l-s]
+- [mon.a, mon.c, mds.a, mds.c, mds.e, mds.g, mds.i, mds.k, osd.0, osd.1, osd.2]
+- [mon.b, mgr.x, mds.b, mds.d, mds.f, mds.h, mds.j, mds.l, osd.3, osd.4, osd.5]
- [client.0]
fs = Filesystem(ctx, name='cephfs', create=True,
ec_profile=config.get('cephfs_ec_profile', None))
- is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
- all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
- num_active = len([r for r in all_roles if is_active_mds(r)])
-
- fs.set_max_mds(config.get('max_mds', num_active))
+ max_mds = config.get('max_mds', 1)
+ if max_mds > 1:
+ fs.set_max_mds(max_mds)
yield
if is_mds(role):
name = teuthology.ceph_role(role)
conf.setdefault(name, {})
- if '-s-' in name:
- standby_mds = name[name.find('-s-') + 3:]
- conf[name]['mds standby for name'] = standby_mds
return conf
def create_simple_monmap(ctx, remote, conf, mons,
"""
fs = self.get_fsmap(fscid)
for info in fs['mdsmap']['info'].values():
- if info['rank'] >= 0:
+ if info['rank'] >= 0 and info['state'] != 'up:standby-replay':
yield info
def get_rank(self, fscid, rank):
def set_max_mds(self, max_mds):
self.set_var("max_mds", "%d" % max_mds)
+ def set_allow_standby_replay(self, yes):
+ self.set_var("allow_standby_replay", str(yes).lower())
+
def set_allow_new_snaps(self, yes):
self.set_var("allow_new_snaps", str(yes).lower(), '--yes-i-really-mean-it')
status = self.getinfo()
return status.get_rank(self.id, rank)
+ def rank_restart(self, rank=0, status=None):
+ name = self.get_rank(rank=rank, status=status)['name']
+ self.mds_restart(mds_id=name)
+
+ def rank_signal(self, signal, rank=0, status=None):
+ name = self.get_rank(rank=rank, status=status)['name']
+ self.mds_signal(name, signal)
+
+ def rank_fail(self, rank=0):
+ self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank))
+
def get_ranks(self, status=None):
if status is None:
status = self.getinfo()
return status.get_ranks(self.id)
+ def get_replays(self, status=None):
+ if status is None:
+ status = self.getinfo()
+ return status.get_replays(self.id)
+
+ def get_replay(self, rank=0, status=None):
+ for replay in self.get_replays(status=status):
+ if replay['rank'] == rank:
+ return replay
+ return None
+
def get_rank_names(self, status=None):
"""
Return MDS daemon names of those daemons holding a rank,
import json
import logging
from unittest import case, SkipTest
+from random import randint
from cephfs_test_case import CephFSTestCase
from teuthology.exceptions import CommandFailedError
class TestStandbyReplay(CephFSTestCase):
MDSS_REQUIRED = 4
- REQUIRE_FILESYSTEM = False
- def set_standby_for(self, leader, follower, replay):
- self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
+ def _confirm_no_replay(self):
+ status = self.fs.status()
+ standby_count = len(list(status.get_standbys()))
+ self.assertEqual(0, len(list(self.fs.get_replays(status=status))))
+ return status
+
+ def _confirm_single_replay(self, full=True, status=None):
+ status = self.fs.wait_for_daemons(status=status)
+ ranks = sorted(self.fs.get_mds_map(status=status)['in'])
+ replays = list(self.fs.get_replays(status=status))
+ checked_replays = set()
+ for rank in ranks:
+ has_replay = False
+ for replay in replays:
+ if replay['rank'] == rank:
+ self.assertFalse(has_replay)
+ has_replay = True
+ checked_replays.add(replay['gid'])
+ if full and not has_replay:
+ raise RuntimeError("rank "+str(rank)+" has no standby-replay follower")
+ self.assertEqual(checked_replays, set(info['gid'] for info in replays))
+ return status
+
+ def _check_replay_takeover(self, status, rank=0):
+ replay = self.fs.get_replay(rank=rank, status=status)
+ new_status = self.fs.wait_for_daemons()
+ new_active = self.fs.get_rank(rank=rank, status=new_status)
if replay:
- self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
-
- def get_info_by_name(self, mds_name):
- status = self.mds_cluster.status()
- info = status.get_mds(mds_name)
- if info is None:
- log.warn(str(status))
- raise RuntimeError("MDS '{0}' not found".format(mds_name))
+ self.assertEqual(replay['gid'], new_active['gid'])
else:
- return info
+ # double check takeover came from a standby (or some new daemon via restart)
+ found = False
+ for info in status.get_standbys():
+ if info['gid'] == new_active['gid']:
+ found = True
+ break
+ if not found:
+ for info in status.get_all():
+ self.assertNotEqual(info['gid'], new_active['gid'])
+ return new_status
- def test_standby_replay_unused(self):
- # Pick out exactly 3 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
- mds_a, mds_b, mds_c = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
+ def test_standby_replay_singleton(self):
+ """
+ That only one MDS becomes standby-replay.
+ """
- # B and C should both follow A, but only one will
- # really get into standby replay state.
- self.set_standby_for(mds_a, mds_b, True)
- self.set_standby_for(mds_a, mds_c, True)
+ self._confirm_no_replay()
+ self.fs.set_allow_standby_replay(True)
+ time.sleep(30)
+ self._confirm_single_replay()
- # Create FS and start A
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
+ def test_standby_replay_singleton_fail(self):
+ """
+ That failures don't violate singleton constraint.
+ """
- # Start B, he should go into standby replay
- self.mds_cluster.mds_restart(mds_b)
- self.wait_for_daemon_start([mds_b])
- info_b = self.get_info_by_name(mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
-
- # Start C, he should go into standby (*not* replay)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_c])
- info_c = self.get_info_by_name(mds_c)
- self.assertEqual(info_c['state'], "up:standby")
- self.assertEqual(info_c['standby_for_name'], mds_a)
- self.assertEqual(info_c['rank'], -1)
-
- # Kill B, C should go into standby replay
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
- self.wait_until_equal(
- lambda: self.get_info_by_name(mds_c)['state'],
- "up:standby-replay",
- 60)
- info_c = self.get_info_by_name(mds_c)
- self.assertEqual(info_c['state'], "up:standby-replay")
- self.assertEqual(info_c['standby_for_name'], mds_a)
- self.assertEqual(info_c['rank'], 0)
-
- def test_standby_failure(self):
+ self._confirm_no_replay()
+ self.fs.set_allow_standby_replay(True)
+ status = self._confirm_single_replay()
+
+ for i in range(10):
+ time.sleep(randint(1, 5))
+ self.fs.rank_restart(status=status)
+ status = self._check_replay_takeover(status)
+ status = self._confirm_single_replay(status=status)
+
+ for i in range(10):
+ time.sleep(randint(1, 5))
+ self.fs.rank_fail()
+ status = self._check_replay_takeover(status)
+ status = self._confirm_single_replay(status=status)
+
+ def test_standby_replay_singleton_fail_multimds(self):
+ """
+ That failures don't violate singleton constraint with multiple actives.
+ """
+
+ status = self._confirm_no_replay()
+ new_max_mds = randint(2, len(list(status.get_standbys())))
+ self.fs.set_max_mds(new_max_mds)
+ self.fs.wait_for_daemons() # wait for actives to come online!
+ self.fs.set_allow_standby_replay(True)
+ status = self._confirm_single_replay(full=False)
+
+ for i in range(10):
+ time.sleep(randint(1, 5))
+ victim = randint(0, new_max_mds-1)
+ self.fs.rank_restart(rank=victim, status=status)
+ status = self._check_replay_takeover(status, rank=victim)
+ status = self._confirm_single_replay(status=status, full=False)
+
+ for i in range(10):
+ time.sleep(randint(1, 5))
+ victim = randint(0, new_max_mds-1)
+ self.fs.rank_fail(rank=victim)
+ status = self._check_replay_takeover(status, rank=victim)
+ status = self._confirm_single_replay(status=status, full=False)
+
+ def test_standby_replay_failure(self):
"""
That the failure of a standby-replay daemon happens cleanly
and doesn't interrupt anything else.
"""
- # Pick out exactly 2 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:2])
- mds_a, mds_b = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- # Configure two pairs of MDSs that are standby for each other
- self.set_standby_for(mds_a, mds_b, True)
- self.set_standby_for(mds_b, mds_a, False)
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
+ status = self._confirm_no_replay()
+ self.fs.set_max_mds(1)
+ self.fs.set_allow_standby_replay(True)
+ status = self._confirm_single_replay()
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b)
- self.wait_for_daemon_start([mds_b])
-
- # See the standby come up as the correct rank
- info_b = self.get_info_by_name(mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
-
- # Kill the standby
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
-
- # See that the standby is gone and the active remains
- self.assertEqual(fs_a.get_active_names(), [mds_a])
- mds_map = fs_a.get_mds_map()
- self.assertEqual(len(mds_map['info']), 1)
- self.assertEqual(mds_map['failed'], [])
- self.assertEqual(mds_map['damaged'], [])
- self.assertEqual(mds_map['stopped'], [])
+ for i in range(10):
+ time.sleep(randint(1, 5))
+ victim = self.fs.get_replay(status=status)
+ self.fs.mds_restart(mds_id=victim['name'])
+ status = self._confirm_single_replay(status=status)
def test_rank_stopped(self):
"""
That when a rank is STOPPED, standby replays for
that rank get torn down
"""
- # Pick out exactly 2 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_a_s, mds_b_s = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
- # a and b both get a standby
- self.set_standby_for(mds_a, mds_a_s, True)
- self.set_standby_for(mds_b, mds_b_s, True)
-
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- fs_a.set_max_mds(2)
+ status = self._confirm_no_replay()
+ standby_count = len(list(status.get_standbys()))
+ self.fs.set_max_mds(2)
+ self.fs.set_allow_standby_replay(True)
+ status = self._confirm_single_replay()
- self.mds_cluster.mds_restart(mds_a)
- self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30)
- self.mds_cluster.mds_restart(mds_b)
- fs_a.wait_for_daemons()
- self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b])
-
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b_s)
- self.wait_for_daemon_start([mds_b_s])
- self.mds_cluster.mds_restart(mds_a_s)
- self.wait_for_daemon_start([mds_a_s])
- info_b_s = self.get_info_by_name(mds_b_s)
- self.assertEqual(info_b_s['state'], "up:standby-replay")
- info_a_s = self.get_info_by_name(mds_a_s)
- self.assertEqual(info_a_s['state'], "up:standby-replay")
-
- # Shrink the cluster
- fs_a.set_max_mds(1)
- self.wait_until_equal(
- lambda: fs_a.get_active_names(), [mds_a],
- 60
- )
+ self.fs.set_max_mds(1) # stop rank 1
- # Both 'b' and 'b_s' should go back to being standbys
- self.wait_until_equal(
- lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s},
- 60
- )
+ status = self._confirm_single_replay()
+ self.assertTrue(standby_count, len(list(status.get_standbys())))
class TestMultiFilesystems(CephFSTestCase):
fs_a.set_max_mds(3)
self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
reject_fn=lambda v: v > 3 or v < 2)
-
- def test_standby_for_name(self):
- # Pick out exactly 4 daemons to be run during test
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader, follower, replay):
- self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
- if replay:
- self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
-
- # Configure two pairs of MDSs that are standby for each other
- set_standby_for(mds_a, mds_b, True)
- set_standby_for(mds_b, mds_a, False)
- set_standby_for(mds_c, mds_d, True)
- set_standby_for(mds_d, mds_c, False)
-
- # Create FS alpha and get mds_a to come up as active
- fs_a = self.mds_cluster.newfs("alpha")
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- # Create FS bravo and get mds_c to come up as active
- fs_b = self.mds_cluster.newfs("bravo")
- self.mds_cluster.mds_restart(mds_c)
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_c])
-
- # Start the standbys
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_b, mds_d])
-
- def get_info_by_name(fs, mds_name):
- mds_map = fs.get_mds_map()
- for gid_str, info in mds_map['info'].items():
- if info['name'] == mds_name:
- return info
-
- log.warn(json.dumps(mds_map, indent=2))
- raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name))
-
- # See both standbys come up as standby replay for the correct ranks
- # mds_b should be in filesystem alpha following mds_a
- info_b = get_info_by_name(fs_a, mds_b)
- self.assertEqual(info_b['state'], "up:standby-replay")
- self.assertEqual(info_b['standby_for_name'], mds_a)
- self.assertEqual(info_b['rank'], 0)
- # mds_d should be in filesystem alpha following mds_c
- info_d = get_info_by_name(fs_b, mds_d)
- self.assertEqual(info_d['state'], "up:standby-replay")
- self.assertEqual(info_d['standby_for_name'], mds_c)
- self.assertEqual(info_d['rank'], 0)
-
- # Kill both active daemons
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- self.mds_cluster.mds_stop(mds_c)
- self.mds_cluster.mds_fail(mds_c)
-
- # Wait for standbys to take over
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_b])
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_d])
-
- # Start the original active daemons up again
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_a, mds_c])
-
- self.assertEqual(set(self.mds_cluster.get_standby_daemons()),
- {mds_a, mds_c})
-
- def test_standby_for_rank(self):
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader_rank, leader_fs, follower_id):
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_rank", leader_rank)
-
- fscid = leader_fs.get_namespace_id()
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_fscid", fscid)
-
- fs_a = self.mds_cluster.newfs("alpha")
- fs_b = self.mds_cluster.newfs("bravo")
- set_standby_for(0, fs_a, mds_a)
- set_standby_for(0, fs_a, mds_b)
- set_standby_for(0, fs_b, mds_c)
- set_standby_for(0, fs_b, mds_d)
-
- self.mds_cluster.mds_restart(mds_a)
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_a])
-
- self.mds_cluster.mds_restart(mds_c)
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_c])
-
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_b, mds_d])
-
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- self.mds_cluster.mds_stop(mds_c)
- self.mds_cluster.mds_fail(mds_c)
-
- fs_a.wait_for_daemons()
- self.assertEqual(fs_a.get_active_names(), [mds_b])
- fs_b.wait_for_daemons()
- self.assertEqual(fs_b.get_active_names(), [mds_d])
-
- def test_standby_for_fscid(self):
- """
- That I can set a standby FSCID with no rank, and the result is
- that daemons join any rank for that filesystem.
- """
- use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
- mds_a, mds_b, mds_c, mds_d = use_daemons
-
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for(leader_fs, follower_id):
- fscid = leader_fs.get_namespace_id()
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_fscid", fscid)
-
- # Create two filesystems which should have two ranks each
- fs_a = self.mds_cluster.newfs("alpha")
-
- fs_b = self.mds_cluster.newfs("bravo")
-
- fs_a.set_max_mds(2)
- fs_b.set_max_mds(2)
-
- # Set all the daemons to have a FSCID assignment but no other
- # standby preferences.
- set_standby_for(fs_a, mds_a)
- set_standby_for(fs_a, mds_b)
- set_standby_for(fs_b, mds_c)
- set_standby_for(fs_b, mds_d)
-
- # Now when we start all daemons at once, they should fall into
- # ranks in the right filesystem
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_c)
- self.mds_cluster.mds_restart(mds_d)
- self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d])
- fs_a.wait_for_daemons()
- fs_b.wait_for_daemons()
- self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b})
- self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d})
-
- def test_standby_for_invalid_fscid(self):
- """
- That an invalid standby_fscid does not cause a mon crash
- """
- use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
- mds_a, mds_b, mds_c = use_daemons
- log.info("Using MDS daemons: {0}".format(use_daemons))
-
- def set_standby_for_rank(leader_rank, follower_id):
- self.set_conf("mds.{0}".format(follower_id),
- "mds_standby_for_rank", leader_rank)
-
- # Create one fs
- fs_a = self.mds_cluster.newfs("cephfs")
-
- # Get configured mons in the cluster, so we can see if any
- # crashed later.
- configured_mons = fs_a.mon_manager.get_mon_quorum()
-
- # Set all the daemons to have a rank assignment but no other
- # standby preferences.
- set_standby_for_rank(0, mds_a)
- set_standby_for_rank(0, mds_b)
-
- # Set third daemon to have invalid fscid assignment and no other
- # standby preferences
- invalid_fscid = 123
- self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)
-
- #Restart all the daemons to make the standby preference applied
- self.mds_cluster.mds_restart(mds_a)
- self.mds_cluster.mds_restart(mds_b)
- self.mds_cluster.mds_restart(mds_c)
- self.wait_for_daemon_start([mds_a, mds_b, mds_c])
-
- #Stop active mds daemon service of fs
- if (fs_a.get_active_names(), [mds_a]):
- self.mds_cluster.mds_stop(mds_a)
- self.mds_cluster.mds_fail(mds_a)
- fs_a.wait_for_daemons()
- else:
- self.mds_cluster.mds_stop(mds_b)
- self.mds_cluster.mds_fail(mds_b)
- fs_a.wait_for_daemons()
-
- #Get active mons from cluster
- active_mons = fs_a.mon_manager.get_mon_quorum()
-
- #Check for active quorum mon status and configured mon status
- self.assertEqual(active_mons, configured_mons,
- "Not all mons are in quorum Invalid standby invalid fscid test failed!")
old_journal_version = JOURNAL_FORMAT_LEGACY
new_journal_version = JOURNAL_FORMAT_RESILIENT
- # Pick out two daemons to use
- mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2])
-
self.mount_a.umount_wait()
self.fs.mds_stop()
- # Enable standby replay, to cover the bug case #8811 where
- # a standby replay might mistakenly end up trying to rewrite
- # the journal at the same time as an active daemon.
- self.fs.set_ceph_conf('mds', 'mds standby replay', "true")
- self.fs.set_ceph_conf('mds', 'mds standby for rank', "0")
-
# Create a filesystem using the older journal format.
self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
+ self.fs.mds_restart()
self.fs.recreate()
- self.fs.mds_restart(mds_id=mds_a)
- self.fs.wait_for_daemons()
- self.assertEqual(self.fs.get_active_names(), [mds_a])
- def replay_names():
- return [s['name']
- for s in self.fs.status().get_replays(fscid = self.fs.id)]
+ # Enable standby replay, to cover the bug case #8811 where
+ # a standby replay might mistakenly end up trying to rewrite
+ # the journal at the same time as an active daemon.
+ self.fs.set_allow_standby_replay(True)
- # Start the standby and wait for it to come up
- self.fs.mds_restart(mds_id=mds_b)
- self.wait_until_equal(
- replay_names,
- [mds_b],
- timeout = 30)
+ status = self.fs.wait_for_daemons()
+
+ self.assertTrue(self.fs.get_replay(status=status) is not None)
# Do some client work so that the log is populated with something.
with self.mount_a.mounted():
self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
# Restart the MDS.
- self.fs.mds_fail_restart(mds_id=mds_a)
- self.fs.mds_fail_restart(mds_id=mds_b)
+ self.fs.mds_fail_restart()
# This ensures that all daemons come up into a valid state
- self.fs.wait_for_daemons()
+ status = self.fs.wait_for_daemons()
# Check that files created in the initial client workload are still visible
# in a client mount.
})
# Check that both an active and a standby replay are still up
- self.assertEqual(len(replay_names()), 1)
- self.assertEqual(len(self.fs.get_active_names()), 1)
- self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running())
- self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running())
-
+ status = self.fs.status()
+ self.assertEqual(len(list(self.fs.get_replays(status=status))), 1)
+ self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1)
import logging
+import signal
import time
from textwrap import dedent
from tasks.cephfs.fuse_mount import FuseMount
class TestSnapshots(CephFSTestCase):
MDSS_REQUIRED = 3
- def _check_subtree(self, status, rank, path):
- got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
+ def _check_subtree(self, rank, path, status=None):
+ got_subtrees = self.fs.rank_asok(["get", "subtrees"], rank=rank, status=status)
for s in got_subtrees:
if s['dir']['path'] == path and s['auth_first'] == rank:
return True
return False
- def _stop_standby_mds(self):
- for i in self.fs.get_standby_daemons():
- self.fs.mds_stop(i)
- self.fs.mds_fail(i)
- self.wait_until_equal(lambda: len(self.fs.get_standby_daemons()), expect_val=0, timeout=30)
+ def _get_snapclient_dump(self, rank=0, status=None):
+ return self.fs.rank_asok(["dump", "snaps"], rank=rank, status=status)
- def _get_snapclient_dump(self, rank_id):
- return self.fs.mds_asok(["dump", "snaps"], rank_id)
+ def _get_snapserver_dump(self, rank=0, status=None):
+ return self.fs.rank_asok(["dump", "snaps", "--server"], rank=rank, status=status)
- def _get_snapserver_dump(self, rank0_id):
- return self.fs.mds_asok(["dump", "snaps", "--server"], rank0_id)
+ def _get_last_created_snap(self, rank=0, status=None):
+ return int(self._get_snapserver_dump(rank,status=status)["last_created"])
- def _get_last_created_snap(self, rank0_id):
- return int(self._get_snapserver_dump(rank0_id)["last_created"])
+ def _get_last_destroyed_snap(self, rank=0, status=None):
+ return int(self._get_snapserver_dump(rank,status=status)["last_destroyed"])
- def _get_last_destroyed_snap(self, rank0_id):
- return int(self._get_snapserver_dump(rank0_id)["last_destroyed"])
+ def _get_pending_snap_update(self, rank=0, status=None):
+ return self._get_snapserver_dump(rank,status=status)["pending_update"]
- def _get_pending_snap_update(self, rank0_id):
- return self._get_snapserver_dump(rank0_id)["pending_update"]
-
- def _get_pending_snap_destroy(self, rank0_id):
- return self._get_snapserver_dump(rank0_id)["pending_destroy"]
+ def _get_pending_snap_destroy(self, rank=0, status=None):
+ return self._get_snapserver_dump(rank,status=status)["pending_destroy"]
def test_kill_mdstable(self):
"""
self.fs.set_allow_new_snaps(True);
self.fs.set_max_mds(2)
- self.fs.wait_for_daemons()
+ status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
- self._stop_standby_mds()
-
- status = self.fs.status()
- rank0_id=status.get_rank(self.fs.id, 0)['name']
- rank1_id=status.get_rank(self.fs.id, 1)['name']
- self.set_conf("mds.{0}".format(rank0_id), "mds standby for rank", 0)
- self.set_conf("mds.{0}".format(rank1_id), "mds standby for rank", 1)
-
# setup subtrees
self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
- self.wait_until_true(lambda: self._check_subtree(status, 1, '/d1'), timeout=30)
+ self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
- last_created = self._get_last_created_snap(rank0_id)
+ last_created = self._get_last_created_snap(rank=0,status=status)
# mds_kill_mdstable_at:
# 1: MDSTableServer::handle_prepare
# 6: MDSTableServer::_commit_logged
for i in [1,2,5,6]:
log.info("testing snapserver mds_kill_mdstable_at={0}".format(i))
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank0_id)
+
+ status = self.fs.status()
+ rank0 = self.fs.get_rank(rank=0, status=status)
+ self.fs.rank_freeze(True, rank=0)
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
- self.delete_mds_coredump(rank0_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+ self.delete_mds_coredump(rank0['name']);
- self.mds_cluster.mds_fail_restart(rank0_id)
- self.wait_for_daemon_start([rank0_id])
- self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+ self.fs.rank_fail(rank=0)
+ self.fs.mds_restart(rank0['name'])
+ self.wait_for_daemon_start([rank0['name']])
+ status = self.fs.wait_for_daemons()
proc.wait()
last_created += 1
- self.wait_until_true(lambda: self._get_last_created_snap(rank0_id) == last_created, timeout=30)
+ self.wait_until_true(lambda: self._get_last_created_snap(rank=0) == last_created, timeout=30)
self.set_conf("mds", "mds_reconnect_timeout", "5")
# set mds_kill_mdstable_at, also kill snapclient
for i in [2,5,6]:
log.info("testing snapserver mds_kill_mdstable_at={0}, also kill snapclient".format(i))
- last_created = self._get_last_created_snap(rank0_id)
-
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank0_id)
+ status = self.fs.status()
+ last_created = self._get_last_created_snap(rank=0, status=status)
+
+ rank0 = self.fs.get_rank(rank=0, status=status)
+ rank1 = self.fs.get_rank(rank=1, status=status)
+ self.fs.rank_freeze(True, rank=0) # prevent failover...
+ self.fs.rank_freeze(True, rank=1) # prevent failover...
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
- self.delete_mds_coredump(rank0_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+ self.delete_mds_coredump(rank0['name']);
- self.mds_cluster.mds_stop(rank1_id)
- self.mds_cluster.mds_fail(rank1_id);
+ self.fs.rank_signal(signal.SIGKILL, rank=1)
self.mount_a.kill()
self.mount_a.kill_cleanup()
- self.mds_cluster.mds_fail_restart(rank0_id)
- self.wait_for_daemon_start([rank0_id])
+ self.fs.rank_fail(rank=0)
+ self.fs.mds_restart(rank0['name'])
+ self.wait_for_daemon_start([rank0['name']])
self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE)
if i in [2,5]:
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
elif i == 6:
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 0)
- self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+ self.assertGreater(self._get_last_created_snap(rank=0), last_created)
- self.mds_cluster.mds_restart(rank1_id)
- self.wait_for_daemon_start([rank1_id])
+ self.fs.rank_fail(rank=1)
+ self.fs.mds_restart(rank1['name'])
+ self.wait_for_daemon_start([rank1['name']])
self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
if i in [2,5]:
- self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
if i == 2:
- self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+ self.assertEqual(self._get_last_created_snap(rank=0), last_created)
else:
- self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+ self.assertGreater(self._get_last_created_snap(rank=0), last_created)
self.mount_a.mount()
self.mount_a.wait_until_mounted()
# 7: MDSTableClient::handle_request (got ack)
for i in [3,4,7]:
log.info("testing snapclient mds_kill_mdstable_at={0}".format(i))
- last_created = self._get_last_created_snap(rank0_id)
+ last_created = self._get_last_created_snap(rank=0)
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank1_id)
+ status = self.fs.status()
+ rank1 = self.fs.get_rank(rank=1, status=status)
+ self.fs.rank_freeze(True, rank=1) # prevent failover...
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank1_id), timeout=grace*2);
- self.delete_mds_coredump(rank1_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+ self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
self.mount_a.kill_cleanup()
if i in [3,4]:
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
elif i == 7:
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 0)
- self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0)
+ self.assertGreater(self._get_last_created_snap(rank=0), last_created)
- self.mds_cluster.mds_fail_restart(rank1_id)
- self.wait_for_daemon_start([rank1_id])
- self.fs.wait_for_state('up:active', rank=1, timeout=MDS_RESTART_GRACE)
+ self.fs.rank_fail(rank=1)
+ self.fs.mds_restart(rank1['name'])
+ self.wait_for_daemon_start([rank1['name']])
+ status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
if i in [3,4]:
- self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
if i == 3:
- self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+ self.assertEqual(self._get_last_created_snap(rank=0), last_created)
else:
- self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+ self.assertGreater(self._get_last_created_snap(rank=0), last_created)
self.mount_a.mount()
self.mount_a.wait_until_mounted()
# 3: MDSTableClient::handle_request (got agree)
# 8: MDSTableServer::handle_rollback
log.info("testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8")
- last_created = self._get_last_created_snap(rank0_id)
+ last_created = self._get_last_created_snap(rank=0)
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "8".format(i)], rank0_id)
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "3".format(i)], rank1_id)
+ status = self.fs.status()
+ rank0 = self.fs.get_rank(rank=0, status=status)
+ rank1 = self.fs.get_rank(rank=1, status=status)
+ self.fs.rank_freeze(True, rank=0)
+ self.fs.rank_freeze(True, rank=1)
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8".format(i)], rank=0, status=status)
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3".format(i)], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4".format(i)], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank1_id), timeout=grace*2);
- self.delete_mds_coredump(rank1_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
+ self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
self.mount_a.kill_cleanup()
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
- self.mds_cluster.mds_fail_restart(rank1_id)
+ self.fs.rank_fail(rank=1)
+ self.fs.mds_restart(rank1['name'])
+ self.wait_for_daemon_start([rank1['name']])
# rollback triggers assertion
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
- self.delete_mds_coredump(rank0_id);
-
- self.mds_cluster.mds_fail_restart(rank0_id)
- self.wait_for_daemon_start([rank0_id])
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
+ self.delete_mds_coredump(rank0['name']);
+ self.fs.rank_fail(rank=0)
+ self.fs.mds_restart(rank0['name'])
+ self.wait_for_daemon_start([rank0['name']])
self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
- # mds.1 should re-send rollack message
- self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
- self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+
+ # mds.1 should re-send rollback message
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+ self.assertEqual(self._get_last_created_snap(rank=0), last_created)
self.mount_a.mount()
self.mount_a.wait_until_mounted()
"""
self.fs.set_allow_new_snaps(True);
self.fs.set_max_mds(3)
- self.fs.wait_for_daemons()
+ status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
- self._stop_standby_mds()
-
- status = self.fs.status()
- rank0_id=status.get_rank(self.fs.id, 0)['name']
- rank1_id=status.get_rank(self.fs.id, 1)['name']
- rank2_id=status.get_rank(self.fs.id, 2)['name']
- self.set_conf("mds.{0}".format(rank0_id), "mds standby for rank", 0)
- self.set_conf("mds.{0}".format(rank1_id), "mds standby for rank", 1)
- self.set_conf("mds.{0}".format(rank2_id), "mds standby for rank", 2)
-
self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2")
- self.wait_until_true(lambda: self._check_subtree(status, 2, '/d0/d2'), timeout=30)
- self.wait_until_true(lambda: self._check_subtree(status, 1, '/d0/d1'), timeout=5)
- self.wait_until_true(lambda: self._check_subtree(status, 0, '/d0'), timeout=5)
+ self.wait_until_true(lambda: self._check_subtree(2, '/d0/d2', status=status), timeout=30)
+ self.wait_until_true(lambda: self._check_subtree(1, '/d0/d1', status=status), timeout=5)
+ self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
- def _check_snapclient_cache(snaps_dump, cache_dump=None, rank_id=-1):
+ def _check_snapclient_cache(snaps_dump, cache_dump=None, rank=0):
if cache_dump is None:
- cache_dump = self._get_snapclient_dump(rank_id)
+ cache_dump = self._get_snapclient_dump(rank=rank)
for key, value in cache_dump.iteritems():
if value != snaps_dump[key]:
return False
return True;
# sync after mksnap
- last_created = self._get_last_created_snap(rank0_id)
- self.mount_a.run_shell(["mkdir", Raw("d0/d1/dir/.snap/{s1,s2}")])
- self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
- self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+ last_created = self._get_last_created_snap(rank=0)
+ self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s1", "d0/d1/dir/.snap/s2"])
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
+ self.assertGreater(self._get_last_created_snap(rank=0), last_created)
- snaps_dump = self._get_snapserver_dump(rank0_id)
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id));
+ snaps_dump = self._get_snapserver_dump(rank=0)
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
# sync after rmsnap
- last_destroyed = self._get_last_destroyed_snap(rank0_id)
+ last_destroyed = self._get_last_destroyed_snap(rank=0)
self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"])
- self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank0_id)) == 0, timeout=30)
- self.assertGreater(self._get_last_destroyed_snap(rank0_id), last_destroyed)
+ self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank=0)) == 0, timeout=30)
+ self.assertGreater(self._get_last_destroyed_snap(rank=0), last_destroyed)
- snaps_dump = self._get_snapserver_dump(rank0_id)
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id));
+ snaps_dump = self._get_snapserver_dump(rank=0)
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
# sync during mds recovers
- self.mds_cluster.mds_stop(rank2_id)
- self.mds_cluster.mds_fail_restart(rank2_id)
- self.wait_for_daemon_start([rank2_id])
- self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE)
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id));
-
- self.mds_cluster.mds_stop(rank0_id)
- self.mds_cluster.mds_stop(rank1_id)
- self.mds_cluster.mds_fail_restart(rank0_id)
- self.mds_cluster.mds_fail_restart(rank1_id)
- self.wait_for_daemon_start([rank0_id, rank1_id])
+ self.fs.rank_fail(rank=2)
+ status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
+
+ self.fs.rank_fail(rank=0)
+ self.fs.rank_fail(rank=1)
+ status = self.fs.wait_for_daemons()
self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id));
- self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1));
+ self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2));
# kill at MDSTableClient::handle_notify_prep
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank2_id)
+ status = self.fs.status()
+ rank2 = self.fs.get_rank(rank=2, status=status)
+ self.fs.rank_freeze(True, rank=2)
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank2_id), timeout=grace*2);
- self.delete_mds_coredump(rank2_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+ self.delete_mds_coredump(rank2['name']);
# mksnap should wait for notify ack from mds.2
self.assertFalse(proc.finished);
# mksnap should proceed after mds.2 fails
- self.mds_cluster.mds_fail(rank2_id)
+ self.fs.rank_fail(rank=2)
self.wait_until_true(lambda: proc.finished, timeout=30);
- self.mds_cluster.mds_fail_restart(rank2_id)
- self.wait_for_daemon_start([rank2_id])
- self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE)
+ self.fs.mds_restart(rank2['name'])
+ self.wait_for_daemon_start([rank2['name']])
+ status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")])
# the recovering mds should sync all mds' cache when it enters resolve stage
self.set_conf("mds", "mds_reconnect_timeout", "5")
for i in range(1, 4):
- self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank2_id)
- last_created = self._get_last_created_snap(rank0_id)
+ status = self.fs.status()
+ rank2 = self.fs.get_rank(rank=2, status=status)
+ self.fs.rank_freeze(True, rank=2)
+ self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
+ last_created = self._get_last_created_snap(rank=0)
proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
- self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank2_id), timeout=grace*2);
- self.delete_mds_coredump(rank2_id);
+ self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
+ self.delete_mds_coredump(rank2['name']);
self.mount_a.kill()
self.mount_a.kill_cleanup()
- self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1)
if i in [2,4]:
- self.mds_cluster.mds_stop(rank0_id)
- self.mds_cluster.mds_fail_restart(rank0_id)
+ self.fs.rank_fail(rank=0)
if i in [3,4]:
- self.mds_cluster.mds_stop(rank1_id)
- self.mds_cluster.mds_fail_restart(rank1_id)
+ self.fs.rank_fail(rank=1)
- self.mds_cluster.mds_fail_restart(rank2_id)
- self.wait_for_daemon_start([rank0_id, rank1_id, rank2_id])
+ self.fs.rank_fail(rank=2)
+ self.fs.mds_restart(rank2['name'])
+ self.wait_for_daemon_start([rank2['name']])
+ status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE)
- self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE)
- rank0_cache = self._get_snapclient_dump(rank0_id)
- rank1_cache = self._get_snapclient_dump(rank1_id)
- rank2_cache = self._get_snapclient_dump(rank2_id)
+ rank0_cache = self._get_snapclient_dump(rank=0)
+ rank1_cache = self._get_snapclient_dump(rank=1)
+ rank2_cache = self._get_snapclient_dump(rank=2)
self.assertGreater(int(rank0_cache["last_created"]), last_created)
self.assertEqual(rank0_cache, rank1_cache);
self.assertEqual(rank0_cache, rank2_cache);
- self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30)
- snaps_dump = self._get_snapserver_dump(rank0_id)
+ snaps_dump = self._get_snapserver_dump(rank=0)
self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"])
self.assertTrue(_check_snapclient_cache(snaps_dump, cache_dump=rank0_cache));
"""
self.fs.set_allow_new_snaps(True);
self.fs.set_max_mds(2)
- self.fs.wait_for_daemons()
-
- status = self.fs.status()
+ status = self.fs.wait_for_daemons()
self.mount_a.run_shell(["mkdir", "-p", "d0/d1"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1")
- self.wait_until_true(lambda: self._check_subtree(status, 1, '/d0/d1'), timeout=30)
- self.wait_until_true(lambda: self._check_subtree(status, 0, '/d0'), timeout=5)
+ self.wait_until_true(lambda: self._check_subtree(1, '/d0/d1', status=status), timeout=30)
+ self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
self.mount_a.write_test_pattern("d0/d1/file_a", 8 * 1024 * 1024)
self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
"""
self.fs.set_allow_new_snaps(True);
self.fs.set_max_mds(2)
- self.fs.wait_for_daemons()
-
- status = self.fs.status()
+ status = self.fs.wait_for_daemons()
self.mount_a.run_shell(["mkdir", "d0", "d1"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
- self.wait_until_true(lambda: self._check_subtree(status, 1, '/d1'), timeout=30)
- self.wait_until_true(lambda: self._check_subtree(status, 0, '/d0'), timeout=5)
+ self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
+ self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
self.mount_a.run_shell(["mkdir", "d0/d3"])
self.mount_a.run_shell(["mkdir", "d0/.snap/s1"])
"""
self.fs.set_allow_new_snaps(True);
self.fs.set_max_mds(2)
- self.fs.wait_for_daemons()
-
- status = self.fs.status()
+ status = self.fs.wait_for_daemons()
self.mount_a.run_shell(["mkdir", "d0", "d1"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
- self.wait_until_true(lambda: self._check_subtree(status, 1, '/d1'), timeout=30)
- self.wait_until_true(lambda: self._check_subtree(status, 0, '/d0'), timeout=5)
+ self.wait_until_true(lambda: self._check_subtree(1, '/d1', status=status), timeout=30)
+ self.wait_until_true(lambda: self._check_subtree(0, '/d0', status=status), timeout=5)
self.mount_a.run_python(dedent("""
import os
"powercycling requested but RemoteConsole is not "
"initialized. Check ipmi config.")
- def revive_mds(self, mds, standby_for_rank=None):
+ def revive_mds(self, mds):
"""
Revive mds -- do an ipmpi powercycle (if indicated by the config)
- and then restart (using --hot-standby if specified.
+ and then restart.
"""
if self.config.get('powercycle'):
(remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
remote.console.power_on()
self.manager.make_admin_daemon_dir(self.ctx, remote)
args = []
- if standby_for_rank:
- args.extend(['--hot-standby', standby_for_rank])
self.ctx.daemons.get_daemon('mds', mds).restart(*args)
def wait_for_stable(self, rank = None, gid = None):