))
raise
- def assert_mds_crash(self, daemon_id):
- """
- Assert that the a particular MDS daemon crashes (block until
- it does)
- """
- try:
- self.mds_cluster.mds_daemons[daemon_id].proc.wait()
- except CommandFailedError as e:
- log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
- self.mds_cluster.mds_daemons[daemon_id].proc = None
-
- # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
- # catch it later and treat it as a failure.
- p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
- core_pattern = p.stdout.getvalue().strip()
- if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
- # We have seen a core_pattern that looks like it's from teuthology's coredump
- # task, so proceed to clear out the core file
- log.info("Clearing core from pattern: {0}".format(core_pattern))
-
- # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
- # to talk to the mons to get assigned a rank to reach the point of crashing
- addr = self.mds_cluster.status().get_mds(daemon_id)['addr']
- pid_str = addr.split("/")[1]
- log.info("Determined crasher PID was {0}".format(pid_str))
-
- # Substitute PID into core_pattern to get a glob
- core_glob = core_pattern.replace("%p", pid_str)
- core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
-
- # Verify that we see the expected single coredump matching the expected pattern
- ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "ls", run.Raw(core_glob)
- ], stdout=StringIO())
- cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
- log.info("Enumerated cores: {0}".format(cores))
- self.assertEqual(len(cores), 1)
-
- log.info("Found core file {0}, deleting it".format(cores[0]))
-
- self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
- "sudo", "rm", "-f", cores[0]
- ])
- else:
- log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
-
+ def delete_mds_coredump(self, daemon_id):
+ # delete coredump file, otherwise teuthology.internal.coredump will
+ # catch it later and treat it as a failure.
+ p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
+ core_dir = os.path.dirname(p.stdout.getvalue().strip())
+ if core_dir: # Non-default core_pattern with a directory in it
+ # We have seen a core_pattern that looks like it's from teuthology's coredump
+ # task, so proceed to clear out the core file
+ log.info("Clearing core from directory: {0}".format(core_dir))
+
+ # Verify that we see the expected single coredump
+ ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "cd", core_dir, run.Raw('&&'),
+ "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
+ ], stdout=StringIO())
+ cores = [l.partition(":")[0]
+ for l in ls_proc.stdout.getvalue().strip().split("\n")
+ if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]
+
+ log.info("Enumerated cores: {0}".format(cores))
+ self.assertEqual(len(cores), 1)
+
+ log.info("Found core file {0}, deleting it".format(cores[0]))
+
+ self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+ "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
+ ])
else:
- raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
+ log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
log = logging.getLogger(__name__)
+MDS_RESTART_GRACE = 60
+
class TestSnapshots(CephFSTestCase):
MDSS_REQUIRED = 2
return True
return False
+ def _stop_standby_mds(self):
+ for i in self.fs.get_standby_daemons():
+ self.fs.mds_stop(i)
+ self.fs.mds_fail(i)
+ self.wait_until_equal(lambda: len(self.fs.get_standby_daemons()), expect_val=0, timeout=30)
+
+ def _get_snapserver_dump(self, rank0_id):
+ return self.fs.mds_asok(["dump", "snaps", "--server"], rank0_id)
+
+ def _get_last_created_snap(self, rank0_id):
+ return int(self._get_snapserver_dump(rank0_id)["last_created"])
+
+ def _get_pending_snap_update(self, rank0_id):
+ return self._get_snapserver_dump(rank0_id)["pending_update"]
+
+ def test_kill_mdstable(self):
+ """
+ check snaptable transcation
+ """
+ self.fs.set_allow_new_snaps(True);
+ self.fs.set_max_mds(2)
+ self.fs.wait_for_daemons()
+
+ grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+ self._stop_standby_mds()
+
+ status = self.fs.status()
+ rank0_id=status.get_rank(self.fs.id, 0)['name']
+ rank1_id=status.get_rank(self.fs.id, 1)['name']
+ self.set_conf("mds.{0}".format(rank0_id), "mds standby for rank", 0)
+ self.set_conf("mds.{0}".format(rank1_id), "mds standby for rank", 1)
+
+ # setup subtrees
+ self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
+ self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
+ self.wait_until_true(lambda: self._check_subtree(status, 1, '/d1'), timeout=30)
+
+ last_created = self._get_last_created_snap(rank0_id)
+
+ # mds_kill_mdstable_at:
+ # 1: MDSTableServer::handle_prepare
+ # 2: MDSTableServer::_prepare_logged
+ # 5: MDSTableServer::handle_commit
+ # 6: MDSTableServer::_commit_logged
+ for i in [1,2,5,6]:
+ log.info("testing snapserver mds_kill_mdstable_at={0}".format(i))
+ self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank0_id)
+ proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
+ self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
+ self.delete_mds_coredump(rank0_id);
+
+ self.mds_cluster.mds_fail_restart(rank0_id)
+ self.wait_for_daemon_start([rank0_id])
+ self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+ proc.wait()
+ last_created += 1
+ self.wait_until_true(lambda: self._get_last_created_snap(rank0_id) == last_created, timeout=30)
+
+ self.set_conf("mds", "mds_reconnect_timeout", "5")
+
+ self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+ # set mds_kill_mdstable_at, also kill snapclient
+ for i in [2,5,6]:
+ log.info("testing snapserver mds_kill_mdstable_at={0}, also kill snapclient".format(i))
+ last_created = self._get_last_created_snap(rank0_id)
+
+ self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank0_id)
+ proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
+ self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
+ self.delete_mds_coredump(rank0_id);
+
+ self.mds_cluster.mds_stop(rank1_id)
+ self.mds_cluster.mds_fail(rank1_id);
+
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ self.mds_cluster.mds_fail_restart(rank0_id)
+ self.wait_for_daemon_start([rank0_id])
+
+ self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE)
+ if i in [2,5]:
+ self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ elif i == 6:
+ self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 0)
+ self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+
+ self.mds_cluster.mds_restart(rank1_id)
+ self.wait_for_daemon_start([rank1_id])
+ self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+
+ if i in [2,5]:
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ if i == 2:
+ self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+ else:
+ self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+ # mds_kill_mdstable_at:
+ # 3: MDSTableClient::handle_request (got agree)
+ # 4: MDSTableClient::commit
+ # 7: MDSTableClient::handle_request (got ack)
+ for i in [3,4,7]:
+ log.info("testing snapclient mds_kill_mdstable_at={0}".format(i))
+ last_created = self._get_last_created_snap(rank0_id)
+
+ self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank1_id)
+ proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
+ self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank1_id), timeout=grace*2);
+ self.delete_mds_coredump(rank1_id);
+
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ if i in [3,4]:
+ self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+ elif i == 7:
+ self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 0)
+ self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+
+ self.mds_cluster.mds_fail_restart(rank1_id)
+ self.wait_for_daemon_start([rank1_id])
+ self.fs.wait_for_state('up:active', rank=1, timeout=MDS_RESTART_GRACE)
+
+ if i in [3,4]:
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ if i == 3:
+ self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+ else:
+ self.assertGreater(self._get_last_created_snap(rank0_id), last_created)
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
+ self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")])
+
+ # mds_kill_mdstable_at:
+ # 3: MDSTableClient::handle_request (got agree)
+ # 8: MDSTableServer::handle_rollback
+ log.info("testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8")
+ last_created = self._get_last_created_snap(rank0_id)
+
+ self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "8".format(i)], rank0_id)
+ self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "3".format(i)], rank1_id)
+ proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4".format(i)], wait=False)
+ self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank1_id), timeout=grace*2);
+ self.delete_mds_coredump(rank1_id);
+
+ self.mount_a.kill()
+ self.mount_a.kill_cleanup()
+
+ self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1)
+
+ self.mds_cluster.mds_fail_restart(rank1_id)
+
+ # rollback triggers assertion
+ self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank0_id), timeout=grace*2);
+ self.delete_mds_coredump(rank0_id);
+
+ self.mds_cluster.mds_fail_restart(rank0_id)
+ self.wait_for_daemon_start([rank0_id])
+ self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE)
+ # mds.1 should re-send rollack message
+ self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30)
+ self.assertEqual(self._get_last_created_snap(rank0_id), last_created)
+
+ self.mount_a.mount()
+ self.mount_a.wait_until_mounted()
+
def test_multimds_mksnap(self):
"""
check if snapshot takes effect across authority subtrees