From d532e66087a1f98428195fc81fe2a1ef2d1199af Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 1 Feb 2018 13:47:48 +0800 Subject: [PATCH] qa/cephfs: add tests for snapclient cache Signed-off-by: "Yan, Zheng" --- qa/tasks/cephfs/test_snapshots.py | 151 +++++++++++++++++++++++++++++- src/mds/MDSTableClient.cc | 1 + 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py index e2c34908d83cf..511e768a7f750 100644 --- a/qa/tasks/cephfs/test_snapshots.py +++ b/qa/tasks/cephfs/test_snapshots.py @@ -10,7 +10,7 @@ log = logging.getLogger(__name__) MDS_RESTART_GRACE = 60 class TestSnapshots(CephFSTestCase): - MDSS_REQUIRED = 2 + MDSS_REQUIRED = 3 def _check_subtree(self, status, rank, path): got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name']) @@ -25,15 +25,24 @@ class TestSnapshots(CephFSTestCase): self.fs.mds_fail(i) self.wait_until_equal(lambda: len(self.fs.get_standby_daemons()), expect_val=0, timeout=30) + def _get_snapclient_dump(self, rank_id): + return self.fs.mds_asok(["dump", "snaps"], rank_id) + def _get_snapserver_dump(self, rank0_id): return self.fs.mds_asok(["dump", "snaps", "--server"], rank0_id) def _get_last_created_snap(self, rank0_id): return int(self._get_snapserver_dump(rank0_id)["last_created"]) + def _get_last_destroyed_snap(self, rank0_id): + return int(self._get_snapserver_dump(rank0_id)["last_destroyed"]) + def _get_pending_snap_update(self, rank0_id): return self._get_snapserver_dump(rank0_id)["pending_update"] + def _get_pending_snap_destroy(self, rank0_id): + return self._get_snapserver_dump(rank0_id)["pending_destroy"] + def test_kill_mdstable(self): """ check snaptable transcation @@ -196,6 +205,146 @@ class TestSnapshots(CephFSTestCase): self.mount_a.mount() self.mount_a.wait_until_mounted() + def test_snapclient_cache(self): + """ + check if snapclient cache gets synced properly + """ + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(3) + self.fs.wait_for_daemons() + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + + self._stop_standby_mds() + + status = self.fs.status() + rank0_id=status.get_rank(self.fs.id, 0)['name'] + rank1_id=status.get_rank(self.fs.id, 1)['name'] + rank2_id=status.get_rank(self.fs.id, 2)['name'] + self.set_conf("mds.{0}".format(rank0_id), "mds standby for rank", 0) + self.set_conf("mds.{0}".format(rank1_id), "mds standby for rank", 1) + self.set_conf("mds.{0}".format(rank2_id), "mds standby for rank", 2) + + self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"]) + self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"]) + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1") + self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2") + self.wait_until_true(lambda: self._check_subtree(status, 2, '/d0/d2'), timeout=30) + self.wait_until_true(lambda: self._check_subtree(status, 1, '/d0/d1'), timeout=5) + self.wait_until_true(lambda: self._check_subtree(status, 0, '/d0'), timeout=5) + + def _check_snapclient_cache(snaps_dump, cache_dump=None, rank_id=-1): + if cache_dump is None: + cache_dump = self._get_snapclient_dump(rank_id) + for key, value in cache_dump.iteritems(): + if value != snaps_dump[key]: + return False + return True; + + # sync after mksnap + last_created = self._get_last_created_snap(rank0_id) + self.mount_a.run_shell(["mkdir", Raw("d0/d1/dir/.snap/{s1,s2}")]) + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30) + self.assertGreater(self._get_last_created_snap(rank0_id), last_created) + + snaps_dump = self._get_snapserver_dump(rank0_id) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id)); + + # sync after rmsnap + last_destroyed = self._get_last_destroyed_snap(rank0_id) + self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"]) + self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank0_id)) == 0, timeout=30) + self.assertGreater(self._get_last_destroyed_snap(rank0_id), last_destroyed) + + snaps_dump = self._get_snapserver_dump(rank0_id) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id)); + + # sync during mds recovers + self.mds_cluster.mds_stop(rank2_id) + self.mds_cluster.mds_fail_restart(rank2_id) + self.wait_for_daemon_start([rank2_id]) + self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id)); + + self.mds_cluster.mds_stop(rank0_id) + self.mds_cluster.mds_stop(rank1_id) + self.mds_cluster.mds_fail_restart(rank0_id) + self.mds_cluster.mds_fail_restart(rank1_id) + self.wait_for_daemon_start([rank0_id, rank1_id]) + self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank0_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank1_id)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank_id=rank2_id)); + + # kill at MDSTableClient::handle_notify_prep + self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank2_id) + proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank2_id), timeout=grace*2); + self.delete_mds_coredump(rank2_id); + + # mksnap should wait for notify ack from mds.2 + self.assertFalse(proc.finished); + + # mksnap should proceed after mds.2 fails + self.mds_cluster.mds_fail(rank2_id) + self.wait_until_true(lambda: proc.finished, timeout=30); + + self.mds_cluster.mds_fail_restart(rank2_id) + self.wait_for_daemon_start([rank2_id]) + self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE) + + self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")]) + + # kill at MDSTableClient::commit + # the recovering mds should sync all mds' cache when it enters resolve stage + self.set_conf("mds", "mds_reconnect_timeout", "5") + for i in range(1, 4): + self.fs.mds_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank2_id) + last_created = self._get_last_created_snap(rank0_id) + proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.status().get_mds(rank2_id), timeout=grace*2); + self.delete_mds_coredump(rank2_id); + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + self.assertEqual(len(self._get_pending_snap_update(rank0_id)), 1) + + if i in [2,4]: + self.mds_cluster.mds_stop(rank0_id) + self.mds_cluster.mds_fail_restart(rank0_id) + if i in [3,4]: + self.mds_cluster.mds_stop(rank1_id) + self.mds_cluster.mds_fail_restart(rank1_id) + + self.mds_cluster.mds_fail_restart(rank2_id) + self.wait_for_daemon_start([rank0_id, rank1_id, rank2_id]) + + self.fs.wait_for_state('up:active', rank=2, timeout=MDS_RESTART_GRACE) + rank0_cache = self._get_snapclient_dump(rank0_id) + rank1_cache = self._get_snapclient_dump(rank1_id) + rank2_cache = self._get_snapclient_dump(rank2_id) + + self.assertGreater(int(rank0_cache["last_created"]), last_created) + self.assertEqual(rank0_cache, rank1_cache); + self.assertEqual(rank0_cache, rank2_cache); + + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank0_id)) == 0, timeout=30) + + snaps_dump = self._get_snapserver_dump(rank0_id) + self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"]) + self.assertTrue(_check_snapclient_cache(snaps_dump, cache_dump=rank0_cache)); + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")]) + def test_multimds_mksnap(self): """ check if snapshot takes effect across authority subtrees diff --git a/src/mds/MDSTableClient.cc b/src/mds/MDSTableClient.cc index 417a8d687c4f0..20c5e0a4f9a76 100644 --- a/src/mds/MDSTableClient.cc +++ b/src/mds/MDSTableClient.cc @@ -71,6 +71,7 @@ void MDSTableClient::handle_request(class MMDSTableRequest *m) break; case TABLESERVER_OP_NOTIFY_PREP: + assert(g_conf->mds_kill_mdstable_at != 9); handle_notify_prep(m); break; -- 2.39.5