From: Leonid Usov Date: Wed, 22 May 2024 11:46:50 +0000 (+0300) Subject: qa/cephfs: check that a completed quiesce doesn't hold remote auth pins X-Git-Tag: v20.0.0~1853^2~6 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=e32fb12b8ea105cef82cf5b9304c28bc4dc8e7a5;p=ceph.git qa/cephfs: check that a completed quiesce doesn't hold remote auth pins Signed-off-by: Leonid Usov --- diff --git a/qa/tasks/cephfs/test_quiesce.py b/qa/tasks/cephfs/test_quiesce.py index c29ad8971059b..bccfbd35ccdd1 100644 --- a/qa/tasks/cephfs/test_quiesce.py +++ b/qa/tasks/cephfs/test_quiesce.py @@ -697,8 +697,66 @@ class TestQuiesceMultiRank(QuiesceTestCase): op = self.fs.rank_tell(["quiesce", "path", self.subvolume, '--await'], rank=0, check_status=False)['op'] self.assertEqual(op['result'], -1) # EPERM - @unittest.skip("https://tracker.ceph.com/issues/66152") - def test_quiesce_drops_remote_authpins_on_failure(self): + def test_quiesce_drops_remote_authpins_when_done(self): + """ + That a quiesce operation drops remote authpins after marking the node as quiesced + + It's important that a remote quiesce doesn't stall freezing ops on the auth + """ + self._configure_subvolume() + + # create two dirs for pinning + self.mount_a.run_shell_payload("mkdir -p pin0 pin1") + # enable export by populating the directories + self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy") + # pin the files to different ranks + self.mount_a.setfattr("pin0", "ceph.dir.pin", "0") + self.mount_a.setfattr("pin1", "ceph.dir.pin", "1") + + # prepare the patient at rank 0 + self.mount_a.write_file("pin0/thefile", "I'm ready, doc") + + # wait for the export to settle + self._wait_subtrees([(f"{self.mntpnt}/pin0", 0), (f"{self.mntpnt}/pin1", 1)]) + + def reqid(cmd): + J = json.loads(cmd.stdout.getvalue()) + J = J.get('type_data', J) # for op get + J = J.get('op', J) # for quiesce path + # lock path returns the op directly + return self._reqid_tostr(J['reqid']) + + def assertQuiesceOpDone(expected_done, quiesce_op, rank): + cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:{rank} op get {quiesce_op}", stdout=StringIO()) + + J = json.loads(cmd.stdout.getvalue()) + self.assertEqual(J['type_data']['result'], 0 if expected_done else None) + + # Take the policy lock on the auth to cause a quiesce operation to request the remote authpin + # This is needed to cause the next command to block + cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:0 lock path {self.mntpnt}/pin0/thefile policy:x --await", stdout=StringIO()) + policy_block_op = reqid(cmd) + + # Try quiescing on the replica. This should block for the policy lock + # As a side effect, it should take the remote authpin + cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 quiesce path {self.mntpnt}/pin0/thefile", stdout=StringIO()) + quiesce_op = reqid(cmd) + + # verify the quiesce is pending + assertQuiesceOpDone(False, quiesce_op, rank=1) + + # kill the op that holds the policy lock exclusively and verify the quiesce succeeds + self.fs.kill_op(policy_block_op, rank=0) + assertQuiesceOpDone(True, quiesce_op, rank=1) + + # If all is good, the ap-freeze operation below should succeed + # despite the quiesce_op that's still active. + # We payload this with some lock that we know shouldn't block + # The call below will block on freezing if the quiesce failed to release + # remote authpins, and after the lifetime elapses will return ECANCELED + cmd = self.fs.run_ceph_cmd(f"tell mds.{self.fs.name}:1 lock path {self.mntpnt}/pin0/thefile policy:r --ap-freeze --await --lifetime 5") + + def test_request_drops_remote_authpins_when_waiting_for_quiescelock(self): """ That remote authpins are dropped when the request fails to acquire the quiesce lock