From: Leonid Usov Date: Mon, 20 May 2024 22:03:15 +0000 (+0300) Subject: squid: mds/quiesce: quiesce_inode should not hold on to remote auth pins X-Git-Tag: testing/wip-lusov-testing-20240611.123850-squid~27^2~7 X-Git-Url: http://git.apps.os.sepia.ceph.com/?a=commitdiff_plain;h=92eab9e5906b84b0ad0a7abb7e19b60f6b584cf1;p=ceph-ci.git squid: mds/quiesce: quiesce_inode should not hold on to remote auth pins 1. avoid taking a remote authpin for the quiesce lock 2. drop remote authpins that were taken because of other locks We should not be forcing a mustpin when taking quiesce lock. This creates unnecessary overhead due to the distributed nature of the quiesce: all ranks will execute quiesce_inode, including the auth rank, which will authpin the inode. Auth pinning on the auth rank is important to synchronize quiesce with operations that are managed by the auth, like fragmenting and exporting. If we let a remote quiesce process take a foreign authpin then it may block freezing on the auth, which will stall quiesce locally. This wouldn't be a problem if the quiesce that is blocked on the auth and the quiesce that's holding a remote authpin from the replica side were unrelated, but in our case it may be the same logical quiesce that effectively steps on its own toes. This creates an opportunity for a deadlock. Fixes: https://tracker.ceph.com/issues/66152 Signed-off-by: Leonid Usov (cherry picked from commit b1cb6d985622c6164d99d3fd79b6eeaf6530894c) Fixes: https://tracker.ceph.com/issues/66258 --- diff --git a/qa/tasks/cephfs/test_quiesce.py b/qa/tasks/cephfs/test_quiesce.py index bccfbd35ccd..3de9d8c321a 100644 --- a/qa/tasks/cephfs/test_quiesce.py +++ b/qa/tasks/cephfs/test_quiesce.py @@ -804,65 +804,71 @@ class TestQuiesceMultiRank(QuiesceTestCase): That a quiesce_inode op with outstanding remote authpin requests can be killed. """ - self.config_set('mds', 'mds_heartbeat_grace', '120') - self.fs.set_session_timeout(240) # avoid spurious session warnings - self._configure_subvolume() - self.mount_a.setfattr(".", "ceph.dir.pin.distributed", "1") - self._client_background_workload() - self._wait_distributed_subtrees(2*2, rank="all", path=self.mntpnt) - status = self.fs.status() + # create two dirs for pinning + self.mount_a.run_shell_payload("mkdir -p pin0 pin1") + # enable export by populating the directories + self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy") + # pin the files to different ranks + self.mount_a.setfattr("pin0", "ceph.dir.pin", "0") + self.mount_a.setfattr("pin1", "ceph.dir.pin", "1") - p = self.mount_a.run_shell_payload("ls", stdout=StringIO()) - dirs = p.stdout.getvalue().strip().split() + # prepare the patient at rank 0 + self.mount_a.write_file("pin0/thefile", "I'm ready, doc") - # make rank 1 unresponsive to auth pin requests - p = self.run_ceph_cmd("tell", f"mds.{self.fs.id}:1", "lockup", "90000", wait=False) + # wait for the export to settle + self._wait_subtrees([("/pin0", 0), ("/pin1", 1)]) - qops = [] - for d in dirs: - path = os.path.join(self.mntpnt, d) - op = self.fs.rank_tell("quiesce", "path", path, rank=0)['op'] - reqid = self._reqid_tostr(op['reqid']) - log.info(f"created {reqid}") - qops.append(reqid) + path = "/pin0/thefile" + + # take the policy lock on the file to cause remote authpin from the replica + # when we get to quiescing the path + op = self.fs.rank_tell("lock", "path", path, "policy:x", "--await", rank=0) + policy_reqid = self._reqid_tostr(op['reqid']) + + # We need to simulate a freezing inode to have quiesce block on the authpin. + # This can be done with authpin freeze feature, but it only works when sent from the replica. + # We'll rdlock the policy lock, but it doesn't really matter as the quiesce won't get that far + op = self.fs.rank_tell("lock", "path", path, "policy:r", "--ap-freeze", rank=1) + freeze_reqid = self._reqid_tostr(op['reqid']) - def find_quiesce(blocked_on_remote_auth_pin): - # verify no quiesce ops - ops = self.fs.get_ops(locks=False, rank=0, path="/tmp/mds.0-ops", status=status)['ops'] + # we should quiesce the same path from the replica side to cause the remote authpin (due to file xlock) + op = self.fs.rank_tell("quiesce", "path", path, rank=1)['op'] + quiesce_reqid = self._reqid_tostr(op['reqid']) + + def has_quiesce(*, blocked_on_remote_auth_pin): + ops = self.fs.get_ops(locks=False, rank=1, path="/tmp/mds.1-ops")['ops'] + log.debug(ops) for op in ops: type_data = op['type_data'] flag_point = type_data['flag_point'] - op_type = type_data['op_type'] - if op_type == 'client_request' or op_type == 'peer_request': - continue if type_data['op_name'] == "quiesce_inode": if blocked_on_remote_auth_pin: - if flag_point == "requesting remote authpins": - return True - else: - return True + self.assertEqual("requesting remote authpins", flag_point) + return True return False - with safe_while(sleep=1, tries=30, action='wait for quiesce op with outstanding remote authpin requests') as proceed: - while proceed(): - if find_quiesce(True): - break + # The quiesce should be pending + self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True)) - # okay, now kill all quiesce ops - for reqid in qops: - self.fs.kill_op(reqid, rank=0) + # even after killing the quiesce op, it should still stay pending + self.fs.kill_op(quiesce_reqid, rank=1) + self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True)) - # verify some quiesce_inode ops still exist because authpin acks have not been received - if not find_quiesce(True): - self.fail("did not find quiesce_inode op blocked on remote authpins! (did the lockup on rank 1 complete?)") + # first, kill the policy xlock to release the freezing request + self.fs.kill_op(policy_reqid, rank=0) + time.sleep(1) - # wait for sleep to complete - p.wait() + # this should have let the freezing request to progress + op = self.fs.rank_tell("op", "get", freeze_reqid, rank=1) + log.debug(op) + self.assertEqual(0, op['type_data']['result']) + + # now unfreeze the inode + self.fs.kill_op(freeze_reqid, rank=1) + time.sleep(1) - with safe_while(sleep=1, tries=30, action='wait for quiesce kill') as proceed: - while proceed(): - if not find_quiesce(False): - break + # the quiesce op should be gone + self.assertFalse(has_quiesce(blocked_on_remote_auth_pin=False)) def test_quiesce_block_file_replicated(self): """ diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 34769454712..8e45504b74d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -230,7 +230,6 @@ struct MarkEventOnDestruct { bool Locker::acquire_locks(const MDRequestRef& mdr, MutationImpl::LockOpVec& lov, CInode* auth_pin_freeze, - std::set mustpin, bool auth_pin_nonblocking, bool skip_quiesce) { @@ -245,6 +244,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, client_t client = mdr->get_client(); + std::set mustpin; if (auth_pin_freeze) mustpin.insert(auth_pin_freeze); @@ -294,7 +294,10 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, dout(20) << " must xlock " << *lock << " " << *object << dendl; - mustpin.insert(object); + // only take the authpin for the quiesce lock on the auth + if (lock->get_type() != CEPH_LOCK_IQUIESCE || object->is_auth()) { + mustpin.insert(object); + } // augment xlock with a versionlock? if (lock->get_type() == CEPH_LOCK_DN) { diff --git a/src/mds/Locker.h b/src/mds/Locker.h index a1ea4a26098..aa037ac6abd 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -54,7 +54,6 @@ public: bool acquire_locks(const MDRequestRef& mdr, MutationImpl::LockOpVec& lov, CInode *auth_pin_freeze=NULL, - std::set mustpin = {}, bool auth_pin_nonblocking=false, bool skip_quiesce=false); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index dce21f26bd8..dad6d486494 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -12076,7 +12076,7 @@ void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freez // prevent a racing gather on any other scatterlocks too lov.lock_scatter_gather(&diri->nestlock); lov.lock_scatter_gather(&diri->filelock); - if (mds->locker->acquire_locks(mdr, lov, NULL, {}, true)) { + if (mds->locker->acquire_locks(mdr, lov, NULL, true)) { mdr->locking_state |= MutationImpl::ALL_LOCKED; } else { if (!mdr->aborted) { @@ -13680,7 +13680,7 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) // as a result of the auth taking the above locks. } - if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) { + if (!mds->locker->acquire_locks(mdr, lov, nullptr, false, true)) { return; } mdr->locking_state |= MutationImpl::ALL_LOCKED; @@ -13711,6 +13711,14 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) mds->locker->drop_lock(mdr.get(), &in->linklock); mds->locker->drop_lock(mdr.get(), &in->xattrlock); } + + // The quiescelock doesn't attempt to take remote authpins, + // but one could have been acquired when rdlock-ing the policylock. + // We are calling drop_remote_locks here to get rid of any remote + // authpins. That's the only side effect of this call, since we + // aren't really holding any remote locks (x/wr) + // See https://tracker.ceph.com/issues/66152 for more info. + mds->locker->request_drop_remote_locks(mdr); } if (in->is_dir()) { @@ -14010,7 +14018,7 @@ void MDCache::dispatch_lock_path(const MDRequestRef& mdr) } } - if (!mds->locker->acquire_locks(mdr, lov, lps.config.ap_freeze ? in : nullptr, {in}, lps.config.ap_dont_block, true)) { + if (!mds->locker->acquire_locks(mdr, lov, lps.config.ap_freeze ? in : nullptr, lps.config.ap_dont_block, true)) { if (lps.config.ap_dont_block && mdr->aborted) { mds->server->respond_to_request(mdr, -CEPHFS_EWOULDBLOCK); } @@ -14023,6 +14031,7 @@ void MDCache::dispatch_lock_path(const MDRequestRef& mdr) } mdr->mark_event("object locked"); + mdr->result = 0; if (auto c = mdr->internal_op_finish) { mdr->internal_op_finish = nullptr; c->complete(0); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 8ef61c65abb..3f708022200 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -3080,7 +3080,7 @@ void Server::dispatch_peer_request(const MDRequestRef& mdr) // will want to take other locks is to prevent issuing unwanted client capabilities, // but since replicas can't issue capabilities, it should be fine allowing remote locks // without taking the quiesce lock. - if (!mds->locker->acquire_locks(mdr, lov, nullptr, {}, false, true)) + if (!mds->locker->acquire_locks(mdr, lov, nullptr, false, true)) return; // ack