]> git.apps.os.sepia.ceph.com Git - ceph.git/commitdiff
mds/quiesce: quiesce_inode should not hold on to remote auth pins
authorLeonid Usov <leonid.usov@ibm.com>
Mon, 20 May 2024 22:03:15 +0000 (01:03 +0300)
committerLeonid Usov <leonid.usov@ibm.com>
Sun, 26 May 2024 08:33:52 +0000 (11:33 +0300)
1. avoid taking a remote authpin for the quiesce lock
2. drop remote authpins that were taken because of other locks

We should not be forcing a mustpin when taking quiesce lock.
This creates unnecessary overhead due to the distributed nature
of the quiesce: all ranks will execute quiesce_inode, including
the auth rank, which will authpin the inode.

Auth pinning on the auth rank is important to synchronize quiesce
with operations that are managed by the auth, like fragmenting
and exporting.

If we let a remote quiesce process take a foreign authpin then
it may block freezing on the auth, which will stall quiesce locally.
This wouldn't be a problem if the quiesce that is blocked on the auth
and the quiesce that's holding a remote authpin from the replica side
were unrelated, but in our case it may be the same logical quiesce
that effectively steps on its own toes. This creates an opportunity
for a deadlock.

Fixes: https://tracker.ceph.com/issues/66152
Signed-off-by: Leonid Usov <leonid.usov@ibm.com>
qa/tasks/cephfs/test_quiesce.py
src/mds/Locker.cc
src/mds/Locker.h
src/mds/MDCache.cc
src/mds/Server.cc

index bccfbd35ccdd10d518ba6caa1769d9807af8ed11..3de9d8c321a932a5ac69d6f9f8822da2d6b84918 100644 (file)
@@ -804,65 +804,71 @@ class TestQuiesceMultiRank(QuiesceTestCase):
         That a quiesce_inode op with outstanding remote authpin requests can be killed.
         """
 
-        self.config_set('mds', 'mds_heartbeat_grace', '120')
-        self.fs.set_session_timeout(240) # avoid spurious session warnings
-        self._configure_subvolume()
-        self.mount_a.setfattr(".", "ceph.dir.pin.distributed", "1")
-        self._client_background_workload()
-        self._wait_distributed_subtrees(2*2, rank="all", path=self.mntpnt)
-        status = self.fs.status()
+        # create two dirs for pinning
+        self.mount_a.run_shell_payload("mkdir -p pin0 pin1")
+        # enable export by populating the directories
+        self.mount_a.run_shell_payload("touch pin0/export_dummy pin1/export_dummy")
+        # pin the files to different ranks
+        self.mount_a.setfattr("pin0", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("pin1", "ceph.dir.pin", "1")
 
-        p = self.mount_a.run_shell_payload("ls", stdout=StringIO())
-        dirs = p.stdout.getvalue().strip().split()
+        # prepare the patient at rank 0
+        self.mount_a.write_file("pin0/thefile", "I'm ready, doc")
 
-        # make rank 1 unresponsive to auth pin requests
-        p = self.run_ceph_cmd("tell", f"mds.{self.fs.id}:1", "lockup", "90000", wait=False)
+        # wait for the export to settle
+        self._wait_subtrees([("/pin0", 0), ("/pin1", 1)])
 
-        qops = []
-        for d in dirs:
-            path = os.path.join(self.mntpnt, d)
-            op = self.fs.rank_tell("quiesce", "path", path, rank=0)['op']
-            reqid = self._reqid_tostr(op['reqid'])
-            log.info(f"created {reqid}")
-            qops.append(reqid)
+        path = "/pin0/thefile"
+
+        # take the policy lock on the file to cause remote authpin from the replica
+        # when we get to quiescing the path
+        op = self.fs.rank_tell("lock", "path", path, "policy:x", "--await", rank=0)
+        policy_reqid = self._reqid_tostr(op['reqid'])
+
+        # We need to simulate a freezing inode to have quiesce block on the authpin.
+        # This can be done with authpin freeze feature, but it only works when sent from the replica.
+        # We'll rdlock the policy lock, but it doesn't really matter as the quiesce won't get that far
+        op = self.fs.rank_tell("lock", "path", path, "policy:r", "--ap-freeze", rank=1)
+        freeze_reqid = self._reqid_tostr(op['reqid'])
 
-        def find_quiesce(blocked_on_remote_auth_pin):
-            # verify no quiesce ops
-            ops = self.fs.get_ops(locks=False, rank=0, path="/tmp/mds.0-ops", status=status)['ops']
+        # we should quiesce the same path from the replica side to cause the remote authpin (due to file xlock)
+        op = self.fs.rank_tell("quiesce", "path", path, rank=1)['op']
+        quiesce_reqid = self._reqid_tostr(op['reqid'])
+
+        def has_quiesce(*, blocked_on_remote_auth_pin):
+            ops = self.fs.get_ops(locks=False, rank=1, path="/tmp/mds.1-ops")['ops']
+            log.debug(ops)
             for op in ops:
                 type_data = op['type_data']
                 flag_point = type_data['flag_point']
-                op_type = type_data['op_type']
-                if op_type == 'client_request' or op_type == 'peer_request':
-                    continue
                 if type_data['op_name'] == "quiesce_inode":
                     if blocked_on_remote_auth_pin:
-                        if flag_point == "requesting remote authpins":
-                            return True
-                    else:
-                        return True
+                        self.assertEqual("requesting remote authpins", flag_point)
+                    return True
             return False
 
-        with safe_while(sleep=1, tries=30, action='wait for quiesce op with outstanding remote authpin requests') as proceed:
-            while proceed():
-                if find_quiesce(True):
-                    break
+        # The quiesce should be pending
+        self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True))
 
-        # okay, now kill all quiesce ops
-        for reqid in qops:
-            self.fs.kill_op(reqid, rank=0)
+        # even after killing the quiesce op, it should still stay pending
+        self.fs.kill_op(quiesce_reqid, rank=1)
+        self.assertTrue(has_quiesce(blocked_on_remote_auth_pin=True))
 
-        # verify some quiesce_inode ops still exist because authpin acks have not been received
-        if not find_quiesce(True):
-            self.fail("did not find quiesce_inode op blocked on remote authpins! (did the lockup on rank 1 complete?)")
+        # first, kill the policy xlock to release the freezing request
+        self.fs.kill_op(policy_reqid, rank=0)
+        time.sleep(1)
 
-        # wait for sleep to complete
-        p.wait()
+        # this should have let the freezing request to progress
+        op = self.fs.rank_tell("op", "get", freeze_reqid, rank=1)
+        log.debug(op)
+        self.assertEqual(0, op['type_data']['result'])
+
+        # now unfreeze the inode
+        self.fs.kill_op(freeze_reqid, rank=1)
+        time.sleep(1)
 
-        with safe_while(sleep=1, tries=30, action='wait for quiesce kill') as proceed:
-            while proceed():
-                if not find_quiesce(False):
-                    break
+        # the quiesce op should be gone
+        self.assertFalse(has_quiesce(blocked_on_remote_auth_pin=False))
 
     def test_quiesce_block_file_replicated(self):
         """
index 02115453027d6be1ae45d7936aa709254194574a..b84f7b59ee538dc9aade0fd6df64706ba96eab27 100644 (file)
@@ -230,7 +230,6 @@ struct MarkEventOnDestruct {
 bool Locker::acquire_locks(const MDRequestRef& mdr,
                           MutationImpl::LockOpVec& lov,
                           CInode* auth_pin_freeze,
-                           std::set<MDSCacheObject*> mustpin,
                           bool auth_pin_nonblocking,
                            bool skip_quiesce)
 {
@@ -245,6 +244,7 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
 
   client_t client = mdr->get_client();
 
+  std::set<MDSCacheObject*> mustpin;
   if (auth_pin_freeze)
     mustpin.insert(auth_pin_freeze);
 
@@ -294,7 +294,10 @@ bool Locker::acquire_locks(const MDRequestRef& mdr,
 
       dout(20) << " must xlock " << *lock << " " << *object << dendl;
 
-      mustpin.insert(object);
+      // only take the authpin for the quiesce lock on the auth
+      if (lock->get_type() != CEPH_LOCK_IQUIESCE || object->is_auth()) {
+        mustpin.insert(object);
+      }
 
       // augment xlock with a versionlock?
       if (lock->get_type() == CEPH_LOCK_DN) {
index a1ea4a260989c6bdd501576dcb56d9a4a112c0f6..aa037ac6abda728f729fdb8d8c0012a6882675b8 100644 (file)
@@ -54,7 +54,6 @@ public:
   bool acquire_locks(const MDRequestRef& mdr,
                     MutationImpl::LockOpVec& lov,
                     CInode *auth_pin_freeze=NULL,
-                     std::set<MDSCacheObject*> mustpin = {},
                     bool auth_pin_nonblocking=false,
                      bool skip_quiesce=false);
 
index a96a2a03e4908e57c715dc6b6bd73323750822ac..a545daded17b4af6a44cbb37c3948a5319a11667 100644 (file)
@@ -12095,7 +12095,7 @@ void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freez
       // prevent a racing gather on any other scatterlocks too
       lov.lock_scatter_gather(&diri->nestlock);
       lov.lock_scatter_gather(&diri->filelock);
-      if (mds->locker->acquire_locks(mdr, lov, NULL, {}, true)) {
+      if (mds->locker->acquire_locks(mdr, lov, NULL, true)) {
         mdr->locking_state |= MutationImpl::ALL_LOCKED;
       } else {
         if (!mdr->aborted) {
@@ -13702,7 +13702,7 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr)
       // as a result of the auth taking the above locks.
     }
 
-    if (!mds->locker->acquire_locks(mdr, lov, nullptr, {in}, false, true)) {
+    if (!mds->locker->acquire_locks(mdr, lov, nullptr, false, true)) {
       return;
     }
     mdr->locking_state |= MutationImpl::ALL_LOCKED;
@@ -13733,6 +13733,14 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr)
       mds->locker->drop_lock(mdr.get(), &in->linklock);
       mds->locker->drop_lock(mdr.get(), &in->xattrlock);
     }
+
+    // The quiescelock doesn't attempt to take remote authpins,
+    // but one could have been acquired when rdlock-ing the policylock.
+    // We are calling drop_remote_locks here to get rid of any remote
+    // authpins. That's the only side effect of this call, since we
+    // aren't really holding any remote locks (x/wr)
+    // See https://tracker.ceph.com/issues/66152 for more info.
+    mds->locker->request_drop_remote_locks(mdr);
   }
 
   if (in->is_dir()) {
@@ -14031,7 +14039,7 @@ void MDCache::dispatch_lock_path(const MDRequestRef& mdr)
     }
   }
 
-  if (!mds->locker->acquire_locks(mdr, lov, lps.config.ap_freeze ? in : nullptr, {in}, lps.config.ap_dont_block, true)) {
+  if (!mds->locker->acquire_locks(mdr, lov, lps.config.ap_freeze ? in : nullptr, lps.config.ap_dont_block, true)) {
     if (lps.config.ap_dont_block && mdr->aborted) {
       mds->server->respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
     }
@@ -14044,6 +14052,7 @@ void MDCache::dispatch_lock_path(const MDRequestRef& mdr)
   }
 
   mdr->mark_event("object locked");
+  mdr->result = 0;
   if (auto c = mdr->internal_op_finish) {
     mdr->internal_op_finish = nullptr;
     c->complete(0);
index 681663cb65276d03def491721cbf1942d1e9b1d8..918cc4e613c1d3dd532848b3e849178a9921c8a0 100644 (file)
@@ -3105,7 +3105,7 @@ void Server::dispatch_peer_request(const MDRequestRef& mdr)
         // will want to take other locks is to prevent issuing unwanted client capabilities,
         // but since replicas can't issue capabilities, it should be fine allowing remote locks
         // without taking the quiesce lock.
-       if (!mds->locker->acquire_locks(mdr, lov, nullptr, {}, false, true))
+       if (!mds->locker->acquire_locks(mdr, lov, nullptr, false, true))
          return;
        
        // ack