From db5c9dc2e6cc95a8d112c2131e4cac5340ca9dd0 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Fri, 27 Jun 2025 14:46:07 -0400 Subject: [PATCH] mds: nudge log for unstable locks after early_reply A getattr/lookup can cause a wrlock or xlock to become unstable after a request (like rename) acquires it but before early reply. The MDS will not nudge the log in this situation and the getattr/lookup will need to wait for the eventual journal flush before the lock is released. Now looks like: 2025-06-27T19:41:32.043+0000 7f11d21a9640 5 mds.0.log _submit_thread 25185408~2845 : EUpdate rename [metablob 0x1, 3 dirs] 2025-06-27T19:41:32.043+0000 7f11d99b8640 1 -- [v2:172.21.10.4:6868/56297870,v1:172.21.10.4:6869/56297870] --> [v2:172.21.10.4:6818/999439823,v1:172.21.10.4:6819/999439823] -- osd_op(unknown.0.23:135 2.e 2:7bf7e7b5:::200.00000006:head [write 19584~2865 [fadvise_dontneed] in=2865b] snapc 0=[] ondisk+write+known_if_redirected+full_force+supports_pool_eio e72) -- 0x563e9b730000 con 0x563e9a674800 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) on [dentry #0x1/a/file [2,head] auth (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14558 v=14556 ap=2 ino=0x10000000002 remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=1610612736 | request=1 lock=2 inodepin=1 dirty=1 waiter=0 authpin=1 0x563e95629900] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) on [dentry #0x1/b/file [2,head] auth NULL (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14557 v=14555 ap=2 ino=(nil) remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=1610612736 | request=1 lock=2 inodepin=0 dirty=1 waiter=0 authpin=1 0x563e95629b80] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (dversion lock w=1 last_client=4393) on [dentry #0x1/a/file [2,head] auth (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14558 v=14556 ap=2 ino=0x10000000002 remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=1610612736 | request=1 lock=2 inodepin=1 dirty=1 waiter=0 authpin=1 0x563e95629900] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (dversion lock w=1 last_client=4393) on [dentry #0x1/b/file [2,head] auth NULL (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14557 v=14555 ap=2 ino=(nil) remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=1610612736 | request=1 lock=2 inodepin=0 dirty=1 waiter=0 authpin=1 0x563e95629b80] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (ifile excl w=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (inest lock w=1 dirty) on [inode 0x1 [...2,head] / auth v179 snaprealm=0x563e9a6ce6c0 f(v0 m2025-06-27T19:09:29.187695+0000 2=0+2) n(v45 rc2025-06-27T19:41:31.249722+0000 3=0+3)/n(v0 rc2025-06-27T19:08:43.024940+0000 1=0+1) (isnap sync r=2) (inest lock w=1 dirty) caps={4359=pAsLsXsFs/-@83,4393=pAsLsXsFs/-@1245} | dirtyscattered=1 request=0 lock=2 dirfrag=1 caps=1 openingsnapparents=0 dirty=1 authpin=0 0x563e9a6f4b00] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (inest lock w=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (iquiesce lock w=1 last_client=4393) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (isnap sync r=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (isnap sync r=2) on [inode 0x1 [...2,head] / auth v179 snaprealm=0x563e9a6ce6c0 f(v0 m2025-06-27T19:09:29.187695+0000 2=0+2) n(v45 rc2025-06-27T19:41:31.249722+0000 3=0+3)/n(v0 rc2025-06-27T19:08:43.024940+0000 1=0+1) (isnap sync r=2) (inest lock w=1 dirty) caps={4359=pAsLsXsFs/-@83,4393=pAsLsXsFs/-@1245} | dirtyscattered=1 request=0 lock=2 dirfrag=1 caps=1 openingsnapparents=0 dirty=1 authpin=0 0x563e9a6f4b00] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (iversion lock w=1 last_client=4393) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log NO (iversion lock w=1 last_client=4393) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080] 2025-06-27T19:41:32.043+0000 7f11d99b8640 10 mds.0.locker nudge_log YES (ifile lock->sync w=1) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600] 2025-06-27T19:41:32.043+0000 7f11d99b8640 20 mds.0.locker : request(client.4393:20702 nref=5 cr=0x563e9b83ae00) 2025-06-27T19:41:32.043+0000 7f11d99b8640 20 mds.0.log _submit_entry EUpdate rename [metablob 0x1, 3 dirs] Easily reproducible with two ceph-fuse clients. One session doing: $ mkdir a b $ touch a/file $ while true; do mv -v a/file b/file; mv -v b/file a/file; done and the other $ while true; do stat a/file; done You can observe the rename/stat stalls (up to 5 seconds; MDS tick interval) without this patch. Fixes: https://tracker.ceph.com/issues/71876 Signed-off-by: Patrick Donnelly --- src/mds/Locker.cc | 26 +++++++++++++++++++++----- src/mds/Locker.h | 4 ++-- src/mds/Server.cc | 2 +- src/mds/SimpleLock.h | 3 +++ 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 7b67097ae66..221da02f751 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -900,16 +900,26 @@ void Locker::drop_non_rdlocks(MutationImpl *mut, set *pneed_issue) issue_caps_set(*pneed_issue); } -void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut) +void Locker::handle_locks_for_early_reply(MutationImpl *mut) { set need_issue; + bool nudged = false; + + dout(10) << __func__ << ": " << *mut << dendl; for (auto it = mut->locks.begin(); it != mut->locks.end(); ) { + SimpleLock *lock = it->lock; + if (!nudged) { + /* A request may finally early reply only after another request has + * triggered an unstable state change. We need to nudge the log now to + * move things along. + */ + nudged = nudge_log(lock); + } if (!it->is_rdlock()) { ++it; continue; } - SimpleLock *lock = it->lock; // make later mksnap/setlayout (at other mds) wait for this unsafe request if (lock->get_type() == CEPH_LOCK_ISNAP || lock->get_type() == CEPH_LOCK_IPOLICY) { @@ -1839,11 +1849,17 @@ bool Locker::rdlock_start(SimpleLock *lock, const MDRequestRef& mut, bool as_ano return false; } -void Locker::nudge_log(SimpleLock *lock) +bool Locker::nudge_log(SimpleLock *lock) { - dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl; - if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked()) // as with xlockdone, or cap flush + // as with xlockdone, or cap flush + if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked() && lock->has_any_waiter()) { + dout(10) << __func__ << " YES " << *lock << " on " << *lock->get_parent() << dendl; mds->mdlog->flush(); + return true; + } else { + dout(20) << __func__ << " NO " << *lock << " on " << *lock->get_parent() << dendl; + return false; + } } void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue) diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 9f030b119cb..bce080d12db 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -64,7 +64,7 @@ public: void tick(); - void nudge_log(SimpleLock *lock); + bool nudge_log(SimpleLock *lock); bool acquire_locks(const MDRequestRef& mdr, MutationImpl::LockOpVec& lov, @@ -80,7 +80,7 @@ public: void drop_locks(MutationImpl *mut, std::set *pneed_issue=0); void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false); void drop_non_rdlocks(MutationImpl *mut, std::set *pneed_issue=0); - void drop_rdlocks_for_early_reply(MutationImpl *mut); + void handle_locks_for_early_reply(MutationImpl *mut); void drop_lock(MutationImpl* mut, SimpleLock* what); void drop_locks_for_fragment_unfreeze(MutationImpl *mut); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 00e04b23bf0..605c4fb7eac 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2132,7 +2132,7 @@ void Server::journal_and_reply(const MDRequestRef& mdr, CInode *in, CDentry *dn, mdr->set_queued_next_replay_op(); mds->queue_one_replay(); } else if (mdr->did_early_reply) - mds->locker->drop_rdlocks_for_early_reply(mdr.get()); + mds->locker->handle_locks_for_early_reply(mdr.get()); else mdlog->flush(); } diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index e443fc07485..a46f0d6d328 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -250,6 +250,9 @@ public: bool is_waiter_for(uint64_t mask) const { return parent->is_waiter_for(getmask(mask)); } + bool has_any_waiter() const { + return is_waiter_for(std::numeric_limits::max()); + } bool is_cached() const { return state_flags & CACHED; -- 2.39.5