A getattr/lookup can cause a wrlock or xlock to become unstable after a request
(like rename) acquires it but before early reply. The MDS will not nudge the
log in this situation and the getattr/lookup will need to wait for the eventual
journal flush before the lock is released.
Now looks like:
2025-06-27T19:41:32.043+0000
7f11d21a9640 5 mds.0.log _submit_thread
25185408~2845 : EUpdate rename [metablob 0x1, 3 dirs]
2025-06-27T19:41:32.043+0000
7f11d99b8640 1 -- [v2:172.21.10.4:6868/
56297870,v1:172.21.10.4:6869/
56297870] --> [v2:172.21.10.4:6818/
999439823,v1:172.21.10.4:6819/
999439823] -- osd_op(unknown.0.23:135 2.e 2:
7bf7e7b5:::200.
00000006:head [write 19584~2865 [fadvise_dontneed] in=2865b] snapc 0=[] ondisk+write+known_if_redirected+full_force+supports_pool_eio e72) -- 0x563e9b730000 con 0x563e9a674800
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) on [dentry #0x1/a/file [2,head] auth (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14558 v=14556 ap=2 ino=0x10000000002 remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=
1610612736 | request=1 lock=2 inodepin=1 dirty=1 waiter=0 authpin=1 0x563e95629900]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) on [dentry #0x1/b/file [2,head] auth NULL (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14557 v=14555 ap=2 ino=(nil) remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=
1610612736 | request=1 lock=2 inodepin=0 dirty=1 waiter=0 authpin=1 0x563e95629b80]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (dversion lock w=1 last_client=4393) on [dentry #0x1/a/file [2,head] auth (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14558 v=14556 ap=2 ino=0x10000000002 remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=
1610612736 | request=1 lock=2 inodepin=1 dirty=1 waiter=0 authpin=1 0x563e95629900]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (dversion lock w=1 last_client=4393) on [dentry #0x1/b/file [2,head] auth NULL (dn xlock x=1 by request(client.4393:20702 nref=6 cr=0x563e9b83ae00)) (dversion lock w=1 last_client=4393) pv=14557 v=14555 ap=2 ino=(nil) remote_ino=0x0 referent_inode_ptr=(nil) referent_ino=0x0 state=
1610612736 | request=1 lock=2 inodepin=0 dirty=1 waiter=0 authpin=1 0x563e95629b80]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (ifile excl w=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (inest lock w=1 dirty) on [inode 0x1 [...2,head] / auth v179 snaprealm=0x563e9a6ce6c0 f(v0 m2025-06-27T19:09:29.187695+0000 2=0+2) n(v45 rc2025-06-27T19:41:31.249722+0000 3=0+3)/n(v0 rc2025-06-27T19:08:43.024940+0000 1=0+1) (isnap sync r=2) (inest lock w=1 dirty) caps={4359=pAsLsXsFs/-@83,4393=pAsLsXsFs/-@1245} | dirtyscattered=1 request=0 lock=2 dirfrag=1 caps=1 openingsnapparents=0 dirty=1 authpin=0 0x563e9a6f4b00]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (inest lock w=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (iquiesce lock w=1 last_client=4393) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (isnap sync r=1) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (isnap sync r=2) on [inode 0x1 [...2,head] / auth v179 snaprealm=0x563e9a6ce6c0 f(v0 m2025-06-27T19:09:29.187695+0000 2=0+2) n(v45 rc2025-06-27T19:41:31.249722+0000 3=0+3)/n(v0 rc2025-06-27T19:08:43.024940+0000 1=0+1) (isnap sync r=2) (inest lock w=1 dirty) caps={4359=pAsLsXsFs/-@83,4393=pAsLsXsFs/-@1245} | dirtyscattered=1 request=0 lock=2 dirfrag=1 caps=1 openingsnapparents=0 dirty=1 authpin=0 0x563e9a6f4b00]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (iversion lock w=1 last_client=4393) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log NO (iversion lock w=1 last_client=4393) on [inode 0x10000000001 [...2,head] /b/ auth v29159 pv29165 ap=1 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000) n(v0 rc2025-06-27T19:41:32.017971+0000 1=0+1) (isnap sync r=1) (inest lock w=1) (ifile excl w=1) (iversion lock w=1 last_client=4393) caps={4393=pAsLsXsFsx/-@955},l=4393 | request=0 lock=4 dirfrag=1 caps=1 dirtyparent=1 dirty=1 authpin=1 0x563e9a6f5080]
2025-06-27T19:41:32.043+0000
7f11d99b8640 10 mds.0.locker nudge_log YES (ifile lock->sync w=1) on [inode 0x10000000000 [...2,head] /a/ auth v29161 pv29163 ap=3 DIRTYPARENT f(v0 m2025-06-27T19:41:32.017971+0000 1=1+0) n(v0 rc2025-06-27T19:41:32.017971+0000 2=1+1) (isnap sync r=2) (inest lock w=1) (ifile lock->sync w=1) (iversion lock w=1 last_client=4393) (iquiesce lock w=1 last_client=4393) caps={4359=pAsLsXs/-@109,4393=pAsLsXs/-@955} | request=1 lock=5 dirfrag=1 caps=1 dirtyparent=1 dirty=1 waiter=1 authpin=1 0x563e9a6f5600]
2025-06-27T19:41:32.043+0000
7f11d99b8640 20 mds.0.locker : request(client.4393:20702 nref=5 cr=0x563e9b83ae00)
2025-06-27T19:41:32.043+0000
7f11d99b8640 20 mds.0.log _submit_entry EUpdate rename [metablob 0x1, 3 dirs]
Easily reproducible with two ceph-fuse clients. One session doing:
$ mkdir a b
$ touch a/file
$ while true; do mv -v a/file b/file; mv -v b/file a/file; done
and the other
$ while true; do stat a/file; done
You can observe the rename/stat stalls (up to 5 seconds; MDS tick interval) without this patch.
Fixes: https://tracker.ceph.com/issues/71876
Signed-off-by: Patrick Donnelly <pdonnell@ibm.com>
issue_caps_set(*pneed_issue);
}
-void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
+void Locker::handle_locks_for_early_reply(MutationImpl *mut)
{
set<CInode*> need_issue;
+ bool nudged = false;
+
+ dout(10) << __func__ << ": " << *mut << dendl;
for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
+ SimpleLock *lock = it->lock;
+ if (!nudged) {
+ /* A request may finally early reply only after another request has
+ * triggered an unstable state change. We need to nudge the log now to
+ * move things along.
+ */
+ nudged = nudge_log(lock);
+ }
if (!it->is_rdlock()) {
++it;
continue;
}
- SimpleLock *lock = it->lock;
// make later mksnap/setlayout (at other mds) wait for this unsafe request
if (lock->get_type() == CEPH_LOCK_ISNAP ||
lock->get_type() == CEPH_LOCK_IPOLICY) {
return false;
}
-void Locker::nudge_log(SimpleLock *lock)
+bool Locker::nudge_log(SimpleLock *lock)
{
- dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl;
- if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked()) // as with xlockdone, or cap flush
+ // as with xlockdone, or cap flush
+ if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked() && lock->has_any_waiter()) {
+ dout(10) << __func__ << " YES " << *lock << " on " << *lock->get_parent() << dendl;
mds->mdlog->flush();
+ return true;
+ } else {
+ dout(20) << __func__ << " NO " << *lock << " on " << *lock->get_parent() << dendl;
+ return false;
+ }
}
void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
void tick();
- void nudge_log(SimpleLock *lock);
+ bool nudge_log(SimpleLock *lock);
bool acquire_locks(const MDRequestRef& mdr,
MutationImpl::LockOpVec& lov,
void drop_locks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
void set_xlocks_done(MutationImpl *mut, bool skip_dentry=false);
void drop_non_rdlocks(MutationImpl *mut, std::set<CInode*> *pneed_issue=0);
- void drop_rdlocks_for_early_reply(MutationImpl *mut);
+ void handle_locks_for_early_reply(MutationImpl *mut);
void drop_lock(MutationImpl* mut, SimpleLock* what);
void drop_locks_for_fragment_unfreeze(MutationImpl *mut);
mdr->set_queued_next_replay_op();
mds->queue_one_replay();
} else if (mdr->did_early_reply)
- mds->locker->drop_rdlocks_for_early_reply(mdr.get());
+ mds->locker->handle_locks_for_early_reply(mdr.get());
else
mdlog->flush();
}
bool is_waiter_for(uint64_t mask) const {
return parent->is_waiter_for(getmask(mask));
}
+ bool has_any_waiter() const {
+ return is_waiter_for(std::numeric_limits<uint64_t>::max());
+ }
bool is_cached() const {
return state_flags & CACHED;