From 5178674573497a511f3f9e4ebfce5b902a34fbff Mon Sep 17 00:00:00 2001 From: Leonid Usov Date: Thu, 9 May 2024 04:39:12 +0300 Subject: [PATCH] squid: mds/quiesce: overdrive fragmenting that's still freezing Quiesce requires revocation of capabilities, which is not working for a freezing/frozen nodes. Since it is best effort, abort an ongoing fragmenting for the sake of a faster quiesce. Signed-off-by: Leonid Usov Fixes: https://tracker.ceph.com/issues/65716 (cherry picked from commit 8b6440652d501644d641c1c8b3255c3720738ec6) Fixes: https://tracker.ceph.com/issues/66154 --- src/mds/CInode.cc | 7 ++++ src/mds/CInode.h | 1 + src/mds/Locker.cc | 3 ++ src/mds/MDCache.cc | 92 +++++++++++++++++++++++++++++++++++++-------- src/mds/MDCache.h | 5 ++- src/mds/Migrator.cc | 7 ++-- 6 files changed, 95 insertions(+), 20 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 030a5488166..b588e017abf 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -5597,4 +5597,11 @@ void CInode::get_subtree_dirfrags(std::vector& v) const } } +bool CInode::will_block_for_quiesce(const MDRequestRef& mdr) { + if (mdr && mdr->is_wrlocked(&quiescelock)) { + return false; + } + return !quiescelock.can_wrlock(); +} + MEMPOOL_DEFINE_OBJECT_FACTORY(CInode, co_inode, mds_co); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index d943b6e17d7..d3478389e68 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -663,6 +663,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counteris_symlink(); } bool is_dir() const { return get_inode()->is_dir(); } bool is_quiesced() const { return quiescelock.is_xlocked(); } + bool will_block_for_quiesce(const MDRequestRef& mdr = MDRequestRef {}); bool is_head() const { return last == CEPH_NOSNAP; } diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index f59826216af..fe39f578986 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -464,6 +464,9 @@ bool Locker::acquire_locks(const MDRequestRef& mdr, marker.message = "failed to authpin, inode is being exported"; } dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; + if (CDentry* dn = dynamic_cast(object)) { + dout(10) << " can't auth_pin dir: " << *dn->get_dir() << dendl; + } object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); if (mdr->is_any_remote_auth_pin()) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 29af7fbc270..f715426802f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -8285,7 +8285,7 @@ void MDCache::dispatch(const cref_t &m) int MDCache::path_traverse(const MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path, int flags, - vector *pdnvec, CInode **pin) + vector *pdnvec, CInode **pin, CDir **pdir) { bool discover = (flags & MDS_TRAVERSE_DISCOVER); bool forward = !discover; @@ -8351,6 +8351,8 @@ int MDCache::path_traverse(const MDRequestRef& mdr, MDSContextFactory& cf, pdnvec->clear(); if (pin) *pin = cur; + if (pdir) + *pdir = nullptr; CInode *target_inode = nullptr; MutationImpl::LockOpVec lov; @@ -8397,6 +8399,9 @@ int MDCache::path_traverse(const MDRequestRef& mdr, MDSContextFactory& cf, // open dir frag_t fg = cur->pick_dirfrag(path[depth]); CDir *curdir = cur->get_dirfrag(fg); + if (pdir) { + *pdir = curdir; + } if (!curdir) { if (cur->is_auth()) { // parent dir frozen_dir? @@ -8422,6 +8427,9 @@ int MDCache::path_traverse(const MDRequestRef& mdr, MDSContextFactory& cf, } } ceph_assert(curdir); + if (pdir) { + *pdir = curdir; + } #ifdef MDS_VERIFY_FRAGSTAT if (curdir->is_complete()) @@ -12069,8 +12077,8 @@ void MDCache::fragment_frozen(const MDRequestRef& mdr, int r) { dirfrag_t basedirfrag = mdr->more()->fragment_base; map::iterator it = fragments.find(basedirfrag); - if (it == fragments.end() || it->second.mdr != mdr) { - dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl; + if (it == fragments.end() || it->second.mdr != mdr || r < 0) { + dout(7) << "fragment_frozen " << basedirfrag << " must have aborted; rc=" << r << dendl; request_finish(mdr); return; } @@ -12084,12 +12092,12 @@ void MDCache::fragment_frozen(const MDRequestRef& mdr, int r) dispatch_fragment_dir(mdr); } -void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr) +void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing) { dirfrag_t basedirfrag = mdr->more()->fragment_base; map::iterator it = fragments.find(basedirfrag); if (it == fragments.end() || it->second.mdr != mdr) { - dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl; + dout(7) << __func__ << ": " << basedirfrag << " must have aborted" << dendl; request_finish(mdr); return; } @@ -12097,20 +12105,30 @@ void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr) fragment_info_t& info = it->second; CInode *diri = info.dirs.front()->get_inode(); - dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits - << " on " << *diri << dendl; + dout(10) << __func__ << ": " << basedirfrag << " all_frozen=" << info.all_frozen << " bits: " << info.bits + << " on " << *diri << dendl; if (mdr->more()->peer_error) mdr->aborted = true; + if (abort_if_freezing) { + if (info.all_frozen) { + dout(20) << __func__ << ": abort_if_freezing: too late, won't abort" << dendl; + return; + } + dout(20) << __func__ << ": abort_if_freezing: will abort" << dendl; + mdr->aborted = true; + } + if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) { - /* If quiescelock cannot be wrlocked, we cannot block with tree frozen. + /* We cannot afford blocking for quiesce with fragments frozen. * Otherwise, this can create deadlock where some quiesce_inode requests - * (on inodes in the dirfrag) are blocked on a frozen tree and the + * (on inodes in the dirfrag) are blocked on a frozen cdir and the * fragment_dir request is blocked on the queiscelock for the directory * inode's quiescelock. */ - if (!mdr->is_wrlocked(&diri->quiescelock) && !diri->quiescelock.can_wrlock()) { + if (diri->will_block_for_quiesce(mdr)) { + dout(10) << __func__ << ": aborting to avoid a deadlock with quiesce" << dendl; mdr->aborted = true; } @@ -12131,7 +12149,7 @@ void MDCache::dispatch_fragment_dir(const MDRequestRef& mdr) } if (mdr->aborted) { - dout(10) << " can't auth_pin or acquire quiescelock on " + dout(10) << __func__ << " aborted fragmenting of " << *diri << ", requeuing dir " << info.dirs.front()->dirfrag() << dendl; if (info.bits > 0) @@ -13585,6 +13603,43 @@ void MDCache::clear_dirty_bits_for_stray(CInode* diri) { } } +void MDCache::quiesce_overdrive_fragmenting(CDir* dir, bool async) { + if (!dir || !dir->state_test(CDir::STATE_FRAGMENTING)) { + return; + } + dout(20) << __func__ << ": will check fragmenting dir " << *dir << dendl; + + auto diri = dir->get_inode(); + auto mydf = dir->dirfrag(); + for (auto it = fragments.lower_bound({diri->ino(), {}}); + it != fragments.end() && it->first.ino == diri->ino(); + ++it) { + if (it->first.frag.contains(mydf.frag)) { + dout(20) << __func__ << ": dirfrag " << it->first << " contains my dirfrag " << mydf << dendl; + auto const& mdr = it->second.mdr; + + if (async) { + dout(10) << __func__ << ": will schedule async abort_if_freezing for " << *mdr << dendl; + mds->queue_waiter(new MDSInternalContextWrapper(mds, new LambdaContext( [this, mdr] { + if (!mdr->dead) { + dispatch_fragment_dir(mdr, true); + } + }))); + } else { + if (mdr->dead) { + dout(20) << __func__ << ": the request is already dead: " << *mdr << dendl; + } else { + dout(10) << __func__ << ": will call abort_if_freezing for " << *mdr << dendl; + dispatch_fragment_dir(mdr, true); + } + } + + // there can't be (shouldn't be) more than one containing fragment + break; + } + } +} + void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) { if (mdr->internal_op_finish == nullptr) { @@ -13733,6 +13788,8 @@ void MDCache::dispatch_quiesce_inode(const MDRequestRef& mdr) std::vector todispatch; for (auto& dir : in->get_dirfrags()) { dout(25) << " iterating " << *dir << dendl; + // overdrive syncrhonously since we aren't yet on the waiting list + quiesce_overdrive_fragmenting(dir, false); for (auto& [dnk, dn] : *dir) { dout(25) << " evaluating (" << dnk << ", " << *dn << ")" << dendl; auto* in = dn->get_projected_inode(); @@ -13830,7 +13887,7 @@ void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) ceph_assert(mdr->internal_op_finish); - dout(5) << __func__ << ": dispatching" << dendl; + dout(5) << __func__ << ": dispatching " << *mdr << dendl; C_MDS_QuiescePath* qfinisher = static_cast(mdr->internal_op_finish); auto& qs = *qfinisher->qs; @@ -13847,10 +13904,15 @@ void MDCache::dispatch_quiesce_path(const MDRequestRef& mdr) | MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_WANT_INODE ; - int r = path_traverse(mdr, cf, mdr->get_filepath(), ptflags, nullptr, &diri); - if (r > 0) + + CDir* curdir = nullptr; + int r = path_traverse(mdr, cf, mdr->get_filepath(), ptflags, nullptr, &diri, &curdir); + if (r > 0) { + // we must abort asyncrhonously, since we may be on the unfreeze waiter list, + // which whill be flushed syncrhonously with the abort + quiesce_overdrive_fragmenting(curdir, true); return; - if (r < 0) { + } else if (r < 0) { mds->server->respond_to_request(mdr, r); return; } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index a8e410adaab..fdee140a2cd 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -951,7 +951,7 @@ private: */ int path_traverse(const MDRequestRef& mdr, MDSContextFactory& cf, const filepath& path, int flags, - std::vector *pdnvec, CInode **pin=nullptr); + std::vector *pdnvec, CInode **pin=nullptr, CDir **pdir = nullptr); int maybe_request_forward_to_auth(const MDRequestRef& mdr, MDSContextFactory& cf, MDSCacheObject *p); @@ -1456,7 +1456,7 @@ private: void fragment_unmark_unfreeze_dirs(const std::vector& dirs); void fragment_drop_locks(fragment_info_t &info); void fragment_maybe_finish(const fragment_info_iterator& it); - void dispatch_fragment_dir(const MDRequestRef& mdr); + void dispatch_fragment_dir(const MDRequestRef& mdr, bool abort_if_freezing=false); void _fragment_logged(const MDRequestRef& mdr); void _fragment_stored(const MDRequestRef& mdr); void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr); @@ -1470,6 +1470,7 @@ private: void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags); + void quiesce_overdrive_fragmenting(CDir* dir, bool async); void dispatch_quiesce_path(const MDRequestRef& mdr); void dispatch_quiesce_inode(const MDRequestRef& mdr); diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 5f84468f394..3600f78c572 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1055,13 +1055,14 @@ void Migrator::dispatch_export_dir(const MDRequestRef& mdr, int count) // locks? if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) { - /* If quiescelock cannot be wrlocked, we cannot block with tree frozen. + /* We cannot afford blocking for quiesce with tree frozen. * Otherwise, this can create deadlock where some quiesce_inode requests * (on inodes in the dirfrag) are blocked on a frozen tree and the - * fragment_dir request is blocked on the queiscelock for the directory + * export_dir request is blocked on the queiscelock for the directory * inode's quiescelock. */ - if (!mdr->is_wrlocked(&diri->quiescelock) && !diri->quiescelock.can_wrlock()) { + if (diri->will_block_for_quiesce(mdr)) { + dout(10) << __func__ << ": aborting to avoid a deadlock with quiesce" << dendl; mdr->aborted = true; export_try_cancel(dir); return; -- 2.39.5