From 9dfef48f8584fcf61c6f95e41df537ea5b0693b7 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 30 Sep 2019 14:20:17 +0800 Subject: [PATCH] mds: suppress frozen inode when locks of dir operation is cached. Frozen inode in directory with lock caches may cause ABBA deadlock. The requeset that owns the frozen inode can't get locks because lock cache is holding conflicting locks. The request that uses lock cache can't auth pin the frozen inode. The solution is not creating lock cache when directory contains freezing or frozen inode. When mds starts freezing an inode, invalidate all lock caches on its parent directory. Signed-off-by: "Yan, Zheng" --- src/mds/CDir.cc | 24 ++++++++++++++ src/mds/CDir.h | 14 +++++++++ src/mds/CInode.cc | 79 ++++++++++++++++++++++++++++++++++------------- src/mds/CInode.h | 9 ++++-- src/mds/Locker.cc | 21 +++++++++++-- 5 files changed, 120 insertions(+), 27 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 745b681e73d..72ff92d053e 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -196,6 +196,7 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : dirty_dentries(member_offset(CDentry, item_dir_dirty)), item_dirty(this), item_new(this), lock_caches_with_auth_pins(member_offset(MDLockCache::DirItem, item_dir)), + freezing_inodes(member_offset(CInode, item_freezing_inode)), dir_rep(REP_NONE), pop_me(mdcache->decayrate), pop_nested(mdcache->decayrate), @@ -591,6 +592,11 @@ void CDir::link_inode_work( CDentry *dn, CInode *in) if (in->auth_pins) dn->adjust_nested_auth_pins(in->auth_pins, NULL); + if (in->is_freezing_inode()) + freezing_inodes.push_back(&in->item_freezing_inode); + else if (in->is_frozen_inode() || in->is_frozen_auth_pin()) + num_frozen_inodes++; + // verify open snaprealm parent if (in->snaprealm) in->snaprealm->adjust_parent(); @@ -672,6 +678,11 @@ void CDir::unlink_inode_work(CDentry *dn) if (in->auth_pins) dn->adjust_nested_auth_pins(-in->auth_pins, nullptr); + if (in->is_freezing_inode()) + in->item_freezing_inode.remove_myself(); + else if (in->is_frozen_inode() || in->is_frozen_auth_pin()) + num_frozen_inodes--; + // detach inode in->remove_primary_parent(dn); if (in->is_dir()) @@ -3171,6 +3182,19 @@ void CDir::unfreeze_dir() } } +void CDir::enable_frozen_inode() +{ + ceph_assert(frozen_inode_suppressed > 0); + if (--frozen_inode_suppressed == 0) { + for (auto p = freezing_inodes.begin(); !p.end(); ) { + CInode *in = *p; + ++p; + ceph_assert(in->is_freezing_inode()); + in->maybe_finish_freeze_inode(); + } + } +} + /** * Slightly less complete than operator<<, because this is intended * for identifying a directory and its state rather than for dumping diff --git a/src/mds/CDir.h b/src/mds/CDir.h index c60bf57421f..43ebfcb53c8 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -578,6 +578,15 @@ public: return true; } + bool is_any_freezing_or_frozen_inode() const { + return num_frozen_inodes || !freezing_inodes.empty(); + } + void disable_frozen_inode() { + ceph_assert(num_frozen_inodes == 0); + frozen_inode_suppressed++; + } + void enable_frozen_inode(); + ostream& print_db_line_prefix(ostream& out) override; void print(ostream& out) override; void dump(Formatter *f, int flags = DUMP_DEFAULT) const; @@ -683,6 +692,11 @@ protected: static int num_frozen_trees; static int num_freezing_trees; + // freezing/frozen inodes in this dirfrag + int num_frozen_inodes = 0; + int frozen_inode_suppressed = 0; + elist freezing_inodes; + int dir_auth_pins = 0; // cache control (defined for authority; hints for replicas) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index e3c4ef1e24a..f45617553c1 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1587,6 +1587,7 @@ void CInode::decode_store(bufferlist::const_iterator& bl) SimpleLock* CInode::get_lock(int type) { switch (type) { + case CEPH_LOCK_IVERSION: return &versionlock; case CEPH_LOCK_IFILE: return &filelock; case CEPH_LOCK_IAUTH: return &authlock; case CEPH_LOCK_ILINK: return &linklock; @@ -2680,25 +2681,64 @@ void CInode::take_waiting(uint64_t mask, MDSContext::vec& ls) MDSCacheObject::take_waiting(mask, ls); } +void CInode::maybe_finish_freeze_inode() +{ + CDir *dir = get_parent_dir(); + if (auth_pins > auth_pin_freeze_allowance || dir->frozen_inode_suppressed) + return; + + dout(10) << "maybe_finish_freeze_inode - frozen" << dendl; + ceph_assert(auth_pins == auth_pin_freeze_allowance); + get(PIN_FROZEN); + put(PIN_FREEZING); + state_clear(STATE_FREEZING); + state_set(STATE_FROZEN); + + item_freezing_inode.remove_myself(); + dir->num_frozen_inodes++; + + finish_waiting(WAIT_FROZEN); +} + bool CInode::freeze_inode(int auth_pin_allowance) { + CDir *dir = get_parent_dir(); + ceph_assert(dir); + ceph_assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins ceph_assert(auth_pins >= auth_pin_allowance); - if (auth_pins > auth_pin_allowance) { - dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; - auth_pin_freeze_allowance = auth_pin_allowance; - get(PIN_FREEZING); - state_set(STATE_FREEZING); - return false; + if (auth_pins == auth_pin_allowance && !dir->frozen_inode_suppressed) { + dout(10) << "freeze_inode - frozen" << dendl; + if (!state_test(STATE_FROZEN)) { + get(PIN_FROZEN); + state_set(STATE_FROZEN); + dir->num_frozen_inodes++; + } + return true; } - dout(10) << "freeze_inode - frozen" << dendl; - ceph_assert(auth_pins == auth_pin_allowance); - if (!state_test(STATE_FROZEN)) { - get(PIN_FROZEN); - state_set(STATE_FROZEN); + dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; + auth_pin_freeze_allowance = auth_pin_allowance; + dir->freezing_inodes.push_back(&item_freezing_inode); + + get(PIN_FREEZING); + state_set(STATE_FREEZING); + + if (!dir->lock_caches_with_auth_pins.empty()) + mdcache->mds->locker->invalidate_lock_caches(dir); + + const static int lock_types[] = { + CEPH_LOCK_IVERSION, CEPH_LOCK_IFILE, CEPH_LOCK_IAUTH, CEPH_LOCK_ILINK, CEPH_LOCK_IDFT, + CEPH_LOCK_IXATTR, CEPH_LOCK_ISNAP, CEPH_LOCK_INEST, CEPH_LOCK_IFLOCK, CEPH_LOCK_IPOLICY, 0 + }; + for (int i = 0; lock_types[i]; ++i) { + auto lock = get_lock(lock_types[i]); + if (lock->is_cached()) + mdcache->mds->locker->invalidate_lock_caches(lock); } - return true; + // invalidate_lock_caches() may decrease dir->frozen_inode_suppressed + // and finish freezing the inode + return state_test(STATE_FROZEN); } void CInode::unfreeze_inode(MDSContext::vec& finished) @@ -2707,9 +2747,11 @@ void CInode::unfreeze_inode(MDSContext::vec& finished) if (state_test(STATE_FREEZING)) { state_clear(STATE_FREEZING); put(PIN_FREEZING); + item_freezing_inode.remove_myself(); } else if (state_test(STATE_FROZEN)) { state_clear(STATE_FROZEN); put(PIN_FROZEN); + get_parent_dir()->num_frozen_inodes--; } else ceph_abort(); take_waiting(WAIT_UNFREEZE, finished); @@ -2726,12 +2768,14 @@ void CInode::freeze_auth_pin() { ceph_assert(state_test(CInode::STATE_FROZEN)); state_set(CInode::STATE_FROZENAUTHPIN); + get_parent_dir()->num_frozen_inodes++; } void CInode::unfreeze_auth_pin() { ceph_assert(state_test(CInode::STATE_FROZENAUTHPIN)); state_clear(CInode::STATE_FROZENAUTHPIN); + get_parent_dir()->num_frozen_inodes--; if (!state_test(STATE_FREEZING|STATE_FROZEN)) { MDSContext::vec finished; take_waiting(WAIT_UNFREEZE, finished); @@ -2808,15 +2852,8 @@ void CInode::auth_unpin(void *by) if (parent) parent->adjust_nested_auth_pins(-1, by); - if (is_freezing_inode() && - auth_pins == auth_pin_freeze_allowance) { - dout(10) << "auth_unpin freezing!" << dendl; - get(PIN_FROZEN); - put(PIN_FREEZING); - state_clear(STATE_FREEZING); - state_set(STATE_FROZEN); - finish_waiting(WAIT_FROZEN); - } + if (is_freezing_inode()) + maybe_finish_freeze_inode(); } // authority diff --git a/src/mds/CInode.h b/src/mds/CInode.h index f3b6659cd68..accceb6858f 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -197,7 +197,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter::item& item_recover_queue = item_dirty_dirfrag_dir; elist::item& item_recover_queue_front = item_dirty_dirfrag_nest; - int auth_pin_freeze_allowance = 0; - inode_load_vec_t pop; elist::item item_pop_lru; @@ -1069,6 +1067,11 @@ protected: // -- waiting -- mempool::mds_co::compact_map waiting_on_dir; + + // -- freezing inode -- + int auth_pin_freeze_allowance = 0; + elist::item item_freezing_inode; + void maybe_finish_freeze_inode(); private: friend class ValidationContinuation; diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index fd9311b9fe9..a6a19600420 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -380,6 +380,7 @@ bool Locker::acquire_locks(MDRequestRef& mdr, if (mdr->lock_cache) { CDir *dir; if (CInode *in = dynamic_cast(object)) { + ceph_assert(!in->is_frozen_inode() && !in->is_frozen_auth_pin()); dir = in->get_projected_parent_dir(); } else if (CDentry *dn = dynamic_cast(object)) { dir = dn->get_dir(); @@ -783,6 +784,14 @@ void Locker::put_lock_cache(MDLockCache* lock_cache) return; ceph_assert(lock_cache->invalidating); + + CInode *diri = lock_cache->get_dir_inode(); + for (auto dir : lock_cache->auth_pinned_dirfrags) { + if (dir->get_inode() != diri) + continue; + dir->enable_frozen_inode(); + } + mds->queue_waiter(new C_MDL_DropCache(this, lock_cache)); } @@ -879,6 +888,10 @@ void Locker::create_lock_cache(MDRequestRef& mdr, CInode *diri) dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl; return; } + if (dir->is_any_freezing_or_frozen_inode()) { + dout(10) << " there is freezing/frozen inode in " << *dir << ", noop" << dendl; + return; + } } for (auto& p : mdr->locks) { @@ -893,9 +906,12 @@ void Locker::create_lock_cache(MDRequestRef& mdr, CInode *diri) auto lock_cache = new MDLockCache(cap, opcode); - // prevent subtree migration - for (auto dir : dfv) + for (auto dir : dfv) { + // prevent subtree migration lock_cache->auth_pin(dir); + // prevent frozen inode + dir->disable_frozen_inode(); + } for (auto& p : mdr->object_states) { if (p.first != diri && !ancestors.count(p.first)) @@ -2155,7 +2171,6 @@ Capability* Locker::issue_new_caps(CInode *in, return cap; } - void Locker::issue_caps_set(set& inset) { for (set::iterator p = inset.begin(); p != inset.end(); ++p) -- 2.39.5