From 09b3e942cdf09ccd24c60931a3a0533aa5ba4891 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 30 Sep 2019 14:20:17 +0800 Subject: [PATCH] mds: initial code for lock cache The lock cache preserves locks and authpins required for directory operations. MDS can create a lock cache when it has acquired all locks of a directory operation. The lock cache can be used to for later operations of the same type on the same directory. For example, when mds has acquired all locks of a unlink operation, it creates a lock cache, which holds holds wrlocks on direcotry inode's filelock and nestlock, rdlocks on ancestor inodes' snaplocks. For later unlink operations on the same directory, MDS only needs to xlock the dentry to unlink and xlock linklock of the inode to unlink. Signed-off-by: "Yan, Zheng" --- src/mds/CInode.h | 2 +- src/mds/Capability.cc | 1 + src/mds/Capability.h | 3 + src/mds/Locker.cc | 190 ++++++++++++++++++++++++++++++++++++++++++ src/mds/Locker.h | 5 ++ src/mds/MDCache.cc | 2 +- src/mds/Mutation.cc | 18 ++++ src/mds/Mutation.h | 38 ++++++--- src/mds/journal.cc | 2 +- 9 files changed, 246 insertions(+), 15 deletions(-) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index c0b8a5fd657..1d29dd700f8 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -513,7 +513,7 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counter& snaps); - bool has_dirfrags() { return !dirfrags.empty(); } + size_t get_num_dirfrags() const { return dirfrags.size(); } CDir* get_dirfrag(frag_t fg) { auto pi = dirfrags.find(fg); if (pi != dirfrags.end()) { diff --git a/src/mds/Capability.cc b/src/mds/Capability.cc index ceaa4e069a4..32f780ee202 100644 --- a/src/mds/Capability.cc +++ b/src/mds/Capability.cc @@ -148,6 +148,7 @@ void Capability::revoke_info::generate_test_instances(std::list::item item_revoking_caps; xlist::item item_client_revoking_caps; + elist lock_caches; private: void calc_issued() { _issued = _pending; diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 60db205f891..66d015e9fcf 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -362,6 +362,7 @@ bool Locker::acquire_locks(MDRequestRef& mdr, } if (!object->is_auth()) { + ceph_assert(!mdr->lock_cache); if (object->is_ambiguous_auth()) { // wait dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl; @@ -376,6 +377,20 @@ bool Locker::acquire_locks(MDRequestRef& mdr, } int err = 0; if (!object->can_auth_pin(&err)) { + if (mdr->lock_cache) { + CDir *dir; + if (CInode *in = dynamic_cast(object)) { + dir = in->get_projected_parent_dir(); + } else if (CDentry *dn = dynamic_cast(object)) { + dir = dn->get_dir(); + } else { + ceph_assert(0 == "unknown type of lock parent"); + } + ceph_assert(dir->get_inode() == mdr->lock_cache->get_dir_inode()); + /* forcibly auth pin if lock cache is used */ + continue; + } + // wait drop_locks(mdr.get()); mdr->drop_local_auth_pins(); @@ -640,6 +655,13 @@ void Locker::_drop_locks(MutationImpl *mut, set *pneed_issue, } } + if (drop_rdlocks) { + if (mut->lock_cache) { + put_lock_cache(mut->lock_cache); + mut->lock_cache = nullptr; + } + } + for (set::iterator p = slaves.begin(); p != slaves.end(); ++p) { if (!mds->is_cluster_degraded() || mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) { @@ -742,6 +764,169 @@ void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut) issue_caps_set(need_issue); } +class C_MDL_DropCache : public LockerContext { + MDLockCache *lock_cache; +public: + C_MDL_DropCache(Locker *l, MDLockCache *lc) : + LockerContext(l), lock_cache(lc) { } + void finish(int r) override { + locker->drop_locks(lock_cache); + lock_cache->cleanup(); + delete lock_cache; + } +}; + +void Locker::put_lock_cache(MDLockCache* lock_cache) +{ + ceph_assert(lock_cache->ref > 0); + if (--lock_cache->ref > 0) + return; + + ceph_assert(lock_cache->invalidating); + mds->queue_waiter(new C_MDL_DropCache(this, lock_cache)); +} + +void Locker::invalidate_lock_cache(MDLockCache *lock_cache) +{ + ceph_assert(lock_cache->item_cap_lock_cache.is_on_list()); + ceph_assert(!lock_cache->invalidating); + lock_cache->invalidating = true; + // XXX check issued caps + lock_cache->item_cap_lock_cache.remove_myself(); + put_lock_cache(lock_cache); +} + +void Locker::create_lock_cache(MDRequestRef& mdr, CInode *diri) +{ + if (mdr->lock_cache) + return; + + client_t client = mdr->get_client(); + int opcode = mdr->client_request->get_op(); + dout(10) << "create_lock_cache for client." << client << "/" << ceph_mds_op_name(opcode)<< " on " << *diri << dendl; + + if (!diri->is_auth()) { + dout(10) << " dir inode is not auth, noop" << dendl; + return; + } + + if (mdr->has_more() && !mdr->more()->slaves.empty()) { + dout(10) << " there are slaves requests for " << *mdr << ", noop" << dendl; + return; + } + + Capability *cap = diri->get_client_cap(client); + if (!cap) { + dout(10) << " there is no cap for client." << client << ", noop" << dendl; + return; + } + + set ancestors; + for (CInode *in = diri; ; ) { + CDentry *pdn = in->get_projected_parent_dn(); + if (!pdn) + break; + // ancestors.insert(pdn); + in = pdn->get_dir()->get_inode(); + ancestors.insert(in); + } + + for (auto& p : mdr->object_states) { + if (p.first != diri && !ancestors.count(p.first)) + continue; + auto& stat = p.second; + if (stat.auth_pinned && !p.first->can_auth_pin()) { + dout(10) << " can't auth_pin(freezing?) lock parent " << *p.first << ", noop" << dendl; + return; + } + } + + std::vector dfv; + dfv.reserve(diri->get_num_dirfrags()); + + diri->get_dirfrags(dfv); + for (auto dir : dfv) { + if (!dir->is_auth() || !dir->can_auth_pin()) { + dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl; + return; + } + } + + for (auto& p : mdr->locks) { + MDSCacheObject *obj = p.lock->get_parent(); + if (obj != diri && !ancestors.count(obj)) + continue; + if (!p.lock->is_stable()) { + dout(10) << " unstable " << *p.lock << " on " << *obj << ", noop" << dendl; + return; + } + } + + auto lock_cache = new MDLockCache(cap, opcode); + + // prevent subtree migration + for (auto dir : dfv) + lock_cache->auth_pin(dir); + + for (auto& p : mdr->object_states) { + if (p.first != diri && !ancestors.count(p.first)) + continue; + auto& stat = p.second; + if (stat.auth_pinned) + lock_cache->auth_pin(p.first); + else + lock_cache->pin(p.first); + } + + for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) { + MDSCacheObject *obj = it->lock->get_parent(); + if (obj != diri && !ancestors.count(obj)) { + ++it; + continue; + } + unsigned lock_flag = 0; + if (it->is_wrlock()) { + // skip wrlocks that were added by MDCache::predirty_journal_parent() + if (obj == diri) + lock_flag = MutationImpl::LockOp::WRLOCK; + } else { + ceph_assert(it->is_rdlock()); + lock_flag = MutationImpl::LockOp::RDLOCK; + } + if (lock_flag) { + lock_cache->emplace_lock(it->lock, lock_flag); + mdr->locks.erase(it++); + } else { + ++it; + } + } + + lock_cache->ref++; + mdr->lock_cache = lock_cache; +} + +bool Locker::find_and_attach_lock_cache(MDRequestRef& mdr, CInode *diri) +{ + if (mdr->lock_cache) + return true; + + Capability *cap = diri->get_client_cap(mdr->get_client()); + if (!cap) + return false; + + int opcode = mdr->client_request->get_op(); + for (auto p = cap->lock_caches.begin(); !p.end(); ++p) { + MDLockCache *lock_cache = *p; + if (lock_cache->opcode == opcode) { + dout(10) << "found lock cache for " << ceph_mds_op_name(opcode) << " on " << *diri << dendl; + mdr->lock_cache = lock_cache; + mdr->lock_cache->ref++; + return true; + } + } + return false; +} + // generics void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSContext::vec *pfinishers) @@ -3599,6 +3784,11 @@ void Locker::remove_client_cap(CInode *in, Capability *cap, bool kill) if (!in->client_need_snapflush.empty()) _do_null_snapflush(in, client); + while (!cap->lock_caches.empty()) { + MDLockCache* lock_cache = cap->lock_caches.front(); + invalidate_lock_cache(lock_cache); + } + bool notable = cap->is_notable(); in->remove_client_cap(client); if (!notable) diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 5fbf124db9b..8735d92cee8 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -67,6 +67,11 @@ public: void drop_rdlocks_for_early_reply(MutationImpl *mut); void drop_locks_for_fragment_unfreeze(MutationImpl *mut); + void create_lock_cache(MDRequestRef& mdr, CInode *diri); + bool find_and_attach_lock_cache(MDRequestRef& mdr, CInode *diri); + void invalidate_lock_cache(MDLockCache *lock_cache); + void put_lock_cache(MDLockCache* lock_cache); + void eval_gather(SimpleLock *lock, bool first=false, bool *need_issue=0, MDSContext::vec *pfinishers=0); void eval(SimpleLock *lock, bool *need_issue); void eval_any(SimpleLock *lock, bool *need_issue, MDSContext::vec *pfinishers=0, bool first=false) { diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 65db633c723..c6169090297 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5969,7 +5969,7 @@ void MDCache::opened_undef_inode(CInode *in) { if (in->is_dir()) { // FIXME: re-hash dentries if necessary ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash); - if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) { + if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) { CDir *dir = in->get_dirfrag(frag_t()); ceph_assert(dir); rejoin_undef_dirfrags.erase(dir); diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index b14d6983758..5d59475b7c0 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -82,6 +82,24 @@ void MutationImpl::finish_locking(SimpleLock *lock) locking_target_mds = -1; } +bool MutationImpl::is_rdlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + if (it != locks.end() && it->is_rdlock()) + return true; + if (lock_cache) + return static_cast(lock_cache)->is_rdlocked(lock); + return false; +} + +bool MutationImpl::is_wrlocked(SimpleLock *lock) const { + auto it = locks.find(lock); + if (it != locks.end() && it->is_wrlock()) + return true; + if (lock_cache) + return static_cast(lock_cache)->is_wrlocked(lock); + return false; +} + void MutationImpl::LockOpVec::erase_rdlock(SimpleLock* lock) { for (int i = size() - 1; i >= 0; --i) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index d8227e0e1bb..1deb9630008 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -31,13 +31,13 @@ #include "messages/MClientReply.h" class LogSegment; -class Capability; class CInode; class CDir; class CDentry; class Session; class ScatterLock; struct sr_t; +struct MDLockCache; struct MutationImpl : public TrackedOp { metareqid_t reqid; @@ -137,23 +137,19 @@ public: using lock_iterator = lock_set::iterator; lock_set locks; // full ordering + MDLockCache* lock_cache = nullptr; + lock_iterator emplace_lock(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) { last_locked = l; return locks.emplace(l, f, t).first; } - bool is_rdlocked(SimpleLock *lock) const { - auto it = locks.find(lock); - return it != locks.end() && it->is_rdlock(); - } + bool is_rdlocked(SimpleLock *lock) const; + bool is_wrlocked(SimpleLock *lock) const; bool is_xlocked(SimpleLock *lock) const { auto it = locks.find(lock); return it != locks.end() && it->is_xlock(); } - bool is_wrlocked(SimpleLock *lock) const { - auto it = locks.find(lock); - return it != locks.end() && it->is_wrlock(); - } bool is_remote_wrlocked(SimpleLock *lock) const { auto it = locks.find(lock); return it != locks.end() && it->is_remote_wrlock(); @@ -198,7 +194,8 @@ public: reqid(ri), attempt(att), slave_to_mds(slave_to) { } ~MutationImpl() override { - ceph_assert(locking == NULL); + ceph_assert(!locking); + ceph_assert(!lock_cache); ceph_assert(num_pins == 0); ceph_assert(num_auth_pins == 0); } @@ -228,8 +225,8 @@ public: } // pin items in cache - void pin(MDSCacheObject *o); - void unpin(MDSCacheObject *o); + void pin(MDSCacheObject *object); + void unpin(MDSCacheObject *object); void set_stickydirs(CInode *in); void put_stickydirs(); void drop_pins(); @@ -487,5 +484,22 @@ struct MDSlaveUpdate { } }; +struct MDLockCache : public MutationImpl { + CInode *diri; + Capability *client_cap; + int opcode; + + elist::item item_cap_lock_cache; + + int ref = 1; + bool invalidating = false; + + MDLockCache(Capability *cap, int op) : + MutationImpl(), diri(cap->get_inode()), client_cap(cap), opcode(op) { + client_cap->lock_caches.push_back(&item_cap_lock_cache); + } + + CInode *get_dir_inode() { return diri; } +}; #endif diff --git a/src/mds/journal.cc b/src/mds/journal.cc index f60e0e7dfff..e62c0e0a353 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -531,7 +531,7 @@ void EMetaBlob::fullbit::update_inode(MDSRank *mds, CInode *in) << dirfragtree << " on " << *in << dendl; in->dirfragtree = dirfragtree; in->force_dirfrags(); - if (in->has_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) { + if (in->get_num_dirfrags() && in->authority() == CDIR_AUTH_UNDEF) { auto&& ls = in->get_nested_dirfrags(); for (const auto& dir : ls) { if (dir->get_num_any() == 0 && -- 2.39.5