From 2ab89287a7fbffb738fe1e5761e62cbdb8792271 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Tue, 20 Feb 2024 17:08:32 -0500 Subject: [PATCH] mds: use 128 bits for waiters on MDSCacheObject Adding a new inode lock will overflow inode wait bits into the MDSCacheObject wait bits. Make space for the quiescelock. This includes a minor refactor to no longer attempt scoping the set of masks we test in MDSCacheObject::waiting when calling MDSCacheObject::is_waiter_for. This optimization wasn't worth the overhead and would be awkard to keep as std::bitset cannot be used as a key for a std::multimap (easily). Instead, we use the sequence number as a key which helps us to avoid allocating another map whenever we call MDSCacheObject::take_waiting. Signed-off-by: Patrick Donnelly (cherry picked from commit 84b33ea9fe6268adfaca1e17b282e68788e38bf4) --- src/mds/CDir.h | 3 +- src/mds/CInode.h | 3 +- src/mds/MDSCacheObject.cc | 53 +++++++++++------------------------ src/mds/MDSCacheObject.h | 59 ++++++++++++++++++++++++++++----------- src/mds/SimpleLock.cc | 24 ++++++++-------- src/mds/SimpleLock.h | 25 +++++++++++++---- 6 files changed, 95 insertions(+), 72 deletions(-) diff --git a/src/mds/CDir.h b/src/mds/CDir.h index 234da01b10c..b70725f714e 100644 --- a/src/mds/CDir.h +++ b/src/mds/CDir.h @@ -182,10 +182,11 @@ public: static const uint64_t WAIT_COMPLETE = (1<<1); // wait for complete dir contents static const uint64_t WAIT_FROZEN = (1<<2); // auth pins removed static const uint64_t WAIT_CREATED = (1<<3); // new dirfrag is logged + static const uint64_t WAIT_BITS = 4; static const int WAIT_DNLOCK_OFFSET = 4; - static const uint64_t WAIT_ANY_MASK = (uint64_t)(-1); + static const uint64_t WAIT_ANY_MASK = ((1ul << WAIT_BITS) - 1); static const uint64_t WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); // -- dump flags -- diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 7d91becb97b..1030b746623 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -398,8 +398,9 @@ class CInode : public MDSCacheObject, public InodeStoreBase, public Counterdump_string("state", "rejoinundef"); } -bool MDSCacheObject::is_waiter_for(uint64_t mask, uint64_t min) { - if (!min) { - min = mask; - while (min & (min-1)) // if more than one bit is set - min &= min-1; // clear LSB - } - for (auto p = waiting.lower_bound(min); p != waiting.end(); ++p) { - if (p->first & mask) return true; - if (p->first > mask) return false; +bool MDSCacheObject::is_waiter_for(waitmask_t mask) { + for ([[maybe_unused]] auto& [seq, waiter] : waiting) { + if ((waiter.mask & mask).any()) { + return true; + } } return false; } -void MDSCacheObject::take_waiting(uint64_t mask, MDSContext::vec& ls) { - if (waiting.empty()) return; - - // process ordered waiters in the same order that they were added. - std::map ordered_waiters; - +void MDSCacheObject::take_waiting(waitmask_t mask, MDSContext::vec& ls) { + if (waiting.empty()) { + return; + } for (auto it = waiting.begin(); it != waiting.end(); ) { - if (it->first & mask) { - if (it->second.first > 0) { - ordered_waiters.insert(it->second); - } else { - ls.push_back(it->second.second); - } -// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) -// << "take_waiting mask " << hex << mask << dec << " took " << it->second -// << " tag " << hex << it->first << dec -// << " on " << *this -// << dendl; - waiting.erase(it++); + auto& waiter = it->second; + if ((waiter.mask & mask).any()) { + ls.push_back(waiter.c); + it = waiting.erase(it); } else { -// pdout(10,g_conf()->debug_mds) << "take_waiting mask " << hex << mask << dec << " SKIPPING " << it->second -// << " tag " << hex << it->first << dec -// << " on " << *this -// << dendl; - ++it; + ++it; } } - for (auto it = ordered_waiters.begin(); it != ordered_waiters.end(); ++it) { - ls.push_back(it->second); - } if (waiting.empty()) { put(PIN_WAITER); - waiting.clear(); + waiting.clear(); // free internal map } } diff --git a/src/mds/MDSCacheObject.h b/src/mds/MDSCacheObject.h index 8a319b4404d..6fa40ea7f8d 100644 --- a/src/mds/MDSCacheObject.h +++ b/src/mds/MDSCacheObject.h @@ -58,6 +58,16 @@ class MDSCacheObject { public: typedef mempool::mds_co::compact_map replica_map_type; + /* Mask for waiters. It is 128 bits to accomodate lock waiters. Its layout: + * + * Most-significant Least significant + * [ SimpleLock 64 bits | MDSCacheObject 64 bits ] + * + * It is organized this way for simplicity not for compactness and because + * the total bits will be >64 bits. + */ + using waitmask_t = std::bitset<128>; + struct ptr_lt { bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { return l->is_lt(r); @@ -261,27 +271,39 @@ class MDSCacheObject { unsigned get_replica_nonce() const { return replica_nonce; } void set_replica_nonce(unsigned n) { replica_nonce = n; } - bool is_waiter_for(uint64_t mask, uint64_t min=0); + /* A uint64_t mask is accepted to accomodate existing code that expects the + * mask to actually be a 64 bit integer. + */ + bool is_waiter_for(uint64_t mask) { + return is_waiter_for(waitmask_t(mask)); + } + bool is_waiter_for(waitmask_t mask); + virtual void add_waiter(uint64_t mask, MDSContext *c) { + add_waiter(waitmask_t(mask), c); + } + void add_waiter(waitmask_t mask, MDSContext *c) { if (waiting.empty()) get(PIN_WAITER); - uint64_t seq = 0; - if (mask & WAIT_ORDERED) { + waiter_seq_t seq = 0; + if ((mask & waitmask_t(WAIT_ORDERED)).any()) { seq = ++last_wait_seq; - mask &= ~WAIT_ORDERED; + mask &= ~waitmask_t(WAIT_ORDERED); + } else { + /* always at the front */ + seq = 0; } - waiting.insert(std::pair >( - mask, - std::pair(seq, c))); -// pdout(10,g_conf()->debug_mds) << (mdsco_db_line_prefix(this)) -// << "add_waiter " << hex << mask << dec << " " << c -// << " on " << *this -// << dendl; - + waiting.insert(std::pair(seq, waiter(mask, c))); + } + virtual void take_waiting(uint64_t mask, MDSContext::vec& ls) { + take_waiting(waitmask_t(mask), ls); } - virtual void take_waiting(uint64_t mask, MDSContext::vec& ls); - void finish_waiting(uint64_t mask, int result = 0); + void take_waiting(waitmask_t mask, MDSContext::vec& ls); + void finish_waiting(uint64_t mask, int result = 0) { + finish_waiting(waitmask_t(mask), result); + } + void finish_waiting(waitmask_t mask, int result = 0); // --------------------------------------------- // locking @@ -322,8 +344,13 @@ class MDSCacheObject { // --------------------------------------------- // waiting private: - mempool::mds_co::compact_multimap> waiting; - static uint64_t last_wait_seq; + using waiter_seq_t = uint64_t; + struct waiter { + waitmask_t mask; + MDSContext* c; + }; + mempool::mds_co::compact_multimap waiting; + static waiter_seq_t last_wait_seq; }; inline std::ostream& operator<<(std::ostream& out, const mdsco_db_line_prefix& o) { diff --git a/src/mds/SimpleLock.cc b/src/mds/SimpleLock.cc index b23915f9452..4dea69f2b5d 100644 --- a/src/mds/SimpleLock.cc +++ b/src/mds/SimpleLock.cc @@ -45,18 +45,18 @@ void SimpleLock::dump(ceph::Formatter *f) const { int SimpleLock::get_wait_shift() const { switch (get_type()) { - case CEPH_LOCK_DN: return 8; - case CEPH_LOCK_DVERSION: return 8 + 1*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IAUTH: return 8 + 2*SimpleLock::WAIT_BITS; - case CEPH_LOCK_ILINK: return 8 + 3*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IDFT: return 8 + 4*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IFILE: return 8 + 5*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IVERSION: return 8 + 6*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IXATTR: return 8 + 7*SimpleLock::WAIT_BITS; - case CEPH_LOCK_ISNAP: return 8 + 8*SimpleLock::WAIT_BITS; - case CEPH_LOCK_INEST: return 8 + 9*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IFLOCK: return 8 +10*SimpleLock::WAIT_BITS; - case CEPH_LOCK_IPOLICY: return 8 +11*SimpleLock::WAIT_BITS; + case CEPH_LOCK_DN: return 0; + case CEPH_LOCK_DVERSION: return 1*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IAUTH: return 2*SimpleLock::WAIT_BITS; + case CEPH_LOCK_ILINK: return 3*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IDFT: return 4*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IFILE: return 5*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IVERSION: return 6*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IXATTR: return 7*SimpleLock::WAIT_BITS; + case CEPH_LOCK_ISNAP: return 8*SimpleLock::WAIT_BITS; + case CEPH_LOCK_INEST: return 9*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IFLOCK: return 10*SimpleLock::WAIT_BITS; + case CEPH_LOCK_IPOLICY: return 11*SimpleLock::WAIT_BITS; default: ceph_abort(); } diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 3ef474c19a4..8d3ea32e244 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -200,7 +200,6 @@ public: int get_type() const { return type->type; } const sm_t* get_sm() const { return type->sm; } - int get_wait_shift() const; int get_cap_shift() const; int get_cap_mask() const; @@ -210,17 +209,33 @@ public: void encode_locked_state(ceph::buffer::list& bl) { parent->encode_lock_state(type->type, bl); } + + using waitmask_t = MDSCacheObject::waitmask_t; + static constexpr auto WAIT_ORDERED = waitmask_t(MDSCacheObject::WAIT_ORDERED); + int get_wait_shift() const; + MDSCacheObject::waitmask_t getmask(uint64_t mask) const { + /* See definition of MDSCacheObject::waitmask_t for waiter bits reserved + * for SimpleLock. + */ + static constexpr int simplelock_shift = 64; + auto waitmask = waitmask_t(mask); + int shift = get_wait_shift(); + ceph_assert(shift < 64); + shift += simplelock_shift; + waitmask <<= shift; + return waitmask; + } void finish_waiters(uint64_t mask, int r=0) { - parent->finish_waiting(mask << get_wait_shift(), r); + parent->finish_waiting(getmask(mask), r); } void take_waiting(uint64_t mask, MDSContext::vec& ls) { - parent->take_waiting(mask << get_wait_shift(), ls); + parent->take_waiting(getmask(mask), ls); } void add_waiter(uint64_t mask, MDSContext *c) { - parent->add_waiter((mask << get_wait_shift()) | MDSCacheObject::WAIT_ORDERED, c); + parent->add_waiter(getmask(mask) | WAIT_ORDERED, c); } bool is_waiter_for(uint64_t mask) const { - return parent->is_waiter_for(mask << get_wait_shift()); + return parent->is_waiter_for(getmask(mask)); } bool is_cached() const { -- 2.39.5