From: Yan, Zheng Date: Wed, 18 Oct 2017 12:58:15 +0000 (+0800) Subject: mds: don't rdlock locks in replica object while auth mds is recovering X-Git-Tag: v12.2.3~153^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cb8eff43b1abd8c268df9e57906d677ff4be8d95;p=ceph.git mds: don't rdlock locks in replica object while auth mds is recovering Auth mds may take xlock on the lock and change the object when replaying unsafe requests. To guarantee new requests and replayed unsafe requests (on auth mds) get processed in proper order, we shouldn't rdlock locks in replica object while auth mds of the object is recovering Signed-off-by: "Yan, Zheng" (cherry picked from commit 0afbc0338e1b9f32340eaa74899d8d43ac8608fe) --- diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index ecd4a7095911..7d2473493158 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -402,15 +402,18 @@ void CDentry::decode_replica(bufferlist::iterator& p, bool is_new) inodeno_t rino; unsigned char rdtype; - __s32 ls; ::decode(rino, p); ::decode(rdtype, p); - ::decode(ls, p); + lock.decode_state(p, is_new); + + bool need_recover; + ::decode(need_recover, p); if (is_new) { if (rino) dir->link_remote_inode(this, rino, rdtype); - lock.set_state(ls); + if (need_recover) + lock.mark_need_recover(); } } diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index e9416104ad15..d5c628b699a0 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -244,7 +244,7 @@ public: void clear_new() { state_clear(STATE_NEW); } // -- replication - void encode_replica(mds_rank_t mds, bufferlist& bl) { + void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) { if (!is_replicated()) lock.replicate_relax(); @@ -253,8 +253,8 @@ public: ::encode(first, bl); ::encode(linkage.remote_ino, bl); ::encode(linkage.remote_d_type, bl); - __s32 ls = lock.get_replica_state(); - ::encode(ls, bl); + lock.encode_state_for_replica(bl); + ::encode(need_recover, bl); } void decode_replica(bufferlist::iterator& p, bool is_new); diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 633e6477553d..25931d019d6d 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -3553,7 +3553,7 @@ void CInode::_decode_locks_full(bufferlist::iterator& p) want_loner_cap = loner_cap; // for now, we'll eval() shortly. } -void CInode::_encode_locks_state_for_replica(bufferlist& bl) +void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover) { authlock.encode_state_for_replica(bl); linklock.encode_state_for_replica(bl); @@ -3564,7 +3564,9 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl) snaplock.encode_state_for_replica(bl); flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); + ::encode(need_recover, bl); } + void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) { authlock.encode_state_for_replica(bl); @@ -3577,6 +3579,7 @@ void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); } + void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) { authlock.decode_state(p, is_new); @@ -3588,19 +3591,35 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) snaplock.decode_state(p, is_new); flocklock.decode_state(p, is_new); policylock.decode_state(p, is_new); + + bool need_recover; + ::decode(need_recover, p); + if (need_recover && is_new) { + // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock + // and change the object when replaying unsafe requests. + authlock.mark_need_recover(); + linklock.mark_need_recover(); + dirfragtreelock.mark_need_recover(); + filelock.mark_need_recover(); + nestlock.mark_need_recover(); + xattrlock.mark_need_recover(); + snaplock.mark_need_recover(); + flocklock.mark_need_recover(); + policylock.mark_need_recover(); + } } void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waiters, - list& eval_locks) -{ - authlock.decode_state_rejoin(p, waiters); - linklock.decode_state_rejoin(p, waiters); - dirfragtreelock.decode_state_rejoin(p, waiters); - filelock.decode_state_rejoin(p, waiters); - nestlock.decode_state_rejoin(p, waiters); - xattrlock.decode_state_rejoin(p, waiters); - snaplock.decode_state_rejoin(p, waiters); - flocklock.decode_state_rejoin(p, waiters); - policylock.decode_state_rejoin(p, waiters); + list& eval_locks, bool survivor) +{ + authlock.decode_state_rejoin(p, waiters, survivor); + linklock.decode_state_rejoin(p, waiters, survivor); + dirfragtreelock.decode_state_rejoin(p, waiters, survivor); + filelock.decode_state_rejoin(p, waiters, survivor); + nestlock.decode_state_rejoin(p, waiters, survivor); + xattrlock.decode_state_rejoin(p, waiters, survivor); + snaplock.decode_state_rejoin(p, waiters, survivor); + flocklock.decode_state_rejoin(p, waiters, survivor); + policylock.decode_state_rejoin(p, waiters, survivor); if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) eval_locks.push_back(&dirfragtreelock); diff --git a/src/mds/CInode.h b/src/mds/CInode.h index f1b371678c08..464b75cfc2c0 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -772,7 +772,7 @@ public: void encode_store(bufferlist& bl, uint64_t features); void decode_store(bufferlist::iterator& bl); - void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features) { + void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) { assert(is_auth()); // relax locks? @@ -783,7 +783,7 @@ public: ::encode(nonce, bl); _encode_base(bl, features); - _encode_locks_state_for_replica(bl); + _encode_locks_state_for_replica(bl, need_recover); } void decode_replica(bufferlist::iterator& p, bool is_new) { __u32 nonce; @@ -811,11 +811,11 @@ public: void _decode_base(bufferlist::iterator& p); void _encode_locks_full(bufferlist& bl); void _decode_locks_full(bufferlist::iterator& p); - void _encode_locks_state_for_replica(bufferlist& bl); + void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover); void _encode_locks_state_for_rejoin(bufferlist& bl, int rep); void _decode_locks_state(bufferlist::iterator& p, bool is_new); void _decode_locks_rejoin(bufferlist::iterator& p, std::list& waiters, - std::list& eval_locks); + std::list& eval_locks, bool survivor); // -- import/export -- void encode_export(bufferlist& bl); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index ccf42a67f9bb..90be7a08f63d 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -585,33 +585,20 @@ bool Locker::acquire_locks(MDRequestRef& mdr, } } else { assert(mdr->is_master()); - if ((*p)->is_scatterlock()) { - ScatterLock *slock = static_cast(*p); - if (slock->is_rejoin_mix()) { - // If there is a recovering mds who replcated an object when it failed - // and scatterlock in the object was in MIX state, It's possible that - // the recovering mds needs to take wrlock on the scatterlock when it - // replays unsafe requests. So this mds should delay taking rdlock on - // the scatterlock until the recovering mds finishes replaying unsafe. - // Otherwise unsafe requests may get replayed after current request. - // - // For example: - // The recovering mds is auth mds of a dirfrag, this mds is auth mds - // of correspinding inode. when 'rm -rf' the direcotry, this mds should - // delay the rmdir request until the recovering mds has replayed unlink - // requests. - if (mds->is_cluster_degraded()) { - if (!mdr->is_replay()) { - drop_locks(mdr.get()); - mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr)); - dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent() - << ", waiting for cluster recovered" << dendl; - marker.message = "rejoin mix scatterlock, waiting for cluster recovered"; - return false; - } - } else { - slock->clear_rejoin_mix(); + if ((*p)->needs_recover()) { + if (mds->is_cluster_degraded()) { + if (!mdr->is_replay()) { + // see comments in SimpleLock::set_state_rejoin() and + // ScatterLock::encode_state_for_rejoin() + drop_locks(mdr.get()); + mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr)); + dout(10) << " rejoin recovering " << **p << " " << *(*p)->get_parent() + << ", waiting for cluster recovered" << dendl; + marker.message = "rejoin recovering lock, waiting for cluster recovered"; + return false; } + } else { + (*p)->clear_need_recover(); } } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index b40833fd1cab..074a86bd844e 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4898,6 +4898,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; mds_rank_t from = mds_rank_t(ack->get_source().num()); + assert(mds->get_state() >= MDSMap::STATE_REJOIN); + bool survivor = !mds->is_rejoin(); + // for sending cache expire message set isolated_inodes; set refragged_inodes; @@ -5007,7 +5010,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) } dn->set_replica_nonce(q->second.nonce); - dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters); + dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor); dn->state_clear(CDentry::STATE_REJOINING); dout(10) << " got " << *dn << dendl; } @@ -5070,7 +5073,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(in); in->set_replica_nonce(nonce); bufferlist::iterator q = lockbl.begin(); - in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks); + in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor); in->state_clear(CInode::STATE_REJOINING); dout(10) << " got inode locks " << *in << dendl; } @@ -5114,7 +5117,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) // done? assert(rejoin_ack_gather.count(from)); rejoin_ack_gather.erase(from); - if (mds->is_rejoin()) { + if (!survivor) { if (rejoin_gather.empty()) { // eval unstable scatter locks after all wrlocks are rejoined. @@ -10182,6 +10185,29 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // ---------------------------- // REPLICAS + +void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) +{ + dirfrag_t df = dir->dirfrag(); + ::encode(df, bl); + dir->encode_replica(to, bl); +} + +void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) +{ + ::encode(dn->name, bl); + ::encode(dn->last, bl); + dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE); +} + +void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, + uint64_t features) +{ + ::encode(in->inode.ino, bl); // bleh, minor assymetry here + ::encode(in->last, bl); + in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE); +} + CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list& finished) { diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 61a170bf6fd5..099445233aef 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -1035,22 +1035,10 @@ protected: friend class C_MDC_Join; public: - void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) { - dirfrag_t df = dir->dirfrag(); - ::encode(df, bl); - dir->encode_replica(to, bl); - } - void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) { - ::encode(dn->name, bl); - ::encode(dn->last, bl); - dn->encode_replica(to, bl); - } + void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl); + void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl); void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, - uint64_t features) { - ::encode(in->inode.ino, bl); // bleh, minor assymetry here - ::encode(in->last, bl); - in->encode_replica(to, bl, features); - } + uint64_t features); CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list& finished); CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list& finished); diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 872428229aad..ab963986bbc2 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -43,7 +43,6 @@ class ScatterLock : public SimpleLock { DIRTY = 1 << 10, FLUSHING = 1 << 11, FLUSHED = 1 << 12, - REJOIN_MIX = 1 << 13, }; public: @@ -125,9 +124,6 @@ public: bool is_dirty_or_flushing() const { return is_dirty() || is_flushing(); } - bool is_rejoin_mix() const { - return state_flags & REJOIN_MIX; - } void mark_dirty() { if (!is_dirty()) { @@ -159,9 +155,6 @@ public: void clear_flushed() override { state_flags &= ~FLUSHED; } - void clear_rejoin_mix() { - state_flags &= ~REJOIN_MIX; - } void infer_state_from_strong_rejoin(int rstate, bool locktoo) { if (rstate == LOCK_MIX || @@ -182,14 +175,26 @@ public: s = LOCK_MIX_LOCK; } + // If there is a recovering mds who replcated an object when it failed + // and scatterlock in the object was in MIX state, It's possible that + // the recovering mds needs to take wrlock on the scatterlock when it + // replays unsafe requests. So this mds should delay taking rdlock on + // the scatterlock until the recovering mds finishes replaying unsafe. + // Otherwise unsafe requests may get replayed after current request. + // + // For example: + // The recovering mds is auth mds of a dirfrag, this mds is auth mds + // of correspinding inode. when 'rm -rf' the direcotry, this mds should + // delay the rmdir request until the recovering mds has replayed unlink + // requests. if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC) - state_flags |= REJOIN_MIX; + mark_need_recover(); ::encode(s, bl); } - void decode_state_rejoin(bufferlist::iterator& p, list& waiters) { - SimpleLock::decode_state_rejoin(p, waiters); + void decode_state_rejoin(bufferlist::iterator& p, list& waiters, bool survivor) { + SimpleLock::decode_state_rejoin(p, waiters, survivor); if (is_flushing()) { set_dirty(); clear_flushing(); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index f97714310b5b..6e60d2a7015f 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -168,7 +168,8 @@ protected: __s16 state_flags; enum { - LEASED = 1 << 0, + LEASED = 1 << 0, + NEED_RECOVER = 1 << 1, }; private: @@ -325,13 +326,17 @@ public: //assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. return s; } - void set_state_rejoin(int s, list& waiters) { - if (!is_stable() && get_parent()->is_auth()) { - state = s; - get_parent()->auth_unpin(this); - } else { - state = s; - } + void set_state_rejoin(int s, list& waiters, bool survivor) { + assert(!get_parent()->is_auth()); + + // If lock in the replica object was not in SYNC state when auth mds of the object failed. + // Auth mds of the object may take xlock on the lock and change the object when replaying + // unsafe requests. + if (!survivor || state != LOCK_SYNC) + mark_need_recover(); + + state = s; + if (is_stable()) take_waiting(SimpleLock::WAIT_ALL, waiters); } @@ -545,6 +550,16 @@ public: return is_xlocked() || is_rdlocked() || is_wrlocked() || is_leased(); } + bool needs_recover() const { + return state_flags & NEED_RECOVER; + } + void mark_need_recover() { + state_flags |= NEED_RECOVER; + } + void clear_need_recover() { + state_flags &= ~NEED_RECOVER; + } + // encode/decode void encode(bufferlist& bl) const { ENCODE_START(2, 2, bl); @@ -574,10 +589,10 @@ public: if (is_new) state = s; } - void decode_state_rejoin(bufferlist::iterator& p, list& waiters) { + void decode_state_rejoin(bufferlist::iterator& p, list& waiters, bool survivor) { __s16 s; ::decode(s, p); - set_state_rejoin(s, waiters); + set_state_rejoin(s, waiters, survivor); }