From: Yan, Zheng Date: Sun, 19 Jan 2014 04:13:06 +0000 (+0800) Subject: mds: fix scattered wrlock rejoin X-Git-Tag: v0.78~165^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cc77ef2d52666b62152024ae9b8b4ac98cb54950;p=ceph.git mds: fix scattered wrlock rejoin If unstable scatter lock is encountered when handling weak cache rejoin, don't remove the recovering MDS from the scatter lock's gather list. The reason is the recovering MDS may hold rejoined wrlock on the scatter lock. (Rejoined wrlocks were created when handling strong cache rejoins from survivor MDS) When composing cache rejoin ack, if the recovering MDS is in lock's gather list, set lock state of the recovering MDS to a compatible unstable stable. Signed-off-by: Yan, Zheng --- diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 2e2529851393..1ce2ee80b7c4 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -3158,6 +3158,18 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl) flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); } +void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) +{ + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_rejoin(bl, rep); + filelock.encode_state_for_rejoin(bl, rep); + nestlock.encode_state_for_rejoin(bl, rep); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); +} void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) { authlock.decode_state(p, is_new); @@ -3170,7 +3182,8 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) flocklock.decode_state(p, is_new); policylock.decode_state(p, is_new); } -void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waiters) +void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waiters, + list& eval_locks) { authlock.decode_state_rejoin(p, waiters); linklock.decode_state_rejoin(p, waiters); @@ -3181,6 +3194,13 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waite snaplock.decode_state_rejoin(p, waiters); flocklock.decode_state_rejoin(p, waiters); policylock.decode_state_rejoin(p, waiters); + + if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) + eval_locks.push_back(&dirfragtreelock); + if (!filelock.is_stable() && !filelock.is_wrlocked()) + eval_locks.push_back(&filelock); + if (!nestlock.is_stable() && !nestlock.is_wrlocked()) + eval_locks.push_back(&nestlock); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 28400aa3744d..3977859821de 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -590,8 +590,10 @@ public: void _encode_locks_full(bufferlist& bl); void _decode_locks_full(bufferlist::iterator& p); void _encode_locks_state_for_replica(bufferlist& bl); + void _encode_locks_state_for_rejoin(bufferlist& bl, int rep); void _decode_locks_state(bufferlist::iterator& p, bool is_new); - void _decode_locks_rejoin(bufferlist::iterator& p, list& waiters); + void _decode_locks_rejoin(bufferlist::iterator& p, list& waiters, + list& eval_locks); // -- import/export -- void encode_export(bufferlist& bl); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 42575679d5b8..6d604d7df1ce 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4084,7 +4084,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) assert(in); if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, true, gather_locks); unsigned inonce = in->add_replica(from); dout(10) << " have " << *in << dendl; @@ -4095,7 +4095,9 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) if (ack) { acked_inodes.insert(in->vino()); ack->add_inode_base(in); - ack->add_inode_locks(in, inonce); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); } } } @@ -4107,14 +4109,16 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) CInode *in = get_inode(*p); assert(in); // hmm fixme wrt stray? if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, true, gather_locks); unsigned inonce = in->add_replica(from); dout(10) << " have base " << *in << dendl; if (ack) { acked_inodes.insert(in->vino()); ack->add_inode_base(in); - ack->add_inode_locks(in, inonce); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); } } @@ -4300,7 +4304,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, if (in->is_auth() && in->is_replica(from) && (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) { - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, false, gather_locks); dout(10) << " rem " << *in << dendl; } @@ -4820,7 +4824,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(in); in->set_replica_nonce(nonce); bufferlist::iterator q = lockbl.begin(); - in->_decode_locks_rejoin(q, rejoin_waiters); + in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks); in->state_clear(CInode::STATE_REJOINING); dout(10) << " got inode locks " << *in << dendl; } @@ -4863,6 +4867,17 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(rejoin_ack_gather.count(from)); rejoin_ack_gather.erase(from); if (mds->is_rejoin()) { + + if (rejoin_gather.empty()) { + // eval unstable scatter locks after all wrlocks are rejoined. + while (!rejoin_eval_locks.empty()) { + SimpleLock *lock = rejoin_eval_locks.front(); + rejoin_eval_locks.pop_front(); + if (!lock->is_stable()) + mds->locker->eval_gather(lock); + } + } + if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. rejoin_ack_gather.empty()) { // finally, kickstart past snap parent opens @@ -5728,7 +5743,9 @@ void MDCache::rejoin_send_acks() r != in->replicas_end(); ++r) { ack[r->first]->add_inode_base(in); - ack[r->first]->add_inode_locks(in, ++r->second); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(in, ++r->second, bl); } // subdirs in this subtree? @@ -5743,14 +5760,18 @@ void MDCache::rejoin_send_acks() r != root->replicas_end(); ++r) { ack[r->first]->add_inode_base(root); - ack[r->first]->add_inode_locks(root, ++r->second); + bufferlist bl; + root->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(root, ++r->second, bl); } if (myin) for (map::iterator r = myin->replicas_begin(); r != myin->replicas_end(); ++r) { ack[r->first]->add_inode_base(myin); - ack[r->first]->add_inode_locks(myin, ++r->second); + bufferlist bl; + myin->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(myin, ++r->second, bl); } // include inode base for any inodes whose scatterlocks may have updated @@ -6795,7 +6816,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m) // remove from our cached_by dout(7) << " inode expire on " << *in << " from mds." << from << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, false, gather_locks); } else { // this is an old nonce, ignore expire. @@ -6922,7 +6943,8 @@ void MDCache::discard_delayed_expire(CDir *dir) delayed_expire.erase(dir); } -void MDCache::inode_remove_replica(CInode *in, int from, set& gather_locks) +void MDCache::inode_remove_replica(CInode *in, int from, bool rejoin, + set& gather_locks) { in->remove_replica(from); in->mds_caps_wanted.erase(from); @@ -6931,14 +6953,17 @@ void MDCache::inode_remove_replica(CInode *in, int from, set& gath // fix lock if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock); if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock); if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock); if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock); - - if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock); if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock); if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock); + + // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state. + // Don't remove the recovering mds from lock's gathering list because + // it may hold rejoined wrlocks. + if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock); + if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock); + if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock); } void MDCache::dentry_remove_replica(CDentry *dn, int from, set& gather_locks) diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 060a8044c194..3aab1fa77a2d 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -428,6 +428,7 @@ protected: map > rejoin_unlinked_inodes; vector rejoin_recover_q, rejoin_check_q; + list rejoin_eval_locks; list rejoin_waiters; void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); @@ -661,7 +662,8 @@ public: } protected: - void inode_remove_replica(CInode *in, int rep, set& gather_locks); + void inode_remove_replica(CInode *in, int rep, bool rejoin, + set& gather_locks); void dentry_remove_replica(CDentry *dn, int rep, set& gather_locks); void rename_file(CDentry *srcdn, CDentry *destdn); diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 42745ce9fdb1..a85caa824ace 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -189,6 +189,29 @@ public: state = LOCK_LOCK; } + void encode_state_for_rejoin(bufferlist& bl, int rep) const { + __s16 s = get_replica_state(); + if (is_gathering(rep)) { + // the recovering mds may hold rejoined wrlocks + if (state == LOCK_MIX_SYNC) + s = LOCK_MIX_SYNC; + else + s = LOCK_MIX_LOCK; + } + ::encode(s, bl); + } + + bool remove_replica(int from, bool rejoin) { + if (rejoin && + (state == LOCK_MIX || + state == LOCK_MIX_SYNC || + state == LOCK_MIX_LOCK2 || + state == LOCK_MIX_TSYN || + state == LOCK_MIX_EXCL)) + return false; + return SimpleLock::remove_replica(from); + } + virtual void print(ostream& out) const { out << "("; _print(out); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 37c823593ecc..4b9b7f5150ff 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -330,7 +330,8 @@ public: } else { state = s; } - take_waiting(SimpleLock::WAIT_ALL, waiters); + if (is_stable()) + take_waiting(SimpleLock::WAIT_ALL, waiters); } bool is_stable() const { @@ -373,8 +374,10 @@ public: ++p) more()->gather_set.insert(p->first); } - bool is_gathering() { return have_more() && !more()->gather_set.empty(); } - bool is_gathering(int i) { + bool is_gathering() const { + return have_more() && !more()->gather_set.empty(); + } + bool is_gathering(int i) const { return have_more() && more()->gather_set.count(i); } void clear_gather() { diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index d48cd67a4285..9cbee3a7bf6c 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -221,12 +221,10 @@ public: void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) { strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl); } - void add_inode_locks(CInode *in, __u32 nonce) { + void add_inode_locks(CInode *in, __u32 nonce, bufferlist& bl) { ::encode(in->inode.ino, inode_locks); ::encode(in->last, inode_locks); ::encode(nonce, inode_locks); - bufferlist bl; - in->_encode_locks_state_for_replica(bl); ::encode(bl, inode_locks); } void add_inode_base(CInode *in) {