From: Yan, Zheng Date: Sun, 19 Jan 2014 04:13:06 +0000 (+0800) Subject: mds: fix scattered wrlock rejoin X-Git-Tag: v0.78~165^2~7 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=cc77ef2d52666b62152024ae9b8b4ac98cb54950;p=ceph.git mds: fix scattered wrlock rejoin If unstable scatter lock is encountered when handling weak cache rejoin, don't remove the recovering MDS from the scatter lock's gather list. The reason is the recovering MDS may hold rejoined wrlock on the scatter lock. (Rejoined wrlocks were created when handling strong cache rejoins from survivor MDS) When composing cache rejoin ack, if the recovering MDS is in lock's gather list, set lock state of the recovering MDS to a compatible unstable stable. Signed-off-by: Yan, Zheng --- diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 2e252985139..1ce2ee80b7c 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -3158,6 +3158,18 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl) flocklock.encode_state_for_replica(bl); policylock.encode_state_for_replica(bl); } +void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep) +{ + authlock.encode_state_for_replica(bl); + linklock.encode_state_for_replica(bl); + dirfragtreelock.encode_state_for_rejoin(bl, rep); + filelock.encode_state_for_rejoin(bl, rep); + nestlock.encode_state_for_rejoin(bl, rep); + xattrlock.encode_state_for_replica(bl); + snaplock.encode_state_for_replica(bl); + flocklock.encode_state_for_replica(bl); + policylock.encode_state_for_replica(bl); +} void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) { authlock.decode_state(p, is_new); @@ -3170,7 +3182,8 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new) flocklock.decode_state(p, is_new); policylock.decode_state(p, is_new); } -void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waiters) +void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waiters, + list& eval_locks) { authlock.decode_state_rejoin(p, waiters); linklock.decode_state_rejoin(p, waiters); @@ -3181,6 +3194,13 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list& waite snaplock.decode_state_rejoin(p, waiters); flocklock.decode_state_rejoin(p, waiters); policylock.decode_state_rejoin(p, waiters); + + if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked()) + eval_locks.push_back(&dirfragtreelock); + if (!filelock.is_stable() && !filelock.is_wrlocked()) + eval_locks.push_back(&filelock); + if (!nestlock.is_stable() && !nestlock.is_wrlocked()) + eval_locks.push_back(&nestlock); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index 28400aa3744..3977859821d 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -590,8 +590,10 @@ public: void _encode_locks_full(bufferlist& bl); void _decode_locks_full(bufferlist::iterator& p); void _encode_locks_state_for_replica(bufferlist& bl); + void _encode_locks_state_for_rejoin(bufferlist& bl, int rep); void _decode_locks_state(bufferlist::iterator& p, bool is_new); - void _decode_locks_rejoin(bufferlist::iterator& p, list& waiters); + void _decode_locks_rejoin(bufferlist::iterator& p, list& waiters, + list& eval_locks); // -- import/export -- void encode_export(bufferlist& bl); diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 42575679d5b..6d604d7df1c 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -4084,7 +4084,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) assert(in); if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, true, gather_locks); unsigned inonce = in->add_replica(from); dout(10) << " have " << *in << dendl; @@ -4095,7 +4095,9 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) if (ack) { acked_inodes.insert(in->vino()); ack->add_inode_base(in); - ack->add_inode_locks(in, inonce); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); } } } @@ -4107,14 +4109,16 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) CInode *in = get_inode(*p); assert(in); // hmm fixme wrt stray? if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, true, gather_locks); unsigned inonce = in->add_replica(from); dout(10) << " have base " << *in << dendl; if (ack) { acked_inodes.insert(in->vino()); ack->add_inode_base(in); - ack->add_inode_locks(in, inonce); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, from); + ack->add_inode_locks(in, inonce, bl); } } @@ -4300,7 +4304,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack, if (in->is_auth() && in->is_replica(from) && (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) { - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, false, gather_locks); dout(10) << " rem " << *in << dendl; } @@ -4820,7 +4824,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(in); in->set_replica_nonce(nonce); bufferlist::iterator q = lockbl.begin(); - in->_decode_locks_rejoin(q, rejoin_waiters); + in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks); in->state_clear(CInode::STATE_REJOINING); dout(10) << " got inode locks " << *in << dendl; } @@ -4863,6 +4867,17 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) assert(rejoin_ack_gather.count(from)); rejoin_ack_gather.erase(from); if (mds->is_rejoin()) { + + if (rejoin_gather.empty()) { + // eval unstable scatter locks after all wrlocks are rejoined. + while (!rejoin_eval_locks.empty()) { + SimpleLock *lock = rejoin_eval_locks.front(); + rejoin_eval_locks.pop_front(); + if (!lock->is_stable()) + mds->locker->eval_gather(lock); + } + } + if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. rejoin_ack_gather.empty()) { // finally, kickstart past snap parent opens @@ -5728,7 +5743,9 @@ void MDCache::rejoin_send_acks() r != in->replicas_end(); ++r) { ack[r->first]->add_inode_base(in); - ack[r->first]->add_inode_locks(in, ++r->second); + bufferlist bl; + in->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(in, ++r->second, bl); } // subdirs in this subtree? @@ -5743,14 +5760,18 @@ void MDCache::rejoin_send_acks() r != root->replicas_end(); ++r) { ack[r->first]->add_inode_base(root); - ack[r->first]->add_inode_locks(root, ++r->second); + bufferlist bl; + root->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(root, ++r->second, bl); } if (myin) for (map::iterator r = myin->replicas_begin(); r != myin->replicas_end(); ++r) { ack[r->first]->add_inode_base(myin); - ack[r->first]->add_inode_locks(myin, ++r->second); + bufferlist bl; + myin->_encode_locks_state_for_rejoin(bl, r->first); + ack[r->first]->add_inode_locks(myin, ++r->second, bl); } // include inode base for any inodes whose scatterlocks may have updated @@ -6795,7 +6816,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m) // remove from our cached_by dout(7) << " inode expire on " << *in << " from mds." << from << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from, gather_locks); + inode_remove_replica(in, from, false, gather_locks); } else { // this is an old nonce, ignore expire. @@ -6922,7 +6943,8 @@ void MDCache::discard_delayed_expire(CDir *dir) delayed_expire.erase(dir); } -void MDCache::inode_remove_replica(CInode *in, int from, set& gather_locks) +void MDCache::inode_remove_replica(CInode *in, int from, bool rejoin, + set& gather_locks) { in->remove_replica(from); in->mds_caps_wanted.erase(from); @@ -6931,14 +6953,17 @@ void MDCache::inode_remove_replica(CInode *in, int from, set& gath // fix lock if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock); if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock); - if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock); - if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock); if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock); if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock); - - if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock); if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock); if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock); + + // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state. + // Don't remove the recovering mds from lock's gathering list because + // it may hold rejoined wrlocks. + if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock); + if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock); + if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock); } void MDCache::dentry_remove_replica(CDentry *dn, int from, set& gather_locks) diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 060a8044c19..3aab1fa77a2 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -428,6 +428,7 @@ protected: map > rejoin_unlinked_inodes; vector rejoin_recover_q, rejoin_check_q; + list rejoin_eval_locks; list rejoin_waiters; void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); @@ -661,7 +662,8 @@ public: } protected: - void inode_remove_replica(CInode *in, int rep, set& gather_locks); + void inode_remove_replica(CInode *in, int rep, bool rejoin, + set& gather_locks); void dentry_remove_replica(CDentry *dn, int rep, set& gather_locks); void rename_file(CDentry *srcdn, CDentry *destdn); diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h index 42745ce9fdb..a85caa824ac 100644 --- a/src/mds/ScatterLock.h +++ b/src/mds/ScatterLock.h @@ -189,6 +189,29 @@ public: state = LOCK_LOCK; } + void encode_state_for_rejoin(bufferlist& bl, int rep) const { + __s16 s = get_replica_state(); + if (is_gathering(rep)) { + // the recovering mds may hold rejoined wrlocks + if (state == LOCK_MIX_SYNC) + s = LOCK_MIX_SYNC; + else + s = LOCK_MIX_LOCK; + } + ::encode(s, bl); + } + + bool remove_replica(int from, bool rejoin) { + if (rejoin && + (state == LOCK_MIX || + state == LOCK_MIX_SYNC || + state == LOCK_MIX_LOCK2 || + state == LOCK_MIX_TSYN || + state == LOCK_MIX_EXCL)) + return false; + return SimpleLock::remove_replica(from); + } + virtual void print(ostream& out) const { out << "("; _print(out); diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 37c823593ec..4b9b7f5150f 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -330,7 +330,8 @@ public: } else { state = s; } - take_waiting(SimpleLock::WAIT_ALL, waiters); + if (is_stable()) + take_waiting(SimpleLock::WAIT_ALL, waiters); } bool is_stable() const { @@ -373,8 +374,10 @@ public: ++p) more()->gather_set.insert(p->first); } - bool is_gathering() { return have_more() && !more()->gather_set.empty(); } - bool is_gathering(int i) { + bool is_gathering() const { + return have_more() && !more()->gather_set.empty(); + } + bool is_gathering(int i) const { return have_more() && more()->gather_set.count(i); } void clear_gather() { diff --git a/src/messages/MMDSCacheRejoin.h b/src/messages/MMDSCacheRejoin.h index d48cd67a428..9cbee3a7bf6 100644 --- a/src/messages/MMDSCacheRejoin.h +++ b/src/messages/MMDSCacheRejoin.h @@ -221,12 +221,10 @@ public: void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) { strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl); } - void add_inode_locks(CInode *in, __u32 nonce) { + void add_inode_locks(CInode *in, __u32 nonce, bufferlist& bl) { ::encode(in->inode.ino, inode_locks); ::encode(in->last, inode_locks); ::encode(nonce, inode_locks); - bufferlist bl; - in->_encode_locks_state_for_replica(bl); ::encode(bl, inode_locks); } void add_inode_base(CInode *in) {