]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: fix scattered wrlock rejoin
authorYan, Zheng <zheng.z.yan@intel.com>
Sun, 19 Jan 2014 04:13:06 +0000 (12:13 +0800)
committerYan, Zheng <zheng.z.yan@intel.com>
Mon, 17 Feb 2014 01:37:51 +0000 (09:37 +0800)
If unstable scatter lock is encountered when handling weak cache
rejoin, don't remove the recovering MDS from the scatter lock's
gather list. The reason is the recovering MDS may hold rejoined
wrlock on the scatter lock. (Rejoined wrlocks were created when
handling strong cache rejoins from survivor MDS)

When composing cache rejoin ack, if the recovering MDS is in lock's
gather list, set lock state of the recovering MDS to a compatible
unstable stable.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
src/mds/CInode.cc
src/mds/CInode.h
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/ScatterLock.h
src/mds/SimpleLock.h
src/messages/MMDSCacheRejoin.h

index 2e2529851393f57001be1d442ff6953255951e40..1ce2ee80b7c4c210b83521529c496a39e329240d 100644 (file)
@@ -3158,6 +3158,18 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl)
   flocklock.encode_state_for_replica(bl);
   policylock.encode_state_for_replica(bl);
 }
+void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
+{
+  authlock.encode_state_for_replica(bl);
+  linklock.encode_state_for_replica(bl);
+  dirfragtreelock.encode_state_for_rejoin(bl, rep);
+  filelock.encode_state_for_rejoin(bl, rep);
+  nestlock.encode_state_for_rejoin(bl, rep);
+  xattrlock.encode_state_for_replica(bl);
+  snaplock.encode_state_for_replica(bl);
+  flocklock.encode_state_for_replica(bl);
+  policylock.encode_state_for_replica(bl);
+}
 void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
 {
   authlock.decode_state(p, is_new);
@@ -3170,7 +3182,8 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
   flocklock.decode_state(p, is_new);
   policylock.decode_state(p, is_new);
 }
-void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waiters)
+void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waiters,
+                                 list<SimpleLock*>& eval_locks)
 {
   authlock.decode_state_rejoin(p, waiters);
   linklock.decode_state_rejoin(p, waiters);
@@ -3181,6 +3194,13 @@ void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waite
   snaplock.decode_state_rejoin(p, waiters);
   flocklock.decode_state_rejoin(p, waiters);
   policylock.decode_state_rejoin(p, waiters);
+
+  if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
+    eval_locks.push_back(&dirfragtreelock);
+  if (!filelock.is_stable() && !filelock.is_wrlocked())
+    eval_locks.push_back(&filelock);
+  if (!nestlock.is_stable() && !nestlock.is_wrlocked())
+    eval_locks.push_back(&nestlock);
 }
 
 
index 28400aa3744dd95554ab5101e7aea229961d41fa..3977859821de11acda66525221fad1f5319531bf 100644 (file)
@@ -590,8 +590,10 @@ public:
   void _encode_locks_full(bufferlist& bl);
   void _decode_locks_full(bufferlist::iterator& p);
   void _encode_locks_state_for_replica(bufferlist& bl);
+  void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
   void _decode_locks_state(bufferlist::iterator& p, bool is_new);
-  void _decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waiters);
+  void _decode_locks_rejoin(bufferlist::iterator& p, list<Context*>& waiters,
+                           list<SimpleLock*>& eval_locks);
 
   // -- import/export --
   void encode_export(bufferlist& bl);
index 42575679d5b83df283bd016fd861ca35dfe32184..6d604d7df1cebef852e7f45d5694ca1de9b47fc2 100644 (file)
@@ -4084,7 +4084,7 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       assert(in);
 
       if (survivor && in->is_replica(from)) 
-       inode_remove_replica(in, from, gather_locks);
+       inode_remove_replica(in, from, true, gather_locks);
       unsigned inonce = in->add_replica(from);
       dout(10) << " have " << *in << dendl;
 
@@ -4095,7 +4095,9 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
       if (ack) {
        acked_inodes.insert(in->vino());
        ack->add_inode_base(in);
-       ack->add_inode_locks(in, inonce);
+       bufferlist bl;
+       in->_encode_locks_state_for_rejoin(bl, from);
+       ack->add_inode_locks(in, inonce, bl);
       }
     }
   }
@@ -4107,14 +4109,16 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak)
     CInode *in = get_inode(*p);
     assert(in);   // hmm fixme wrt stray?
     if (survivor && in->is_replica(from)) 
-      inode_remove_replica(in, from, gather_locks);
+      inode_remove_replica(in, from, true, gather_locks);
     unsigned inonce = in->add_replica(from);
     dout(10) << " have base " << *in << dendl;
     
     if (ack) {
       acked_inodes.insert(in->vino());
       ack->add_inode_base(in);
-      ack->add_inode_locks(in, inonce);
+      bufferlist bl;
+      in->_encode_locks_state_for_rejoin(bl, from);
+      ack->add_inode_locks(in, inonce, bl);
     }
   }
 
@@ -4300,7 +4304,7 @@ void MDCache::rejoin_scour_survivor_replicas(int from, MMDSCacheRejoin *ack,
     if (in->is_auth() &&
        in->is_replica(from) &&
        (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) {
-      inode_remove_replica(in, from, gather_locks);
+      inode_remove_replica(in, from, false, gather_locks);
       dout(10) << " rem " << *in << dendl;
     }
 
@@ -4820,7 +4824,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     assert(in);
     in->set_replica_nonce(nonce);
     bufferlist::iterator q = lockbl.begin();
-    in->_decode_locks_rejoin(q, rejoin_waiters);
+    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
     in->state_clear(CInode::STATE_REJOINING);
     dout(10) << " got inode locks " << *in << dendl;
   }
@@ -4863,6 +4867,17 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   assert(rejoin_ack_gather.count(from));
   rejoin_ack_gather.erase(from);
   if (mds->is_rejoin()) {
+
+    if (rejoin_gather.empty()) {
+      // eval unstable scatter locks after all wrlocks are rejoined.
+      while (!rejoin_eval_locks.empty()) {
+       SimpleLock *lock = rejoin_eval_locks.front();
+       rejoin_eval_locks.pop_front();
+       if (!lock->is_stable())
+         mds->locker->eval_gather(lock);
+      }
+    }
+
     if (rejoin_gather.empty() &&     // make sure we've gotten our FULL inodes, too.
        rejoin_ack_gather.empty()) {
       // finally, kickstart past snap parent opens
@@ -5728,7 +5743,9 @@ void MDCache::rejoin_send_acks()
             r != in->replicas_end();
             ++r) {
          ack[r->first]->add_inode_base(in);
-         ack[r->first]->add_inode_locks(in, ++r->second);
+         bufferlist bl;
+         in->_encode_locks_state_for_rejoin(bl, r->first);
+         ack[r->first]->add_inode_locks(in, ++r->second, bl);
        }
        
        // subdirs in this subtree?
@@ -5743,14 +5760,18 @@ void MDCache::rejoin_send_acks()
         r != root->replicas_end();
         ++r) {
       ack[r->first]->add_inode_base(root);
-      ack[r->first]->add_inode_locks(root, ++r->second);
+      bufferlist bl;
+      root->_encode_locks_state_for_rejoin(bl, r->first);
+      ack[r->first]->add_inode_locks(root, ++r->second, bl);
     }
   if (myin)
     for (map<int,unsigned>::iterator r = myin->replicas_begin();
         r != myin->replicas_end();
         ++r) {
       ack[r->first]->add_inode_base(myin);
-      ack[r->first]->add_inode_locks(myin, ++r->second);
+      bufferlist bl;
+      myin->_encode_locks_state_for_rejoin(bl, r->first);
+      ack[r->first]->add_inode_locks(myin, ++r->second, bl);
     }
 
   // include inode base for any inodes whose scatterlocks may have updated
@@ -6795,7 +6816,7 @@ void MDCache::handle_cache_expire(MCacheExpire *m)
        // remove from our cached_by
        dout(7) << " inode expire on " << *in << " from mds." << from 
                << " cached_by was " << in->get_replicas() << dendl;
-       inode_remove_replica(in, from, gather_locks);
+       inode_remove_replica(in, from, false, gather_locks);
       } 
       else {
        // this is an old nonce, ignore expire.
@@ -6922,7 +6943,8 @@ void MDCache::discard_delayed_expire(CDir *dir)
   delayed_expire.erase(dir);  
 }
 
-void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gather_locks)
+void MDCache::inode_remove_replica(CInode *in, int from, bool rejoin,
+                                  set<SimpleLock *>& gather_locks)
 {
   in->remove_replica(from);
   in->mds_caps_wanted.erase(from);
@@ -6931,14 +6953,17 @@ void MDCache::inode_remove_replica(CInode *in, int from, set<SimpleLock *>& gath
   // fix lock
   if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
   if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
-  if (in->dirfragtreelock.remove_replica(from)) gather_locks.insert(&in->dirfragtreelock);
-  if (in->filelock.remove_replica(from)) gather_locks.insert(&in->filelock);
   if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
   if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
-
-  if (in->nestlock.remove_replica(from)) gather_locks.insert(&in->nestlock);
   if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
   if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
+
+  // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
+  // Don't remove the recovering mds from lock's gathering list because
+  // it may hold rejoined wrlocks.
+  if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
+  if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
+  if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
 }
 
 void MDCache::dentry_remove_replica(CDentry *dn, int from, set<SimpleLock *>& gather_locks)
index 060a8044c194d6f8ffd88978ffc48b5c5975d956..3aab1fa77a2d05f6beee6e17d62055243edda116 100644 (file)
@@ -428,6 +428,7 @@ protected:
   map<int, set<CInode*> > rejoin_unlinked_inodes;
 
   vector<CInode*> rejoin_recover_q, rejoin_check_q;
+  list<SimpleLock*> rejoin_eval_locks;
   list<Context*> rejoin_waiters;
 
   void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
@@ -661,7 +662,8 @@ public:
   }
 protected:
 
-  void inode_remove_replica(CInode *in, int rep, set<SimpleLock *>& gather_locks);
+  void inode_remove_replica(CInode *in, int rep, bool rejoin,
+                           set<SimpleLock *>& gather_locks);
   void dentry_remove_replica(CDentry *dn, int rep, set<SimpleLock *>& gather_locks);
 
   void rename_file(CDentry *srcdn, CDentry *destdn);
index 42745ce9fdb103bdaa2f9fbb72112a52919d1381..a85caa824ace6d569c2a899798d5a65699309d6b 100644 (file)
@@ -189,6 +189,29 @@ public:
       state = LOCK_LOCK;
   }
 
+  void encode_state_for_rejoin(bufferlist& bl, int rep) const {
+    __s16 s = get_replica_state();
+    if (is_gathering(rep)) {
+      // the recovering mds may hold rejoined wrlocks
+      if (state == LOCK_MIX_SYNC)
+       s = LOCK_MIX_SYNC;
+      else
+       s = LOCK_MIX_LOCK;
+    }
+    ::encode(s, bl);
+  }
+
+  bool remove_replica(int from, bool rejoin) {
+    if (rejoin &&
+       (state == LOCK_MIX ||
+        state == LOCK_MIX_SYNC ||
+        state == LOCK_MIX_LOCK2 ||
+        state == LOCK_MIX_TSYN ||
+        state == LOCK_MIX_EXCL))
+      return false;
+    return SimpleLock::remove_replica(from);
+  }
+
   virtual void print(ostream& out) const {
     out << "(";
     _print(out);
index 37c823593ecc9fbe5ac08c84c68c6a2ca0ba0871..4b9b7f5150ff2787043fbabb20edc2139f5150e6 100644 (file)
@@ -330,7 +330,8 @@ public:
     } else {
       state = s;
     }
-    take_waiting(SimpleLock::WAIT_ALL, waiters);
+    if (is_stable())
+      take_waiting(SimpleLock::WAIT_ALL, waiters);
   }
 
   bool is_stable() const {
@@ -373,8 +374,10 @@ public:
         ++p)
       more()->gather_set.insert(p->first);
   }
-  bool is_gathering() { return have_more() && !more()->gather_set.empty(); }
-  bool is_gathering(int i) {
+  bool is_gathering() const {
+    return have_more() && !more()->gather_set.empty();
+  }
+  bool is_gathering(int i) const {
     return have_more() && more()->gather_set.count(i);
   }
   void clear_gather() {
index d48cd67a4285a507073966a9c0d75ffefe9862d1..9cbee3a7bf6c7b5abb1c2d9e57cd7727b0c6075e 100644 (file)
@@ -221,12 +221,10 @@ public:
   void add_strong_inode(vinodeno_t i, int n, int cw, int dl, int nl, int dftl) {
     strong_inodes[i] = inode_strong(n, cw, dl, nl, dftl);
   }
-  void add_inode_locks(CInode *in, __u32 nonce) {
+  void add_inode_locks(CInode *in, __u32 nonce, bufferlist& bl) {
     ::encode(in->inode.ino, inode_locks);
     ::encode(in->last, inode_locks);
     ::encode(nonce, inode_locks);
-    bufferlist bl;
-    in->_encode_locks_state_for_replica(bl);
     ::encode(bl, inode_locks);
   }
   void add_inode_base(CInode *in) {