]> git.apps.os.sepia.ceph.com Git - ceph-ci.git/commitdiff
mds: don't rdlock locks in replica object while auth mds is recovering
authorYan, Zheng <zyan@redhat.com>
Wed, 18 Oct 2017 12:58:15 +0000 (20:58 +0800)
committerYan, Zheng <zyan@redhat.com>
Thu, 19 Oct 2017 01:32:48 +0000 (09:32 +0800)
Auth mds may take xlock on the lock and change the object when replaying
unsafe requests. To guarantee new requests and replayed unsafe requests
(on auth mds) get processed in proper order, we shouldn't rdlock locks in
replica object while auth mds of the object is recovering

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
src/mds/CDentry.cc
src/mds/CDentry.h
src/mds/CInode.cc
src/mds/CInode.h
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/ScatterLock.h
src/mds/SimpleLock.h

index ecd4a7095911536237051682c4e6929bc54cd3fe..7d24734931589cdfcf9e01f454adff85fa6927f4 100644 (file)
@@ -402,15 +402,18 @@ void CDentry::decode_replica(bufferlist::iterator& p, bool is_new)
 
   inodeno_t rino;
   unsigned char rdtype;
-  __s32 ls;
   ::decode(rino, p);
   ::decode(rdtype, p);
-  ::decode(ls, p);
+  lock.decode_state(p, is_new);
+
+  bool need_recover;
+  ::decode(need_recover, p);
 
   if (is_new) {
     if (rino)
       dir->link_remote_inode(this, rino, rdtype);
-    lock.set_state(ls);
+    if (need_recover)
+      lock.mark_need_recover();
   }
 }
 
index e9416104ad15f1fc81723729cfa940f922414231..d5c628b699a0ccf3df2f174a3270963563b466d3 100644 (file)
@@ -244,7 +244,7 @@ public:
   void clear_new() { state_clear(STATE_NEW); }
   
   // -- replication
-  void encode_replica(mds_rank_t mds, bufferlist& bl) {
+  void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) {
     if (!is_replicated())
       lock.replicate_relax();
 
@@ -253,8 +253,8 @@ public:
     ::encode(first, bl);
     ::encode(linkage.remote_ino, bl);
     ::encode(linkage.remote_d_type, bl);
-    __s32 ls = lock.get_replica_state();
-    ::encode(ls, bl);
+    lock.encode_state_for_replica(bl);
+    ::encode(need_recover, bl);
   }
   void decode_replica(bufferlist::iterator& p, bool is_new);
 
index d07da8a7aba55956721a597ecfada67aefcdd832..860cbf9ca72dc7c346ea84718e9076cd4e8a8dd6 100644 (file)
@@ -3553,7 +3553,7 @@ void CInode::_decode_locks_full(bufferlist::iterator& p)
   want_loner_cap = loner_cap;  // for now, we'll eval() shortly.
 }
 
-void CInode::_encode_locks_state_for_replica(bufferlist& bl)
+void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
 {
   authlock.encode_state_for_replica(bl);
   linklock.encode_state_for_replica(bl);
@@ -3564,7 +3564,9 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl)
   snaplock.encode_state_for_replica(bl);
   flocklock.encode_state_for_replica(bl);
   policylock.encode_state_for_replica(bl);
+  ::encode(need_recover, bl);
 }
+
 void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
 {
   authlock.encode_state_for_replica(bl);
@@ -3577,6 +3579,7 @@ void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
   flocklock.encode_state_for_replica(bl);
   policylock.encode_state_for_replica(bl);
 }
+
 void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
 {
   authlock.decode_state(p, is_new);
@@ -3588,19 +3591,35 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
   snaplock.decode_state(p, is_new);
   flocklock.decode_state(p, is_new);
   policylock.decode_state(p, is_new);
+
+  bool need_recover;
+  ::decode(need_recover, p);
+  if (need_recover && is_new) {
+    // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
+    // and change the object when replaying unsafe requests.
+    authlock.mark_need_recover();
+    linklock.mark_need_recover();
+    dirfragtreelock.mark_need_recover();
+    filelock.mark_need_recover();
+    nestlock.mark_need_recover();
+    xattrlock.mark_need_recover();
+    snaplock.mark_need_recover();
+    flocklock.mark_need_recover();
+    policylock.mark_need_recover();
+  }
 }
 void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters,
-                                 list<SimpleLock*>& eval_locks)
-{
-  authlock.decode_state_rejoin(p, waiters);
-  linklock.decode_state_rejoin(p, waiters);
-  dirfragtreelock.decode_state_rejoin(p, waiters);
-  filelock.decode_state_rejoin(p, waiters);
-  nestlock.decode_state_rejoin(p, waiters);
-  xattrlock.decode_state_rejoin(p, waiters);
-  snaplock.decode_state_rejoin(p, waiters);
-  flocklock.decode_state_rejoin(p, waiters);
-  policylock.decode_state_rejoin(p, waiters);
+                                 list<SimpleLock*>& eval_locks, bool survivor)
+{
+  authlock.decode_state_rejoin(p, waiters, survivor);
+  linklock.decode_state_rejoin(p, waiters, survivor);
+  dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
+  filelock.decode_state_rejoin(p, waiters, survivor);
+  nestlock.decode_state_rejoin(p, waiters, survivor);
+  xattrlock.decode_state_rejoin(p, waiters, survivor);
+  snaplock.decode_state_rejoin(p, waiters, survivor);
+  flocklock.decode_state_rejoin(p, waiters, survivor);
+  policylock.decode_state_rejoin(p, waiters, survivor);
 
   if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
     eval_locks.push_back(&dirfragtreelock);
index d7e4a66b7fca398c3fb2507ddafe96fa0803e3d2..d0a49d1feb56d52ac7ae396a0f2d0d947bb33a39 100644 (file)
@@ -785,7 +785,7 @@ public:
   void encode_store(bufferlist& bl, uint64_t features);
   void decode_store(bufferlist::iterator& bl);
 
-  void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features) {
+  void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
     assert(is_auth());
     
     // relax locks?
@@ -796,7 +796,7 @@ public:
     ::encode(nonce, bl);
     
     _encode_base(bl, features);
-    _encode_locks_state_for_replica(bl);
+    _encode_locks_state_for_replica(bl, need_recover);
   }
   void decode_replica(bufferlist::iterator& p, bool is_new) {
     __u32 nonce;
@@ -824,11 +824,11 @@ public:
   void _decode_base(bufferlist::iterator& p);
   void _encode_locks_full(bufferlist& bl);
   void _decode_locks_full(bufferlist::iterator& p);
-  void _encode_locks_state_for_replica(bufferlist& bl);
+  void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
   void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
   void _decode_locks_state(bufferlist::iterator& p, bool is_new);
   void _decode_locks_rejoin(bufferlist::iterator& p, std::list<MDSInternalContextBase*>& waiters,
-                           std::list<SimpleLock*>& eval_locks);
+                           std::list<SimpleLock*>& eval_locks, bool survivor);
 
   // -- import/export --
   void encode_export(bufferlist& bl);
index ecb274b0bbdebe55e9d0b6dfeabdf721905676e5..2ecec27f8764b09e4a08a00eac24f33cc3b46bb3 100644 (file)
@@ -547,33 +547,20 @@ bool Locker::acquire_locks(MDRequestRef& mdr,
       }
     } else {
       assert(mdr->is_master());
-      if ((*p)->is_scatterlock()) {
-       ScatterLock *slock = static_cast<ScatterLock *>(*p);
-       if (slock->is_rejoin_mix()) {
-         // If there is a recovering mds who replcated an object when it failed
-         // and scatterlock in the object was in MIX state, It's possible that
-         // the recovering mds needs to take wrlock on the scatterlock when it
-         // replays unsafe requests. So this mds should delay taking rdlock on
-         // the scatterlock until the recovering mds finishes replaying unsafe.
-         // Otherwise unsafe requests may get replayed after current request.
-         //
-         // For example:
-         // The recovering mds is auth mds of a dirfrag, this mds is auth mds
-         // of correspinding inode. when 'rm -rf' the direcotry, this mds should
-         // delay the rmdir request until the recovering mds has replayed unlink
-         // requests.
-         if (mds->is_cluster_degraded()) {
-           if (!mdr->is_replay()) {
-             drop_locks(mdr.get());
-             mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
-             dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent()
-                      << ", waiting for cluster recovered" << dendl;
-             marker.message = "rejoin mix scatterlock, waiting for cluster recovered";
-             return false;
-           }
-         } else {
-           slock->clear_rejoin_mix();
+      if ((*p)->needs_recover()) {
+       if (mds->is_cluster_degraded()) {
+         if (!mdr->is_replay()) {
+           // see comments in SimpleLock::set_state_rejoin() and
+           // ScatterLock::encode_state_for_rejoin()
+           drop_locks(mdr.get());
+           mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+           dout(10) << " rejoin recovering " << **p << " " << *(*p)->get_parent()
+                    << ", waiting for cluster recovered" << dendl;
+           marker.message = "rejoin recovering lock, waiting for cluster recovered";
+           return false;
          }
+       } else {
+         (*p)->clear_need_recover();
        }
       }
 
index 17068a64ca2deb9c2033d97634feb950e0bd0f07..99d6940986d2e9a11a23d2075c4022a17ba8de3a 100644 (file)
@@ -4881,6 +4881,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
   mds_rank_t from = mds_rank_t(ack->get_source().num());
 
+  assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+  bool survivor = !mds->is_rejoin();
+
   // for sending cache expire message
   set<CInode*> isolated_inodes;
   set<CInode*> refragged_inodes;
@@ -4990,7 +4993,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
       }
 
       dn->set_replica_nonce(q->second.nonce);
-      dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
+      dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
       dn->state_clear(CDentry::STATE_REJOINING);
       dout(10) << " got " << *dn << dendl;
     }
@@ -5053,7 +5056,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     assert(in);
     in->set_replica_nonce(nonce);
     bufferlist::iterator q = lockbl.begin();
-    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
+    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
     in->state_clear(CInode::STATE_REJOINING);
     dout(10) << " got inode locks " << *in << dendl;
   }
@@ -5097,7 +5100,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   // done?
   assert(rejoin_ack_gather.count(from));
   rejoin_ack_gather.erase(from);
-  if (mds->is_rejoin()) {
+  if (!survivor) {
 
     if (rejoin_gather.empty()) {
       // eval unstable scatter locks after all wrlocks are rejoined.
@@ -10156,6 +10159,29 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
 // ----------------------------
 // REPLICAS
 
+
+void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
+{
+  dirfrag_t df = dir->dirfrag();
+  ::encode(df, bl);
+  dir->encode_replica(to, bl);
+}
+
+void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
+{
+  ::encode(dn->name, bl);
+  ::encode(dn->last, bl);
+  dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
+void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+                             uint64_t features)
+{
+  ::encode(in->inode.ino, bl);  // bleh, minor assymetry here
+  ::encode(in->last, bl);
+  in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
                               list<MDSInternalContextBase*>& finished)
 {
index a2ab48751a0b92ac4d1b329c3574c9aa685c84d6..1be07705680a92a530cb652b977f38ba8bb905a8 100644 (file)
@@ -1046,22 +1046,10 @@ protected:
   friend class C_MDC_Join;
 
 public:
-  void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) {
-    dirfrag_t df = dir->dirfrag();
-    ::encode(df, bl);
-    dir->encode_replica(to, bl);
-  }
-  void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) {
-    ::encode(dn->name, bl);
-    ::encode(dn->last, bl);
-    dn->encode_replica(to, bl);
-  }
+  void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
+  void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
   void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
-                      uint64_t features) {
-    ::encode(in->inode.ino, bl);  // bleh, minor assymetry here
-    ::encode(in->last, bl);
-    in->encode_replica(to, bl, features);
-  }
+                      uint64_t features);
   
   CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
   CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
index 872428229aad85ac55530cd83132757cbddcb45f..ab963986bbc2a891f6ef0e97eec47a217e5880d4 100644 (file)
@@ -43,7 +43,6 @@ class ScatterLock : public SimpleLock {
     DIRTY            = 1 << 10,
     FLUSHING         = 1 << 11,
     FLUSHED          = 1 << 12,
-    REJOIN_MIX       = 1 << 13,
   };
 
 public:
@@ -125,9 +124,6 @@ public:
   bool is_dirty_or_flushing() const {
     return is_dirty() || is_flushing();
   }
-  bool is_rejoin_mix() const {
-    return state_flags & REJOIN_MIX;
-  }
 
   void mark_dirty() { 
     if (!is_dirty()) {
@@ -159,9 +155,6 @@ public:
   void clear_flushed() override {
     state_flags &= ~FLUSHED;
   }
-  void clear_rejoin_mix() {
-    state_flags &= ~REJOIN_MIX;
-  }
 
   void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
     if (rstate == LOCK_MIX || 
@@ -182,14 +175,26 @@ public:
        s = LOCK_MIX_LOCK;
     }
 
+    // If there is a recovering mds who replcated an object when it failed
+    // and scatterlock in the object was in MIX state, It's possible that
+    // the recovering mds needs to take wrlock on the scatterlock when it
+    // replays unsafe requests. So this mds should delay taking rdlock on
+    // the scatterlock until the recovering mds finishes replaying unsafe.
+    // Otherwise unsafe requests may get replayed after current request.
+    //
+    // For example:
+    // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+    // of correspinding inode. when 'rm -rf' the direcotry, this mds should
+    // delay the rmdir request until the recovering mds has replayed unlink
+    // requests.
     if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
-      state_flags |= REJOIN_MIX;
+      mark_need_recover();
 
     ::encode(s, bl);
   }
 
-  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
-    SimpleLock::decode_state_rejoin(p, waiters);
+  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
+    SimpleLock::decode_state_rejoin(p, waiters, survivor);
     if (is_flushing()) {
       set_dirty();
       clear_flushing();
index f97714310b5bf3a6d690ec1d7c0148eedf14d0e1..6e60d2a7015ff0addfbd83ded4ddda862f404e8f 100644 (file)
@@ -168,7 +168,8 @@ protected:
   __s16 state_flags;
 
   enum {
-    LEASED   = 1 << 0,
+    LEASED             = 1 << 0,
+    NEED_RECOVER       = 1 << 1,
   };
 
 private:
@@ -325,13 +326,17 @@ public:
     //assert(!is_stable() || gather_set.size() == 0);  // gather should be empty in stable states.
     return s;
   }
-  void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters) {
-    if (!is_stable() && get_parent()->is_auth()) {
-      state = s;
-      get_parent()->auth_unpin(this);
-    } else {
-      state = s;
-    }
+  void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters, bool survivor) {
+    assert(!get_parent()->is_auth());
+
+    // If lock in the replica object was not in SYNC state when auth mds of the object failed.
+    // Auth mds of the object may take xlock on the lock and change the object when replaying
+    // unsafe requests.
+    if (!survivor || state != LOCK_SYNC)
+      mark_need_recover();
+
+    state = s;
+
     if (is_stable())
       take_waiting(SimpleLock::WAIT_ALL, waiters);
   }
@@ -545,6 +550,16 @@ public:
     return is_xlocked() || is_rdlocked() || is_wrlocked() || is_leased();
   }
 
+  bool needs_recover() const {
+    return state_flags & NEED_RECOVER;
+  }
+  void mark_need_recover() {
+    state_flags |= NEED_RECOVER;
+  }
+  void clear_need_recover() {
+    state_flags &= ~NEED_RECOVER;
+  }
+
   // encode/decode
   void encode(bufferlist& bl) const {
     ENCODE_START(2, 2, bl);
@@ -574,10 +589,10 @@ public:
     if (is_new)
       state = s;
   }
-  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
+  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
     __s16 s;
     ::decode(s, p);
-    set_state_rejoin(s, waiters);
+    set_state_rejoin(s, waiters, survivor);
   }