]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: don't rdlock locks in replica object while auth mds is recovering
authorYan, Zheng <zyan@redhat.com>
Wed, 18 Oct 2017 12:58:15 +0000 (20:58 +0800)
committerShinobu Kinjo <shinobu@redhat.com>
Fri, 10 Nov 2017 07:21:24 +0000 (02:21 -0500)
Auth mds may take xlock on the lock and change the object when replaying
unsafe requests. To guarantee new requests and replayed unsafe requests
(on auth mds) get processed in proper order, we shouldn't rdlock locks in
replica object while auth mds of the object is recovering

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
(cherry picked from commit 0afbc0338e1b9f32340eaa74899d8d43ac8608fe)

src/mds/CDentry.cc
src/mds/CDentry.h
src/mds/CInode.cc
src/mds/CInode.h
src/mds/Locker.cc
src/mds/MDCache.cc
src/mds/MDCache.h
src/mds/ScatterLock.h
src/mds/SimpleLock.h

index ecd4a7095911536237051682c4e6929bc54cd3fe..7d24734931589cdfcf9e01f454adff85fa6927f4 100644 (file)
@@ -402,15 +402,18 @@ void CDentry::decode_replica(bufferlist::iterator& p, bool is_new)
 
   inodeno_t rino;
   unsigned char rdtype;
-  __s32 ls;
   ::decode(rino, p);
   ::decode(rdtype, p);
-  ::decode(ls, p);
+  lock.decode_state(p, is_new);
+
+  bool need_recover;
+  ::decode(need_recover, p);
 
   if (is_new) {
     if (rino)
       dir->link_remote_inode(this, rino, rdtype);
-    lock.set_state(ls);
+    if (need_recover)
+      lock.mark_need_recover();
   }
 }
 
index e9416104ad15f1fc81723729cfa940f922414231..d5c628b699a0ccf3df2f174a3270963563b466d3 100644 (file)
@@ -244,7 +244,7 @@ public:
   void clear_new() { state_clear(STATE_NEW); }
   
   // -- replication
-  void encode_replica(mds_rank_t mds, bufferlist& bl) {
+  void encode_replica(mds_rank_t mds, bufferlist& bl, bool need_recover) {
     if (!is_replicated())
       lock.replicate_relax();
 
@@ -253,8 +253,8 @@ public:
     ::encode(first, bl);
     ::encode(linkage.remote_ino, bl);
     ::encode(linkage.remote_d_type, bl);
-    __s32 ls = lock.get_replica_state();
-    ::encode(ls, bl);
+    lock.encode_state_for_replica(bl);
+    ::encode(need_recover, bl);
   }
   void decode_replica(bufferlist::iterator& p, bool is_new);
 
index 633e6477553dda671318f2991d28dc1b2e4a621c..25931d019d6dc3974c494852b5c4b885489f27aa 100644 (file)
@@ -3553,7 +3553,7 @@ void CInode::_decode_locks_full(bufferlist::iterator& p)
   want_loner_cap = loner_cap;  // for now, we'll eval() shortly.
 }
 
-void CInode::_encode_locks_state_for_replica(bufferlist& bl)
+void CInode::_encode_locks_state_for_replica(bufferlist& bl, bool need_recover)
 {
   authlock.encode_state_for_replica(bl);
   linklock.encode_state_for_replica(bl);
@@ -3564,7 +3564,9 @@ void CInode::_encode_locks_state_for_replica(bufferlist& bl)
   snaplock.encode_state_for_replica(bl);
   flocklock.encode_state_for_replica(bl);
   policylock.encode_state_for_replica(bl);
+  ::encode(need_recover, bl);
 }
+
 void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
 {
   authlock.encode_state_for_replica(bl);
@@ -3577,6 +3579,7 @@ void CInode::_encode_locks_state_for_rejoin(bufferlist& bl, int rep)
   flocklock.encode_state_for_replica(bl);
   policylock.encode_state_for_replica(bl);
 }
+
 void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
 {
   authlock.decode_state(p, is_new);
@@ -3588,19 +3591,35 @@ void CInode::_decode_locks_state(bufferlist::iterator& p, bool is_new)
   snaplock.decode_state(p, is_new);
   flocklock.decode_state(p, is_new);
   policylock.decode_state(p, is_new);
+
+  bool need_recover;
+  ::decode(need_recover, p);
+  if (need_recover && is_new) {
+    // Auth mds replicated this inode while it's recovering. Auth mds may take xlock on the lock
+    // and change the object when replaying unsafe requests.
+    authlock.mark_need_recover();
+    linklock.mark_need_recover();
+    dirfragtreelock.mark_need_recover();
+    filelock.mark_need_recover();
+    nestlock.mark_need_recover();
+    xattrlock.mark_need_recover();
+    snaplock.mark_need_recover();
+    flocklock.mark_need_recover();
+    policylock.mark_need_recover();
+  }
 }
 void CInode::_decode_locks_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters,
-                                 list<SimpleLock*>& eval_locks)
-{
-  authlock.decode_state_rejoin(p, waiters);
-  linklock.decode_state_rejoin(p, waiters);
-  dirfragtreelock.decode_state_rejoin(p, waiters);
-  filelock.decode_state_rejoin(p, waiters);
-  nestlock.decode_state_rejoin(p, waiters);
-  xattrlock.decode_state_rejoin(p, waiters);
-  snaplock.decode_state_rejoin(p, waiters);
-  flocklock.decode_state_rejoin(p, waiters);
-  policylock.decode_state_rejoin(p, waiters);
+                                 list<SimpleLock*>& eval_locks, bool survivor)
+{
+  authlock.decode_state_rejoin(p, waiters, survivor);
+  linklock.decode_state_rejoin(p, waiters, survivor);
+  dirfragtreelock.decode_state_rejoin(p, waiters, survivor);
+  filelock.decode_state_rejoin(p, waiters, survivor);
+  nestlock.decode_state_rejoin(p, waiters, survivor);
+  xattrlock.decode_state_rejoin(p, waiters, survivor);
+  snaplock.decode_state_rejoin(p, waiters, survivor);
+  flocklock.decode_state_rejoin(p, waiters, survivor);
+  policylock.decode_state_rejoin(p, waiters, survivor);
 
   if (!dirfragtreelock.is_stable() && !dirfragtreelock.is_wrlocked())
     eval_locks.push_back(&dirfragtreelock);
index f1b371678c084a3d6884d5fcc1d7a60b0fd42b9e..464b75cfc2c05c7ef3f849c814b4f4a29e6b0e61 100644 (file)
@@ -772,7 +772,7 @@ public:
   void encode_store(bufferlist& bl, uint64_t features);
   void decode_store(bufferlist::iterator& bl);
 
-  void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features) {
+  void encode_replica(mds_rank_t rep, bufferlist& bl, uint64_t features, bool need_recover) {
     assert(is_auth());
     
     // relax locks?
@@ -783,7 +783,7 @@ public:
     ::encode(nonce, bl);
     
     _encode_base(bl, features);
-    _encode_locks_state_for_replica(bl);
+    _encode_locks_state_for_replica(bl, need_recover);
   }
   void decode_replica(bufferlist::iterator& p, bool is_new) {
     __u32 nonce;
@@ -811,11 +811,11 @@ public:
   void _decode_base(bufferlist::iterator& p);
   void _encode_locks_full(bufferlist& bl);
   void _decode_locks_full(bufferlist::iterator& p);
-  void _encode_locks_state_for_replica(bufferlist& bl);
+  void _encode_locks_state_for_replica(bufferlist& bl, bool need_recover);
   void _encode_locks_state_for_rejoin(bufferlist& bl, int rep);
   void _decode_locks_state(bufferlist::iterator& p, bool is_new);
   void _decode_locks_rejoin(bufferlist::iterator& p, std::list<MDSInternalContextBase*>& waiters,
-                           std::list<SimpleLock*>& eval_locks);
+                           std::list<SimpleLock*>& eval_locks, bool survivor);
 
   // -- import/export --
   void encode_export(bufferlist& bl);
index ccf42a67f9bb609eda2740b1f724e40873534e21..90be7a08f63dc57544a1c524a6736f85bb61797d 100644 (file)
@@ -585,33 +585,20 @@ bool Locker::acquire_locks(MDRequestRef& mdr,
       }
     } else {
       assert(mdr->is_master());
-      if ((*p)->is_scatterlock()) {
-       ScatterLock *slock = static_cast<ScatterLock *>(*p);
-       if (slock->is_rejoin_mix()) {
-         // If there is a recovering mds who replcated an object when it failed
-         // and scatterlock in the object was in MIX state, It's possible that
-         // the recovering mds needs to take wrlock on the scatterlock when it
-         // replays unsafe requests. So this mds should delay taking rdlock on
-         // the scatterlock until the recovering mds finishes replaying unsafe.
-         // Otherwise unsafe requests may get replayed after current request.
-         //
-         // For example:
-         // The recovering mds is auth mds of a dirfrag, this mds is auth mds
-         // of correspinding inode. when 'rm -rf' the direcotry, this mds should
-         // delay the rmdir request until the recovering mds has replayed unlink
-         // requests.
-         if (mds->is_cluster_degraded()) {
-           if (!mdr->is_replay()) {
-             drop_locks(mdr.get());
-             mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
-             dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent()
-                      << ", waiting for cluster recovered" << dendl;
-             marker.message = "rejoin mix scatterlock, waiting for cluster recovered";
-             return false;
-           }
-         } else {
-           slock->clear_rejoin_mix();
+      if ((*p)->needs_recover()) {
+       if (mds->is_cluster_degraded()) {
+         if (!mdr->is_replay()) {
+           // see comments in SimpleLock::set_state_rejoin() and
+           // ScatterLock::encode_state_for_rejoin()
+           drop_locks(mdr.get());
+           mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+           dout(10) << " rejoin recovering " << **p << " " << *(*p)->get_parent()
+                    << ", waiting for cluster recovered" << dendl;
+           marker.message = "rejoin recovering lock, waiting for cluster recovered";
+           return false;
          }
+       } else {
+         (*p)->clear_need_recover();
        }
       }
 
index b40833fd1cabbee3ebf15906d4df332f8239e677..074a86bd844ec9de10418d2f64fcba59fecc272f 100644 (file)
@@ -4898,6 +4898,9 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
   mds_rank_t from = mds_rank_t(ack->get_source().num());
 
+  assert(mds->get_state() >= MDSMap::STATE_REJOIN);
+  bool survivor = !mds->is_rejoin();
+
   // for sending cache expire message
   set<CInode*> isolated_inodes;
   set<CInode*> refragged_inodes;
@@ -5007,7 +5010,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
       }
 
       dn->set_replica_nonce(q->second.nonce);
-      dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters);
+      dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters, survivor);
       dn->state_clear(CDentry::STATE_REJOINING);
       dout(10) << " got " << *dn << dendl;
     }
@@ -5070,7 +5073,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
     assert(in);
     in->set_replica_nonce(nonce);
     bufferlist::iterator q = lockbl.begin();
-    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks);
+    in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
     in->state_clear(CInode::STATE_REJOINING);
     dout(10) << " got inode locks " << *in << dendl;
   }
@@ -5114,7 +5117,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack)
   // done?
   assert(rejoin_ack_gather.count(from));
   rejoin_ack_gather.erase(from);
-  if (mds->is_rejoin()) {
+  if (!survivor) {
 
     if (rejoin_gather.empty()) {
       // eval unstable scatter locks after all wrlocks are rejoined.
@@ -10182,6 +10185,29 @@ void MDCache::handle_discover_reply(MDiscoverReply *m)
 // ----------------------------
 // REPLICAS
 
+
+void MDCache::replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
+{
+  dirfrag_t df = dir->dirfrag();
+  ::encode(df, bl);
+  dir->encode_replica(to, bl);
+}
+
+void MDCache::replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
+{
+  ::encode(dn->name, bl);
+  ::encode(dn->last, bl);
+  dn->encode_replica(to, bl, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
+void MDCache::replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
+                             uint64_t features)
+{
+  ::encode(in->inode.ino, bl);  // bleh, minor assymetry here
+  ::encode(in->last, bl);
+  in->encode_replica(to, bl, features, mds->get_state() < MDSMap::STATE_ACTIVE);
+}
+
 CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from,
                               list<MDSInternalContextBase*>& finished)
 {
index 61a170bf6fd57b60baae3a613b89c0a9770afdce..099445233aefae8f5f22bec5bc3eccbf4cb3104e 100644 (file)
@@ -1035,22 +1035,10 @@ protected:
   friend class C_MDC_Join;
 
 public:
-  void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) {
-    dirfrag_t df = dir->dirfrag();
-    ::encode(df, bl);
-    dir->encode_replica(to, bl);
-  }
-  void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) {
-    ::encode(dn->name, bl);
-    ::encode(dn->last, bl);
-    dn->encode_replica(to, bl);
-  }
+  void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
+  void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
   void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
-                      uint64_t features) {
-    ::encode(in->inode.ino, bl);  // bleh, minor assymetry here
-    ::encode(in->last, bl);
-    in->encode_replica(to, bl, features);
-  }
+                      uint64_t features);
   
   CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
   CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
index 872428229aad85ac55530cd83132757cbddcb45f..ab963986bbc2a891f6ef0e97eec47a217e5880d4 100644 (file)
@@ -43,7 +43,6 @@ class ScatterLock : public SimpleLock {
     DIRTY            = 1 << 10,
     FLUSHING         = 1 << 11,
     FLUSHED          = 1 << 12,
-    REJOIN_MIX       = 1 << 13,
   };
 
 public:
@@ -125,9 +124,6 @@ public:
   bool is_dirty_or_flushing() const {
     return is_dirty() || is_flushing();
   }
-  bool is_rejoin_mix() const {
-    return state_flags & REJOIN_MIX;
-  }
 
   void mark_dirty() { 
     if (!is_dirty()) {
@@ -159,9 +155,6 @@ public:
   void clear_flushed() override {
     state_flags &= ~FLUSHED;
   }
-  void clear_rejoin_mix() {
-    state_flags &= ~REJOIN_MIX;
-  }
 
   void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
     if (rstate == LOCK_MIX || 
@@ -182,14 +175,26 @@ public:
        s = LOCK_MIX_LOCK;
     }
 
+    // If there is a recovering mds who replcated an object when it failed
+    // and scatterlock in the object was in MIX state, It's possible that
+    // the recovering mds needs to take wrlock on the scatterlock when it
+    // replays unsafe requests. So this mds should delay taking rdlock on
+    // the scatterlock until the recovering mds finishes replaying unsafe.
+    // Otherwise unsafe requests may get replayed after current request.
+    //
+    // For example:
+    // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+    // of correspinding inode. when 'rm -rf' the direcotry, this mds should
+    // delay the rmdir request until the recovering mds has replayed unlink
+    // requests.
     if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
-      state_flags |= REJOIN_MIX;
+      mark_need_recover();
 
     ::encode(s, bl);
   }
 
-  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
-    SimpleLock::decode_state_rejoin(p, waiters);
+  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
+    SimpleLock::decode_state_rejoin(p, waiters, survivor);
     if (is_flushing()) {
       set_dirty();
       clear_flushing();
index f97714310b5bf3a6d690ec1d7c0148eedf14d0e1..6e60d2a7015ff0addfbd83ded4ddda862f404e8f 100644 (file)
@@ -168,7 +168,8 @@ protected:
   __s16 state_flags;
 
   enum {
-    LEASED   = 1 << 0,
+    LEASED             = 1 << 0,
+    NEED_RECOVER       = 1 << 1,
   };
 
 private:
@@ -325,13 +326,17 @@ public:
     //assert(!is_stable() || gather_set.size() == 0);  // gather should be empty in stable states.
     return s;
   }
-  void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters) {
-    if (!is_stable() && get_parent()->is_auth()) {
-      state = s;
-      get_parent()->auth_unpin(this);
-    } else {
-      state = s;
-    }
+  void set_state_rejoin(int s, list<MDSInternalContextBase*>& waiters, bool survivor) {
+    assert(!get_parent()->is_auth());
+
+    // If lock in the replica object was not in SYNC state when auth mds of the object failed.
+    // Auth mds of the object may take xlock on the lock and change the object when replaying
+    // unsafe requests.
+    if (!survivor || state != LOCK_SYNC)
+      mark_need_recover();
+
+    state = s;
+
     if (is_stable())
       take_waiting(SimpleLock::WAIT_ALL, waiters);
   }
@@ -545,6 +550,16 @@ public:
     return is_xlocked() || is_rdlocked() || is_wrlocked() || is_leased();
   }
 
+  bool needs_recover() const {
+    return state_flags & NEED_RECOVER;
+  }
+  void mark_need_recover() {
+    state_flags |= NEED_RECOVER;
+  }
+  void clear_need_recover() {
+    state_flags &= ~NEED_RECOVER;
+  }
+
   // encode/decode
   void encode(bufferlist& bl) const {
     ENCODE_START(2, 2, bl);
@@ -574,10 +589,10 @@ public:
     if (is_new)
       state = s;
   }
-  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters) {
+  void decode_state_rejoin(bufferlist::iterator& p, list<MDSInternalContextBase*>& waiters, bool survivor) {
     __s16 s;
     ::decode(s, p);
-    set_state_rejoin(s, waiters);
+    set_state_rejoin(s, waiters, survivor);
   }