mds: don't break order of inter-dependent requests during mds recovers

author Yan, Zheng <zyan@redhat.com>

Wed, 1 Mar 2017 03:57:20 +0000 (11:57 +0800)

committer Yan, Zheng <zyan@redhat.com>

Thu, 30 Mar 2017 01:46:05 +0000 (09:46 +0800)
author Yan, Zheng <zyan@redhat.com>
Wed, 1 Mar 2017 03:57:20 +0000 (11:57 +0800)
committer Yan, Zheng <zyan@redhat.com>
Thu, 30 Mar 2017 01:46:05 +0000 (09:46 +0800)
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc

index 899cd0082c5bc138742025a68d26abeacb3147e4..58c8795b7a849435aed98a3770eed497de78ccf2 100644 (file)
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -345,7 +345,7 @@ bool Locker::acquire_locks(MDRequestRef& mdr,
      
      if (!object->is_auth()) {
        if (!mdr->locks.empty())
-       mds->locker->drop_locks(mdr.get());
+       drop_locks(mdr.get());
        if (object->is_ambiguous_auth()) {
         // wait
         dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
@@ -358,7 +358,7 @@ bool Locker::acquire_locks(MDRequestRef& mdr,
      }
      if (!object->can_auth_pin()) {
        // wait
-      mds->locker->drop_locks(mdr.get());
+      drop_locks(mdr.get());
        mdr->drop_local_auth_pins();
        if (auth_pin_nonblock) {
         dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
@@ -548,6 +548,37 @@ bool Locker::acquire_locks(MDRequestRef& mdr,
         dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << dendl;
        }
      } else {
+      assert(mdr->is_master());
+      if ((*p)->is_scatterlock()) {
+       ScatterLock *slock = static_cast<ScatterLock *>(*p);
+       if (slock->is_rejoin_mix()) {
+         // If there is a recovering mds who replcated an object when it failed
+         // and scatterlock in the object was in MIX state, It's possible that
+         // the recovering mds needs to take wrlock on the scatterlock when it
+         // replays unsafe requests. So this mds should delay taking rdlock on
+         // the scatterlock until the recovering mds finishes replaying unsafe.
+         // Otherwise unsafe requests may get replayed after current request.
+         //
+         // For example:
+         // The recovering mds is auth mds of a dirfrag, this mds is auth mds
+         // of correspinding inode. when 'rm -rf' the direcotry, this mds should
+         // delay the rmdir request until the recovering mds has replayed unlink
+         // requests.
+         if (mds->is_cluster_degraded()) {
+           if (!mdr->is_replay()) {
+             drop_locks(mdr.get());
+             mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
+             dout(10) << " rejoin mix scatterlock " << *slock << " " << *(*p)->get_parent()
+                      << ", waiting for cluster recovered" << dendl;
+             marker.message = "rejoin mix scatterlock, waiting for cluster recovered";
+             return false;
+           }
+         } else {
+           slock->clear_rejoin_mix();
+         }
+       }
+      }
+
        marker.message = "failed to rdlock, waiting";
        if (!rdlock_start(*p, mdr)) 
         goto out;
diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc

index b98102f48e136ceb29cbf2ab15994226fb7fb561..eb538f9c849e214bb3a92a1fdba2cdeb78457bfc 100644 (file)
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@@ -1576,6 +1576,11 @@ void MDSRankDispatcher::handle_mds_map(
    cluster_degraded = mdsmap->is_degraded();
    if (oldmap->is_degraded() && !cluster_degraded && state >= MDSMap::STATE_ACTIVE) {
      dout(1) << "cluster recovered." << dendl;
+    auto it = waiting_for_active_peer.find(MDS_RANK_NONE);
+    if (it != waiting_for_active_peer.end()) {
+      queue_waiters(it->second);
+      waiting_for_active_peer.erase(it);
+    }
    }
  
    // did someone go active?
diff --git a/src/mds/MDSRank.h b/src/mds/MDSRank.h

index 717dbd268b183c09bda4a3e643574cba6a04b428..8c0802561b69b8bf22fcd717919deb6ae6e852a3 100644 (file)
--- a/src/mds/MDSRank.h
+++ b/src/mds/MDSRank.h
@@ -352,12 +352,17 @@ class MDSRank {
        send_message(m, c.get());
      }
  
-    void wait_for_active(MDSInternalContextBase *c) { 
-      waiting_for_active.push_back(c); 
-    }
      void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) { 
        waiting_for_active_peer[who].push_back(c);
      }
+    void wait_for_cluster_recovered(MDSInternalContextBase *c) {
+      assert(cluster_degraded);
+      waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
+    }
+
+    void wait_for_active(MDSInternalContextBase *c) {
+      waiting_for_active.push_back(c);
+    }
      void wait_for_replay(MDSInternalContextBase *c) { 
        waiting_for_replay.push_back(c); 
      }
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc

index 15db202cb2855bf2c6c605470fadfb812d81388c..4e26b515896a209123a6dfe86ce460d6923bd40a 100644 (file)
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -306,12 +306,18 @@ void MDRequestImpl::set_filepath(const filepath& fp)
    assert(!client_request);
    more()->filepath1 = fp;
  }
+
  void MDRequestImpl::set_filepath2(const filepath& fp)
  {
    assert(!client_request);
    more()->filepath2 = fp;
  }
  
+bool MDRequestImpl::is_replay() const
+{
+  return client_request ? client_request->is_replay() : false;
+}
+
  void MDRequestImpl::print(ostream &out) const
  {
    out << "request(" << reqid;
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h

index 56b410fe6a736d6147e9f61ec8496f8044ffc2b6..1d4d25728ff155ef6153f40cec92c8b74a447d6f 100644 (file)
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -332,6 +332,7 @@ struct MDRequestImpl : public MutationImpl {
    const filepath& get_filepath2();
    void set_filepath(const filepath& fp);
    void set_filepath2(const filepath& fp);
+  bool is_replay() const;
  
    void print(ostream &out) const override;
    void dump(Formatter *f) const override;
diff --git a/src/mds/ScatterLock.h b/src/mds/ScatterLock.h

index 62f01d26976f861fff4644bc6584e219f0befd94..618ced921bc3a34aaeb942b329855473d94b4547 100644 (file)
--- a/src/mds/ScatterLock.h
+++ b/src/mds/ScatterLock.h
@@ -58,6 +58,7 @@ class ScatterLock : public SimpleLock {
      DIRTY            = 1 << 2,
      FLUSHING         = 1 << 3,
      FLUSHED          = 1 << 4,
+    REJOIN_MIX      = 1 << 5, // no rdlock until the recovering mds become active
    };
  
  public:
@@ -147,6 +148,9 @@ public:
    bool is_dirty_or_flushing() const {
      return have_more() ? (is_dirty() || is_flushing()) : false;
    }
+  bool is_rejoin_mix() const {
+    return have_more() ? _more->state_flags & REJOIN_MIX : false;
+  }
  
    void mark_dirty() { 
      if (!is_dirty()) {
@@ -182,6 +186,13 @@ public:
      }
    }
  
+  void clear_rejoin_mix() {
+    if (have_more()) {
+      _more->state_flags &= ~REJOIN_MIX;
+      try_clear_more();
+    }
+  }
+
    void set_last_scatter(utime_t t) { more()->last_scatter = t; }
    utime_t get_last_scatter() {
      return more()->last_scatter;
@@ -190,14 +201,13 @@ public:
    void infer_state_from_strong_rejoin(int rstate, bool locktoo) {
      if (rstate == LOCK_MIX || 
         rstate == LOCK_MIX_LOCK || // replica still has wrlocks?
-       rstate == LOCK_MIX_SYNC || // "
-       rstate == LOCK_MIX_TSYN)  // "
+       rstate == LOCK_MIX_SYNC)
        state = LOCK_MIX;
      else if (locktoo && rstate == LOCK_LOCK)
        state = LOCK_LOCK;
    }
  
-  void encode_state_for_rejoin(bufferlist& bl, int rep) const {
+  void encode_state_for_rejoin(bufferlist& bl, int rep) {
      __s16 s = get_replica_state();
      if (is_gathering(rep)) {
        // the recovering mds may hold rejoined wrlocks
@@ -206,6 +216,10 @@ public:
        else
         s = LOCK_MIX_LOCK;
      }
+
+    if (s == LOCK_MIX || s == LOCK_MIX_LOCK || s == LOCK_MIX_SYNC)
+      more()->state_flags |= REJOIN_MIX;
+
      ::encode(s, bl);
    }
  
diff --git a/src/mds/Server.cc b/src/mds/Server.cc

index 4ecc7085bd701afc37f37e4cb8b2bdcc1f3f4e2c..86f1d5c85835c291712aa2388641a8ca9838b3c9 100644 (file)
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -6120,7 +6120,7 @@ bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
      CDir *dir = *p;
      const fnode_t *pf = dir->get_projected_fnode();
      if (pf->fragstat.size()) {
-      dout(10) << "dir_is_nonempty_unlocked dirstat has "
+      dout(10) << "dir_is_nonempty dirstat has "
                << pf->fragstat.size() << " items " << *dir << dendl;
        return true;
      }
author	Yan, Zheng <zyan@redhat.com>
	Wed, 1 Mar 2017 03:57:20 +0000 (11:57 +0800)
committer	Yan, Zheng <zyan@redhat.com>
	Thu, 30 Mar 2017 01:46:05 +0000 (09:46 +0800)
src/mds/Locker.cc		patch \| blob \| history
src/mds/MDSRank.cc		patch \| blob \| history
src/mds/MDSRank.h		patch \| blob \| history
src/mds/Mutation.cc		patch \| blob \| history
src/mds/Mutation.h		patch \| blob \| history
src/mds/ScatterLock.h		patch \| blob \| history
src/mds/Server.cc		patch \| blob \| history