]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: fix cross-authorty rename race
authorYan, Zheng <zheng.z.yan@intel.com>
Sat, 15 Jun 2013 10:26:04 +0000 (18:26 +0800)
committerYan, Zheng <zheng.z.yan@intel.com>
Mon, 17 Jun 2013 11:14:06 +0000 (19:14 +0800)
When doing cross-authorty rename, we need to make sure bystanders
have received all messages sent by inode's original auth MDS,
then change inode's authorty. Otherwise lock messages sent by the
original/new auth MDS can arrive bystanders out of order. The fix
is: inode's original auth MDS sends notify messages to bystanders,
performs slave rename after receiving all bystanders' notify acks.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
src/mds/MDCache.cc
src/mds/Server.cc
src/mds/Server.h
src/messages/MMDSSlaveRequest.h

index b9b154d91b3a2acac231bf6931f907c05f42cdb1..2b0029f90b6121c3f473860d9427d6004dc8f9bb 100644 (file)
@@ -2651,6 +2651,15 @@ void MDCache::handle_mds_failure(int who)
     if (p->second->slave_to_mds == who) {
       if (p->second->slave_did_prepare()) {
        dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+       if (!p->second->more()->waiting_on_slave.empty()) {
+         assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
+         // will rollback, no need to wait
+         if (p->second->slave_request) {
+           p->second->slave_request->put();
+           p->second->slave_request = 0;
+         }
+         p->second->more()->waiting_on_slave.clear();
+       }
       } else {
        dout(10) << " slave request " << *p->second << " has no prepare, finishing up" << dendl;
        if (p->second->slave_request)
@@ -2660,12 +2669,22 @@ void MDCache::handle_mds_failure(int who)
       }
     }
 
-    if (p->second->is_slave() &&
-       p->second->slave_did_prepare() && p->second->more()->srcdn_auth_mds == who &&
-       mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
-      // rename srcdn's auth mds failed, resolve even I'm a survivor.
-      dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
-      add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+    if (p->second->is_slave() && p->second->slave_did_prepare()) {
+      if (p->second->more()->waiting_on_slave.count(who)) {
+       assert(p->second->more()->srcdn_auth_mds == mds->get_nodeid());
+       dout(10) << " slave request " << *p->second << " no longer need rename notity ack from mds."
+                << who << dendl;
+       p->second->more()->waiting_on_slave.erase(who);
+       if (p->second->more()->waiting_on_slave.empty())
+         mds->queue_waiter(new C_MDS_RetryRequest(this, p->second));
+      }
+
+      if (p->second->more()->srcdn_auth_mds == who &&
+         mds->mdsmap->is_clientreplay_or_active_or_stopping(p->second->slave_to_mds)) {
+       // rename srcdn's auth mds failed, resolve even I'm a survivor.
+       dout(10) << " slave request " << *p->second << " uncommitted, will resolve shortly" << dendl;
+       add_ambiguous_slave_update(p->first, p->second->slave_to_mds);
+      }
     }
     
     // failed node is slave?
index c3162e7fe2cf6b24c9bcfb49d8a56b2b6681878f..00bf0181e5f46c700648d0af21a9d92b91f4e484 100644 (file)
@@ -1280,6 +1280,16 @@ void Server::handle_slave_request(MMDSSlaveRequest *m)
   if (m->is_reply())
     return handle_slave_request_reply(m);
 
+  // the purpose of rename notify is enforcing causal message ordering. making sure
+  // bystanders have received all messages from rename srcdn's auth MDS.
+  if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
+    MMDSSlaveRequest *reply = new MMDSSlaveRequest(m->get_reqid(), m->get_attempt(),
+                                                  MMDSSlaveRequest::OP_RENAMENOTIFYACK);
+    mds->send_message(reply, m->get_connection());
+    m->put();
+    return;
+  }
+
   CDentry *straydn = NULL;
   if (m->stray.length() > 0) {
     straydn = mdcache->add_replica_stray(m->stray, from);
@@ -1432,6 +1442,10 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m)
     handle_slave_rename_prep_ack(mdr, m);
     break;
 
+  case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
+    handle_slave_rename_notify_ack(mdr, m);
+    break;
+
   default:
     assert(0);
   }
@@ -6560,6 +6574,9 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
 
   // am i srcdn auth?
   if (srcdn->is_auth()) {
+    set<int> srcdnrep;
+    srcdn->list_replicas(srcdnrep);
+
     bool reply_witness = false;
     if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
       // freeze?
@@ -6594,12 +6611,19 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
       if (mdr->slave_request->witnesses.size() > 1) {
        dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
        reply_witness = true;
+       for (set<int>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
+         if (*p == mdr->slave_to_mds ||
+             !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p))
+           continue;
+         MMDSSlaveRequest *notify = new MMDSSlaveRequest(mdr->reqid, mdr->attempt,
+                                                         MMDSSlaveRequest::OP_RENAMENOTIFY);
+         mds->send_message_mds(notify, *p);
+         mdr->more()->waiting_on_slave.insert(*p);
+       }
       }
     }
 
     // is witness list sufficient?
-    set<int> srcdnrep;
-    srcdn->list_replicas(srcdnrep);
     for (set<int>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
       if (*p == mdr->slave_to_mds ||
          mdr->slave_request->witnesses.count(*p)) continue;
@@ -6619,6 +6643,11 @@ void Server::handle_slave_rename_prep(MDRequest *mdr)
       return;  
     }
     dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
+    if (!mdr->more()->waiting_on_slave.empty()) {
+      dout(10) << " still waiting for rename notify acks from "
+              << mdr->more()->waiting_on_slave << dendl;
+      return;
+    }
   } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
     // set ambiguous auth for srci on witnesses
     mdr->set_ambiguous_auth(srcdnl->get_inode());
@@ -7187,6 +7216,24 @@ void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
     dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
 }
 
+void Server::handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *ack)
+{
+  dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
+          << ack->get_source() << dendl;
+  assert(mdr->is_slave());
+  int from = ack->get_source().num();
+
+  if (mdr->more()->waiting_on_slave.count(from)) {
+    mdr->more()->waiting_on_slave.erase(from);
+
+    if (mdr->more()->waiting_on_slave.empty()) {
+      if (mdr->slave_request)
+       dispatch_slave_request(mdr);
+    } else 
+      dout(10) << " still waiting for rename notify acks from "
+              << mdr->more()->waiting_on_slave << dendl;
+  }
+}
 
 
 
index 35a405b58ebaae7f8dfc6f89ed0ec0046e04a8b3..269b286b4316241bd97687cf941b211f28701472 100644 (file)
@@ -242,6 +242,7 @@ public:
   // slaving
   void handle_slave_rename_prep(MDRequest *mdr);
   void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m);
+  void handle_slave_rename_notify_ack(MDRequest *mdr, MMDSSlaveRequest *m);
   void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
   void do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr, bool finish_mdr=false);
index 35af81d928c27517b84751b0648aa523c75cdc2c..f8f30b273af4fdc658acb6068960c6e522b4023d 100644 (file)
@@ -43,6 +43,9 @@ class MMDSSlaveRequest : public Message {
 
   static const int OP_DROPLOCKS        = 11;
 
+  static const int OP_RENAMENOTIFY = 12;
+  static const int OP_RENAMENOTIFYACK = -12;
+
   static const int OP_FINISH = 17;  
   static const int OP_COMMITTED = -18;  
 
@@ -77,6 +80,9 @@ class MMDSSlaveRequest : public Message {
 
     case OP_DROPLOCKS: return "drop_locks";
 
+    case OP_RENAMENOTIFY: return "reame_notify";
+    case OP_RENAMENOTIFYACK: return "rename_notify_ack";
+
     case OP_ABORT: return "abort";
       //case OP_COMMIT: return "commit";