]> git-server-git.apps.pok.os.sepia.ceph.com Git - ceph.git/commitdiff
mds: freeze tree deadlock detection.
authorYan, Zheng <zheng.z.yan@intel.com>
Wed, 23 Oct 2013 01:15:58 +0000 (09:15 +0800)
committerYan, Zheng <zheng.z.yan@intel.com>
Mon, 16 Dec 2013 04:15:22 +0000 (12:15 +0800)
there are two situations that result freeze tree deadlock.

 - mds.0 authpins an item in subtree A
 - mds.0 sends request to mds.1 to authpin an item in subtree B
 - mds.0 freezes subtree A
 - mds.1 authpins an item in subtree B
 - mds.1 sends request to mds.0 to authpin an item in subtree A
 - mds.1 freezes subtree B
 - mds.1 receives the remote authpin request from mds.0
   (wait because subtree B is freezing)
 - mds.0 receives the remote authpin request from mds.1
   (wait because subtree A is freezing)

 - client request authpins items in subtree B
 - freeze subtree B
 - import subtree A which is parent of subtree B
   (authpins parent inode of subtree B, see CDir::set_dir_auth())
 - freeze subtree A
 - client request tries authpinning items in subtree A
   (wait because subtree A is freezing)

Enforcing a authpinning order can avoid the deadlock, but it's very
expensive. The deadlock is rare, so I think deadlock detection is
more suitable for the case.

This patch introduces freeze tree deadlock detection. We record the
start time of freezing tree. If we fail to freeze the tree within a
given duration, cancel the process of freezing tree.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
src/common/config_opts.h
src/mds/MDS.cc
src/mds/Migrator.cc
src/mds/Migrator.h
src/mds/Server.cc

index 9a9509e88352c2970f9e6e3fabae0e1a087188dc..a518d5cff46b0afd7a7874bf2cb3f53e74459215 100644 (file)
@@ -291,6 +291,7 @@ OPTION(mds_beacon_grace, OPT_FLOAT, 15)
 OPTION(mds_enforce_unique_name, OPT_BOOL, true)
 OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0)  // how long to blacklist failed nodes
 OPTION(mds_session_timeout, OPT_FLOAT, 60)    // cap bits and leases time out if client idle
+OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30)    // cap bits and leases time out if client idle
 OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session
 OPTION(mds_reconnect_timeout, OPT_FLOAT, 45)  // seconds to wait for clients during mds restart
              //  make it (mds_session_timeout - mds_beacon_grace)
index 83722274981d9c1b8b43246dbbea7b3103dc5bf0..14080789000e1effa14f87179eb045d96ab1add3 100644 (file)
@@ -588,6 +588,7 @@ void MDS::tick()
   
   if (is_active()) {
     balancer->tick();
+    mdcache->migrator->find_stale_export_freeze();
     if (snapserver)
       snapserver->check_osd_map(false);
   }
index 0647448c40c7f4b8abdf4b384b31635f1134a6fc..69ab14b76707eb44d1d514c81e6b6f30eeca91c5 100644 (file)
@@ -182,8 +182,148 @@ void Migrator::export_empty_import(CDir *dir)
   export_dir( dir, dest );
 }
 
+void Migrator::find_stale_export_freeze()
+{
+  utime_t now = ceph_clock_now(g_ceph_context);
+  utime_t cutoff = now;
+  cutoff -= g_conf->mds_freeze_tree_timeout;
+
+
+  /*
+   * We could have situations like:
+   *
+   * - mds.0 authpins an item in subtree A
+   * - mds.0 sends request to mds.1 to authpin an item in subtree B
+   * - mds.0 freezes subtree A
+   * - mds.1 authpins an item in subtree B
+   * - mds.1 sends request to mds.0 to authpin an item in subtree A
+   * - mds.1 freezes subtree B
+   * - mds.1 receives the remote authpin request from mds.0
+   *   (wait because subtree B is freezing)
+   * - mds.0 receives the remote authpin request from mds.1
+   *   (wait because subtree A is freezing)
+   *
+   *
+   * - client request authpins items in subtree B
+   * - freeze subtree B
+   * - import subtree A which is parent of subtree B
+   *   (authpins parent inode of subtree B, see CDir::set_dir_auth())
+   * - freeze subtree A
+   * - client request tries authpinning items in subtree A
+   *   (wait because subtree A is freezing)
+   */
+  for (set<pair<utime_t,CDir*> >::iterator p = export_freezing_dirs.begin();
+       p != export_freezing_dirs.end(); ) {
+    if (p->first >= cutoff)
+      break;
+    CDir *dir = p->second;
+    ++p;
+    if (export_freezing_state[dir].num_waiters > 0 ||
+       (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
+      assert(get_export_state(dir) == EXPORT_DISCOVERING ||
+            get_export_state(dir) == EXPORT_FREEZING);
+      export_try_cancel(dir);
+    }
+  }
+}
+
+void Migrator::export_try_cancel(CDir *dir)
+{
+  int state = export_state[dir];
+  switch (state) {
+  case EXPORT_DISCOVERING:
+    dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
+    dir->unfreeze_tree();  // cancel the freeze
+    dir->auth_unpin(this);
+    export_state.erase(dir); // clean up
+    export_unlock(dir);
+    export_locks.erase(dir);
+    export_freeze_finish(dir);
+    dir->state_clear(CDir::STATE_EXPORTING);
+    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
+      mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
+    break;
+
+  case EXPORT_FREEZING:
+    dout(10) << "export state=freezing : canceling freeze" << dendl;
+    dir->unfreeze_tree();  // cancel the freeze
+    export_state.erase(dir); // clean up
+    export_freeze_finish(dir);
+    dir->state_clear(CDir::STATE_EXPORTING);
+    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
+      mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
+    break;
+
+    // NOTE: state order reversal, warning comes after prepping
+  case EXPORT_WARNING:
+    dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
+    // fall-thru
+
+  case EXPORT_PREPPING:
+    if (state != EXPORT_WARNING)
+      dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
+    {
+      // unpin bounds
+      set<CDir*> bounds;
+      cache->get_subtree_bounds(dir, bounds);
+      for (set<CDir*>::iterator q = bounds.begin();
+          q != bounds.end();
+          ++q) {
+        CDir *bd = *q;
+        bd->put(CDir::PIN_EXPORTBOUND);
+        bd->state_clear(CDir::STATE_EXPORTBOUND);
+      }
+      // notify bystanders
+      if (state == EXPORT_WARNING)
+        export_notify_abort(dir, bounds);
+    }
+    dir->unfreeze_tree();
+    export_state.erase(dir); // clean up
+    cache->adjust_subtree_auth(dir, mds->get_nodeid());
+    cache->try_subtree_merge(dir);  // NOTE: this may journal subtree_map as side effect
+    export_unlock(dir);
+    export_locks.erase(dir);
+    dir->state_clear(CDir::STATE_EXPORTING);
+    if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
+      mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
+    break;
+
+  case EXPORT_EXPORTING:
+    dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
+    export_reverse(dir);
+    export_state.erase(dir); // clean up
+    export_locks.erase(dir);
+    dir->state_clear(CDir::STATE_EXPORTING);
+    break;
+
+  case EXPORT_LOGGINGFINISH:
+  case EXPORT_NOTIFYING:
+    dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
+    // leave export_state, don't clean up now.
+    break;
+
+  default:
+    assert(0);
+  }
+
+  // finish clean-up?
+  if (export_state.count(dir) == 0) {
+    export_peer.erase(dir);
+    export_warning_ack_waiting.erase(dir);
+    export_notify_ack_waiting.erase(dir);
 
+    // wake up any waiters
+    mds->queue_waiters(export_finish_waiters[dir]);
+    export_finish_waiters.erase(dir);
+
+    // send pending import_maps?  (these need to go out when all exports have finished.)
+    cache->maybe_send_pending_resolves();
 
+    cache->show_subtrees();
+
+    maybe_do_queued_export();
+  }
+}
 
 // ==========================================================
 // mds failure handling
@@ -228,98 +368,7 @@ void Migrator::handle_mds_failure_or_stop(int who)
       // the guy i'm exporting to failed, or we're just freezing.
       dout(10) << "cleaning up export state (" << p->second << ")" << get_export_statename(p->second)
               << " of " << *dir << dendl;
-      
-      switch (p->second) {
-      case EXPORT_DISCOVERING:
-       dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
-       dir->unfreeze_tree();  // cancel the freeze
-       dir->auth_unpin(this);
-       export_state.erase(dir); // clean up
-       export_unlock(dir);
-       export_locks.erase(dir);
-       dir->state_clear(CDir::STATE_EXPORTING);
-       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
-         mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
-       break;
-       
-      case EXPORT_FREEZING:
-       dout(10) << "export state=freezing : canceling freeze" << dendl;
-       dir->unfreeze_tree();  // cancel the freeze
-       export_state.erase(dir); // clean up
-       dir->state_clear(CDir::STATE_EXPORTING);
-       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
-         mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
-       break;
-
-       // NOTE: state order reversal, warning comes after prepping
-      case EXPORT_WARNING:
-       dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
-       // fall-thru
-
-      case EXPORT_PREPPING:
-       if (p->second != EXPORT_WARNING) 
-         dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
-       {
-         // unpin bounds
-         set<CDir*> bounds;
-         cache->get_subtree_bounds(dir, bounds);
-         for (set<CDir*>::iterator q = bounds.begin();
-              q != bounds.end();
-              ++q) {
-           CDir *bd = *q;
-           bd->put(CDir::PIN_EXPORTBOUND);
-           bd->state_clear(CDir::STATE_EXPORTBOUND);
-         }
-         // notify bystanders
-         if (p->second == EXPORT_WARNING)
-           export_notify_abort(dir, bounds);
-       }
-       dir->unfreeze_tree();
-       export_state.erase(dir); // clean up
-       cache->adjust_subtree_auth(dir, mds->get_nodeid());
-       cache->try_subtree_merge(dir);  // NOTE: this may journal subtree_map as side effect
-       export_unlock(dir);
-       export_locks.erase(dir);
-       dir->state_clear(CDir::STATE_EXPORTING);
-       if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them.
-         mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]);
-       break;
-       
-      case EXPORT_EXPORTING:
-       dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
-       export_reverse(dir);
-       export_state.erase(dir); // clean up
-       export_locks.erase(dir);
-       dir->state_clear(CDir::STATE_EXPORTING);
-       break;
-
-      case EXPORT_LOGGINGFINISH:
-      case EXPORT_NOTIFYING:
-       dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
-       // leave export_state, don't clean up now.
-       break;
-
-      default:
-       assert(0);
-      }
-
-      // finish clean-up?
-      if (export_state.count(dir) == 0) {
-       export_peer.erase(dir);
-       export_warning_ack_waiting.erase(dir);
-       export_notify_ack_waiting.erase(dir);
-       
-       // wake up any waiters
-       mds->queue_waiters(export_finish_waiters[dir]);
-       export_finish_waiters.erase(dir);
-       
-       // send pending import_maps?  (these need to go out when all exports have finished.)
-       cache->maybe_send_pending_resolves();
-
-       cache->show_subtrees();
-
-       maybe_do_queued_export();       
-      }
+      export_try_cancel(dir);
     } else {
       // bystander failed.
       if (export_warning_ack_waiting.count(dir) &&
@@ -688,6 +737,10 @@ void Migrator::export_dir(CDir *dir, int dest)
   assert(g_conf->mds_kill_export_at != 2);
 
   // start the freeze, but hold it up with an auth_pin.
+  utime_t now = ceph_clock_now(g_ceph_context);
+  export_freezing_dirs.insert(make_pair(now, dir));
+  export_freezing_state[dir].start_time = now;
+
   dir->auth_pin(this);
   dir->freeze_tree();
   assert(dir->is_freezing_tree());
@@ -732,6 +785,8 @@ void Migrator::export_frozen(CDir *dir)
   assert(dir->is_frozen());
   assert(dir->get_cum_auth_pins() == 0);
 
+  export_freeze_finish(dir);
+
   int dest = export_peer[dir];
   CInode *diri = dir->inode;
 
index afe2e6cd65a097bd1440b62a372d927a4528cc57..033e6eb00be132773367af9b8960e6f4225b3cce 100644 (file)
@@ -91,6 +91,15 @@ protected:
   
   list< pair<dirfrag_t,int> >  export_queue;
 
+  // for deadlock detection
+  struct freezing_state_t {
+    utime_t start_time;
+    int num_waiters;           // number of remote authpin waiters
+    freezing_state_t() : num_waiters(0) {}
+  };
+  map<CDir*,freezing_state_t > export_freezing_state;
+  set<pair<utime_t,CDir*> >    export_freezing_dirs;
+
   // -- imports --
 public:
   const static int IMPORT_DISCOVERING   = 1; // waiting for prep
@@ -182,6 +191,13 @@ public:
     assert(export_state[dir] == EXPORT_NOTIFYING);
     return (export_notify_ack_waiting[dir].count(who) == 0);
   }
+
+  void export_freeze_inc_num_waiters(CDir *dir) {
+    assert(is_exporting(dir));
+    export_freezing_state[dir].num_waiters++;
+  }
+  void find_stale_export_freeze();
+
   // -- misc --
   void handle_mds_failure_or_stop(int who);
 
@@ -227,6 +243,7 @@ public:
   void handle_export_prep_ack(MExportDirPrepAck *m);
   void export_go(CDir *dir);
   void export_go_synced(CDir *dir);
+  void export_try_cancel(CDir *dir);
   void export_reverse(CDir *dir);
   void export_notify_abort(CDir *dir, set<CDir*>& bounds);
   void handle_export_ack(MExportDirAck *m);
@@ -237,6 +254,11 @@ public:
 
   void handle_export_caps_ack(MExportCapsAck *m);
 
+  void export_freeze_finish(CDir *dir) {
+    utime_t start = export_freezing_state[dir].start_time;
+    export_freezing_dirs.erase(make_pair(start, dir));
+    export_freezing_state.erase(dir);
+  }
 
   friend class C_MDC_ExportFreeze;
   friend class C_MDS_ExportFinishLogged;
index 904d54f89bc9c8091c295aa0d2f0caba8d730f29..ee9aae137f05c9aef1091226e0a9c5727ab676c7 100644 (file)
@@ -1612,6 +1612,22 @@ void Server::handle_slave_auth_pin(MDRequest *mdr)
        dout(10) << " waiting for authpinnable on " << **p << dendl;
        (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
        mdr->drop_local_auth_pins();
+
+       CDir *dir = NULL;
+       if (CInode *in = dynamic_cast<CInode*>(*p)) {
+         if (!in->is_root())
+           dir = in->get_parent_dir();
+       } else if (CDentry *dn = dynamic_cast<CDentry*>(*p)) {
+         dir = dn->get_dir();
+       } else {
+         assert(0);
+       }
+       if (dir && dir->is_freezing_tree()) {
+         while (!dir->is_freezing_tree_root())
+           dir = dir->get_parent_dir();
+         mdcache->migrator->export_freeze_inc_num_waiters(dir);
+       }
+
        return;
       }
     }