From: Yan, Zheng Date: Wed, 15 Jan 2014 08:49:23 +0000 (+0800) Subject: mds: improve freeze tree deadlock detection X-Git-Tag: v0.78~165^2~16 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9a47913d2062aec65da36d2ace30afb03f79f63f;p=ceph.git mds: improve freeze tree deadlock detection Current code uses the start time of freezing tree to detect deadlock. It is better to check how long the auth pin count of freezing tree stays unchanged to decide if there is potential deadlock. Signed-off-by: Yan, Zheng --- diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 7edffb93b1b7..50d9931a368c 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -210,16 +210,23 @@ void Migrator::find_stale_export_freeze() * - client request tries authpinning items in subtree A * (wait because subtree A is freezing) */ - for (set >::iterator p = export_freezing_dirs.begin(); - p != export_freezing_dirs.end(); ) { - if (p->first >= cutoff) - break; - CDir *dir = p->second; + for (map::iterator p = export_state.begin(); + p != export_state.end(); ) { + CDir* dir = p->first; + export_state_t& stat = p->second; ++p; - if (export_freezing_state[dir].num_waiters > 0 || + if (p->second.state != EXPORT_DISCOVERING && + p->second.state != EXPORT_FREEZING) + continue; + if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) { + stat.last_cum_auth_pins = dir->get_cum_auth_pins(); + stat.last_cum_auth_pins_change = now; + continue; + } + if (stat.last_cum_auth_pins_change >= cutoff) + continue; + if (stat.num_remote_waiters > 0 || (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { - assert(get_export_state(dir) == EXPORT_DISCOVERING || - get_export_state(dir) == EXPORT_FREEZING); export_try_cancel(dir); } } @@ -240,13 +247,11 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer) dir->auth_unpin(this); dir->state_clear(CDir::STATE_EXPORTING); break; - case EXPORT_DISCOVERING: dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; it->second.state = EXPORT_CANCELLED; dir->unfreeze_tree(); // cancel the freeze dir->auth_unpin(this); - export_freeze_finish(dir); dir->state_clear(CDir::STATE_EXPORTING); if (notify_peer && mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) // tell them. mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer); @@ -256,7 +261,6 @@ void Migrator::export_try_cancel(CDir *dir, bool notify_peer) dout(10) << "export state=freezing : canceling freeze" << dendl; it->second.state = EXPORT_CANCELLED; dir->unfreeze_tree(); // cancel the freeze - export_freeze_finish(dir); dir->state_clear(CDir::STATE_EXPORTING); if (notify_peer && mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) // tell them. mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer); @@ -810,10 +814,9 @@ void Migrator::dispatch_export_dir(MDRequest *mdr) mds->send_message_mds(discover, it->second.peer); assert(g_conf->mds_kill_export_at != 2); - utime_t now = ceph_clock_now(g_ceph_context); - export_freezing_dirs.insert(make_pair(now, dir)); - export_freezing_state[dir].start_time = now; + it->second.last_cum_auth_pins_change = ceph_clock_now(g_ceph_context); + // start the freeze, but hold it up with an auth_pin. dir->freeze_tree(); assert(dir->is_freezing_tree()); dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); @@ -894,8 +897,6 @@ void Migrator::export_frozen(CDir *dir) assert(it != export_state.end()); assert(it->second.state == EXPORT_FREEZING); - export_freeze_finish(dir); - CInode *diri = dir->get_inode(); // ok, try to grab all my locks. diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index eb09135df1ea..0bf35f08f35b 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -92,21 +92,17 @@ protected: map > peer_imported; list waiting_for_finish; Mutation *mut; - export_state_t() : mut(NULL) {} + // for freeze tree deadlock detection + utime_t last_cum_auth_pins_change; + int last_cum_auth_pins; + int num_remote_waiters; // number of remote authpin waiters + export_state_t() : mut(NULL), last_cum_auth_pins(0), num_remote_waiters(0) {} }; map export_state; list > export_queue; - // for deadlock detection - struct freezing_state_t { - utime_t start_time; - int num_waiters; // number of remote authpin waiters - freezing_state_t() : num_waiters(0) {} - }; - map export_freezing_state; - set > export_freezing_dirs; // -- imports -- public: @@ -118,6 +114,7 @@ public: const static int IMPORT_ACKING = 6; // logged EImportStart, sent ack, waiting for finish const static int IMPORT_FINISHING = 7; // sent cap imports, waiting for finish const static int IMPORT_ABORTING = 8; // notifying bystanders of an abort before unfreezing + static const char *get_import_statename(int s) { switch (s) { case IMPORT_DISCOVERING: return "discovering"; @@ -216,8 +213,9 @@ public: } void export_freeze_inc_num_waiters(CDir *dir) { - assert(is_exporting(dir)); - export_freezing_state[dir].num_waiters++; + map::iterator it = export_state.find(dir); + assert(it != export_state.end()); + it->second.num_remote_waiters++; } void find_stale_export_freeze(); @@ -286,12 +284,6 @@ public: void handle_export_notify_ack(MExportDirNotifyAck *m); void export_finish(CDir *dir); - void export_freeze_finish(CDir *dir) { - utime_t start = export_freezing_state[dir].start_time; - export_freezing_dirs.erase(make_pair(start, dir)); - export_freezing_state.erase(dir); - } - friend class C_MDC_ExportFreeze; friend class C_MDS_ExportFinishLogged; friend class C_M_ExportGo;