From: Yan, Zheng Date: Wed, 23 Oct 2013 01:15:58 +0000 (+0800) Subject: mds: freeze tree deadlock detection. X-Git-Tag: v0.75~93^2~36 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=ab93aa59bfa8459b5c060870613b2d64e3090264;p=ceph.git mds: freeze tree deadlock detection. there are two situations that result freeze tree deadlock. - mds.0 authpins an item in subtree A - mds.0 sends request to mds.1 to authpin an item in subtree B - mds.0 freezes subtree A - mds.1 authpins an item in subtree B - mds.1 sends request to mds.0 to authpin an item in subtree A - mds.1 freezes subtree B - mds.1 receives the remote authpin request from mds.0 (wait because subtree B is freezing) - mds.0 receives the remote authpin request from mds.1 (wait because subtree A is freezing) - client request authpins items in subtree B - freeze subtree B - import subtree A which is parent of subtree B (authpins parent inode of subtree B, see CDir::set_dir_auth()) - freeze subtree A - client request tries authpinning items in subtree A (wait because subtree A is freezing) Enforcing a authpinning order can avoid the deadlock, but it's very expensive. The deadlock is rare, so I think deadlock detection is more suitable for the case. This patch introduces freeze tree deadlock detection. We record the start time of freezing tree. If we fail to freeze the tree within a given duration, cancel the process of freezing tree. Signed-off-by: Yan, Zheng --- diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 9a9509e88352..a518d5cff46b 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -291,6 +291,7 @@ OPTION(mds_beacon_grace, OPT_FLOAT, 15) OPTION(mds_enforce_unique_name, OPT_BOOL, true) OPTION(mds_blacklist_interval, OPT_FLOAT, 24.0*60.0) // how long to blacklist failed nodes OPTION(mds_session_timeout, OPT_FLOAT, 60) // cap bits and leases time out if client idle +OPTION(mds_freeze_tree_timeout, OPT_FLOAT, 30) // cap bits and leases time out if client idle OPTION(mds_session_autoclose, OPT_FLOAT, 300) // autoclose idle session OPTION(mds_reconnect_timeout, OPT_FLOAT, 45) // seconds to wait for clients during mds restart // make it (mds_session_timeout - mds_beacon_grace) diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 83722274981d..14080789000e 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -588,6 +588,7 @@ void MDS::tick() if (is_active()) { balancer->tick(); + mdcache->migrator->find_stale_export_freeze(); if (snapserver) snapserver->check_osd_map(false); } diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 0647448c40c7..69ab14b76707 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -182,8 +182,148 @@ void Migrator::export_empty_import(CDir *dir) export_dir( dir, dest ); } +void Migrator::find_stale_export_freeze() +{ + utime_t now = ceph_clock_now(g_ceph_context); + utime_t cutoff = now; + cutoff -= g_conf->mds_freeze_tree_timeout; + + + /* + * We could have situations like: + * + * - mds.0 authpins an item in subtree A + * - mds.0 sends request to mds.1 to authpin an item in subtree B + * - mds.0 freezes subtree A + * - mds.1 authpins an item in subtree B + * - mds.1 sends request to mds.0 to authpin an item in subtree A + * - mds.1 freezes subtree B + * - mds.1 receives the remote authpin request from mds.0 + * (wait because subtree B is freezing) + * - mds.0 receives the remote authpin request from mds.1 + * (wait because subtree A is freezing) + * + * + * - client request authpins items in subtree B + * - freeze subtree B + * - import subtree A which is parent of subtree B + * (authpins parent inode of subtree B, see CDir::set_dir_auth()) + * - freeze subtree A + * - client request tries authpinning items in subtree A + * (wait because subtree A is freezing) + */ + for (set >::iterator p = export_freezing_dirs.begin(); + p != export_freezing_dirs.end(); ) { + if (p->first >= cutoff) + break; + CDir *dir = p->second; + ++p; + if (export_freezing_state[dir].num_waiters > 0 || + (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { + assert(get_export_state(dir) == EXPORT_DISCOVERING || + get_export_state(dir) == EXPORT_FREEZING); + export_try_cancel(dir); + } + } +} + +void Migrator::export_try_cancel(CDir *dir) +{ + int state = export_state[dir]; + switch (state) { + case EXPORT_DISCOVERING: + dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; + dir->unfreeze_tree(); // cancel the freeze + dir->auth_unpin(this); + export_state.erase(dir); // clean up + export_unlock(dir); + export_locks.erase(dir); + export_freeze_finish(dir); + dir->state_clear(CDir::STATE_EXPORTING); + if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); + break; + + case EXPORT_FREEZING: + dout(10) << "export state=freezing : canceling freeze" << dendl; + dir->unfreeze_tree(); // cancel the freeze + export_state.erase(dir); // clean up + export_freeze_finish(dir); + dir->state_clear(CDir::STATE_EXPORTING); + if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); + break; + + // NOTE: state order reversal, warning comes after prepping + case EXPORT_WARNING: + dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; + // fall-thru + + case EXPORT_PREPPING: + if (state != EXPORT_WARNING) + dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl; + { + // unpin bounds + set bounds; + cache->get_subtree_bounds(dir, bounds); + for (set::iterator q = bounds.begin(); + q != bounds.end(); + ++q) { + CDir *bd = *q; + bd->put(CDir::PIN_EXPORTBOUND); + bd->state_clear(CDir::STATE_EXPORTBOUND); + } + // notify bystanders + if (state == EXPORT_WARNING) + export_notify_abort(dir, bounds); + } + dir->unfreeze_tree(); + export_state.erase(dir); // clean up + cache->adjust_subtree_auth(dir, mds->get_nodeid()); + cache->try_subtree_merge(dir); // NOTE: this may journal subtree_map as side effect + export_unlock(dir); + export_locks.erase(dir); + dir->state_clear(CDir::STATE_EXPORTING); + if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. + mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); + break; + + case EXPORT_EXPORTING: + dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; + export_reverse(dir); + export_state.erase(dir); // clean up + export_locks.erase(dir); + dir->state_clear(CDir::STATE_EXPORTING); + break; + + case EXPORT_LOGGINGFINISH: + case EXPORT_NOTIFYING: + dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; + // leave export_state, don't clean up now. + break; + + default: + assert(0); + } + + // finish clean-up? + if (export_state.count(dir) == 0) { + export_peer.erase(dir); + export_warning_ack_waiting.erase(dir); + export_notify_ack_waiting.erase(dir); + // wake up any waiters + mds->queue_waiters(export_finish_waiters[dir]); + export_finish_waiters.erase(dir); + + // send pending import_maps? (these need to go out when all exports have finished.) + cache->maybe_send_pending_resolves(); + cache->show_subtrees(); + + maybe_do_queued_export(); + } +} // ========================================================== // mds failure handling @@ -228,98 +368,7 @@ void Migrator::handle_mds_failure_or_stop(int who) // the guy i'm exporting to failed, or we're just freezing. dout(10) << "cleaning up export state (" << p->second << ")" << get_export_statename(p->second) << " of " << *dir << dendl; - - switch (p->second) { - case EXPORT_DISCOVERING: - dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl; - dir->unfreeze_tree(); // cancel the freeze - dir->auth_unpin(this); - export_state.erase(dir); // clean up - export_unlock(dir); - export_locks.erase(dir); - dir->state_clear(CDir::STATE_EXPORTING); - if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); - break; - - case EXPORT_FREEZING: - dout(10) << "export state=freezing : canceling freeze" << dendl; - dir->unfreeze_tree(); // cancel the freeze - export_state.erase(dir); // clean up - dir->state_clear(CDir::STATE_EXPORTING); - if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); - break; - - // NOTE: state order reversal, warning comes after prepping - case EXPORT_WARNING: - dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl; - // fall-thru - - case EXPORT_PREPPING: - if (p->second != EXPORT_WARNING) - dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl; - { - // unpin bounds - set bounds; - cache->get_subtree_bounds(dir, bounds); - for (set::iterator q = bounds.begin(); - q != bounds.end(); - ++q) { - CDir *bd = *q; - bd->put(CDir::PIN_EXPORTBOUND); - bd->state_clear(CDir::STATE_EXPORTBOUND); - } - // notify bystanders - if (p->second == EXPORT_WARNING) - export_notify_abort(dir, bounds); - } - dir->unfreeze_tree(); - export_state.erase(dir); // clean up - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - cache->try_subtree_merge(dir); // NOTE: this may journal subtree_map as side effect - export_unlock(dir); - export_locks.erase(dir); - dir->state_clear(CDir::STATE_EXPORTING); - if (mds->mdsmap->is_clientreplay_or_active_or_stopping(export_peer[dir])) // tell them. - mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir]); - break; - - case EXPORT_EXPORTING: - dout(10) << "export state=exporting : reversing, and unfreezing" << dendl; - export_reverse(dir); - export_state.erase(dir); // clean up - export_locks.erase(dir); - dir->state_clear(CDir::STATE_EXPORTING); - break; - - case EXPORT_LOGGINGFINISH: - case EXPORT_NOTIFYING: - dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl; - // leave export_state, don't clean up now. - break; - - default: - assert(0); - } - - // finish clean-up? - if (export_state.count(dir) == 0) { - export_peer.erase(dir); - export_warning_ack_waiting.erase(dir); - export_notify_ack_waiting.erase(dir); - - // wake up any waiters - mds->queue_waiters(export_finish_waiters[dir]); - export_finish_waiters.erase(dir); - - // send pending import_maps? (these need to go out when all exports have finished.) - cache->maybe_send_pending_resolves(); - - cache->show_subtrees(); - - maybe_do_queued_export(); - } + export_try_cancel(dir); } else { // bystander failed. if (export_warning_ack_waiting.count(dir) && @@ -688,6 +737,10 @@ void Migrator::export_dir(CDir *dir, int dest) assert(g_conf->mds_kill_export_at != 2); // start the freeze, but hold it up with an auth_pin. + utime_t now = ceph_clock_now(g_ceph_context); + export_freezing_dirs.insert(make_pair(now, dir)); + export_freezing_state[dir].start_time = now; + dir->auth_pin(this); dir->freeze_tree(); assert(dir->is_freezing_tree()); @@ -732,6 +785,8 @@ void Migrator::export_frozen(CDir *dir) assert(dir->is_frozen()); assert(dir->get_cum_auth_pins() == 0); + export_freeze_finish(dir); + int dest = export_peer[dir]; CInode *diri = dir->inode; diff --git a/src/mds/Migrator.h b/src/mds/Migrator.h index afe2e6cd65a0..033e6eb00be1 100644 --- a/src/mds/Migrator.h +++ b/src/mds/Migrator.h @@ -91,6 +91,15 @@ protected: list< pair > export_queue; + // for deadlock detection + struct freezing_state_t { + utime_t start_time; + int num_waiters; // number of remote authpin waiters + freezing_state_t() : num_waiters(0) {} + }; + map export_freezing_state; + set > export_freezing_dirs; + // -- imports -- public: const static int IMPORT_DISCOVERING = 1; // waiting for prep @@ -182,6 +191,13 @@ public: assert(export_state[dir] == EXPORT_NOTIFYING); return (export_notify_ack_waiting[dir].count(who) == 0); } + + void export_freeze_inc_num_waiters(CDir *dir) { + assert(is_exporting(dir)); + export_freezing_state[dir].num_waiters++; + } + void find_stale_export_freeze(); + // -- misc -- void handle_mds_failure_or_stop(int who); @@ -227,6 +243,7 @@ public: void handle_export_prep_ack(MExportDirPrepAck *m); void export_go(CDir *dir); void export_go_synced(CDir *dir); + void export_try_cancel(CDir *dir); void export_reverse(CDir *dir); void export_notify_abort(CDir *dir, set& bounds); void handle_export_ack(MExportDirAck *m); @@ -237,6 +254,11 @@ public: void handle_export_caps_ack(MExportCapsAck *m); + void export_freeze_finish(CDir *dir) { + utime_t start = export_freezing_state[dir].start_time; + export_freezing_dirs.erase(make_pair(start, dir)); + export_freezing_state.erase(dir); + } friend class C_MDC_ExportFreeze; friend class C_MDS_ExportFinishLogged; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 904d54f89bc9..ee9aae137f05 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1612,6 +1612,22 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) dout(10) << " waiting for authpinnable on " << **p << dendl; (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); mdr->drop_local_auth_pins(); + + CDir *dir = NULL; + if (CInode *in = dynamic_cast(*p)) { + if (!in->is_root()) + dir = in->get_parent_dir(); + } else if (CDentry *dn = dynamic_cast(*p)) { + dir = dn->get_dir(); + } else { + assert(0); + } + if (dir && dir->is_freezing_tree()) { + while (!dir->is_freezing_tree_root()) + dir = dir->get_parent_dir(); + mdcache->migrator->export_freeze_inc_num_waiters(dir); + } + return; } }