From: Yan, Zheng Date: Thu, 16 Jan 2014 00:15:08 +0000 (+0800) Subject: mds: freeze dir deadlock detection X-Git-Tag: v0.78~165^2~15 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=9df6861b31bbdb7f2076a9588a7deb5d83a74acc;p=ceph.git mds: freeze dir deadlock detection freezing dir and freezing tree have the same deadlock cases. This patch adds freeze dir deadlock detection, which imitates commit ab93aa59 (mds: freeze tree deadlock detection) Signed-off-by: Yan, Zheng --- diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 215caa02e02f..30713d1cb47d 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -11057,13 +11057,12 @@ void MDCache::adjust_dir_fragments(CInode *diri, class C_MDC_FragmentFrozen : public Context { MDCache *mdcache; - list dirs; - frag_t basefrag; - int by; + dirfrag_t basedirfrag; public: - C_MDC_FragmentFrozen(MDCache *m, list d, frag_t bf, int b) : mdcache(m), dirs(d), basefrag(bf), by(b) {} + C_MDC_FragmentFrozen(MDCache *m, dirfrag_t df) : + mdcache(m), basedirfrag(df) {} virtual void finish(int r) { - mdcache->fragment_frozen(dirs, basefrag, by); + mdcache->fragment_frozen(basedirfrag, r); } }; @@ -11115,8 +11114,13 @@ void MDCache::split_dir(CDir *dir, int bits) if (!can_fragment(diri, dirs)) return; - C_GatherBuilder gather(g_ceph_context, - new C_MDC_FragmentFrozen(this, dirs, dir->get_frag(), bits)); + assert(fragments.count(dir->dirfrag()) == 0); + fragment_info_t& info = fragments[dir->dirfrag()]; + info.dirs.push_back(dir); + info.bits = bits; + info.last_cum_auth_pins_change = ceph_clock_now(g_ceph_context); + + C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentFrozen(this, dir->dirfrag())); fragment_freeze_dirs(dirs, gather); gather.activate(); @@ -11146,8 +11150,15 @@ void MDCache::merge_dir(CInode *diri, frag_t frag) int bits = first->get_frag().bits() - frag.bits(); dout(10) << " we are merginb by " << bits << " bits" << dendl; + dirfrag_t df(diri->ino(), frag); + assert(fragments.count(df) == 0); + fragment_info_t& info = fragments[df]; + info.dirs = dirs; + info.bits = -bits; + info.last_cum_auth_pins_change = ceph_clock_now(g_ceph_context); + C_GatherBuilder gather(g_ceph_context, - new C_MDC_FragmentFrozen(this, dirs, frag, -bits)); + new C_MDC_FragmentFrozen(this, dirfrag_t(diri->ino(), frag))); fragment_freeze_dirs(dirs, gather); gather.activate(); @@ -11246,6 +11257,70 @@ void MDCache::fragment_unmark_unfreeze_dirs(list& dirs) } } +void MDCache::fragment_freeze_inc_num_waiters(CDir *dir) +{ + map::iterator p; + for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); + p != fragments.end() && p->first.ino == dir->ino(); + ++p) { + if (p->first.frag.contains(dir->get_frag())) { + p->second.num_remote_waiters++; + return; + } + } + assert(0); +} + +void MDCache::find_stale_fragment_freeze() +{ + dout(10) << "find_stale_fragment_freeze" << dendl; + // see comment in Migrator::find_stale_export_freeze() + utime_t now = ceph_clock_now(g_ceph_context); + utime_t cutoff = now; + cutoff -= g_conf->mds_freeze_tree_timeout; + + for (map::iterator p = fragments.begin(); + p != fragments.end(); ) { + dirfrag_t df = p->first; + fragment_info_t& info = p->second; + ++p; + if (info.dirs_frozen) + continue; + CDir *dir; + int total_auth_pins = 0; + for (list::iterator q = info.dirs.begin(); + q != info.dirs.end(); + ++q) { + dir = *q; + if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + total_auth_pins = -1; + break; + } + if (dir->is_frozen_dir()) + continue; + total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins(); + } + if (total_auth_pins < 0) + continue; + if (info.last_cum_auth_pins != total_auth_pins) { + info.last_cum_auth_pins = total_auth_pins; + info.last_cum_auth_pins_change = now; + continue; + } + if (info.last_cum_auth_pins_change >= cutoff) + continue; + dir = info.dirs.front(); + if (info.num_remote_waiters > 0 || + (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { + dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl; + list dirs; + dirs.swap(info.dirs); + fragments.erase(df); + fragment_unmark_unfreeze_dirs(dirs); + } + } +} + class C_MDC_FragmentPrep : public Context { MDCache *mdcache; MDRequest *mdr; @@ -11271,8 +11346,8 @@ class C_MDC_FragmentCommit : public Context { dirfrag_t basedirfrag; list resultfrags; public: - C_MDC_FragmentCommit(MDCache *m, inodeno_t ino, frag_t f, list& l) : - mdcache(m), basedirfrag(ino, f) { + C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list& l) : + mdcache(m), basedirfrag(df) { resultfrags.swap(l); } virtual void finish(int r) { @@ -11295,36 +11370,37 @@ public: } }; -void MDCache::fragment_frozen(list& dirs, frag_t basefrag, int bits) +void MDCache::fragment_frozen(dirfrag_t basedirfrag, int r) { - dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits - << " on " << dirs.front()->get_inode() << dendl; + map::iterator it = fragments.find(basedirfrag); + if (r < 0) { + dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl; + assert(it == fragments.end()); + return; + } + assert(it != fragments.end()); + fragment_info_t& info = it->second; - if (bits > 0) - assert(dirs.size() == 1); - else if (bits < 0) - assert(dirs.size() > 1); - else - assert(0); + dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits + << " on " << info.dirs.front()->get_inode() << dendl; - MDRequest *mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); - fragment_info_t &info = fragment_requests[mdr->reqid]; - info.basefrag = basefrag; - info.bits = bits; - info.dirs = dirs; + info.dirs_frozen = true; + MDRequest *mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); + mdr->more()->fragment_base = basedirfrag; dispatch_fragment_dir(mdr); } void MDCache::dispatch_fragment_dir(MDRequest *mdr) { - map::iterator it = fragment_requests.find(mdr->reqid); - assert(it != fragment_requests.end()); - fragment_info_t &info = it->second; + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map::iterator it = fragments.find(basedirfrag); + assert(it != fragments.end()); + fragment_info_t& info = it->second; CInode *diri = info.dirs.front()->get_inode(); - dout(10) << "dispatch_fragment_dir " << info.resultfrags << " " - << info.basefrag << " bits " << info.bits << " on " << *diri << dendl; + dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; // avoid freeze dir deadlock if (!mdr->is_auth_pinned(diri)) { @@ -11336,7 +11412,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) else mds->balancer->queue_merge(info.dirs.front()); fragment_unmark_unfreeze_dirs(info.dirs); - fragment_requests.erase(mdr->reqid); + fragments.erase(it); request_finish(mdr); return; } @@ -11352,8 +11428,8 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) return; mdr->ls = mds->mdlog->get_current_segment(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(), - info.basefrag, info.bits); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, + basedirfrag.ino, basedirfrag.frag, info.bits); mds->mdlog->start_entry(le); for (list::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) { @@ -11365,7 +11441,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) // refragment list waiters; - adjust_dir_fragments(diri, info.dirs, info.basefrag, info.bits, + adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits, info.resultfrags, waiters, false); if (g_conf->mds_debug_frag) diri->verify_dirfrags(); @@ -11398,20 +11474,21 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) mut->add_updated_lock(&diri->nestlock); */ - add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags, mdr->ls); + add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls); mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr)); mds->mdlog->flush(); } void MDCache::_fragment_logged(MDRequest *mdr) { - map::iterator it = fragment_requests.find(mdr->reqid); - assert(it != fragment_requests.end()); + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map::iterator it = fragments.find(basedirfrag); + assert(it != fragments.end()); fragment_info_t &info = it->second; CInode *diri = info.resultfrags.front()->get_inode(); - dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag - << " bits " << info.bits << " on " << *diri << dendl; + dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; // store resulting frags C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); @@ -11433,13 +11510,14 @@ void MDCache::_fragment_logged(MDRequest *mdr) void MDCache::_fragment_stored(MDRequest *mdr) { - map::iterator it = fragment_requests.find(mdr->reqid); - assert(it != fragment_requests.end()); + dirfrag_t basedirfrag = mdr->more()->fragment_base; + map::iterator it = fragments.find(basedirfrag); + assert(it != fragments.end()); fragment_info_t &info = it->second; CInode *diri = info.resultfrags.front()->get_inode(); - dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag - << " bits " << info.bits << " on " << *diri << dendl; + dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits + << " on " << *diri << dendl; // tell peers CDir *first = *info.resultfrags.begin(); @@ -11451,7 +11529,7 @@ void MDCache::_fragment_stored(MDRequest *mdr) rejoin_gather.count(p->first))) continue; - MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), info.basefrag, info.bits); + MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag.ino, basedirfrag.frag, info.bits); // freshly replicate new dirs to peers for (list::iterator q = info.resultfrags.begin(); @@ -11487,11 +11565,11 @@ void MDCache::_fragment_stored(MDRequest *mdr) // journal commit EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, - diri->ino(), info.basefrag, info.bits); - mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, diri->ino(), info.basefrag, + basedirfrag, info.bits); + mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, info.resultfrags)); - fragment_requests.erase(it); + fragments.erase(it); request_finish(mdr); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 34b9e8c9ca3d..1c2f8487ec32 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -950,12 +950,18 @@ private: map uncommitted_fragments; struct fragment_info_t { - frag_t basefrag; int bits; list dirs; list resultfrags; + MDRequest *mdr; + // for deadlock detection + bool dirs_frozen; + utime_t last_cum_auth_pins_change; + int last_cum_auth_pins; + int num_remote_waiters; // number of remote authpin waiters + fragment_info_t() : last_cum_auth_pins(0), num_remote_waiters(0) {} }; - map fragment_requests; + map fragments; void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, list& frags, list& waiters, bool replay); @@ -971,7 +977,7 @@ private: bool can_fragment(CInode *diri, list& dirs); void fragment_freeze_dirs(list& dirs, C_GatherBuilder &gather); void fragment_mark_and_complete(list& dirs); - void fragment_frozen(list& dirs, frag_t basefrag, int bits); + void fragment_frozen(dirfrag_t basedirfrag, int r); void fragment_unmark_unfreeze_dirs(list& dirs); void dispatch_fragment_dir(MDRequest *mdr); void _fragment_logged(MDRequest *mdr); @@ -1002,6 +1008,9 @@ public: void merge_dir(CInode *diri, frag_t fg); void rollback_uncommitted_fragments(); + void find_stale_fragment_freeze(); + void fragment_freeze_inc_num_waiters(CDir *dir); + // -- updates -- //int send_inode_updates(CInode *in); //void handle_inode_update(MInodeUpdate *m); diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc index 1f9fc9154a1a..7ff1110f247f 100644 --- a/src/mds/MDS.cc +++ b/src/mds/MDS.cc @@ -607,6 +607,7 @@ void MDS::tick() if (is_active()) { balancer->tick(); + mdcache->find_stale_fragment_freeze(); mdcache->migrator->find_stale_export_freeze(); if (snapserver) snapserver->check_osd_map(false); diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index bd0901490321..10ed53d28f91 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -237,8 +237,9 @@ struct MDRequest : public Mutation { list waiting_for_finish; - // export + // export & fragment CDir* export_dir; + dirfrag_t fragment_base; More() : srcdn_auth_mds(-1), diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 7c6e3bdf4b00..a6fa9f0eeb3c 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1659,12 +1659,15 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) } else { assert(0); } - if (dir && dir->is_freezing_tree()) { - while (!dir->is_freezing_tree_root()) - dir = dir->get_parent_dir(); - mdcache->migrator->export_freeze_inc_num_waiters(dir); + if (dir) { + if (dir->is_freezing_dir()) + mdcache->fragment_freeze_inc_num_waiters(dir); + if (dir->is_freezing_tree()) { + while (!dir->is_freezing_tree_root()) + dir = dir->get_parent_dir(); + mdcache->migrator->export_freeze_inc_num_waiters(dir); + } } - return; } }