From: Sage Weil Date: Tue, 10 Jun 2008 23:48:16 +0000 (-0700) Subject: mds: some initial dirfrag cleanups X-Git-Tag: v0.3~134 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=c58ea2ee84eb218e059925d627976399397dae2c;p=ceph.git mds: some initial dirfrag cleanups --- diff --git a/src/TODO b/src/TODO index 7b936be8e90..fd777c55ef7 100644 --- a/src/TODO +++ b/src/TODO @@ -62,6 +62,8 @@ mds - dir frags - fix replay (dont want dir frozen, pins, etc.?) - fix accounting + - block while acquiring the dft dirlock, since we may need to request a scatter? + - make sure non-dirty dirs journal/replay properly - proper handling of cache expire messages during rejoin phase? -> i think cache expires are fine; the rejoin_ack handler just has to behave if rejoining items go missing diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index c61205d5c4f..b23fbdc8a7e 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -692,6 +692,10 @@ void CInode::clear_dirty_scattered(int type) xlist_dirty_dirfrag_dir.remove_myself(); break; + case CEPH_LOCK_IDFT: + xlist_dirty_dirfrag_dirfragtree.remove_myself(); + break; + default: assert(0); } diff --git a/src/mds/CInode.h b/src/mds/CInode.h index e6bc6143d58..7a15fb27b74 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -222,6 +222,7 @@ protected: public: xlist::item xlist_open_file; xlist::item xlist_dirty_dirfrag_dir; + xlist::item xlist_dirty_dirfrag_dirfragtree; xlist::item xlist_purging_inode; private: @@ -258,6 +259,7 @@ private: replica_caps_wanted(0), xlist_dirty(this), xlist_open_file(this), xlist_dirty_dirfrag_dir(this), + xlist_dirty_dirfrag_dirfragtree(this), xlist_purging_inode(this), auth_pins(0), nested_auth_pins(0), nested_anchors(0), diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index d7f98b5ff92..d45c5e61c24 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2034,8 +2034,10 @@ bool Locker::scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate) // can wrlock? if (lock->can_wrlock()) { lock->get_wrlock(); - mut->wrlocks.insert(lock); - mut->locks.insert(lock); + if (mut) { + mut->wrlocks.insert(lock); + mut->locks.insert(lock); + } return true; } diff --git a/src/mds/Locker.h b/src/mds/Locker.h index 931580773d6..9c0be82858f 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -141,9 +141,13 @@ protected: void scatter_tempsync(ScatterLock *lock); bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mut); void scatter_rdlock_finish(ScatterLock *lock, Mutation *mut); +public: bool scatter_wrlock_try(ScatterLock *lock, Mutation *mut, bool initiate); +protected: bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mut); +public: void scatter_wrlock_finish(ScatterLock *lock, Mutation *mut); +protected: bool scatter_xlock_start(ScatterLock *lock, MDRequest *mut); void scatter_xlock_finish(ScatterLock *lock, Mutation *mut); diff --git a/src/mds/LogSegment.h b/src/mds/LogSegment.h index 106973efe8e..0619a13f810 100644 --- a/src/mds/LogSegment.h +++ b/src/mds/LogSegment.h @@ -42,6 +42,7 @@ class LogSegment { xlist open_files; xlist dirty_dirfrag_dir; + xlist dirty_dirfrag_dirfragtree; xlist slave_updates; diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 6cf7187969b..fd9d4982192 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6203,16 +6203,15 @@ void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, dout(10) << "adjust_dir_fragments " << basefrag << " " << bits << " on " << *diri << dendl; + // adjust fragtree // yuck. we may have discovered the inode while it was being fragmented. if (!diri->dirfragtree.is_leaf(basefrag)) diri->dirfragtree.force_to_leaf(basefrag); - - CDir *base = diri->get_or_open_dirfrag(this, basefrag); - - // adjust fragtree diri->dirfragtree.split(basefrag, bits); dout(10) << " new fragtree is " << diri->dirfragtree << dendl; + CDir *base = diri->get_or_open_dirfrag(this, basefrag); + if (bits > 0) { if (base) { CDir *baseparent = base->get_parent_dir(); @@ -6221,17 +6220,21 @@ void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, // did i change the subtree map? if (base->is_subtree_root()) { - // am i a bound? + // new frags are now separate subtrees + for (list::iterator p = resultfrags.begin(); + p != resultfrags.end(); + ++p) + subtrees[*p].clear(); // new frag is now its own subtree + + // was i a bound? if (baseparent) { CDir *parent = get_subtree_root(baseparent); assert(subtrees[parent].count(base)); subtrees[parent].erase(base); for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); - ++p) { + ++p) subtrees[parent].insert(*p); - subtrees[*p].clear(); // new frag is now its own subtree - } } // adjust my bounds. @@ -6298,6 +6301,12 @@ void MDCache::split_dir(CDir *dir, int bits) return; } + // wrlock + if (!mds->locker->scatter_wrlock_try(&dir->inode->dirfragtreelock, 0, false)) { + dout(7) << "can't wrlock dirfragtree on " << *dir->inode << dendl; + return; + } + list startfrags; startfrags.push_back(dir); @@ -6324,7 +6333,7 @@ void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, p != frags.end(); ++p) { CDir *dir = *p; - dir->auth_pin(); // this will block the freeze + dir->auth_pin(); // this will block the freeze, until mark_and_complete dir->freeze_dir(); assert(dir->is_freezing_dir()); dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); @@ -6429,20 +6438,16 @@ class C_MDC_FragmentLogged : public Context { frag_t basefrag; int bits; list resultfrags; - vector pvs; - LogSegment *ls; + Mutation *mut; public: C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, vector& p, - LogSegment *s) : - mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) { + list& rf, Mutation *mu) : + mdcache(m), diri(di), basefrag(bf), bits(b), mut(mu) { resultfrags.swap(rf); - pvs.swap(p); } virtual void finish(int r) { mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, pvs, - ls); + resultfrags, mut); } }; @@ -6452,64 +6457,57 @@ void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, dout(10) << "fragment_stored " << basefrag << " by " << bits << " on " << *diri << dendl; + Mutation *mut = new Mutation; + mut->ls = mds->mdlog->get_current_segment(); EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); + le->metablob.add_dir_context(*resultfrags.begin()); - set peers; - vector pvs; + mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); + mut->ls->dirty_dirfrag_dirfragtree.push_back(&diri->xlist_dirty_dirfrag_dirfragtree); + mut->add_updated_scatterlock(&diri->dirfragtreelock); + + // journal new dirfrag fragstats for each new fragment. + // mark complete. but not dirty. for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); p++) { CDir *dir = *p; dout(10) << " result frag " << *dir << dendl; - - if (p == resultfrags.begin()) { - le->metablob.add_dir_context(dir); - // note peers - // only do this once: all frags have identical replica_maps. - if (peers.empty()) - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - peers.insert(p->first); - } - - pvs.push_back(dir->pre_dirty()); - le->metablob.add_dir(dir, true); + le->metablob.add_dir(dir, true).mark_complete(); // mark complete } mds->mdlog->submit_entry(le, new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, pvs, mds->mdlog->get_current_segment())); + resultfrags, mut)); +} + +void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, + list& resultfrags, + Mutation *mut) +{ + dout(10) << "fragment_logged " << basefrag << " bits " << bits + << " on " << *diri << dendl; - // announcelist& resultfrags, - for (set::iterator p = peers.begin(); - p != peers.end(); - ++p) { + // tell peers + CDir *first = *resultfrags.begin(); + for (map::iterator p = first->replica_map.begin(); + p != first->replica_map.end(); + p++) { MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); if (bits < 0) { - // freshly replicate basedir to peer on merge + // freshly replicate new basedir to peer on merge CDir *base = resultfrags.front(); - CDirDiscover *basedis = base->replicate_to(*p); + CDirDiscover *basedis = base->replicate_to(p->first); basedis->encode(notify->basebl); delete basedis; } - mds->send_message_mds(notify, *p); - } + mds->send_message_mds(notify, p->first); + } -} + mut->apply(); // mark scatterlocks, mainly. -void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - vector& pvs, - LogSegment *ls) -{ - dout(10) << "fragment_logged " << basefrag << " bits " << bits - << " on " << *diri << dendl; - - - // dirty resulting frags + // unfreeze resulting frags set peers; - vector::iterator pv = pvs.begin(); for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); p++) { @@ -6518,8 +6516,6 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, // dirty, unpin, unfreeze dir->state_clear(CDir::STATE_FRAGMENTING); - dir->mark_dirty(*pv, ls); - pv++; for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); @@ -6531,6 +6527,8 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, dir->unfreeze_dir(); } + + mds->locker->scatter_wrlock_finish(&diri->dirfragtreelock, 0); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 9c474921418..51e993b19f6 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -800,7 +800,7 @@ private: void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs, LogSegment *ls); + void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, Mutation *mut); friend class C_MDC_FragmentGo; friend class C_MDC_FragmentMarking; friend class C_MDC_FragmentStored; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index f5a992240d5..3f76b403b1e 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -115,13 +115,19 @@ C_Gather *LogSegment::try_to_expire(MDS *mds) mds->mdcache->wait_for_uncommitted_master(*p, gather->new_sub()); } - // dirty non-auth mtimes + // nudge scatterlocks for (xlist::iterator p = dirty_dirfrag_dir.begin(); !p.end(); ++p) { CInode *in = *p; dout(10) << "try_to_expire waiting for dirlock flush on " << *in << dendl; if (!gather) gather = new C_Gather; mds->locker->scatter_nudge(&in->dirlock, gather->new_sub()); } + for (xlist::iterator p = dirty_dirfrag_dirfragtree.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for dirfragtreelock flush on " << *in << dendl; + if (!gather) gather = new C_Gather; + mds->locker->scatter_nudge(&in->dirfragtreelock, gather->new_sub()); + } // open files if (!open_files.empty()) {