From 1c6ffab1440ccf02b28f1b6f44a4bf676f5559c9 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 20 Jul 2007 23:42:58 +0000 Subject: [PATCH] more frag/split work git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1540 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/TODO | 5 +- trunk/ceph/mds/CDir.cc | 152 +++++++++++++++++++++++++++++-------- trunk/ceph/mds/CDir.h | 45 +++++++---- trunk/ceph/mds/CInode.cc | 111 ++------------------------- trunk/ceph/mds/CInode.h | 4 +- trunk/ceph/mds/Migrator.cc | 143 +++++++++------------------------- trunk/ceph/mds/Migrator.h | 2 +- trunk/ceph/mds/journal.cc | 2 +- 8 files changed, 199 insertions(+), 265 deletions(-) diff --git a/trunk/ceph/TODO b/trunk/ceph/TODO index a4a61ea8da65f..9b380f43ae842 100644 --- a/trunk/ceph/TODO +++ b/trunk/ceph/TODO @@ -56,9 +56,8 @@ sage mds - the split/merge plan: -*** - should get_dirfrags(frag_t) return partial matches? bc we might have the two frags listed separately even tho they've merged.. +/ - fragset_t to describe bounds; we need to tolerate concurrent merge/splits - - fragset_t to describe bounds; we need to tolerate concurrent merge/splits / - fragtree_t / - get_leaves(fg, ls) needs to be smarter / - force_to_leaf() @@ -68,6 +67,8 @@ sage mds / - STICKY dir state and pin? make sure it's kept across import/export/fragment / - pull _bound maps out of Migrator; they are redundant (trust the subtree map!) + - handle_resolve needs to infer splits/merges + - auth journals and applies update in the request update pipeline - dirfragtree is lazily consistent. no lock. bcast by primary when it updates. diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc index 8c1f83db9aa81..72ab9d7bcf74f 100644 --- a/trunk/ceph/mds/CDir.cc +++ b/trunk/ceph/mds/CDir.cc @@ -326,6 +326,29 @@ void CDir::unlink_inode( CDentry *dn ) //assert(nnull == null_items.size()); } +void CDir::try_remove_unlinked_dn(CDentry *dn) +{ + assert(dn->dir == this); + + if (dn->is_new() && dn->is_dirty() && + dn->get_num_ref() == 1) { + dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; + dn->mark_clean(); + remove_dentry(dn); + + if (version == projected_version && + committing_version == committed_version && + num_dirty == 0) { + dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version + << " vs committed " << committed_version + << endl; + committed_version_equivalent = committed_version; + } + } +} + + + void CDir::unlink_inode_work( CDentry *dn ) { CInode *in = dn->inode; @@ -356,6 +379,28 @@ void CDir::unlink_inode_work( CDentry *dn ) nitems--; // adjust dir size } +void CDir::remove_null_dentries() { + dout(12) << "remove_null_dentries " << *this << endl; + + list dns; + for (CDir_map_t::iterator it = items.begin(); + it != items.end(); + it++) { + if (it->second->is_null()) + dns.push_back(it->second); + } + + for (list::iterator it = dns.begin(); + it != dns.end(); + it++) { + CDentry *dn = *it; + remove_dentry(dn); + } + //assert(null_items.empty()); + assert(nnull == 0); + assert(nnull + nitems == items.size()); +} + void CDir::steal_dentry(CDentry *dn) { @@ -388,6 +433,7 @@ void CDir::purge_stolen(list& waiters) if (state_test(STATE_EXPORT)) put(PIN_EXPORT); if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); + if (state_test(STATE_FROZENDIR)) put(PIN_FROZEN); if (auth_pins > 0) put(PIN_AUTHPIN); @@ -396,51 +442,91 @@ void CDir::purge_stolen(list& waiters) assert(get_num_ref() == 0); } +void CDir::init_fragment_pins() +{ + if (state_test(STATE_DIRTY)) get(PIN_DIRTY); + if (state_test(STATE_FROZENDIR)) get(PIN_FROZEN); + if (state_test(STATE_EXPORT)) get(PIN_EXPORT); + if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); + if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); + if (state_test(STATE_STICKY)) get(PIN_STICKY); +} -void CDir::remove_null_dentries() { - dout(12) << "remove_null_dentries " << *this << endl; +void CDir::split(int bits, list& subs, list& waiters) +{ + dout(10) << "split by " << bits << " bits" << endl; + + assert(is_complete()); - list dns; - for (CDir_map_t::iterator it = items.begin(); - it != items.end(); - it++) { - if (it->second->is_null()) - dns.push_back(it->second); + list frags; + frag.split(bits, frags); + + vector subfrags(1 << bits); + + // create subfrag dirs + for (list::iterator p = frags.begin(); p != frags.end(); ++p) { + CDir *f = new CDir(inode, *p, cache, true); + f->state_set(state & MASK_STATE_FRAGMENT_KEPT); + f->init_fragment_pins(); + f->set_version(get_version()); + f->replica_map = replica_map; + dout(10) << " subfrag " << *p << " " << *f << endl; + subfrags.push_back(f); + inode->add_dirfrag(f); } + assert(subfrags.size() == frags.size()); - for (list::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - remove_dentry(dn); + // repartition dentries + while (!items.empty()) { + map::iterator p = items.begin(); + + CDentry *dn = p->second; + frag_t subfrag = inode->pick_dirfrag(p->first); + int n = subfrag.value() >> frag.bits(); + dout(15) << " subfrag " << subfrag << " n=" << n << " for " << p->first << endl; + CDir *f = subfrags[n]; + f->steal_dentry(dn); } - //assert(null_items.empty()); - assert(nnull == 0); - assert(nnull + nitems == items.size()); -} + purge_stolen(waiters); + inode->close_dirfrag(frag); // selft deletion, watch out. +} -void CDir::try_remove_unlinked_dn(CDentry *dn) +void CDir::merge(int bits, list& waiters) { - assert(dn->dir == this); + dout(10) << "merge by " << bits << " bits" << endl; + + list frags; + frag.split(bits, frags); - if (dn->is_new() && dn->is_dirty() && - dn->get_num_ref() == 1) { - dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; - dn->mark_clean(); - remove_dentry(dn); + for (list::iterator p = frags.begin(); p != frags.end(); ++p) { + CDir *dir = inode->get_or_open_dirfrag(cache, *p); + assert(dir->is_complete()); + dout(10) << " subfrag " << *p << " " << *dir << endl; + + // steal dentries + while (!dir->items.empty()) + steal_dentry(dir->items.begin()->second); + + // merge replica map + for (map::iterator p = dir->replica_map.begin(); + p != dir->replica_map.end(); + ++p) + replica_map[p->first] = MAX(replica_map[p->first], p->second); + + // merge state + state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); - if (version == projected_version && - committing_version == committed_version && - num_dirty == 0) { - dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version - << " vs committed " << committed_version - << endl; - committed_version_equivalent = committed_version; - } + dir->purge_stolen(waiters); + inode->close_dirfrag(dir->get_frag()); } + + init_fragment_pins(); } - + + + + diff --git a/trunk/ceph/mds/CDir.h b/trunk/ceph/mds/CDir.h index fff398f5b8932..df3412040508f 100644 --- a/trunk/ceph/mds/CDir.h +++ b/trunk/ceph/mds/CDir.h @@ -109,22 +109,28 @@ class CDir : public MDSCacheObject { // these state bits are preserved by an import/export // ...except if the directory is hashed, in which case none of them are! static const unsigned MASK_STATE_EXPORTED = - STATE_COMPLETE|STATE_DIRTY; + (STATE_COMPLETE|STATE_DIRTY); static const unsigned MASK_STATE_IMPORT_KEPT = - //STATE_IMPORT| - STATE_EXPORT - |STATE_IMPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_STICKY; + (STATE_EXPORT + |STATE_IMPORTING + |STATE_IMPORTBOUND|STATE_EXPORTBOUND + |STATE_FROZENTREE + |STATE_STICKY); static const unsigned MASK_STATE_EXPORT_KEPT = - STATE_EXPORTING - |STATE_IMPORTBOUND|STATE_EXPORTBOUND - |STATE_FROZENTREE - |STATE_FROZENDIR - |STATE_EXPORT - |STATE_STICKY; - + (STATE_EXPORTING + |STATE_IMPORTBOUND|STATE_EXPORTBOUND + |STATE_FROZENTREE + |STATE_FROZENDIR + |STATE_EXPORT + |STATE_STICKY); + static const unsigned MASK_STATE_FRAGMENT_KEPT = + (STATE_DIRTY | + STATE_COMPLETE | + STATE_FROZENDIR | + STATE_EXPORT | + STATE_EXPORTBOUND | + STATE_IMPORTBOUND | + STATE_STICKY); // -- rep spec -- static const int REP_NONE = 0; @@ -230,7 +236,6 @@ protected: return num_dirty; } - void try_remove_unlinked_dn(CDentry *dn); // -- dentries and inodes -- public: @@ -248,12 +253,20 @@ protected: void link_inode( CDentry *dn, inodeno_t ino ); void link_inode( CDentry *dn, CInode *in ); void unlink_inode( CDentry *dn ); + void try_remove_unlinked_dn(CDentry *dn); private: void link_inode_work( CDentry *dn, CInode *in ); void unlink_inode_work( CDentry *dn ); + void remove_null_dentries(); + +public: + void split(int bits, list& subs, list& waiters); + void merge(int bits, list& waiters); +private: void steal_dentry(CDentry *dn); // from another dir. used by merge/split. void purge_stolen(list& waiters); - void remove_null_dentries(); + void init_fragment_pins(); + // -- authority -- /* diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc index 402d50e679dfa..e72c8770ee7ac 100644 --- a/trunk/ceph/mds/CInode.cc +++ b/trunk/ceph/mds/CInode.cc @@ -275,102 +275,23 @@ void CInode::put_stickydirs() } -void CInode::fragment_dir(frag_t basefrag, int bits) -{ - dout(10) << "fragment_dir " << basefrag << " by " << bits << endl; +void CInode::fragment_dir(frag_t basefrag, int bits, list& subs, list& waiters) +{ + dout(10) << "fragment_dir " << bits << endl; + CDir *base = get_or_open_dirfrag(mdcache, basefrag); - list frags; - basefrag.split(bits, frags); - - vector subfrags(1 << bits); - - list waiters; - + dirfragtree.split(basefrag, bits); if (bits > 0) { - // split. - // update fragtree - dirfragtree.split(basefrag, bits); - - // create subfrag dirs - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(this, *p, mdcache, true); - - // propogate flags - f->state_set(base->get_state() & - (CDir::STATE_DIRTY | - CDir::STATE_COMPLETE | - CDir::STATE_FROZENDIR | - CDir::STATE_EXPORT | - CDir::STATE_EXPORTBOUND | - CDir::STATE_IMPORTBOUND | - CDir::STATE_STICKY | - 0)); - if (f->state_test(CDir::STATE_DIRTY)) f->get(CDir::PIN_DIRTY); - if (f->state_test(CDir::STATE_FROZENDIR)) f->get(CDir::PIN_FROZEN); - if (f->state_test(CDir::STATE_EXPORT)) f->get(CDir::PIN_EXPORT); - if (f->state_test(CDir::STATE_EXPORTBOUND)) f->get(CDir::PIN_EXPORTBOUND); - if (f->state_test(CDir::STATE_IMPORTBOUND)) f->get(CDir::PIN_IMPORTBOUND); - if (f->state_test(CDir::STATE_STICKY)) f->get(CDir::PIN_STICKY); - - f->set_version(base->get_version()); - - // dup replica map - f->replica_map = base->replica_map; - - dout(10) << " subfrag " << *p << " " << *f << endl; - subfrags.push_back(f); - add_dirfrag(f); - } - assert(subfrags.size() == frags.size()); - - // repartition dentries - while (!base->items.empty()) { - map::iterator p = base->items.begin(); - - CDentry *dn = p->second; - frag_t frag = base->inode->pick_dirfrag(p->first); - int n = frag.value() >> basefrag.bits(); - dout(15) << " subfrag " << frag << " n=" << n << " for " << p->first << endl; - CDir *f = dirfrags[n]; - - f->steal_dentry(dn); - } - - // empty. - base->purge_stolen(waiters); - close_dirfrag(basefrag); + base->split(bits, subs, waiters); } else { - // merge. - dirfragtree.merge(basefrag, bits); - - // enumerate subfrags - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = get_or_open_dirfrag(mdcache, *p); - dout(10) << " subfrag " << *p << " " << *dir << endl; - - // steal dentries - while (!dir->items.empty()) - base->steal_dentry(dir->items.begin()->second); - - // merge replica map - for (map::iterator p = dir->replica_map.begin(); - p != dir->replica_map.end(); - ++p) - base->replica_map[p->first] = MAX(base->replica_map[p->first], p->second); - - dir->purge_stolen(waiters); - close_dirfrag(dir->dirfrag().frag); - } + base->merge(bits, waiters); } - - mdcache->mds->queue_waiters(waiters); } - // pins void CInode::first_get() @@ -383,25 +304,9 @@ void CInode::first_get() void CInode::last_put() { // unpin my dentry? - if (parent) { + if (parent) parent->put(CDentry::PIN_INODEPIN); - } - //if (num_parents == 0 && get_num_ref() == 0) - //mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection -} - -/* -void CInode::get_parent() -{ - num_parents++; -} -void CInode::put_parent() -{ - num_parents--; - if (num_parents == 0 && get_num_ref() == 0) - mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection } -*/ void CInode::add_remote_parent(CDentry *p) { diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h index 56e16e8346f5c..d32236e326753 100644 --- a/trunk/ceph/mds/CInode.h +++ b/trunk/ceph/mds/CInode.h @@ -138,7 +138,6 @@ class CInode : public MDSCacheObject { inode_t *project_inode(); void pop_and_dirty_projected_inode(); - // -- cache infrastructure -- private: @@ -167,8 +166,7 @@ public: void get_stickydirs(); void put_stickydirs(); - void fragment_dir(frag_t base, int bits); - + void fragment_dir(frag_t basefrag, int bits, list& subs, list& waiters); protected: // parent dentries in cache diff --git a/trunk/ceph/mds/Migrator.cc b/trunk/ceph/mds/Migrator.cc index 0e47de3432421..796ba73e31e9f 100644 --- a/trunk/ceph/mds/Migrator.cc +++ b/trunk/ceph/mds/Migrator.cc @@ -2048,19 +2048,16 @@ void Migrator::fragment_dir(CDir *dir, int bits) dout(7) << "cluster degraded, no fragmenting for now" << endl; return; } - if (dir->inode->is_root()) { dout(7) << "i won't fragment root" << endl; //assert(0); return; } - if (dir->is_frozen() || dir->is_freezing()) { dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; return; } - if (dir->state_test(CDir::STATE_FRAGMENTING)) { dout(7) << "already fragmenting" << endl; return; @@ -2075,13 +2072,14 @@ void Migrator::fragment_dir(CDir *dir, int bits) class C_MDC_FragmentLogged : public Context { Migrator *mig; - CDir *dir; + list dirs; int bits; + version_t pv; public: - C_MDC_FragmentLogged(Migrator *m, CDir *d, int b) : mig(m), dir(d), bits(b) {} + C_MDC_FragmentLogged(Migrator *m, list& dls, int b, version_t v) : mig(m), dirs(dls), bits(b), pv(v) {} virtual void finish(int r) { if (r >= 0) - mig->fragment_logged(dir, bits); + mig->fragment_logged(dirs, bits, pv); } }; @@ -2089,120 +2087,53 @@ void Migrator::fragment_frozen(CDir *dir, int bits) { dout(7) << "fragment_frozen " << *dir << " bits " << bits << endl; - // xlock CInode *diri = dir->get_inode(); - if (!diri->dirfragtreelock.is_stable()) { - dout(10) << "fragment_frozen waiting for stable" << endl; - diri->dirfragtreelock.add_waiter(SimpleLock::WAIT_STABLE, - new C_MDC_FragmentFreeze(this, dir, bits)); - return; - } - - //if (diri->dirfragtreelock.get_state() != LOCK_LOCK) - //mds->locker->simple_lock(&diri->dirfragtreelock); - - if (diri->dirfragtreelock.get_state() != LOCK_LOCK) { - dout(10) << "fragment_frozen waiting for lock" << endl; - diri->dirfragtreelock.add_waiter(SimpleLock::WAIT_STABLE, - new C_MDC_FragmentFreeze(this, dir, bits)); - } - - // lock. do a manual xlock. - diri->dirfragtreelock.get_xlock((MDRequest*)1); - // journal it. EFragment *le = new EFragment(dir->ino(), dir->get_frag(), bits); + list subfrags; + list waiters; + version_t pv = dir->pre_dirty(); + diri->fragment_dir(dir->get_frag(), bits, subfrags, waiters); + // predirty and journal content - le->metablob.add_dir_context(dir); - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); + for (list::iterator p = subfrags.begin(); + p != subfrags.end(); ++p) { - p->second->pre_dirty(); - le->metablob.add_dentry(p->second, true); + CDir *subfrag = *p; + le->metablob.add_dir_context(subfrag); + for (map::iterator q = subfrag->items.begin(); + q != subfrag->items.end(); + ++q) { + CDentry *dn = q->second; + dn->set_projected_version(pv); + le->metablob.add_dentry(dn, true); + } } - + // go mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_FragmentLogged(this, dir, bits)); + mds->mdlog->wait_for_sync(new C_MDC_FragmentLogged(this, subfrags, bits, pv)); } -void Migrator::fragment_logged(CDir *dir, int bits) +void Migrator::fragment_logged(list& dirs, int bits, version_t pv) { - dout(10) << "fragment_logged " << *dir << " bits " << bits << endl; - - CInode *diri = dir->get_inode(); - diri->fragment_dir(dir->get_frag(), bits); - - // dirty everything + CInode *diri = dirs.front()->get_inode(); + dout(10) << "fragment_logged " << diri->ino() << " bits " << bits << " pv " << pv << endl; - - // create fragments - - frag_t startfrag = dir->get_frag(); - list frags; - startfrag.split(bits, frags); - - vector dirfrags(1 << bits); - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(diri, *p, cache, true); - - // propogate flags - f->state_set(dir->get_state() & - (CDir::STATE_DIRTY | - CDir::STATE_COMPLETE | - CDir::STATE_FROZENDIR)); - f->set_version(dir->get_version()); - f->pre_dirty(); - - dout(10) << " new frag " << *p << " " << *f << endl; - dirfrags.push_back(f); - diri->add_dirfrag(f); - } - assert(dirfrags.size() == frags.size()); - - // update dirfragtree - dir->inode->dirfragtree.split(startfrag, bits); - dout(10) << "new inode dirfragtree is " << dir->inode->dirfragtree << endl; - - // partition dentries - while (!dir->items.empty()) { - map::iterator p = dir->items.begin(); - - CDentry *dn = p->second; - frag_t frag = dir->inode->pick_dirfrag(p->first); - int n = frag.value() >> startfrag.bits(); - dout(15) << "frag " << frag << " n=" << n << " for " << p->first << endl; - CDir *f = dirfrags[n]; - - CDentry *newdn; - if (dn->is_primary()) { - CInode *in = dn->get_inode(); - dir->unlink_inode(dn); - newdn = f->add_dentry(dn->name, in); - } - else if (dn->is_remote()) { - inodeno_t ino = dn->get_remote_ino(); - newdn = f->add_dentry(dn->name, dn->get_remote_ino()); - } - else if (dn->is_null()) { - newdn = f->add_dentry(dn->name); - } - else - assert(0); - - dout(15) << " new dn " << *newdn << endl; - - dir->remove_dentry(dn); - } - - - + for (list::iterator p = dirs.begin(); + p != dirs.end(); + p++) { + CDir *dir = *p; + dout(10) << " subfrag " << *dir << endl; + // dirty everything + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) + p->second->mark_dirty(pv); - // remove old dir - diri->close_dirfrag(startfrag); - - + dir->unfreeze_dir(); + } } diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h index 60db3d89a4832..25851635bced4 100644 --- a/trunk/ceph/mds/Migrator.h +++ b/trunk/ceph/mds/Migrator.h @@ -251,7 +251,7 @@ protected: void fragment_dir(CDir *dir, int byn); void fragment_frozen(CDir *dir, int byn); friend class C_MDC_FragmentFreeze; - void fragment_logged(CDir *dir, int bits); + void fragment_logged(list& dirs, int bits, version_t pv); friend class C_MDC_FragmentLogged; void handle_fragment_notify(MFragmentDirNotify *m); diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc index 196c16f81156c..7770afed866fa 100644 --- a/trunk/ceph/mds/journal.cc +++ b/trunk/ceph/mds/journal.cc @@ -898,7 +898,7 @@ void EFragment::replay(MDS *mds) CInode *in = mds->mdcache->get_inode(ino); assert(in); - in->fragment_dir(basefrag, bits); + //in->fragment_dir(basefrag, bits); metablob.replay(mds); } -- 2.39.5