From 18d349698211b5ba84fdac6a26354e8444fb691e Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 24 Jul 2007 19:51:01 +0000 Subject: [PATCH] split appears to not crash git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1545 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/TODO | 5 +- branches/sage/mds/client/SyntheticClient.cc | 3 +- branches/sage/mds/include/frag.h | 2 +- branches/sage/mds/mds/CDentry.h | 1 + branches/sage/mds/mds/CDir.cc | 73 ++++++------ branches/sage/mds/mds/CDir.h | 9 +- branches/sage/mds/mds/LogEvent.h | 5 +- branches/sage/mds/mds/MDCache.cc | 118 +++++++++++++------- branches/sage/mds/mds/MDCache.h | 5 +- branches/sage/mds/mds/mdstypes.h | 4 +- 10 files changed, 134 insertions(+), 91 deletions(-) diff --git a/branches/sage/mds/TODO b/branches/sage/mds/TODO index d8ac975257123..c0883355c1bc6 100644 --- a/branches/sage/mds/TODO +++ b/branches/sage/mds/TODO @@ -68,9 +68,8 @@ sage mds / - STICKY dir state and pin? make sure it's kept across import/export/fragment / - pull _bound maps out of Migrator; they are redundant (trust the subtree map!) -** fix fetch/commit to avoid carrying a CDir* pointer around - - handle_resolve needs to infer splits/merges + - rejoin, too! - auth journals and applies update in the request update pipeline - dirfragtree is lazily consistent. no lock. bcast by primary when it updates. @@ -84,6 +83,8 @@ sage mds - CDentry objects will be moved to the new frag(s) - Server etc. must take care not to carry CDir pointers around; they're unstable! + - what about flushing the old dirfrag storage off disk...? + - journal epoch, or something similar - reduce size of EMetaBlob by skipping context when inode was already journaled since the last diff --git a/branches/sage/mds/client/SyntheticClient.cc b/branches/sage/mds/client/SyntheticClient.cc index a496480d7328e..63df511183748 100644 --- a/branches/sage/mds/client/SyntheticClient.cc +++ b/branches/sage/mds/client/SyntheticClient.cc @@ -1723,7 +1723,8 @@ void SyntheticClient::foo() int c = rand() % s; char src[80]; sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - int fd = client->open(src, O_RDONLY); + //int fd = + client->open(src, O_RDONLY); } return; diff --git a/branches/sage/mds/include/frag.h b/branches/sage/mds/include/frag.h index e89fc63f17811..b6a1c19c22080 100644 --- a/branches/sage/mds/include/frag.h +++ b/branches/sage/mds/include/frag.h @@ -134,7 +134,7 @@ class frag_t { } }; -inline ostream& operator<<(ostream& out, frag_t& hb) +inline ostream& operator<<(ostream& out, frag_t hb) { return out << hex << hb.value() << dec << "/" << hb.bits(); } diff --git a/branches/sage/mds/mds/CDentry.h b/branches/sage/mds/mds/CDentry.h index 61b02d4b65ce8..2b73be54e9fe4 100644 --- a/branches/sage/mds/mds/CDentry.h +++ b/branches/sage/mds/mds/CDentry.h @@ -47,6 +47,7 @@ class CDentry : public MDSCacheObject, public LRUObject { public: // -- state -- static const int STATE_NEW = 1; + static const int STATE_FRAGMENTING = 2; // -- pins -- static const int PIN_INODEPIN = 1; // linked inode is pinned diff --git a/branches/sage/mds/mds/CDir.cc b/branches/sage/mds/mds/CDir.cc index 1ea69e2803748..dc6bf975d8579 100644 --- a/branches/sage/mds/mds/CDir.cc +++ b/branches/sage/mds/mds/CDir.cc @@ -13,6 +13,7 @@ */ +#include "include/types.h" #include "CDir.h" #include "CDentry.h" @@ -40,7 +41,7 @@ ostream& operator<<(ostream& out, CDir& dir) string path; dir.get_inode()->make_path(path); out << "[dir " << dir.ino(); - if (!dir.frag.is_root()) out << "%" << dir.frag; + if (!dir.frag.is_root()) out << "_" << dir.frag; out << " " << path << "/"; if (dir.is_auth()) { out << " auth"; @@ -101,13 +102,13 @@ void CDir::print(ostream& out) #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << dirfrag() << ") " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " //#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache." << *this << " " ostream& CDir::print_db_line_prefix(ostream& out) { - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") "; + return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; } @@ -329,9 +330,15 @@ void CDir::unlink_inode( CDentry *dn ) void CDir::try_remove_unlinked_dn(CDentry *dn) { assert(dn->dir == this); + assert(dn->is_null()); + assert(dn->is_dirty()); - if (dn->is_new() && dn->is_dirty() && - dn->get_num_ref() == 1) { + // no pins (besides dirty)? + if (dn->get_num_ref() != 1) + return; + + // was the dn new? or is the dir complete (i.e. we don't need negatives)? + if (dn->is_new() || is_complete()) { dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; dn->mark_clean(); remove_dentry(dn); @@ -433,26 +440,27 @@ void CDir::steal_dentry(CDentry *dn) void CDir::purge_stolen(list& waiters) { + // take waiters _before_ unfreeze... + take_waiting(WAIT_ANY, waiters); + + assert(is_frozen_dir()); + unfreeze_dir(); + nnull = nitems = 0; if (is_dirty()) mark_clean(); - if (state_test(STATE_EXPORT)) put(PIN_EXPORT); if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); - if (state_test(STATE_FROZENDIR)) put(PIN_FROZEN); if (auth_pins > 0) put(PIN_AUTHPIN); - take_waiting(WAIT_ANY, waiters); - assert(get_num_ref() == 0); } void CDir::init_fragment_pins() { if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_FROZENDIR)) get(PIN_FROZEN); if (state_test(STATE_EXPORT)) get(PIN_EXPORT); if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); @@ -479,6 +487,7 @@ void CDir::split(int bits, list& subs, list& waiters) f->version = version; f->projected_version = projected_version; f->replica_map = replica_map; + f->freeze_dir(0); dout(10) << " subfrag " << *p << " " << *f << endl; subfrags[n++] = f; subs.push_back(f); @@ -497,7 +506,6 @@ void CDir::split(int bits, list& subs, list& waiters) f->steal_dentry(dn); } - put(PIN_FRAGMENTING); purge_stolen(waiters); inode->close_dirfrag(frag); // selft deletion, watch out. } @@ -527,7 +535,6 @@ void CDir::merge(int bits, list& waiters) // merge state state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); - dir->put(PIN_FRAGMENTING); dir->purge_stolen(waiters); inode->close_dirfrag(dir->get_frag()); } @@ -708,13 +715,19 @@ class C_Dir_Fetch : public Context { } }; -void CDir::fetch(Context *c) +void CDir::fetch(Context *c, bool ignore_authpinnability) { dout(10) << "fetch on " << *this << endl; assert(is_auth()); assert(!is_complete()); + if (!can_auth_pin() && !ignore_authpinnability) { + dout(7) << "fetch waiting for authpinnable" << endl; + add_waiter(WAIT_AUTHPINNABLE, c); + return; + } + if (c) add_waiter(WAIT_COMPLETE, c); // already fetching? @@ -723,6 +736,7 @@ void CDir::fetch(Context *c) return; } + auth_pin(); state_set(CDir::STATE_FETCHING); if (cache->mds->logger) cache->mds->logger->inc("fdir"); @@ -739,29 +753,19 @@ void CDir::fetch(Context *c) void CDir::_fetched(bufferlist &bl) { - dout(10) << "_fetched " << 0 << "~" << bl.length() - << " on " << *this + dout(10) << "_fetched " << bl.length() + << " bytes for " << *this << endl; - // give up? - if (!is_auth() || is_frozen()) { - dout(10) << "_fetched canceling (!auth or frozen)" << endl; - //ondisk_bl.clear(); - //ondisk_size = 0; - - // kick waiters? - state_clear(CDir::STATE_FETCHING); - finish_waiting(WAIT_COMPLETE, -1); - return; - } + assert(is_auth()); + assert(!is_frozen()); // decode. int len = bl.length(); int off = 0; - version_t got_version; + version_t got_version; - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); + ::_decode(got_version, bl, off); dout(10) << "_fetched version " << got_version << ", " << len << " bytes" @@ -917,8 +921,8 @@ void CDir::_fetched(bufferlist &bl) /** * commit * - * @param want min version i want committed - * @param c callback for completion + * @param want - min version i want committed + * @param c - callback for completion */ void CDir::commit(version_t want, Context *c) { @@ -927,7 +931,7 @@ void CDir::commit(version_t want, Context *c) // preconditions assert(want <= version || version == 0); // can't commit the future - assert(committed_version < want); // the caller is stupid + assert(want > committed_version); // the caller is stupid assert(is_auth()); assert(can_auth_pin()); @@ -1008,9 +1012,10 @@ void CDir::_commit(version_t want) if (cache->mds->logger) cache->mds->logger->inc("cdir"); - // encode dentries + // encode bufferlist bl; - bl.append((char*)&version, sizeof(version)); + + ::_encode(version, bl); for (CDir_map_t::iterator it = items.begin(); it != items.end(); diff --git a/branches/sage/mds/mds/CDir.h b/branches/sage/mds/mds/CDir.h index 5925191008af0..c28a67980f4d0 100644 --- a/branches/sage/mds/mds/CDir.h +++ b/branches/sage/mds/mds/CDir.h @@ -60,7 +60,6 @@ class CDir : public MDSCacheObject { static const int PIN_DNWAITER = 1; static const int PIN_CHILD = 2; static const int PIN_FROZEN = 3; - static const int PIN_FRAGMENTING = 4; static const int PIN_EXPORT = 5; static const int PIN_IMPORTING = 7; static const int PIN_EXPORTING = 8; @@ -72,7 +71,6 @@ class CDir : public MDSCacheObject { case PIN_DNWAITER: return "dnwaiter"; case PIN_CHILD: return "child"; case PIN_FROZEN: return "frozen"; - case PIN_FRAGMENTING: return "fragmenting"; case PIN_EXPORT: return "export"; case PIN_EXPORTING: return "exporting"; case PIN_IMPORTING: return "importing"; @@ -125,7 +123,6 @@ class CDir : public MDSCacheObject { static const unsigned MASK_STATE_FRAGMENT_KEPT = (STATE_DIRTY | STATE_COMPLETE | - STATE_FROZENDIR | STATE_EXPORT | STATE_EXPORTBOUND | STATE_IMPORTBOUND | @@ -326,7 +323,7 @@ private: // -- fetch -- object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c); + void fetch(Context *c, bool ignore_authpinnability=false); void _fetched(bufferlist &bl); // -- commit -- @@ -431,8 +428,8 @@ public: if (auth_pins > 0) return false; - // if not subtree root, inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) + // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). + if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) return false; return true; diff --git a/branches/sage/mds/mds/LogEvent.h b/branches/sage/mds/mds/LogEvent.h index dca883e9f386a..fb2ccf2664fb2 100644 --- a/branches/sage/mds/mds/LogEvent.h +++ b/branches/sage/mds/mds/LogEvent.h @@ -49,10 +49,12 @@ class LogEvent { private: int _type; off_t _start_off,_end_off; + friend class MDLog; public: - LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } + LogEvent(int t) : + _type(t), _start_off(0), _end_off(0) { } virtual ~LogEvent() { } int get_type() { return _type; } @@ -69,7 +71,6 @@ class LogEvent { out << "event(" << _type << ")"; } - /*** live journal ***/ /* obsolete() - is this entry committed to primary store, such that diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc index ef0d06e8cc275..96b11cd085e72 100644 --- a/branches/sage/mds/mds/MDCache.cc +++ b/branches/sage/mds/mds/MDCache.cc @@ -1286,11 +1286,13 @@ void MDCache::handle_resolve(MMDSResolve *m) for (map >::iterator pi = m->subtrees.begin(); pi != m->subtrees.end(); ++pi) { - CDir *im = get_dirfrag(pi->first); - if (im) { - adjust_bounded_subtree_auth(im, pi->second, from); - try_subtree_merge(im); - } + CInode *diri = get_inode(pi->first.ino); + if (!diri) continue; + diri->dirfragtree.force_to_leaf(pi->first.frag); + CDir *dir = diri->get_dirfrag(pi->first.frag); + if (!dir) continue; + adjust_bounded_subtree_auth(dir, pi->second, from); + try_subtree_merge(dir); } // am i a surviving ambiguous importer? @@ -5422,7 +5424,19 @@ void MDCache::_refragment_dir(CInode *diri, frag_t basefrag, int bits, } } - +class C_MDC_FragmentGo : public Context { + MDCache *mdcache; + CInode *diri; + list dirs; + frag_t basefrag; + int bits; +public: + C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : + mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } + virtual void finish(int r) { + mdcache->fragment_go(diri, dirs, basefrag, bits); + } +}; void MDCache::split_dir(CDir *dir, int bits) { @@ -5447,17 +5461,31 @@ void MDCache::split_dir(CDir *dir, int bits) return; } - dir->auth_pin(); - dir->state_set(CDir::STATE_FRAGMENTING); - dir->get(CDir::PIN_FRAGMENTING); - - // make complete list startfrags; startfrags.push_back(dir); - + + dir->state_set(CDir::STATE_FRAGMENTING); + + fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); } +/* + * initial the freeze, blocking with an auth_pin. + */ +void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) +{ + C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); + + for (list::iterator p = frags.begin(); + p != frags.end(); + ++p) { + CDir *dir = *p; + dir->auth_pin(); // this will block the freeze + dir->freeze_dir(gather->new_sub()); + } +} + class C_MDC_FragmentMarking : public Context { MDCache *mdcache; CInode *diri; @@ -5479,29 +5507,33 @@ void MDCache::fragment_mark_and_complete(CInode *diri, dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits << " on " << *diri << endl; - int waiting = 0; + C_Gather *gather = 0; + for (list::iterator p = startfrags.begin(); p != startfrags.end(); ++p) { CDir *dir = *p; - if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dout(15) << " marked " << *dir << endl; - } else if (dir->is_complete()) { + + if (!dir->is_complete()) { + dout(15) << " fetching incomplete " << *dir << endl; + if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); + dir->fetch(gather->new_sub(), + true); // ignore authpinnability + } + else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { dout(15) << " marking " << *dir << endl; for (map::iterator p = dir->items.begin(); p != dir->items.end(); - ++p) + ++p) { p->second->get(CDentry::PIN_FRAGMENTING); + p->second->state_set(CDentry::STATE_FRAGMENTING); + } dir->state_set(CDir::STATE_DNPINNEDFRAG); - } else { - dout(15) << " fetching incomplete " << *dir << endl; - dir->fetch(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); - waiting++; + dir->auth_unpin(); // allow our freeze to complete + } + else { + dout(15) << " marked " << *dir << endl; } - } - - if (!waiting) { - fragment_go(diri, startfrags, basefrag, bits); } } @@ -5512,18 +5544,17 @@ class C_MDC_FragmentLogged : public Context { frag_t basefrag; int bits; list resultfrags; - version_t maxpv; vector pvs; public: C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, version_t mpv, vector& p) : - mdcache(m), diri(di), basefrag(bf), bits(b), maxpv(mpv) { + list& rf, vector& p) : + mdcache(m), diri(di), basefrag(bf), bits(b) { resultfrags.swap(rf); pvs.swap(p); } virtual void finish(int r) { mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, maxpv, pvs); + resultfrags, pvs); } }; @@ -5565,25 +5596,25 @@ void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag dir->state_set(CDir::STATE_FRAGMENTING); - // add new dirfrag + // new dirfrag + pvs.push_back(dir->pre_dirty()); le->metablob.add_dir(dir, true); - // add all the dentries, partitioned. - pvs.push_back(dir->pre_dirty()); + // all the dentries for (map::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { - pvs.push_back(p->second->pre_dirty()); - le->metablob.add_dentry(p->second, true); + if (p->second->state_test(CDentry::STATE_FRAGMENTING)) { + pvs.push_back(p->second->pre_dirty()); + le->metablob.add_dentry(p->second, true); + } } } - version_t maxpv = 0; - if (!pvs.empty()) maxpv = pvs.back(); // journal mds->mdlog->submit_entry(le, new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, maxpv, pvs)); + resultfrags, pvs)); // announcelist& resultfrags, for (set::iterator p = peers.begin(); @@ -5604,7 +5635,7 @@ void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, - version_t maxpv, vector& pvs) + vector& pvs) { dout(10) << "fragment_logged " << basefrag << " bits " << bits << " on " << *diri << endl; @@ -5628,11 +5659,14 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, p != dir->items.end(); ++p) { CDentry *dn = p->second; - if (dn->version >= maxpv) continue; // skip it; created after the frag event - dn->put(CDentry::PIN_FRAGMENTING); - dn->mark_dirty(*pv); - pv++; + if (dn->state_test(CDentry::STATE_FRAGMENTING)) { + dn->put(CDentry::PIN_FRAGMENTING); + dn->mark_dirty(*pv); + pv++; + } } + + dir->unfreeze_dir(); } } diff --git a/branches/sage/mds/mds/MDCache.h b/branches/sage/mds/mds/MDCache.h index 525cc89615208..d4099185dba3b 100644 --- a/branches/sage/mds/mds/MDCache.h +++ b/branches/sage/mds/mds/MDCache.h @@ -605,12 +605,15 @@ public: void split_dir(CDir *dir, int byn); private: + void fragment_freeze(CInode *diri, list& startfrags, + frag_t basefrag, int bits); void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_logged(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, version_t maxpv, vector& pvs); + list& resultfrags, vector& pvs); + friend class C_MDC_FragmentGo; friend class C_MDC_FragmentMarking; friend class C_MDC_FragmentLogged; diff --git a/branches/sage/mds/mds/mdstypes.h b/branches/sage/mds/mds/mdstypes.h index 1f91a58eb6ea6..abf4db5a5d309 100644 --- a/branches/sage/mds/mds/mdstypes.h +++ b/branches/sage/mds/mds/mdstypes.h @@ -114,8 +114,8 @@ struct dirfrag_t { dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } }; -inline ostream& operator<<(ostream& out, const dirfrag_t& df) { - return out << df.ino << "#" << df.frag; +inline ostream& operator<<(ostream& out, const dirfrag_t df) { + return out << df.ino << "_" << df.frag; } inline bool operator<(dirfrag_t l, dirfrag_t r) { if (l.ino < r.ino) return true; -- 2.39.5