From 035ee45c1d4db74cfd1e0f3ae09cc3c214f60d40 Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 16 Jul 2007 23:40:41 +0000 Subject: [PATCH] cdentry new, cdir committed_version_equivalent, some rejoin cleanup, inode purge bugfix git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1507 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 48 +++------------------------ branches/sage/cephmds2/mds/CDentry.cc | 19 ++++++++--- branches/sage/cephmds2/mds/CDentry.h | 3 ++ branches/sage/cephmds2/mds/CDir.cc | 34 ++++++++++++++++++- branches/sage/cephmds2/mds/CDir.h | 22 ++++++++---- branches/sage/cephmds2/mds/MDCache.cc | 37 +++++++++++++++------ branches/sage/cephmds2/mds/MDCache.h | 1 + branches/sage/cephmds2/mds/MDS.cc | 4 +++ branches/sage/cephmds2/mds/Server.cc | 10 ++++++ branches/sage/cephmds2/mds/journal.cc | 11 ++++-- 10 files changed, 121 insertions(+), 68 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 1be506037d234..366babd418b1f 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -61,38 +61,13 @@ sage doc sage mds +- CDir should avoid a store in response to temporary files (create+unlink) + - count+flag new dentries + - add last_committed_equivalent? + - hmm, should we move ESubtreeMap out of the journal? that would avoid all the icky weirdness in shutdown, with periodic logging, etc. -- fix rejoin -/ - validate dentry<->inode connectivity -/ - clean up remove_gather() crap - - add_strong_* should take the cache object? -/ - all replicated scatterlocks should start out in scatter state. -/ - parallel_fetch - - missing/full - - carefully document rejoin - - cases - - confounding factors - - - - - - -- for open file caps: - - a survivor will issue strong caps_watned, etc. if the rejoiner doesn't have it, they can request via missing/full. - - a recovering node: - - mark all non-auth caps stale - - advertise non-auth open inodes/paths/Capability::Exports in rejoins. - - in _weak_rejoin, traverse list, and if a path is mine, add claim cap (or add to parallel_fetch list, etc.) - -/- fix rename.. don't journal on witnesses unless we have to. -- fix unlink.. journal on witnesses if the file is open. - -- unlink needs to journal on witnesses (probably), since unlinked inodes may be in those journals - -> hmm, no, rejoin needs to be more robust, and validate namespace changes. - - extend/clean up filepath to allow paths relative to an ino - fix path_traverse - fix reconnect/rejoin open file weirdness @@ -102,19 +77,12 @@ sage mds - need to export stray crap to another mds.. - verify stray is empty on shutdown -- dir complete flag on migration.. does it go into the EMetaBlob too? can it be safely dropped? - -- journal+recovery - - file capabilities i/o - dirfrag split/merge - client readdir for dirfrags - consistency points/snapshots - dentry versions vs dirfrags... - statfs? -- finish multistage rejoin -- trim_on_rejoin - - more testing of failures + thrashing. - is export prep dir open deadlock properly fixed by forge_replica_dir()? - failures during recovery stages (resolve, rejoin)... make sure rejoin still works! @@ -124,11 +92,7 @@ sage mds we break commit()'s preconditions when it fetches an incomplete dir. - detect and deal with client failure - -- recovering open files - - recovery will either have inode (from EOpen), or will provide path+cap to reassert open state. - - path+cap window will require some fetching of metadata from disk before doing the rejoin - - failures during migration.. what about client stale/reap stuff and misplaced WR caps? + - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - inode.max_size - inode.allocated_size @@ -139,8 +103,6 @@ sage mds - osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) -- dir version/committed/etc versus migration, log expires. - - DOCUMENT. - fix rmdir empty exported dirfrag race - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. diff --git a/branches/sage/cephmds2/mds/CDentry.cc b/branches/sage/cephmds2/mds/CDentry.cc index 604b0dfe74d50..2db36a7a187d8 100644 --- a/branches/sage/cephmds2/mds/CDentry.cc +++ b/branches/sage/cephmds2/mds/CDentry.cc @@ -63,6 +63,8 @@ ostream& operator<<(ostream& out, CDentry& dn) out << " inode=" << dn.get_inode(); + if (dn.is_new()) out << " state=new"; + if (dn.get_num_ref()) { out << " |"; dn.print_pin_set(out); @@ -127,6 +129,7 @@ void CDentry::_mark_dirty() // state+pin if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); + dir->inc_num_dirty(); get(PIN_DIRTY); } } @@ -144,19 +147,27 @@ void CDentry::mark_dirty(version_t pv) dir->mark_dirty(pv); } -void CDentry::mark_clean() { + +void CDentry::mark_clean() +{ dout(10) << " mark_clean " << *this << endl; assert(is_dirty()); assert(version <= dir->get_version()); - // this happens on export. - //assert(version <= dir->get_last_committed_version()); - // state+pin state_clear(STATE_DIRTY); + dir->dec_num_dirty(); put(PIN_DIRTY); + + if (state_test(STATE_NEW)) + state_clear(STATE_NEW); } +void CDentry::mark_new() +{ + dout(10) << " mark_new " << *this << endl; + state_set(STATE_NEW); +} void CDentry::make_path(string& s) { diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h index fe7cb7393252a..96eac0a44f32d 100644 --- a/branches/sage/cephmds2/mds/CDentry.h +++ b/branches/sage/cephmds2/mds/CDentry.h @@ -46,6 +46,7 @@ bool operator<(const CDentry& l, const CDentry& r); class CDentry : public MDSCacheObject, public LRUObject { public: // -- state -- + static const int STATE_NEW = 1; // -- pins -- static const int PIN_INODEPIN = 1; // linked inode is pinned @@ -176,6 +177,8 @@ public: void mark_dirty(version_t projected_dirv); void mark_clean(); + void mark_new(); + bool is_new() { return state_test(STATE_NEW); } // -- replication CDentryDiscover *replicate_to(int rep); diff --git a/branches/sage/cephmds2/mds/CDir.cc b/branches/sage/cephmds2/mds/CDir.cc index 0c2b8d4ac3ef6..7d646f414873c 100644 --- a/branches/sage/cephmds2/mds/CDir.cc +++ b/branches/sage/cephmds2/mds/CDir.cc @@ -51,6 +51,7 @@ ostream& operator<<(ostream& out, CDir& dir) out << " v=" << dir.get_version(); out << " cv=" << dir.get_committing_version(); out << "/" << dir.get_committed_version(); + out << "/" << dir.get_committed_version_equivalent(); } else { out << " rep@" << dir.authority(); if (dir.get_replica_nonce() > 1) @@ -78,6 +79,9 @@ ostream& operator<<(ostream& out, CDir& dir) if (dir.state_test(CDir::STATE_IMPORTBOUND)) out << "|importbound"; out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); + if (dir.get_num_dirty()) + out << " dirty=" << dir.get_num_dirty(); + if (dir.get_num_ref()) { out << " |"; @@ -119,6 +123,8 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) nitems = 0; nnull = 0; + num_dirty = 0; + state = STATE_INITIAL; projected_version = version = 0; @@ -199,7 +205,7 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in) //assert(null_items.count(dn->name) == 0); items[dn->name] = dn; - + if (in) { link_inode_work( dn, in ); } else { @@ -238,6 +244,10 @@ void CDir::remove_dentry(CDentry *dn) assert(items.count(dn->name) == 1); items.erase(dn->name); + // adjust dirty counter? + if (dn->state_test(CDentry::STATE_DIRTY)) + num_dirty--; + cache->lru.lru_remove(dn); delete dn; @@ -369,6 +379,28 @@ void CDir::remove_null_dentries() { } +void CDir::try_remove_unlinked_dn(CDentry *dn) +{ + assert(dn->dir == this); + + if (dn->is_new() && dn->is_dirty() && + dn->get_num_ref() == 1) { + dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; + dn->mark_clean(); + remove_dentry(dn); + + if (version == projected_version && + committing_version == committed_version && + num_dirty == 0) { + dout(10) << "try_remove_unlinked_dn committed_equivalent now " << version + << " vs committed " << committed_version + << endl; + committed_version_equivalent = committed_version; + } + } +} + + CDirDiscover *CDir::replicate_to(int mds) diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h index 4babf35f5b4e7..6fef0fda4364b 100644 --- a/branches/sage/cephmds2/mds/CDir.h +++ b/branches/sage/cephmds2/mds/CDir.h @@ -157,16 +157,19 @@ class CDir : public MDSCacheObject { return dirfrag() < ((const CDir*)r)->dirfrag(); } - protected: +protected: // contents CDir_map_t items; // non-null AND null size_t nitems; // # non-null size_t nnull; // # null + int num_dirty; + // state version_t version; version_t committing_version; version_t committed_version; + version_t committed_version_equivalent; // in case of, e.g., temporary file version_t projected_version; // lock nesting, freeze @@ -211,13 +214,17 @@ class CDir : public MDSCacheObject { } size_t get_nitems() { return nitems; } size_t get_nnull() { return nnull; } - - /* - float get_popularity() { - return popularity[0].get(); - } - */ + void inc_num_dirty() { num_dirty++; } + void dec_num_dirty() { + assert(num_dirty > 0); + num_dirty--; + } + int get_num_dirty() { + return num_dirty; + } + + void try_remove_unlinked_dn(CDentry *dn); // -- dentries and inodes -- public: @@ -318,6 +325,7 @@ class CDir : public MDSCacheObject { version_t get_projected_version() { return projected_version; } version_t get_committing_version() { return committing_version; } version_t get_committed_version() { return committed_version; } + version_t get_committed_version_equivalent() { return committed_version_equivalent; } void set_committed_version(version_t v) { committed_version = v; } version_t pre_dirty(version_t min=0); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 28fd0fab9041d..0a8685a9df40d 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -202,7 +202,7 @@ CInode *MDCache::create_root_inode() root->inode.nlink = 1; root->inode.layout = g_OSD_MDDirLayout; - root->force_auth = pair(mds->get_nodeid(), CDIR_AUTH_UNKNOWN); + root->force_auth = pair(0, CDIR_AUTH_UNKNOWN); set_root( root ); add_inode( root ); @@ -616,6 +616,13 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair::iterator p = subtrees[dir].begin(); + p != subtrees[dir].end(); + ++p) + adjust_export_state(*p); + // bound should now match. verify_subtree_bounds(dir, bounds); @@ -2461,6 +2468,12 @@ void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) +/** + * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes + * + * FIXME: wait, can this actually happen? a survivor should generate cache trim + * messages that clean these guys up... + */ void MDCache::rejoin_trim_undef_inodes() { dout(10) << "rejoin_trim_undef_inodes" << endl; @@ -2506,6 +2519,7 @@ void MDCache::rejoin_trim_undef_inodes() } } + assert(rejoin_undef_inodes.empty()); // hmm: this shouldn't ever happen, actually! rejoin_undef_inodes.clear(); } @@ -2736,16 +2750,19 @@ void MDCache::purge_inode(inode_t *inode, off_t newsize) purging[inode->ino][newsize] = *inode; assert(inode->size > newsize); + _do_purge_inode(inode, newsize); +} +void MDCache::_do_purge_inode(inode_t *inode, off_t newsize) +{ // remove - mds->filer->remove(*inode, newsize, inode->size, - 0, new C_MDC_PurgeFinish(this, inode->ino, newsize)); - - /*} else { + if (inode->size > 0) { + mds->filer->remove(*inode, newsize, inode->size, + 0, new C_MDC_PurgeFinish(this, inode->ino, newsize)); + } else { // no need, empty file, just log it purge_inode_finish(inode->ino, newsize); } - */ } void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize) @@ -2800,8 +2817,7 @@ void MDCache::start_recovered_purges() dout(10) << "start_recovered_purges " << p->first << " size " << q->second.size << " to " << q->first << endl; - mds->filer->remove(q->second, q->first, q->second.size, - 0, new C_MDC_PurgeFinish(this, p->first, q->first)); + _do_purge_inode(&q->second, q->first); } } } @@ -4598,7 +4614,7 @@ void MDCache::_purge_stray(CDentry *dn) // log removal version_t pdv = dn->pre_dirty(); - EUpdate *le = new EUpdate; + EUpdate *le = new EUpdate("purge_stray"); le->metablob.add_dir_context(dn->dir); le->metablob.add_null_dentry(dn, true); le->metablob.add_inode_truncate(dn->inode->inode, 0); @@ -4616,8 +4632,7 @@ void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv) dn->dir->remove_dentry(dn); // purge+remove inode - if (in->inode.size > 0) - purge_inode(&in->inode, 0); + purge_inode(&in->inode, 0); remove_inode(in); } diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 5bcf897e50d34..fcff21976645e 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -476,6 +476,7 @@ public: public: // inode purging void purge_inode(inode_t *inode, off_t newsize); + void _do_purge_inode(inode_t *inode, off_t newsize); void purge_inode_finish(inodeno_t ino, off_t newsize); void purge_inode_finish_2(inodeno_t ino, off_t newsize); bool is_purging(inodeno_t ino, off_t newsize) { diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index 36a755249aac6..df0f0b88fa092 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -923,6 +923,9 @@ void MDS::reconnect_start() void MDS::reconnect_done() { dout(1) << "reconnect_done" << endl; + set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state + + /* if (mdsmap->get_num_in_mds() == 1 && mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { // just me! @@ -933,6 +936,7 @@ void MDS::reconnect_done() } else { set_want_state(MDSMap::STATE_REJOIN); // move to rejoin state } + */ } void MDS::rejoin_joint_start() diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 69a7c837bdc93..808d1337e53cd 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -923,6 +923,7 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn // create dn = dir->add_dentry(dname, 0); + dn->mark_new(); dout(10) << "prepare_null_dentry added " << *dn << endl; return dn; @@ -2385,6 +2386,9 @@ void Server::_unlink_local_finish(MDRequest *mdr, // clean up? if (straydn) mdcache->eval_stray(straydn); + + // removing a new dn? + dn->dir->try_remove_unlinked_dn(dn); } @@ -2486,6 +2490,9 @@ void Server::_unlink_remote_finish(MDRequest *mdr, // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref + + // removing a new dn? + dn->dir->try_remove_unlinked_dn(dn); } @@ -3102,6 +3109,9 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen // update subtree map? if (destdn->is_primary() && destdn->inode->is_dir()) mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); + + // removing a new dn? + srcdn->dir->try_remove_unlinked_dn(srcdn); } diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc index ca05279b8abe2..e169cee1f51b7 100644 --- a/branches/sage/cephmds2/mds/journal.cc +++ b/branches/sage/cephmds2/mds/journal.cc @@ -545,14 +545,21 @@ void ESession::replay(MDS *mds) if (mds->clientmap.get_version() >= cmapv) { dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() << " >= " << cmapv << ", noop" << endl; + + // hrm, this isn't very pretty. + if (!open) + mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); + } else { dout(10) << "ESession.replay clientmap " << mds->clientmap.get_version() << " < " << cmapv << endl; assert(mds->clientmap.get_version() + 1 == cmapv); - if (open) + if (open) { mds->clientmap.open_session(client_inst); - else + } else { mds->clientmap.close_session(client_inst.name.num()); + mds->clientmap.trim_completed_requests(client_inst.name.num(), 0); + } mds->clientmap.reset_projected(); // make it follow version. } } -- 2.39.5