From 60d059e3eb082962db3a571141ca6f5e41d5e291 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 5 Oct 2007 00:06:41 +0000 Subject: [PATCH] big cleanup of rename operation. * simplified rename slave ops * WAIT_PTRWAITER for save contexts with pointers (used in Locker.cc) * CInode specifc freezing and aubmiguous_authing * fixed some problems with discover duping, dir_auth_hint git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1885 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/mds/TODO | 41 ++- branches/sage/mds/include/Context.h | 10 +- branches/sage/mds/mds/CInode.cc | 68 +++- branches/sage/mds/mds/CInode.h | 33 +- branches/sage/mds/mds/Locker.cc | 10 +- branches/sage/mds/mds/MDCache.cc | 180 +++++++--- branches/sage/mds/mds/MDCache.h | 42 ++- branches/sage/mds/mds/Migrator.cc | 21 +- branches/sage/mds/mds/Migrator.h | 4 +- branches/sage/mds/mds/Server.cc | 323 ++++++++++-------- branches/sage/mds/mds/Server.h | 4 +- branches/sage/mds/mds/mdstypes.h | 2 + branches/sage/mds/messages/MMDSSlaveRequest.h | 14 +- 13 files changed, 472 insertions(+), 280 deletions(-) diff --git a/branches/sage/mds/TODO b/branches/sage/mds/TODO index 644bbec8e882b..256a7cc134f8a 100644 --- a/branches/sage/mds/TODO +++ b/branches/sage/mds/TODO @@ -54,32 +54,27 @@ mdsmon fix rename if (0) { - who cares if dir items are dirtied out of order? - - need to make sure slave_prepare isn't trimmed before teh commit commits + - continue to delay _apply + - need to make sure slave_prepare isn't trimmed before the commit commits - put xlist_item in MDRequest, add to LogSegment for duration of delay before change is applied to the cache - dirtying has to be attached to teh _original_ logsegment.. mdr->ls } else { - witnesses - - rename_apply after _prepare - - srci auth do versionlock on inode to avoid mention in journal prior to commit|rollback +/ - rename_apply immediately after _prepare (preserve journal pipelining) +/ - srci auth do versionlock on inode to avoid mention in journal prior to commit|rollback +/ - this is to avoid some event between the prepare and rollback with incorrect linkage - add in-memory rollback in the _commit (on failure) +/ - do separate get_srci_replicas slave op first, then wrap inode export into actual prepare last, so that srci auth can apply immediately. } -- srci auth needs to freeze inode before exporting it - - hrm, what about misdirected lock requests.. singleauth won't capture a rename export! -- make sure handle_slave_rename_prep can only happen once - (e.g. if witness journals then fails, and initiator re-requests prepare) +/- srci auth needs to freeze inode before exporting it +/ *** hrm, what about misdirected lock requests.. singleauth won't capture a rename export! +/ -> inode ambiguous_auth bit+waiter; set on prepare, clear on mdr finish. -fix link/unlink -- prepare should be sure that the handle_slave_link or whatever only happens _once_ on the slave - -mds segments -- reverse_export is fuggered by the log segment business. need to delay auth/clean step until export is confirmed (i.e. make second pass over the subtree?). retest migrator! pay close attention to cache_expire checks on auth ordering... ,.dkjasdfjdajfdkjl - - upshot: once it's done, it'll be much cleaner! - mds bugs -- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage +???- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage - fix purge_stray bug - try_remove_unlinked_dn thing - emetablob playback with bad linkage.. from sloppy unlink? hmm @@ -89,6 +84,9 @@ mds bugs - verify once-per-segment jouranl context is working... +mds cleanup +- fix freeze_* interface to pull waiters _outside_ +- get rid of replicate objects for replicate_to .. encode to bufferlists directly mds - extend/clean up filepath to allow paths relative to an ino @@ -100,24 +98,23 @@ mds - need to export stray crap to another mds.. - verify stray is empty on shutdown +- real chdir (directory "open") + - relative metadata ops + - consistency points/snapshots - dentry versions vs dirfrags... - - detect and deal with client failure - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - inode.max_size - inode.allocated_size - -- real chdir (directory "open") - - relative metadata ops - + - osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. - EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) -- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in teh current log epoch in CDir... +- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir... - fix rmdir empty exported dirfrag race - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. @@ -128,7 +125,7 @@ mds - need to move state from replicas to auth. simplelock doesn't currently support that. - ScatterLock or something? hrm. -- FIXME how to journal root and stray inode content? +- FIXME how to journal/store root and stray inode content? - in particular, i care about dirfragtree.. get it on rejoin? - and dir sizes, if i add that... also on rejoin? diff --git a/branches/sage/mds/include/Context.h b/branches/sage/mds/include/Context.h index 231683bc2fe70..e5c74de6cb6e5 100644 --- a/branches/sage/mds/include/Context.h +++ b/branches/sage/mds/include/Context.h @@ -70,17 +70,17 @@ public: * C_Contexts - set of Contexts */ class C_Contexts : public Context { - std::list clist; - public: + std::list contexts; + void add(Context* c) { - clist.push_back(c); + contexts.push_back(c); } void take(std::list& ls) { - clist.splice(clist.end(), ls); + contexts.splice(contexts.end(), ls); } void finish(int r) { - finish_contexts(clist, r); + finish_contexts(contexts, r); } }; diff --git a/branches/sage/mds/mds/CInode.cc b/branches/sage/mds/mds/CInode.cc index 26f0cb3ea570d..d447d6f0d1f36 100644 --- a/branches/sage/mds/mds/CInode.cc +++ b/branches/sage/mds/mds/CInode.cc @@ -64,6 +64,11 @@ ostream& operator<<(ostream& out, CInode& in) out << " v" << in.get_version(); + if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) + out << " AMBIGAUTH"; + if (in.is_freezing_inode()) out << " FREEZING"; + if (in.is_frozen_inode()) out << " FROZEN"; + // locks out << " " << in.authlock; out << " " << in.linklock; @@ -400,7 +405,7 @@ void CInode::_mark_dirty(LogSegment *ls) if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); get(PIN_DIRTY); - assert(ls); + //assert(ls); // not true for wonky srci import on rename. } // move myself to this segment's dirty list @@ -582,38 +587,73 @@ void CInode::decode_lock_state(int type, bufferlist& bl) bool CInode::is_frozen() { - if (parent && parent->dir->is_frozen()) - return true; + if (is_frozen_inode()) return true; + if (parent && parent->dir->is_frozen()) return true; return false; } bool CInode::is_frozen_dir() { - if (parent && parent->dir->is_frozen_dir()) - return true; + if (parent && parent->dir->is_frozen_dir()) return true; return false; } bool CInode::is_freezing() { - if (parent && parent->dir->is_freezing()) - return true; + if (is_freezing_inode()) return true; + if (parent && parent->dir->is_freezing()) return true; return false; } void CInode::add_waiter(int tag, Context *c) { // wait on the directory? - if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { + // make sure its not the inode that is explicitly ambiguous|freezing|frozen + if ((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH) || + (tag & WAIT_UNFREEZE) && !state_test(STATE_FROZEN|STATE_FREEZING)) { parent->dir->add_waiter(tag, c); return; } MDSCacheObject::add_waiter(tag, c); } +bool CInode::freeze_inode(int auth_pin_allowance) +{ + assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins + assert(auth_pins >= auth_pin_allowance); + if (auth_pins > auth_pin_allowance) { + dout(10) << "freeze_inode - waiting for auth_pins to drop" << dendl; + auth_pin_freeze_allowance = auth_pin_freeze_allowance; + get(PIN_FREEZING); + state_set(STATE_FREEZING); + return false; + } + + dout(10) << "freeze_inode - frozen" << dendl; + assert(auth_pins == auth_pin_allowance); + get(PIN_FROZEN); + state_set(STATE_FROZEN); + return true; +} + +void CInode::unfreeze_inode(list& finished) +{ + dout(10) << "unfreeze_inode" << dendl; + if (state_test(STATE_FREEZING)) { + state_clear(STATE_FREEZING); + put(PIN_FREEZING); + } else if (state_test(STATE_FROZEN)) { + state_clear(STATE_FROZEN); + put(PIN_FROZEN); + } else + assert(0); + take_waiting(WAIT_UNFREEZE, finished); +} + // auth_pins bool CInode::can_auth_pin() { + if (is_freezing_inode() || is_frozen_inode()) return false; if (parent) return parent->can_auth_pin(); return true; @@ -644,9 +684,19 @@ void CInode::auth_unpin() << dendl; assert(auth_pins >= 0); - + if (parent) parent->adjust_nested_auth_pins( -1 ); + + if (is_freezing_inode() && + auth_pins == auth_pin_freeze_allowance) { + dout(10) << "auth_unpin freezing!" << dendl; + get(PIN_FROZEN); + put(PIN_FREEZING); + state_clear(STATE_FREEZING); + state_set(STATE_FROZEN); + finish_waiting(WAIT_FROZEN); + } } void CInode::adjust_nested_auth_pins(int a) diff --git a/branches/sage/mds/mds/CInode.h b/branches/sage/mds/mds/CInode.h index 75541b2130926..cce2b43058f56 100644 --- a/branches/sage/mds/mds/CInode.h +++ b/branches/sage/mds/mds/CInode.h @@ -65,7 +65,9 @@ class CInode : public MDSCacheObject { static const int PIN_BATCHOPENJOURNAL = 9; static const int PIN_SCATTERED = 10; static const int PIN_STICKYDIRS = 11; - static const int PIN_PURGING = -12; + static const int PIN_PURGING = -12; + static const int PIN_FREEZING = 13; + static const int PIN_FROZEN = 14; const char *pin_name(int p) { switch (p) { @@ -79,6 +81,8 @@ class CInode : public MDSCacheObject { case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; case PIN_SCATTERED: return "scattered"; case PIN_STICKYDIRS: return "stickydirs"; + case PIN_FREEZING: return "freezing"; + case PIN_FROZEN: return "frozen"; default: return generic_pin_name(p); } } @@ -89,6 +93,9 @@ class CInode : public MDSCacheObject { static const int STATE_UNANCHORING = (1<<4); static const int STATE_OPENINGDIR = (1<<5); static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. + static const int STATE_FREEZING = (1<<7); + static const int STATE_FROZEN = (1<<8); + static const int STATE_AMBIGUOUSAUTH = (1<<9); // -- waiters -- //static const int WAIT_SLAVEAGREE = (1<<0); @@ -96,6 +103,8 @@ class CInode : public MDSCacheObject { static const int WAIT_ANCHORED = (1<<2); static const int WAIT_UNANCHORED = (1<<3); static const int WAIT_CAPS = (1<<4); + static const int WAIT_FROZEN = (1<<5); + static const int WAIT_UNFREEZE = (1<<6); static const int WAIT_AUTHLOCK_OFFSET = 5; static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; @@ -196,6 +205,7 @@ private: // auth pin int auth_pins; int nested_auth_pins; + int auth_pin_freeze_allowance; public: inode_load_vec_t pop; @@ -248,6 +258,12 @@ private: bool is_root() { return inode.ino == MDS_INO_ROOT; } bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } + // note: this overloads MDSCacheObject + bool is_ambiguous_auth() { + return state_test(STATE_AMBIGUOUSAUTH) || + MDSCacheObject::is_ambiguous_auth(); + } + inodeno_t ino() const { return inode.ino; } inode_t& get_inode() { return inode; } @@ -358,15 +374,17 @@ public: client_caps = cl; } */ - void take_client_caps(map& cl) { + void clear_client_caps() { if (!client_caps.empty()) put(PIN_CAPS); + client_caps.clear(); + } + void export_client_caps(map& cl) { for (map::iterator it = client_caps.begin(); it != client_caps.end(); it++) { cl[it->first] = it->second.make_export(); } - client_caps.clear(); } void merge_client_caps(map& cl, set& new_client_caps) { if (client_caps.empty() && !cl.empty()) @@ -445,10 +463,15 @@ public: // -- freeze -- + bool is_freezing_inode() { return state_test(STATE_FREEZING); } + bool is_frozen_inode() { return state_test(STATE_FROZEN); } bool is_frozen(); bool is_frozen_dir(); bool is_freezing(); + bool freeze_inode(int auth_pin_allowance=0); + void unfreeze_inode(list& finished); + // -- reference counting -- void bad_put(int by) { @@ -618,9 +641,7 @@ public: st.pop = in->pop; in->pop.zero(now); - // steal WRITER caps from inode - in->take_client_caps(cap_map); - //remaining_issued = in->get_caps_issued(); + in->export_client_caps(cap_map); } inodeno_t get_ino() { return st.inode.ino; } diff --git a/branches/sage/mds/mds/Locker.cc b/branches/sage/mds/mds/Locker.cc index f0a29ec51e1c0..ad5d021b612d7 100644 --- a/branches/sage/mds/mds/Locker.cc +++ b/branches/sage/mds/mds/Locker.cc @@ -566,6 +566,7 @@ class C_MDL_RequestInodeFileCaps : public Context { public: C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {} void finish(int r) { + in->put(CInode::PIN_PTRWAITER); if (!in->is_auth()) locker->request_inode_file_caps(in); } @@ -607,6 +608,7 @@ void Locker::request_inode_file_caps(CInode *in) // wait for single auth if (in->is_ambiguous_auth()) { + in->get(CInode::PIN_PTRWAITER); in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDL_RequestInodeFileCaps(this, in)); return; @@ -1413,6 +1415,7 @@ class C_Locker_ScatterEval : public Context { public: C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} void finish(int r) { + lock->get_parent()->put(CInode::PIN_PTRWAITER); locker->try_scatter_eval(lock); } }; @@ -1423,8 +1426,9 @@ void Locker::try_scatter_eval(ScatterLock *lock) // unstable and ambiguous auth? if (!lock->is_stable() && lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; + dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->get(CInode::PIN_PTRWAITER); lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); return; } @@ -1437,6 +1441,7 @@ void Locker::try_scatter_eval(ScatterLock *lock) if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->get(CInode::PIN_PTRWAITER); lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock)); return; } @@ -2137,6 +2142,7 @@ class C_Locker_FileEval : public Context { public: C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} void finish(int r) { + lock->get_parent()->put(CInode::PIN_PTRWAITER); locker->try_file_eval(lock); } }; @@ -2150,6 +2156,7 @@ void Locker::try_file_eval(FileLock *lock) in->is_ambiguous_auth()) { dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + in->get(CInode::PIN_PTRWAITER); in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); return; } @@ -2162,6 +2169,7 @@ void Locker::try_file_eval(FileLock *lock) if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + in->get(CInode::PIN_PTRWAITER); in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock)); return; } diff --git a/branches/sage/mds/mds/MDCache.cc b/branches/sage/mds/mds/MDCache.cc index 0dee3045bafcb..646f719327ba9 100644 --- a/branches/sage/mds/mds/MDCache.cc +++ b/branches/sage/mds/mds/MDCache.cc @@ -167,6 +167,9 @@ void MDCache::add_inode(CInode *in) // add to lru, inode map assert(inode_map.count(in->ino()) == 0); // should be no dup inos! inode_map[ in->ino() ] = in; + + if (in->ino() < MDS_INO_BASE) + base_inodes.insert(in); } void MDCache::remove_inode(CInode *o) @@ -183,11 +186,16 @@ void MDCache::remove_inode(CInode *o) // remove from inode map inode_map.erase(o->ino()); + if (o->ino() < MDS_INO_BASE) { + assert(base_inodes.count(o)); + base_inodes.erase(o); + + if (o == root) root = 0; + if (o == stray) stray = 0; + } + // delete it delete o; - - if (o == root) root = 0; - if (o == stray) stray = 0; } @@ -2693,6 +2701,7 @@ void MDCache::set_root(CInode *in) { assert(root == 0); root = in; + base_inodes.insert(in); } @@ -4324,6 +4333,12 @@ void MDCache::request_cleanup(MDRequest *mdr) // drop (local) auth pins mdr->drop_local_auth_pins(); + // drop stickydirs + for (set::iterator p = mdr->stickydirs.begin(); + p != mdr->stickydirs.end(); + ++p) + (*p)->put_stickydirs(); + // drop cache pins for (set::iterator it = mdr->pins.begin(); it != mdr->pins.end(); @@ -4725,13 +4740,15 @@ void MDCache::discover_dir_frag(CInode *base, dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg << " from mds" << from << dendl; - filepath want_path; - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true); // need the base dir open - dis->set_base_dir_frag(approx_fg); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); + if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative + filepath want_path; + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->ino(), + want_path, + true); // need the base dir open + dis->set_base_dir_frag(approx_fg); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } // register + wait if (onfinish) @@ -4757,12 +4774,14 @@ void MDCache::discover_path(CInode *base, return; } - MDiscover *dis = new MDiscover(mds->get_nodeid(), - base->ino(), - want_path, - true, // we want the base dir; we are relative to ino. - want_xlocked); - mds->send_message_mds(dis, from, MDS_PORT_CACHE); + if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->ino(), + want_path, + true, // we want the base dir; we are relative to ino. + want_xlocked); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } // register + wait if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish); @@ -4915,18 +4934,19 @@ void MDCache::handle_discover(MDiscover *dis) << " don't have base ino " << dis->get_base_ino() << ", dropping" << dendl; delete reply; + assert(0); // hmm: when does this happen? return; } if (dis->wants_base_dir()) { dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur << " wants basedir+" << dis->get_want().get_path() + << " has " << *cur << dendl; } else { dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur << " wants " << dis->get_want().get_path() + << " has " << *cur << dendl; } } @@ -4963,19 +4983,35 @@ void MDCache::handle_discover(MDiscover *dis) if ((!curdir && !cur->is_auth()) || (curdir && !curdir->is_auth())) { - // set hint - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " - << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); + + /* before: + * ONLY set flag if empty!! + * otherwise requester will wake up waiter(s) _and_ continue with discover, + * resulting in duplicate discovers in flight, + * which can wreak havoc when discovering rename srcdn (which may move) + */ + + if (reply->is_empty()) { + // only hint if empty. + // someday this could be better, but right now the waiter logic isn't smart enough. + + // hint + if (curdir) { + dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; + reply->set_dir_auth_hint(curdir->authority().first); + } else { + dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " + << *cur << dendl; + reply->set_dir_auth_hint(cur->authority().first); + } + + // note error dentry, if any + // NOTE: important, as it allows requester to issue an equivalent discover + // to whomever we hint at. + if (dis->get_want().depth() > i) + reply->set_error_dentry(dis->get_dentry(i)); } - - // set error dentry, if there is one - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); + break; } @@ -5068,6 +5104,20 @@ void MDCache::handle_discover(MDiscover *dis) } } + // frozen inode? + if (dn->is_primary() && + dn->inode->is_frozen()) { + if (reply->is_empty()) { + dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl; + dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + delete reply; + return; + } else { + dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl; + break; + } + } + // add dentry reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); dout(7) << "handle_discover added dentry " << *dn << dendl; @@ -5223,17 +5273,36 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) m->get_dir_auth_hint() != mds->get_nodeid()) { dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - // try again - frag_t fg; - if (m->get_error_dentry().length()) - fg = cur->pick_dirfrag(m->get_error_dentry()); - else - fg = m->get_base_dir_frag(); - discover_dir_frag(cur, fg, 0, m->get_dir_auth_hint()); - } - else if (m->is_flag_error_dir()) { - // dir error at the end there? - dout(7) << " flag_error on dir " << *cur << dendl; + // try again? + if (m->get_error_dentry().length()) { + // wanted a dentry + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + if (dir) { + // don't actaully need the hint, now + if (dir->lookup(m->get_error_dentry()) == 0 && + dir->is_waiting_for_dentry(m->get_error_dentry())) + discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked()); + else + dout(7) << " doing nothing, have dir but nobody is waiting on dentry " + << m->get_error_dentry() << dendl; + } else { + if (cur->is_waiter_for(CInode::WAIT_DIR)) + discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), + m->get_dir_auth_hint()); + else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + } else { + // wanted just the dir + frag_t fg = m->get_base_dir_frag(); + if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR)) + discover_dir_frag(cur, fg, 0, m->get_dir_auth_hint()); + else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + } else if (m->is_flag_error_dir()) { + dout(7) << " flag_error_dir on " << *cur << dendl; //assert(!cur->is_dir()); // this assert might be racey if dir auth != inode auth? cur->take_waiting(CInode::WAIT_DIR, error); } @@ -5975,10 +6044,12 @@ void MDCache::show_subtrees(int dbl) } // root frags - list rootfrags; - if (root) root->get_dirfrags(rootfrags); - if (stray) stray->get_dirfrags(rootfrags); - dout(15) << "rootfrags " << rootfrags << dendl; + list basefrags; + for (set::iterator p = base_inodes.begin(); + p != base_inodes.end(); + ++p) + (*p)->get_dirfrags(basefrags); + dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; // queue stuff list > q; @@ -5986,9 +6057,11 @@ void MDCache::show_subtrees(int dbl) set seen; // calc max depth - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) q.push_back(pair(*p, 0)); + set subtrees_seen; + int depth = 0; while (!q.empty()) { CDir *dir = q.front().first; @@ -5997,6 +6070,8 @@ void MDCache::show_subtrees(int dbl) if (subtrees.count(dir) == 0) continue; + subtrees_seen.insert(dir); + if (d > depth) depth = d; // sanity check @@ -6018,7 +6093,7 @@ void MDCache::show_subtrees(int dbl) // print tree - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) q.push_back(pair(*p, 0)); while (!q.empty()) { @@ -6073,6 +6148,17 @@ void MDCache::show_subtrees(int dbl) q.push_front(pair(*p, d+2)); } } + + // verify there isn't stray crap in subtree map + int lost = 0; + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (subtrees_seen.count(p->first)) continue; + dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; + lost++; + } + assert(lost == 0); } diff --git a/branches/sage/mds/mds/MDCache.h b/branches/sage/mds/mds/MDCache.h index 56fd38faec1a1..d738b8ccd4962 100644 --- a/branches/sage/mds/mds/MDCache.h +++ b/branches/sage/mds/mds/MDCache.h @@ -101,6 +101,7 @@ struct MDRequest { set stickydirs; // auth pins + set< MDSCacheObject* > remote_auth_pins; set< MDSCacheObject* > auth_pins; // held locks @@ -126,8 +127,8 @@ struct MDRequest { version_t dst_reanchor_atid; // dst->stray bufferlist inode_import; version_t inode_import_v; - CInode *inode_export; // inode we're exporting, if any - CDentry *srcdn; // srcdn, if auth, on slave + //CInode *inode_export; // inode we're exporting, if any + //CDentry *srcdn; // srcdn, if auth, on slave // called when slave commits Context *slave_commit; @@ -140,7 +141,7 @@ struct MDRequest { ls(0), done_locking(false), committing(false), aborted(false), src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - inode_export(0), srcdn(0), + //inode_export(0), srcdn(0), slave_commit(0) { } MDRequest(metareqid_t ri, MClientRequest *req) : reqid(ri), client_request(req), ref(0), @@ -148,7 +149,7 @@ struct MDRequest { ls(0), done_locking(false), committing(false), aborted(false), src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - inode_export(0), srcdn(0), + //inode_export(0), srcdn(0), slave_commit(0) { } MDRequest(metareqid_t ri, int by) : reqid(ri), client_request(0), ref(0), @@ -156,7 +157,7 @@ struct MDRequest { ls(0), done_locking(false), committing(false), aborted(false), src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - inode_export(0), srcdn(0), + //inode_export(0), srcdn(0), slave_commit(0) { } bool is_master() { return slave_to_mds < 0; } @@ -180,7 +181,7 @@ struct MDRequest { // auth pins bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object); + return auth_pins.count(object) || remote_auth_pins.count(object); } void auth_pin(MDSCacheObject *object) { if (!is_auth_pinned(object)) { @@ -188,15 +189,17 @@ struct MDRequest { auth_pins.insert(object); } } + void auth_unpin(MDSCacheObject *object) { + assert(is_auth_pinned(object)); + object->auth_unpin(); + auth_pins.erase(object); + } void drop_local_auth_pins() { - set::iterator it = auth_pins.begin(); - while (it != auth_pins.end()) { - if ((*it)->is_auth()) { - (*it)->auth_unpin(); - auth_pins.erase(it++); - } else { - it++; - } + for (set::iterator it = auth_pins.begin(); + it != auth_pins.end(); + it++) { + assert((*it)->is_auth()); + (*it)->auth_unpin(); } auth_pins.clear(); } @@ -219,12 +222,13 @@ class MDCache { MDS *mds; // -- my cache -- - LRU lru; // dentry lru for expiring items from cache + LRU lru; // dentry lru for expiring items from cache protected: - hash_map inode_map; // map of inodes by ino - CInode *root; // root inode - CInode *stray; // my stray dir + hash_map inode_map; // map of inodes by ino + CInode *root; // root inode + CInode *stray; // my stray dir + set base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.) // -- discover -- // waiters @@ -380,6 +384,7 @@ public: ESubtreeMap *create_subtree_map(); + protected: // [rejoin] set rejoin_gather; // nodes from whom i need a rejoin @@ -612,6 +617,7 @@ protected: CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); +public: // for Server::handle_slave_rename_prep CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished); public: diff --git a/branches/sage/mds/mds/Migrator.cc b/branches/sage/mds/mds/Migrator.cc index 7819bec5f3a67..7588a414030b3 100644 --- a/branches/sage/mds/mds/Migrator.cc +++ b/branches/sage/mds/mds/Migrator.cc @@ -130,11 +130,13 @@ void Migrator::export_empty_import(CDir *dir) } // is it really empty? + /* who cares! nothing cached, so clearly unimportant. export it! if (!dir->is_complete()) { dout(7) << "not complete, fetching." << dendl; dir->fetch(new C_MDC_EmptyImport(this,dir)); return; } + */ int dest = dir->inode->authority().first; @@ -840,7 +842,7 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, exported_client_map[it->first] = mds->clientmap.get_inst(it->first); } -void Migrator::finish_export_inode(CInode *in, C_Contexts *fin) +void Migrator::finish_export_inode(CInode *in, list& finished) { dout(12) << "finish_export_inode " << *in << dendl; @@ -858,6 +860,7 @@ void Migrator::finish_export_inode(CInode *in, C_Contexts *fin) entity_inst_t inst = mds->clientmap.get_inst(it->first); mds->send_message_client_maybe_open(m, inst); } + in->clear_client_caps(); // relax locks? if (!in->is_replicated()) @@ -882,9 +885,7 @@ void Migrator::finish_export_inode(CInode *in, C_Contexts *fin) in->replica_nonce = CInode::EXPORT_NONCE; // waiters - list waiters; - in->take_waiting(CInode::WAIT_ANY, waiters); - fin->take(waiters); + in->take_waiting(CInode::WAIT_ANY, finished); // *** other state too? @@ -978,7 +979,7 @@ int Migrator::encode_export_dir(list& dirstatelist, return num_exported; } -void Migrator::finish_export_dir(CDir *dir, C_Contexts *fin, utime_t now) +void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t now) { dout(10) << "finish_export_dir " << *dir << dendl; @@ -997,9 +998,7 @@ void Migrator::finish_export_dir(CDir *dir, C_Contexts *fin, utime_t now) dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. // suck up all waiters - list waiting; - dir->take_waiting(CDir::WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); + dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters // pop dir->pop_auth_subtree_nested -= dir->pop_auth_subtree; @@ -1018,7 +1017,7 @@ void Migrator::finish_export_dir(CDir *dir, C_Contexts *fin, utime_t now) // inode? if (dn->is_primary()) { - finish_export_inode(in, fin); + finish_export_inode(in, finished); // subdirs? in->get_nested_dirfrags(subdirs); @@ -1027,7 +1026,7 @@ void Migrator::finish_export_dir(CDir *dir, C_Contexts *fin, utime_t now) // subdirs for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - finish_export_dir(*it, fin, now); + finish_export_dir(*it, finished, now); } class C_MDS_ExportFinishLogged : public Context { @@ -1241,7 +1240,7 @@ void Migrator::export_finish(CDir *dir) // finish export (adjust local cache state) C_Contexts *fin = new C_Contexts; - finish_export_dir(dir, fin, g_clock.now()); + finish_export_dir(dir, fin->contexts, g_clock.now()); dir->add_waiter(CDir::WAIT_UNFREEZE, fin); // unfreeze diff --git a/branches/sage/mds/mds/Migrator.h b/branches/sage/mds/mds/Migrator.h index ccfe2666d66ab..812cdfa26f97c 100644 --- a/branches/sage/mds/mds/Migrator.h +++ b/branches/sage/mds/mds/Migrator.h @@ -186,12 +186,12 @@ public: void encode_export_inode(CInode *in, bufferlist& enc_state, map& exported_client_map, utime_t now); - void finish_export_inode(CInode *in, C_Contexts *fin); + void finish_export_inode(CInode *in, list& finished); int encode_export_dir(list& dirstatelist, CDir *dir, map& exported_client_map, utime_t now); - void finish_export_dir(CDir *dir, class C_Contexts *fin, utime_t now); + void finish_export_dir(CDir *dir, list& finished, utime_t now); void add_export_finish_waiter(CDir *dir, Context *c) { export_finish_waiters[dir].push_back(c); diff --git a/branches/sage/mds/mds/Server.cc b/branches/sage/mds/mds/Server.cc index 0f015c429e4f7..4df9347319371 100644 --- a/branches/sage/mds/mds/Server.cc +++ b/branches/sage/mds/mds/Server.cc @@ -650,13 +650,6 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } break; - case MMDSSlaveRequest::OP_RENAMEGETINODEACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_get_inode_ack(mdr, m); - } - break; - default: assert(0); } @@ -771,10 +764,6 @@ void Server::dispatch_slave_request(MDRequest *mdr) handle_slave_rename_prep(mdr); break; - case MMDSSlaveRequest::OP_RENAMEGETINODE: - handle_slave_rename_get_inode(mdr); - break; - case MMDSSlaveRequest::OP_FINISH: // finish off request. mdcache->request_finish(mdr); @@ -874,19 +863,19 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) assert(object); // we pinned it dout(10) << " remote has pinned " << *object << dendl; if (!mdr->is_auth_pinned(object)) - mdr->auth_pins.insert(object); + mdr->remote_auth_pins.insert(object); pinned.insert(object); } // removed auth pins? - set::iterator p = mdr->auth_pins.begin(); - while (p != mdr->auth_pins.end()) { + set::iterator p = mdr->remote_auth_pins.begin(); + while (p != mdr->remote_auth_pins.end()) { if ((*p)->authority().first == from && pinned.count(*p) == 0) { dout(10) << " remote has unpinned " << **p << dendl; set::iterator o = p; ++p; - mdr->auth_pins.erase(o); + mdr->remote_auth_pins.erase(o); } else { ++p; } @@ -2097,10 +2086,8 @@ void Server::handle_slave_link_prep(MDRequest *mdr) } } - - inode_t *pi = dn->inode->project_inode(); - // update journaled target inode + inode_t *pi = dn->inode->project_inode(); bool inc; if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { inc = true; @@ -2766,6 +2753,13 @@ void Server::handle_client_rename(MDRequest *mdr) xlocks.insert(&destdn->lock); wrlocks.insert(&destdn->dir->inode->dirlock); + // xlock versionlock on srci if remote? + // this ensures it gets safely remotely auth_pinned, avoiding deadlock; + // strictly speaking, having the slave node freeze the inode is + // otherwise sufficient for avoiding conflicts with inode locks, etc. + if (!srcdn->is_auth() && srcdn->is_primary()) + xlocks.insert(&srcdn->inode->versionlock); + // xlock oldin (for nlink--) if (oldin) xlocks.insert(&oldin->linklock); @@ -2826,62 +2820,34 @@ void Server::handle_client_rename(MDRequest *mdr) else witnesses.insert(srcdn->authority().first); destdn->list_replicas(witnesses); + dout(10) << " witnesses " << witnesses << ", have " << mdr->witnessed << dendl; + // do srcdn auth last + int last = -1; + if (!srcdn->is_auth()) + last = srcdn->authority().first; + for (set::iterator p = witnesses.begin(); p != witnesses.end(); ++p) { + if (*p == last) continue; // do it last! if (mdr->witnessed.count(*p)) { dout(10) << " already witnessed by mds" << *p << dendl; + } else if (mdr->waiting_on_slave.count(*p)) { + dout(10) << " already waiting on witness mds" << *p << dendl; } else { - dout(10) << " not yet witnessed by mds" << *p << ", sending prepare" << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(*p); - CDirDiscover *dirdis = straydn->dir->replicate_to(*p); - CDentryDiscover *dndis = straydn->replicate_to(*p); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - mds->send_message_mds(req, *p, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(*p) == 0); - mdr->waiting_on_slave.insert(*p); + _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); } } if (!mdr->waiting_on_slave.empty()) return; // we're waiting for a witness. - // -- inode migration? -- - if (!srcdn->is_auth() && - srcdn->is_primary()) { - if (mdr->inode_import.length() == 0) { - // get inode - int auth = srcdn->authority().first; - dout(10) << " requesting inode export from srcdn auth mds" << auth << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODE); - srcdn->make_path(req->srcdnpath); - mds->send_message_mds(req, auth, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(auth) == 0); - mdr->waiting_on_slave.insert(auth); - return; - } else { - dout(10) << " already (just!) got inode export from srcdn auth" << dendl; - /*int off = 0; - mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off, - srcdn->authority().first); - srcdn->inode->force_auth.first = srcdn->authority().first; - */ - } + if (last >= 0 && + mdr->witnessed.count(last) == 0 && + mdr->waiting_on_slave.count(last) == 0) { + dout(10) << " preparing last witness (srcdn auth)" << dendl; + _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); + return; } // -- prepare anchor updates -- @@ -2973,6 +2939,36 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // helpers +void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_prepare_witness mds" << who << dendl; + MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); + srcdn->make_path(req->srcdnpath); + destdn->make_path(req->destdnpath); + req->now = mdr->now; + + if (straydn) { + CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); + CDirDiscover *dirdis = straydn->dir->replicate_to(who); + CDentryDiscover *dndis = straydn->replicate_to(who); + indis->_encode(req->stray); + dirdis->_encode(req->stray); + dndis->_encode(req->stray); + delete indis; + delete dirdis; + delete dndis; + } + + // srcdn auth will verify our current witness list is sufficient + req->witnesses = mdr->witnessed; + + mds->send_message_mds(req, who, MDS_PORT_SERVER); + + assert(mdr->waiting_on_slave.count(who) == 0); + mdr->waiting_on_slave.insert(who); +} + + void Server::_rename_prepare(MDRequest *mdr, EMetaBlob *metablob, CDentry *srcdn, CDentry *destdn, CDentry *straydn) @@ -3289,6 +3285,51 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) mdr->now = mdr->slave_request->now; + // set up commit waiter (early, to clean up any freezing etc we do) + if (!mdr->slave_commit) + mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); + + // am i srcdn auth? + if (srcdn->is_auth()) { + if (srcdn->is_primary() && + !srcdn->inode->is_freezing_inode() && + !srcdn->inode->is_frozen_inode()) { + // srci auth. + // set ambiguous auth. + srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); + + // freeze? + // we need this to + // - avoid conflicting lock state changes + // - avoid concurrent updates to the inode + // (this could also be accomplished with the versionlock) + int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) + dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; + if (!srcdn->inode->freeze_inode(allowance)) { + srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + // is witness list sufficient? + set srcdnrep; + srcdn->list_replicas(srcdnrep); + for (set::iterator p = srcdnrep.begin(); + p != srcdnrep.end(); + ++p) { + if (*p == mdr->slave_to_mds || + mdr->slave_request->witnesses.count(*p)) continue; + dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; + MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->witnesses.swap(srcdnrep); + mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); + delete mdr->slave_request; + mdr->slave_request = 0; + return; + } + dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; + } + // journal it? if (srcdn->is_auth() || (destdn->inode && destdn->inode->is_auth()) || @@ -3310,21 +3351,35 @@ void Server::_logged_slave_rename(MDRequest *mdr, { dout(10) << "_logged_slave_rename " << *mdr << dendl; - // ack + // prepare ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - if (srcdn->is_auth()) { - // share the replica list, so that they can all witness the rename. - srcdn->list_replicas(reply->srcdn_replicas); + + // export srci? + if (srcdn->is_auth() && srcdn->is_primary()) { + list finished; + map exported_client_map; + bufferlist inodebl; + mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, + exported_client_map, + mdr->now); + mdcache->migrator->finish_export_inode(srcdn->inode, finished); + mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. + ::_encode(exported_client_map, reply->inode_export); + reply->inode_export.claim_append(inodebl); + reply->inode_export_v = srcdn->inode->inode.version; + + // remove mdr auth pin + mdr->auth_unpin(srcdn->inode); + assert(!srcdn->inode->is_auth_pinned()); + + dout(10) << " exported srci " << *srcdn->inode << dendl; + } - // note srcdn, we'll get asked for inode momentarily - mdr->srcdn = srcdn; - } + // apply + _rename_apply(mdr, srcdn, destdn, straydn); mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - // bump popularity //if (srcdn->is_auth()) //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); @@ -3341,50 +3396,74 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, { dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; - mdr->ls = mdlog->get_current_segment(); + // unfreeze+singleauth inode + // hmm, do i really need to delay this? + if (srcdn->is_auth() && destdn->is_primary()) { + dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; + list finished; + + // singleauth + assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); + destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); + destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); + + // unfreeze + assert(destdn->inode->is_frozen_inode() || + destdn->inode->is_freezing_inode()); + destdn->inode->unfreeze_inode(finished); + + mds->queue_waiters(finished); + } + + ESlaveUpdate *le; if (r == 0) { - // finish the inode export - if (mdr->inode_export) { - C_Contexts *fin = new C_Contexts; - mdcache->migrator->finish_export_inode(mdr->inode_export, fin); - mds->queue_waiter(fin); - } - - // commit - _rename_apply(mdr, srcdn, destdn, straydn); - // write a commit to the journal le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); + } else { // abort le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); + + // -- rollback in memory -- + + // *** WRITE ME *** + assert(0); + } + + + mdlog->submit_entry(le); } -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) +void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) { dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << m->get_source() - << " " << *m << dendl; - int from = m->get_source().num(); + << " witnessed by " << ack->get_source() + << " " << *ack << dendl; + int from = ack->get_source().num(); // note slave mdr->slaves.insert(from); - // witnessed! + // witnessed? or add extra witnesses? assert(mdr->witnessed.count(from) == 0); - mdr->witnessed.insert(from); - - - // add extra witnesses? - if (!m->srcdn_replicas.empty()) { - dout(10) << " extra witnesses (srcdn replicas) are " << m->srcdn_replicas << dendl; - mdr->extra_witnesses = m->srcdn_replicas; + if (ack->witnesses.empty()) { + mdr->witnessed.insert(from); + } else { + dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; + mdr->extra_witnesses.swap(ack->witnesses); mdr->extra_witnesses.erase(mds->get_nodeid()); // not me! } + // srci import? + if (ack->inode_export.length()) { + dout(10) << " got srci import" << dendl; + mdr->inode_import.claim(ack->inode_export); + mdr->inode_import_v = ack->inode_export_v; + } + // remove from waiting list assert(mdr->waiting_on_slave.count(from)); mdr->waiting_on_slave.erase(from); @@ -3397,60 +3476,6 @@ void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -void Server::handle_slave_rename_get_inode(MDRequest *mdr) -{ - dout(10) << "handle_slave_rename_get_inode " << *mdr << dendl; - - assert(mdr->srcdn); - assert(mdr->srcdn->is_auth()); - assert(mdr->srcdn->is_primary()); - - // reply - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODEACK); - dout(10) << " replying with inode export info " << *mdr->srcdn->inode << dendl; - - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, - exported_client_map, - mdr->now); - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - reply->inode_export_v = mdr->srcdn->inode->inode.version; - - // take note of inode; we'll need to finish the export later! - mdr->inode_export = mdr->srcdn->inode; - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - - // clean up. - delete mdr->slave_request; - mdr->slave_request = 0; -} - -void Server::handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_rename_get_inode_ack " << *mdr - << " " << *m << dendl; - int from = m->get_source().num(); - - assert(m->inode_export.length()); - dout(10) << " got inode export, saving in " << *mdr << dendl; - mdr->inode_import.claim(m->inode_export); - mdr->inode_import_v = m->inode_export_v; - - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - if (mdr->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl; -} - - - - // =================================== diff --git a/branches/sage/mds/mds/Server.h b/branches/sage/mds/mds/Server.h index 2b95711bcaf1f..281fd13ca2593 100644 --- a/branches/sage/mds/mds/Server.h +++ b/branches/sage/mds/mds/Server.h @@ -163,6 +163,8 @@ public: CDentry *srcdn, CDentry *destdn, CDentry *straydn); // helpers + void _rename_prepare_witness(MDRequest *mdr, int who, + CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _rename_prepare(MDRequest *mdr, EMetaBlob *metablob, CDentry *srcdn, CDentry *destdn, CDentry *straydn); @@ -173,8 +175,6 @@ public: void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void handle_slave_rename_get_inode(MDRequest *mdr); - void handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m); }; diff --git a/branches/sage/mds/mds/mdstypes.h b/branches/sage/mds/mds/mdstypes.h index 3b4048dc8080e..a3112223f1028 100644 --- a/branches/sage/mds/mds/mdstypes.h +++ b/branches/sage/mds/mds/mdstypes.h @@ -386,6 +386,7 @@ class MDSCacheObject { const static int PIN_WAITER = 1004; const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? static const int PIN_AUTHPIN = 1006; + static const int PIN_PTRWAITER = -1007; const char *generic_pin_name(int p) { switch (p) { @@ -396,6 +397,7 @@ class MDSCacheObject { case PIN_WAITER: return "waiter"; case PIN_DIRTYSCATTERED: return "dirtyscattered"; case PIN_AUTHPIN: return "authpin"; + case PIN_PTRWAITER: return "ptrwaiter"; default: assert(0); return 0; } } diff --git a/branches/sage/mds/messages/MMDSSlaveRequest.h b/branches/sage/mds/messages/MMDSSlaveRequest.h index 25ec82a86256f..5ef65223ec1c9 100644 --- a/branches/sage/mds/messages/MMDSSlaveRequest.h +++ b/branches/sage/mds/messages/MMDSSlaveRequest.h @@ -35,9 +35,6 @@ class MMDSSlaveRequest : public Message { static const int OP_RENAMEPREP = 7; static const int OP_RENAMEPREPACK = -7; - static const int OP_RENAMEGETINODE = 8; - static const int OP_RENAMEGETINODEACK = -8; - static const int OP_FINISH = 17; static const int OP_ABORT = 20; // used for recovery only @@ -58,8 +55,6 @@ class MMDSSlaveRequest : public Message { case OP_RENAMEPREP: return "rename_prep"; case OP_RENAMEPREPACK: return "rename_prep_ack"; - case OP_RENAMEGETINODE: return "rename_get_inode"; - case OP_RENAMEGETINODEACK: return "rename_get_inode_ack"; case OP_FINISH: return "finish"; // commit case OP_ABORT: return "abort"; @@ -84,9 +79,10 @@ class MMDSSlaveRequest : public Message { // for rename prep string srcdnpath; string destdnpath; - set srcdn_replicas; + set witnesses; bufferlist inode_export; version_t inode_export_v; + bufferlist srci_replica; utime_t now; bufferlist stray; // stray dir + dentry @@ -116,10 +112,11 @@ public: ::_encode_complex(authpins, payload); ::_encode(srcdnpath, payload); ::_encode(destdnpath, payload); - ::_encode(srcdn_replicas, payload); + ::_encode(witnesses, payload); ::_encode(now, payload); ::_encode(inode_export, payload); ::_encode(inode_export_v, payload); + ::_encode(srci_replica, payload); ::_encode(stray, payload); } void decode_payload() { @@ -131,10 +128,11 @@ public: ::_decode_complex(authpins, p); ::_decode_simple(srcdnpath, p); ::_decode_simple(destdnpath, p); - ::_decode_simple(srcdn_replicas, p); + ::_decode_simple(witnesses, p); ::_decode_simple(now, p); ::_decode_simple(inode_export, p); ::_decode_simple(inode_export_v, p); + ::_decode_simple(srci_replica, p); ::_decode_simple(stray, p); } -- 2.39.5