From 7b3497ecb70422980b71c2ed7910f0d326d3f9cd Mon Sep 17 00:00:00 2001 From: sageweil Date: Tue, 19 Jun 2007 23:51:52 +0000 Subject: [PATCH] * rename now generic, and works locally so far git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1430 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 5 +- .../sage/cephmds2/client/SyntheticClient.cc | 2 +- branches/sage/cephmds2/mds/CInode.cc | 8 +- branches/sage/cephmds2/mds/CInode.h | 17 - branches/sage/cephmds2/mds/Locker.cc | 77 +- branches/sage/cephmds2/mds/MDCache.cc | 52 +- branches/sage/cephmds2/mds/MDCache.h | 60 +- branches/sage/cephmds2/mds/Server.cc | 808 ++++++++---------- branches/sage/cephmds2/mds/Server.h | 36 +- .../sage/cephmds2/mds/events/ESlaveUpdate.h | 4 + .../sage/cephmds2/messages/MDiscoverReply.h | 8 +- .../sage/cephmds2/messages/MMDSSlaveRequest.h | 27 +- 12 files changed, 503 insertions(+), 601 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 8b996064351bf..0fb779285443a 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -47,9 +47,8 @@ sage doc sage mds -- dn unlock should take a CInodeDiscover payload with it? -- general issue of lock messages vs trimming... - +- slave request cleanup on failure + - flag request, and discard on re-dispatch? (cuz it'll be waiting on random stuff) - dirlock-protected mtime updates vs migration, journaling, recovery - rename fun diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc index 7cbcf6300eb02..429142bcd538d 100644 --- a/branches/sage/cephmds2/client/SyntheticClient.cc +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -1360,7 +1360,7 @@ void SyntheticClient::make_dir_mess(const char *basedir, int n) void SyntheticClient::foo() { - if (1) { + if (0) { // rename fun for (int i=0; i<100; i++) { int s = 5; diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 09a8ca1773b28..610cafe2fd3af 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -27,7 +27,7 @@ #include "messages/MLock.h" #include -#include +#include #include "config.h" #undef dout @@ -293,9 +293,9 @@ void CInode::make_anchor_trace(vector& trace) void CInode::name_stray_dentry(string& dname) { - stringstream ss; - ss << inode.ino; - ss >> dname; + char s[20]; + sprintf(s, "%ld", inode.ino.val); + dname = s; } diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index 18f6cba34387a..1310b9c236706 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -233,8 +233,6 @@ protected: void mark_clean(); - - CInodeDiscover* replicate_to(int rep); @@ -382,17 +380,6 @@ public: // -- reference counting -- - - /* these can be pinned any # of times, and are - linked to an active_request, so they're automatically cleaned - up when a request is finished. pin at will! */ - void request_pin_get() { - get(PIN_REQUEST); - } - void request_pin_put() { - put(PIN_REQUEST); - } - void bad_put(int by) { dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; assert(ref_set.count(by) == 1); @@ -407,10 +394,6 @@ public: // -- hierarchy stuff -- -private: - //void get_parent(); - //void put_parent(); - public: void set_primary_parent(CDentry *p) { assert(parent == 0); diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 3de229d1b8052..d4f2f7f208b73 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -160,7 +160,7 @@ bool Locker::acquire_locks(MDRequest *mdr, // wait object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); mdcache->request_drop_locks(mdr); - mdr->drop_auth_pins(); + mdr->drop_local_auth_pins(); return false; } mustpin_remote[object->authority().first].insert(object); @@ -170,7 +170,7 @@ bool Locker::acquire_locks(MDRequest *mdr, // wait object->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); mdcache->request_drop_locks(mdr); - mdr->drop_auth_pins(); + mdr->drop_local_auth_pins(); return false; } } @@ -181,7 +181,7 @@ bool Locker::acquire_locks(MDRequest *mdr, ++p) { MDSCacheObject *object = (*p)->get_parent(); if (mdr->is_auth_pinned(object)) { - dout(10) << " auth_pinned " << *object << endl; + dout(10) << " already auth_pinned " << *object << endl; } else if (object->is_auth()) { dout(10) << " auth_pinning " << *object << endl; mdr->auth_pin(object); @@ -212,7 +212,7 @@ bool Locker::acquire_locks(MDRequest *mdr, for (set::iterator p = rdlocks.begin(); p != rdlocks.end(); ++p) { - dout(20) << " will rdlock " << **p << " " << *(*p)->get_parent() << endl; + dout(20) << " must rdlock " << **p << " " << *(*p)->get_parent() << endl; sorted.insert(*p); } @@ -226,64 +226,61 @@ bool Locker::acquire_locks(MDRequest *mdr, // already locked? if (existing != mdr->locks.end() && *existing == *p) { // right kind? - SimpleLock *had = *existing; + SimpleLock *have = *existing; + existing++; if (xlocks.count(*p) && mdr->xlocks.count(*p)) { - dout(10) << "acquire_locks already xlocked " << *had << " " << *had->get_parent() << endl; - existing++; - continue; + dout(10) << " already xlocked " << *have << " " << *have->get_parent() << endl; } - if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { - dout(10) << "acquire_locks already wrlocked " << *had << " " << *had->get_parent() << endl; - existing++; - continue; + else if (wrlocks.count(*p) && mdr->wrlocks.count(*p)) { + dout(10) << " already wrlocked " << *have << " " << *have->get_parent() << endl; } - if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { - dout(10) << "acquire_locks already rdlocked " << *had << " " << *had->get_parent() << endl; - existing++; - continue; + else if (rdlocks.count(*p) && mdr->rdlocks.count(*p)) { + dout(10) << " already rdlocked " << *have << " " << *have->get_parent() << endl; } + else assert(0); + continue; } // hose any stray locks while (existing != mdr->locks.end()) { - SimpleLock *had = *existing; + SimpleLock *stray = *existing; existing++; - dout(10) << "acquire_locks unlocking out-of-order " << **existing - << " " << *(*existing)->get_parent() << endl; - if (mdr->xlocks.count(had)) - xlock_finish(had, mdr); - else if (mdr->wrlocks.count(had)) - wrlock_finish(had, mdr); + dout(10) << " unlocking out-of-order " << *stray << " " << *stray->get_parent() << endl; + if (mdr->xlocks.count(stray)) + xlock_finish(stray, mdr); + else if (mdr->wrlocks.count(stray)) + wrlock_finish(stray, mdr); else - rdlock_finish(had, mdr); + rdlock_finish(stray, mdr); } // lock if (xlocks.count(*p)) { if (!xlock_start(*p, mdr)) return false; - dout(10) << "acquire_locks got xlock on " << **p << " " << *(*p)->get_parent() << endl; + dout(10) << " got xlock on " << **p << " " << *(*p)->get_parent() << endl; } else if (wrlocks.count(*p)) { if (!wrlock_start(*p, mdr)) return false; - dout(10) << "acquire_locks got wrlock on " << **p << " " << *(*p)->get_parent() << endl; + dout(10) << " got wrlock on " << **p << " " << *(*p)->get_parent() << endl; } else { if (!rdlock_start(*p, mdr)) return false; - dout(10) << "acquire_locks got rdlock on " << **p << " " << *(*p)->get_parent() << endl; + dout(10) << " got rdlock on " << **p << " " << *(*p)->get_parent() << endl; } } // any extra unneeded locks? while (existing != mdr->locks.end()) { - dout(10) << "acquire_locks unlocking " << *existing - << " " << *(*existing)->get_parent() << endl; - if (mdr->xlocks.count(*existing)) - xlock_finish(*existing, mdr); - else if (mdr->wrlocks.count(*existing)) - wrlock_finish(*existing, mdr); + SimpleLock *stray = *existing; + existing++; + dout(10) << " unlocking extra " << *stray << " " << *stray->get_parent() << endl; + if (mdr->xlocks.count(stray)) + xlock_finish(stray, mdr); + else if (mdr->wrlocks.count(stray)) + wrlock_finish(stray, mdr); else - rdlock_finish(*existing, mdr); + rdlock_finish(stray, mdr); } return true; @@ -791,20 +788,24 @@ void Locker::handle_lock(MLock *m) assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); - assert(lock); + if (!lock) { + dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << endl; + delete m; + return; + } - switch (m->get_lock_type()) { + switch (lock->get_type()) { case LOCK_OTYPE_DN: case LOCK_OTYPE_IAUTH: case LOCK_OTYPE_ILINK: case LOCK_OTYPE_IDIRFRAGTREE: handle_simple_lock(lock, m); break; - + case LOCK_OTYPE_IFILE: handle_file_lock((FileLock*)lock, m); break; - + case LOCK_OTYPE_IDIR: handle_scatter_lock((ScatterLock*)lock, m); break; diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 7da0a1f8831e7..c0319c32d6878 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -95,6 +95,7 @@ MDCache::MDCache(MDS *m) migrator = new Migrator(mds, this); // renamer = new Renamer(mds, this); root = NULL; + stray = NULL; lru.lru_set_max(g_conf.mds_cache_size); lru.lru_set_midpoint(g_conf.mds_cache_mid); @@ -289,11 +290,11 @@ void MDCache::open_foreign_stray(int who, Context *c) // discover filepath want; - MDiscover *req = new MDiscover(who, + MDiscover *req = new MDiscover(mds->get_nodeid(), ino, want, - false); - mds->send_message_mds(req, 0, MDS_PORT_CACHE); + false); // there _is_ no base dir for the stray inode + mds->send_message_mds(req, who, MDS_PORT_CACHE); // wait waiting_for_stray[ino].push_back(c); @@ -3511,7 +3512,6 @@ void MDCache::request_drop_locks(MDRequest *mdr) assert(mdr->trace.empty()); } - void MDCache::request_cleanup(MDRequest *mdr) { dout(15) << "request_cleanup " << *mdr << endl; @@ -3524,8 +3524,8 @@ void MDCache::request_cleanup(MDRequest *mdr) // drop locks request_drop_locks(mdr); - // drop auth pins - mdr->drop_auth_pins(); + // drop (local) auth pins + mdr->drop_local_auth_pins(); // drop cache pins for (set::iterator it = mdr->pins.begin(); @@ -3999,8 +3999,17 @@ void MDCache::handle_discover(MDiscover *dis) dout(10) << "added root " << *root << endl; cur = root; + } + else if (dis->get_base_ino() == MDS_INO_STRAY(whoami)) { + // wants root + dout(7) << "handle_discover from mds" << dis->get_asker() << " wants stray + " << dis->get_want().get_path() << endl; - } else { + reply->add_inode( stray->replicate_to( dis->get_asker() ) ); + dout(10) << "added stray " << *stray << endl; + + cur = stray; + } + else { // there's a base inode cur = get_inode(dis->get_base_ino()); @@ -4051,7 +4060,7 @@ void MDCache::handle_discover(MDiscover *dis) } else { // requester explicity specified the frag fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_base_ino() == MDS_INO_ROOT); + assert(dis->wants_base_dir() || dis->get_base_ino() < MDS_INO_BASE); } CDir *curdir = cur->get_dirfrag(fg); @@ -4186,16 +4195,10 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) if (cur) { dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; - } else { - if (!m->has_root()) { - dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; - delete m; - return; - } - + } + else if (m->get_base_ino() == MDS_INO_ROOT) { // it's the root inode. assert(!root); - assert(m->get_base_ino() == MDS_INO_ROOT); assert(!m->has_base_dentry()); assert(!m->has_base_dir()); @@ -4213,13 +4216,26 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // take root waiters finished.swap(waiting_for_root); } + else if (MDS_INO_IS_STRAY(m->get_base_ino())) { + dout(7) << "discover_reply stray + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; + + // add in root + cur = new CInode(this, false); + m->get_inode(0).update_inode(cur); // that thar 0 is an array index (the 0th inode in the reply). + add_inode( cur ); + dout(7) << "discover_reply got stray " << *cur << endl; + + // take waiters + finished.swap(waiting_for_stray[cur->ino()]); + waiting_for_stray.erase(cur->ino()); + } // fyi if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; dout(10) << "depth = " << m->get_depth() << ", has base_dir/base_dn/root = " - << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_root() + << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() << ", num dirs/dentries/inodes = " << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() << endl; @@ -4228,7 +4244,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // indexese follow each ([[dir] dentry] inode) // can start, end with any type. - for (int i=m->has_root(); iget_depth(); i++) { + for (int i=m->has_base_inode(); iget_depth(); i++) { dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; frag_t fg; diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index 3d9301966bb44..3678d7ecae9d4 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -58,6 +58,13 @@ class MMDSSlaveRequest; //typedef const char* pchar; +struct PVList { + map ls; + + version_t add(MDSCacheObject* o, version_t v) { + return ls[o] = v; + } +}; /** active_request_t * state we track for requests we are currently processing. @@ -95,35 +102,36 @@ struct MDRequest { map< inodeno_t, inode_t > projected_inode; - // remote dn pins - map< CDentry*, set > remote_dn_pinning; // [master] dn -> mds set it's pinning on - map< CDentry*, set > remote_dn_pins; // [master] dn -> mds set it's pinned on - bool waiting_on_remote_dn_pin; - int waiting_on_remote_auth_pin; // which mds? + // for rename + set extra_witnesses; // replica list from srcdn auth + set witnessed; // nodes who have journaled a RenamePrepare + utime_t now; + int waiting_on_remote_witness; + map pvmap; + // --------------------------------------------------- MDRequest() : client_request(0), ref(0), slave_request(0), slave_to_mds(-1), - waiting_on_remote_dn_pin(false), - waiting_on_remote_auth_pin(-1) { } + waiting_on_remote_auth_pin(-1), + waiting_on_remote_witness(-1) { } MDRequest(metareqid_t ri, MClientRequest *req) : reqid(ri), client_request(req), ref(0), slave_request(0), slave_to_mds(-1), - waiting_on_remote_dn_pin(false), - waiting_on_remote_auth_pin(-1) { } + waiting_on_remote_auth_pin(-1), + waiting_on_remote_witness(-1) { } MDRequest(metareqid_t ri, int by) : reqid(ri), client_request(0), ref(0), slave_request(0), slave_to_mds(by), - waiting_on_remote_dn_pin(false), - waiting_on_remote_auth_pin(-1) { } + waiting_on_remote_auth_pin(-1), + waiting_on_remote_witness(-1) { } - bool is_slave() { - return slave_to_mds >= 0; - } + bool is_master() { return slave_to_mds < 0; } + bool is_slave() { return slave_to_mds >= 0; } // pin items in cache void pin(MDSCacheObject *o) { @@ -143,22 +151,18 @@ struct MDRequest { auth_pins.insert(object); } } - void drop_auth_pins() { - for (set::iterator it = auth_pins.begin(); - it != auth_pins.end(); - it++) - (*it)->auth_unpin(); + void drop_local_auth_pins() { + set::iterator it = auth_pins.begin(); + while (it != auth_pins.end()) { + if ((*it)->is_auth()) { + (*it)->auth_unpin(); + auth_pins.erase(it++); + } else { + it++; + } + } auth_pins.clear(); } - - bool is_remote_pinning_dn(CDentry *dn, int who) { - return remote_dn_pinning.count(dn) && - remote_dn_pinning[dn].count(who); - } - bool is_remote_pinned_dn(CDentry *dn, int who) { - return remote_dn_pins.count(dn) && - remote_dn_pins[dn].count(who); - } }; inline ostream& operator<<(ostream& out, MDRequest &mdr) diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 894973a356abf..01667b165a9c3 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -37,6 +37,7 @@ #include "events/EString.h" #include "events/EUpdate.h" +#include "events/ESlaveUpdate.h" #include "events/ESession.h" #include "events/EOpen.h" @@ -530,6 +531,7 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } break; + /* case MMDSSlaveRequest::OP_PINDNACK: { if (!mdcache->have_request(m->get_reqid())) @@ -559,6 +561,7 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } } break; + */ case MMDSSlaveRequest::OP_AUTHPINACK: { @@ -567,6 +570,13 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } break; + case MMDSSlaveRequest::OP_RENAMEPREPACK: + { + MDRequest *mdr = mdcache->request_get(m->get_reqid()); + handle_slave_rename_prep_ack(mdr, m); + } + break; + default: assert(0); } @@ -650,37 +660,11 @@ void Server::dispatch_slave_request(MDRequest *mdr) break; case MMDSSlaveRequest::OP_AUTHPIN: - { - handle_slave_auth_pin(mdr); - } + handle_slave_auth_pin(mdr); break; - case MMDSSlaveRequest::OP_PINDN: - case MMDSSlaveRequest::OP_UNPINDN: - // get the CDentry* - { - filepath path(mdr->slave_request->get_dnpath()); - dout(10) << "dnpath " << path << endl; - vector trace; - int r = mdcache->path_traverse(mdr, 0, path, trace, false, mdr->slave_request, - new C_MDS_RetryRequest(mdcache, mdr), - MDS_TRAVERSE_DISCOVERXLOCK, false, true); - if (r < 0) return; - - // pin final cdentry in cache - CDentry *dn = trace[trace.size()-1]; - dout(10) << "discovered and pinning " << *dn << endl; - mdr->pin(dn); - - // ack - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_PINDNACK); - dn->set_object_info(reply->get_object_info()); - mds->send_message_mds(reply, mdr->slave_request->get_source().num(), MDS_PORT_SERVER); - - // done. - delete mdr->slave_request; - mdr->slave_request = 0; - } + case MMDSSlaveRequest::OP_RENAMEPREP: + handle_slave_rename_prep(mdr); break; case MMDSSlaveRequest::OP_FINISH: @@ -729,7 +713,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) // wait dout(10) << " waiting for authpinnable on " << **p << endl; (*p)->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - mdr->drop_auth_pins(); + mdr->drop_local_auth_pins(); return; } } @@ -737,7 +721,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) // auth pin! if (fail) { - mdr->drop_auth_pins(); // just in case + mdr->drop_local_auth_pins(); // just in case } else { for (list::iterator p = objects.begin(); p != objects.end(); @@ -2258,7 +2242,8 @@ void Server::_unlink_local_finish(MDRequest *mdr, // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref - + + // clean up? if (straydn) mdcache->eval_stray(straydn); } @@ -2338,80 +2323,48 @@ bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) +// ====================================================== -// RENAME - -class C_MDS_RenameTraverseDst : public Context { +class C_MDS_rename_anchor : public Context { Server *server; - MDRequest *mdr; - CInode *srci; - CDir *srcdir; - CDentry *srcdn; - filepath destpath; public: - vector trace; + LogEvent *le; + C_MDS_rename_finish *fin; + version_t atid1; + version_t atid2; - C_MDS_RenameTraverseDst(Server *server, - MDRequest *r, - CDentry *srcdn, - filepath& destpath) { - this->server = server; - this->mdr = r; - this->srcdn = srcdn; - this->destpath = destpath; + C_MDS_rename_anchor(Server *s) : server(s), le(0), fin(0), atid1(0), atid2(0) { } + void finish(int r) { + server->_rename_reanchored(le, fin, atid1, atid2); } +}; + + +class C_MDS_rename_finish : public Context { + MDS *mds; + MDRequest *mdr; + CDentry *srcdn; + CDentry *destdn; + CDentry *straydn; +public: + version_t atid1; + version_t atid2; + C_MDS_rename_finish(MDS *m, MDRequest *r, + CDentry *sdn, CDentry *ddn, CDentry *stdn) : + mds(m), mdr(r), + srcdn(sdn), destdn(ddn), straydn(stdn), + atid1(0), atid2(0) { } void finish(int r) { - server->handle_client_rename_2(mdr, - srcdn, destpath, - trace, r); + assert(r == 0); + mds->server->_rename_finish(mdr, srcdn, destdn, straydn, + atid1, atid2); } }; /** handle_client_rename * - * NOTE: caller did not path_pin the ref (srcdir) inode, as it normally does. - * - - weirdness iwith rename: - - ref inode is what was originally srcdiri, but that may change by the time - the rename actually happens. for all practical purpose, ref is useless except - for C_MDS_RetryRequest - */ - -bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr) -{ - // xlocked? - if (dn && !dn->lock.can_rdlock(mdr)) { - dout(10) << "_rename_open_dn waiting on " << *dn << endl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - if (mustexist && - ((dn && dn->is_null()) || - (!dn && dir->is_complete()))) { - dout(10) << "_rename_open_dn dn dne in " << *dir << endl; - reply_request(mdr, -ENOENT); - return false; - } - - if (!dn && !dir->is_complete()) { - dout(10) << "_rename_open_dn readding incomplete dir" << endl; - dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - assert(dn && !dn->is_null()); - - dout(10) << "_rename_open_dn dn is " << *dn << endl; - CInode *in = mdcache->get_dentry_inode(dn, mdr); - if (!in) return false; - dout(10) << "_rename_open_dn inode is " << *in << endl; - - return true; -} - void Server::handle_client_rename(MDRequest *mdr) { MClientRequest *req = mdr->client_request; @@ -2443,9 +2396,9 @@ void Server::handle_client_rename(MDRequest *mdr) return; } CDentry *srcdn = srctrace[srctrace.size()-1]; - dout(10) << "srcdn is " << *srcdn << endl; + dout(10) << " srcdn " << *srcdn << endl; CInode *srci = mdcache->get_dentry_inode(srcdn, mdr); - dout(10) << "srci is " << *srci << endl; + dout(10) << " srci " << *srci << endl; // -- some sanity checks -- // src == dest? @@ -2477,10 +2430,10 @@ void Server::handle_client_rename(MDRequest *mdr) CInode *oldin = 0; if (destdn && !destdn->is_null()) { - dout(10) << "dest dn exists " << *destdn << endl; + //dout(10) << "dest dn exists " << *destdn << endl; oldin = mdcache->get_dentry_inode(destdn, mdr); if (!oldin) return; - dout(10) << "oldin " << *oldin << endl; + dout(10) << " oldin " << *oldin << endl; // mv /some/thing /to/some/existing_other_thing if (oldin->is_dir() && !srci->is_dir()) { @@ -2502,7 +2455,7 @@ void Server::handle_client_rename(MDRequest *mdr) if (!destdn) return; } - dout(10) << "destdn " << *destdn << endl; + dout(10) << " destdn " << *destdn << endl; // -- locks -- @@ -2520,174 +2473,166 @@ void Server::handle_client_rename(MDRequest *mdr) xlocks.insert(&destdn->lock); wrlocks.insert(&destdn->dir->inode->dirlock); - // xlock oldin + // xlock oldin (for nlink--) if (oldin) xlocks.insert(&oldin->linklock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - - // ok go! - if (srcdn->is_auth() && destdn->is_auth()) - _rename_local(mdr, srcdn, destdn); - else { - // _rename_remote(mdr, srcdn, destdn); - reply_request(mdr, -EXDEV); - return; - } -} - + // -- declare now -- + if (mdr->now == utime_t()) + mdr->now = g_clock.real_now(); + // -- prepare witnesses -- + set witnesses = mdr->extra_witnesses; + srcdn->list_replicas(witnesses); + destdn->list_replicas(witnesses); -class C_MDS_rename_local_finish : public Context { - MDS *mds; - MDRequest *mdr; - CDentry *srcdn; - CDentry *destdn; - CDentry *straydn; - version_t ipv; - version_t straypv; - version_t destpv; - version_t srcpv; - version_t ddirpv, sdirpv; - utime_t ictime; -public: - version_t atid1; - version_t atid2; - C_MDS_rename_local_finish(MDS *m, MDRequest *r, - CDentry *sdn, CDentry *ddn, CDentry *stdn, - version_t v, version_t ddirpv_, version_t sdirpv_, utime_t ct) : - mds(m), mdr(r), - srcdn(sdn), destdn(ddn), straydn(stdn), - ipv(v), - straypv(straydn ? straydn->get_projected_version():0), - destpv(destdn->get_projected_version()), - srcpv(srcdn->get_projected_version()), - ddirpv(ddirpv_), sdirpv(sdirpv_), - ictime(ct), - atid1(0), atid2(0) { } - void finish(int r) { - assert(r == 0); - mds->server->_rename_local_finish(mdr, srcdn, destdn, straydn, - srcpv, destpv, straypv, ipv, ddirpv, sdirpv, ictime, - atid1, atid2); + for (set::iterator p = witnesses.begin(); + p != witnesses.end(); + ++p) { + if (mdr->witnessed.count(*p)) { + dout(10) << " already witnessed by mds" << *p << endl; + } else { + dout(10) << " not yet witnessed by mds" << *p << ", sending prepare" << endl; + MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); + srcdn->make_path(req->srcdnpath); + destdn->make_path(req->destdnpath); + req->now = mdr->now; + mds->send_message_mds(req, *p, MDS_PORT_SERVER); + mdr->waiting_on_remote_witness = *p; + return; + } } -}; -class C_MDS_rename_local_anchor : public Context { - Server *server; -public: - LogEvent *le; - C_MDS_rename_local_finish *fin; - version_t atid1; - version_t atid2; + // -- prepare journal entry -- + EUpdate *le = new EUpdate("rename"); + le->metablob.add_client_req(mdr->reqid); - C_MDS_rename_local_anchor(Server *s) : server(s), le(0), fin(0), atid1(0), atid2(0) { } - void finish(int r) { - server->_rename_local_reanchored(le, fin, atid1, atid2); - } -}; + CDentry *straydn = _rename_prepare(mdr, &le->metablob, srcdn, destdn); -bool Server::_rename_pin_dn_on_replicas(MDRequest *mdr, CDentry *dn, inodeno_t reltoino, set& ls) -{ - dout(10) << "_rename_pin_dn_on_replicas " << ls << " " << *dn << endl; + // -- prepare anchor updates -- + C_MDS_rename_anchor *anchorfin = 0; + C_Gather *anchorgather = 0; - bool ok = true; + bool linkmerge = (srcdn->inode == destdn->inode && + (srcdn->is_primary() || destdn->is_primary())); - for (set::iterator p = ls.begin(); - p != ls.end(); - ++p) { - if (mdr->is_remote_pinned_dn(dn, *p)) { - dout(10) << "_rename_pin_dn_on_replicas already pinned on mds" << *p << " " << *dn << endl; - continue; + if (!linkmerge) { + if (srcdn->is_primary() && srcdn->inode->is_anchored() && + srcdn->dir != destdn->dir) { + dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; + vector trace; + destdn->make_anchor_trace(trace, srcdn->inode); + + anchorfin = new C_MDS_rename_anchor(this); + anchorgather = new C_Gather(anchorfin); + mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &anchorfin->atid1, + anchorgather->new_sub()); } - if (mdr->is_remote_pinning_dn(dn, *p)) { - ok = false; - dout(10) << "_rename_pin_dn_on_replicas already pinning on mds" << *p << " " << *dn << endl; - continue; + if (destdn->is_primary() && + destdn->inode->is_anchored()) { + dout(10) << "reanchoring dst->stray " << *destdn->inode << endl; + vector trace; + straydn->make_anchor_trace(trace, destdn->inode); + + if (!anchorfin) { + anchorfin = new C_MDS_rename_anchor(this); + anchorgather = new C_Gather(anchorfin); + } + mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &anchorfin->atid2, + anchorgather->new_sub()); } + } - // remote pin. - dout(10) << "_rename_pin_dn_on_replicas pinning on mds" << *p << " " << *dn << endl; - ok = false; - mdr->remote_dn_pinning[dn].insert(*p); + // -- commit locally -- + C_MDS_rename_finish *fin = new C_MDS_rename_finish(mds, mdr, srcdn, destdn, straydn); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_PINDN); - string dnpath; - dn->make_path(dnpath, reltoino); - r->set_dnpath(dnpath, reltoino); - mds->send_message_mds(r, *p, MDS_PORT_SERVER); - } + journal_opens(); // journal pending opens, just in case - return ok; + if (anchorfin) { + // doing anchor update prepare first + anchorfin->fin = fin; + anchorfin->le = le; + } else { + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); + } } -void Server::_rename_local(MDRequest *mdr, - CDentry *srcdn, - CDentry *destdn) +void Server::_rename_reanchored(LogEvent *le, C_MDS_rename_finish *fin, + version_t atid1, version_t atid2) { - dout(10) << "_rename_local " << *srcdn << " to " << *destdn << endl; + dout(10) << "_rename_reanchored, logging " << *le << endl; + + // note anchor transaction ids + fin->atid1 = atid1; + fin->atid2 = atid2; - // propagate dest dir to any witnesses - inodeno_t baseino = mdr->client_request->get_cwd_ino(); - set need_to_pin; - srcdn->list_replicas(need_to_pin); - destdn->list_replicas(need_to_pin); - if (!_rename_pin_dn_on_replicas(mdr, srcdn, baseino, need_to_pin) || - !_rename_pin_dn_on_replicas(mdr, destdn, baseino, need_to_pin)) { - mdr->waiting_on_remote_dn_pin = true; - return; // note: no explicit waiter, so we can || these checks, yay. - } + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); +} - // let's go. - EUpdate *le = new EUpdate("rename_local"); - le->metablob.add_client_req(mdr->reqid); +void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn, + version_t atid1, version_t atid2) +{ + dout(10) << "_rename_finish " << *mdr << endl; - utime_t now = g_clock.real_now(); + // apply + _rename_apply(mdr, srcdn, destdn, straydn); + + // commit anchor updates? + if (atid1) mds->anchorclient->commit(atid1); + if (atid2) mds->anchorclient->commit(atid2); - CDentry *straydn = 0; - inode_t *pi = 0; - version_t ipv = 0; + // reply + MClientReply *reply = new MClientReply(mdr->client_request, 0); + reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref - C_MDS_rename_local_anchor *anchorfin = 0; - C_Gather *anchorgather = 0; + // clean up? + if (straydn) + mdcache->eval_stray(straydn); +} + + + +// helpers + +CDentry *Server::_rename_prepare(MDRequest *mdr, + EMetaBlob *metablob, + CDentry *srcdn, CDentry *destdn) +{ + dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << endl; // primary+remote link merge? bool linkmerge = (srcdn->inode == destdn->inode && (srcdn->is_primary() || destdn->is_primary())); - // dir mtimes - version_t ddirpv = predirty_dn_diri(destdn, &le->metablob, now); - version_t sdirpv = 0; - if (destdn->dir != srcdn->dir) - sdirpv = predirty_dn_diri(srcdn, &le->metablob, now); - + inode_t *pi = 0; // inode getting nlink-- + version_t ipv; // it's version + CDentry *straydn = 0; + if (linkmerge) { dout(10) << "will merge remote+primary links" << endl; // destdn -> primary - le->metablob.add_dir_context(destdn->dir); - ipv = destdn->pre_dirty(destdn->inode->inode.version); - pi = le->metablob.add_primary_dentry(destdn, true, destdn->inode); + metablob->add_dir_context(destdn->dir); + if (destdn->is_auth()) + ipv = mdr->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); + pi = metablob->add_primary_dentry(destdn, true, destdn->inode); // do src dentry - le->metablob.add_dir_context(srcdn->dir); - srcdn->pre_dirty(); - le->metablob.add_null_dentry(srcdn, true); - - // anchor update? - if (srcdn->is_primary() && srcdn->inode->is_anchored() && - srcdn->dir != destdn->dir) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - anchorfin = new C_MDS_rename_local_anchor(this); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &anchorfin->atid1, anchorfin); - } + metablob->add_dir_context(srcdn->dir); + if (srcdn->is_auth()) + mdr->pvmap[srcdn] = srcdn->pre_dirty(); + metablob->add_null_dentry(srcdn, true); } else { + // move to stray? if (destdn->is_primary()) { // primary. @@ -2698,29 +2643,21 @@ void Server::_rename_local(MDRequest *mdr, CDir *straydir = mdcache->get_stray()->get_or_open_dirfrag(mdcache, fg); straydn = straydir->add_dentry(straydname, 0); dout(10) << "straydn is " << *straydn << endl; - - // renanchor? - if (destdn->inode->is_anchored()) { - dout(10) << "reanchoring dst->stray " << *destdn->inode << endl; - vector trace; - straydn->make_anchor_trace(trace, destdn->inode); - anchorfin = new C_MDS_rename_local_anchor(this); - anchorgather = new C_Gather(anchorfin); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &anchorfin->atid1, - anchorgather->new_sub()); - } + mdr->pin(straydn); // link-- inode, move to stray dir. - le->metablob.add_dir_context(straydn->dir); - ipv = straydn->pre_dirty(destdn->inode->inode.version); - pi = le->metablob.add_primary_dentry(straydn, true, destdn->inode); + metablob->add_dir_context(straydn->dir); + if (straydn->is_auth()) + ipv = mdr->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); + pi = metablob->add_primary_dentry(straydn, true, destdn->inode); } else if (destdn->is_remote()) { // remote. // nlink-- targeti - le->metablob.add_dir_context(destdn->inode->get_parent_dir()); - ipv = destdn->inode->pre_dirty(); - pi = le->metablob.add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary + metablob->add_dir_context(destdn->inode->get_parent_dir()); + if (destdn->is_auth()) + ipv = mdr->pvmap[destdn->inode] = destdn->inode->pre_dirty(); + pi = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary dout(10) << "remote targeti (nlink--) is " << *destdn->inode << endl; } else { @@ -2728,84 +2665,43 @@ void Server::_rename_local(MDRequest *mdr, } // add dest dentry - le->metablob.add_dir_context(destdn->dir); + metablob->add_dir_context(destdn->dir); if (srcdn->is_primary()) { dout(10) << "src is a primary dentry" << endl; - destdn->pre_dirty(srcdn->inode->inode.version); - le->metablob.add_primary_dentry(destdn, true, srcdn->inode); - - if (srcdn->inode->is_anchored()) { - dout(10) << "reanchoring src->dst " << *srcdn->inode << endl; - vector trace; - destdn->make_anchor_trace(trace, srcdn->inode); - if (!anchorfin) anchorfin = new C_MDS_rename_local_anchor(this); - if (!anchorgather) anchorgather = new C_Gather(anchorfin); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &anchorfin->atid2, - anchorgather->new_sub()); - - } + if (destdn->is_auth()) + mdr->pvmap[destdn] = destdn->pre_dirty(srcdn->inode->inode.version); + metablob->add_primary_dentry(destdn, true, srcdn->inode); + } else { assert(srcdn->is_remote()); dout(10) << "src is a remote dentry" << endl; - destdn->pre_dirty(); - le->metablob.add_remote_dentry(destdn, true, srcdn->get_remote_ino()); + if (destdn->is_auth()) + mdr->pvmap[destdn] = destdn->pre_dirty(); + metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); } // remove src dentry - le->metablob.add_dir_context(srcdn->dir); - srcdn->pre_dirty(); - le->metablob.add_null_dentry(srcdn, true); + metablob->add_dir_context(srcdn->dir); + if (srcdn->is_auth()) + mdr->pvmap[srcdn] = srcdn->pre_dirty(); + metablob->add_null_dentry(srcdn, true); } if (pi) { // update journaled target inode pi->nlink--; - pi->ctime = now; + pi->ctime = mdr->now; pi->version = ipv; } - C_MDS_rename_local_finish *fin = new C_MDS_rename_local_finish(mds, mdr, - srcdn, destdn, straydn, - ipv, ddirpv, sdirpv, now); - - journal_opens(); // journal pending opens, just in case - - if (anchorfin) { - // doing anchor update prepare first - anchorfin->fin = fin; - anchorfin->le = le; - } else { - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - } -} - - -void Server::_rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *fin, - version_t atid1, version_t atid2) -{ - dout(10) << "_rename_local_reanchored, logging " << *le << endl; - - // note anchor transaction ids - fin->atid1 = atid1; - fin->atid2 = atid2; - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + return straydn; } -void Server::_rename_local_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn, - version_t srcpv, version_t destpv, version_t straypv, version_t ipv, - version_t ddirpv, version_t sdirpv, - utime_t ictime, - version_t atid1, version_t atid2) +void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { - MClientRequest *req = mdr->client_request; - dout(10) << "_rename_local_finish " << *req << endl; + dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << endl; + dout(10) << " pvs " << mdr->pvmap << endl; CInode *oldin = destdn->inode; @@ -2814,24 +2710,26 @@ void Server::_rename_local_finish(MDRequest *mdr, (srcdn->is_primary() || destdn->is_primary())); // dir mtimes + /* dirty_dn_diri(destdn, ddirpv, ictime); if (destdn->dir != srcdn->dir) dirty_dn_diri(srcdn, sdirpv, ictime); - - if (linkmerge) { - assert(ipv); + */ + if (linkmerge) { if (destdn->is_primary()) { dout(10) << "merging remote onto primary link" << endl; // nlink-- in place destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = ictime; - destdn->inode->mark_dirty(destpv); + destdn->inode->inode.ctime = mdr->now; + if (destdn->inode->is_auth()) + destdn->inode->mark_dirty(mdr->pvmap[destdn]); // unlink srcdn srcdn->dir->unlink_inode(srcdn); - srcdn->mark_dirty(srcpv); + if (srcdn->is_auth()) + srcdn->mark_dirty(mdr->pvmap[srcdn]); } else { dout(10) << "merging primary onto remote link" << endl; assert(srcdn->is_primary()); @@ -2843,30 +2741,49 @@ void Server::_rename_local_finish(MDRequest *mdr, // nlink-- destdn->inode->inode.nlink--; - destdn->inode->inode.ctime = ictime; - destdn->inode->mark_dirty(destpv); + destdn->inode->inode.ctime = mdr->now; + if (destdn->inode->is_auth()) + destdn->inode->mark_dirty(mdr->pvmap[destdn]); // mark src dirty - srcdn->mark_dirty(srcpv); + if (srcdn->is_auth()) + srcdn->mark_dirty(mdr->pvmap[srcdn]); } } else { + // straydn? + if (destdn->is_primary() && !straydn) { + string straydname; + destdn->inode->name_stray_dentry(straydname); + frag_t fg = mdcache->get_stray()->pick_dirfrag(straydname); + CDir *straydir = mdcache->get_stray()->get_dirfrag(fg); + straydn = straydir->lookup(straydname); + } + // unlink destdn? if (!destdn->is_null()) destdn->dir->unlink_inode(destdn); - + if (straydn) { - // relink oldin to stray dir + dout(10) << "straydn is " << *straydn << endl; + + // relink oldin to stray dir. destdn was primary. assert(oldin); straydn->dir->link_inode(straydn, oldin); - assert(straypv == ipv); + //assert(straypv == ipv); + + // nlink-- in stray dir. + oldin->inode.nlink--; + oldin->inode.ctime = mdr->now; + if (oldin->is_auth()) + oldin->mark_dirty(mdr->pvmap[straydn]); } - - if (oldin) { - // nlink-- + else if (oldin) { + // nlink-- remote. destdn was remote. oldin->inode.nlink--; - oldin->inode.ctime = ictime; - oldin->mark_dirty(ipv); + oldin->inode.ctime = mdr->now; + if (oldin->is_auth()) + oldin->mark_dirty(mdr->pvmap[oldin]); } CInode *in = srcdn->inode; @@ -2878,187 +2795,140 @@ void Server::_rename_local_finish(MDRequest *mdr, srcdn->dir->unlink_inode(srcdn); destdn->dir->link_inode(destdn, in); } - destdn->mark_dirty(destpv); - srcdn->mark_dirty(srcpv); + if (destdn->is_auth()) + destdn->mark_dirty(mdr->pvmap[destdn]); + if (srcdn->is_auth()) + srcdn->mark_dirty(mdr->pvmap[srcdn]); } - // commit anchor updates? - if (atid1) mds->anchorclient->commit(atid1); - if (atid2) mds->anchorclient->commit(atid2); - // update subtree map? if (destdn->inode->is_dir()) mdcache->adjust_subtree_after_rename(destdn->inode, srcdn->dir); +} - // share news with replicas - // *** - // reply - MClientReply *reply = new MClientReply(req, 0); - reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref - // clean up? - if (straydn) - mdcache->eval_stray(straydn); -} +// ------------ +// SLAVE +class C_MDS_SlaveRenamePrep : public Context { + Server *server; + MDRequest *mdr; + CDentry *srcdn; +public: + C_MDS_SlaveRenamePrep(Server *s, MDRequest *m, CDentry *d) : server(s), mdr(m), srcdn(d) {} + void finish(int r) { + server->_logged_slave_rename_prep(mdr, srcdn); + } +}; -/* -void Server::handle_client_rename_local(MClientRequest *req, - CInode *ref, - const string& srcpath, - CInode *srcdiri, - CDentry *srcdn, - const string& destpath, - CDir *destdir, - CDentry *destdn, - const string& destname) +void Server::handle_slave_rename_prep(MDRequest *mdr) { -*/ - //bool everybody = false; - //if (true || srcdn->inode->is_dir()) { - /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! - i could limit this to cases where something beneath me is exported. - could possibly limit the list. (maybe.) - Underlying constraint is that, regardless of the order i do the xlocks, and whatever - imports/exports might happen in the process, the destdir _must_ exist on any node - importing something beneath me when rename finishes, or else mayhem ensues when - their import is dangling in the cache. - */ - /* - having made a proper mess of this on the first pass, here is my plan: + dout(10) << "handle_slave_rename_prep " << *mdr + << " " << mdr->slave_request->srcdnpath + << " to " << mdr->slave_request->destdnpath + << endl; + + // discover destdn + filepath destpath(mdr->slave_request->destdnpath); + dout(10) << " dest " << destpath << endl; + vector trace; + int r = mdcache->path_traverse(mdr, 0, destpath, trace, false, mdr->slave_request, + new C_MDS_RetryRequest(mdcache, mdr), + MDS_TRAVERSE_DISCOVERXLOCK, false, true); + if (r > 0) return; + assert(r == 0); // we shouldn't get an error here! - - xlocks of src, dest are done in lex order - - xlock is optional.. if you have the dentry, lock it, if not, don't. - - if you discover an xlocked dentry, you get the xlock. - - possible trouble: - - you have an import beneath the source, and don't have the dest dir. - - when the actual rename happens, you discover the dest - - actually, do this on any open dir, so we don't detach whole swaths - of our cache. + CDentry *destdn = trace[trace.size()-1]; + dout(10) << " destdn " << *destdn << endl; + mdr->pin(destdn); - notes: - - xlocks are initiated from authority, as are discover_replies, so replicas are - guaranteed to either not have dentry, or to have it xlocked. - - - - foreign xlocks are eventually unraveled by the initiator on success or failure. - - todo to make this work: - - hose bool everybody param crap - /- make handle_lock_dn not discover, clean up cases - /- put dest path in MRenameNotify - /- make rename_notify discover if its a dir - / - this will catch nested imports too, obviously - /- notify goes to merged list on local rename - /- notify goes to everybody on a foreign rename - /- handle_notify needs to gracefully ignore spurious notifies - */ - //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl; - //everybody = true; - //} -/* - bool srclocal = srcdn->dir->dentry_authority(srcdn->name).first == mds->get_nodeid(); - bool destlocal = destdir->dentry_authority(destname).first == mds->get_nodeid(); - - dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl; - if (destdn) { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl; - } else { - dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl; - } - - // lock source and dest dentries, in lexicographic order. - bool dosrc = srcpath < destpath; - for (int i=0; i<2; i++) { - if (dosrc) { - - // src - if (srclocal) { - if (!srcdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(srcdn, req, ref)) - return; - } else { - if (!srcdn || srcdn->xlockedby != req) { - mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl; + // discover srcdn + filepath srcpath(mdr->slave_request->srcdnpath); + dout(10) << " src " << srcpath << endl; + r = mdcache->path_traverse(mdr, 0, srcpath, trace, false, mdr->slave_request, + new C_MDS_RetryRequest(mdcache, mdr), + MDS_TRAVERSE_DISCOVERXLOCK, false, true); + if (r > 0) return; + assert(r == 0); // we shouldn't get an error here! - } else { - - if (destlocal) { - // dest - if (!destdn) destdn = destdir->add_dentry(destname); - if (!destdn->is_xlockedbyme(req) && - !mds->locker->dentry_xlock_start(destdn, req, ref)) { - if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn); - return; - } - } else { - if (!destdn || destdn->xlockedby != req) { - // NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case - // my traverse and determination of dest vs dest/srcfilename was out of date. - mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref)); - return; - } - } - dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl; + CDentry *srcdn = trace[trace.size()-1]; + dout(10) << " srcdn " << *srcdn << endl; + mdr->pin(srcdn); + // open destdn stray? + CDentry *straydn = 0; + if (destdn->is_primary()) { + CInode *dstray = mdcache->get_inode(MDS_INO_STRAY(mdr->slave_to_mds)); + if (!dstray) { + mdcache->open_foreign_stray(mdr->slave_to_mds, new C_MDS_RetryRequest(mdcache, mdr)); + return; } - dosrc = !dosrc; + string straydname; + destdn->inode->name_stray_dentry(straydname); + frag_t fg = dstray->pick_dirfrag(straydname); + CDir *straydir = dstray->get_dirfrag(fg); + if (!straydir) { + mdcache->open_remote_dir(dstray, fg, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + + straydn = straydir->add_dentry(straydname, 0); + dout(10) << " straydn is " << *straydn << endl; } + // journal it + ESlaveUpdate *le = new ESlaveUpdate("rename_prep", mdr->reqid, ESlaveUpdate::OP_PREPARE); - // final check: verify if dest exists that src is a file + mdr->now = mdr->slave_request->now; + _rename_prepare(mdr, &le->metablob, srcdn, destdn); - // FIXME: is this necessary? + mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn)); +} - if (destdn->inode) { - if (destdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - if (srcdn->inode->is_dir()) { - dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl; - assert(0); - reply_request(req, -EINVAL); - return; - } - } else { - // if destdn->inode is null, then we know it's a non-existent dest, - // why? because if it's local, it dne. and if it's remote, we xlocked with - // REQXLOCKC, which will only allow you to lock a file. - // so we know dest is a file, or non-existent - if (!destlocal) { - if (srcdn->inode->is_dir()) { - // help: maybe the dest exists and is a file? ..... FIXME - } else { - // we're fine, src is file, dest is file|dne - } - } - } - - mds->balancer->hit_dir(srcdn->dir, META_POP_DWR); - mds->balancer->hit_dir(destdn->dir, META_POP_DWR); +void Server::_logged_slave_rename_prep(MDRequest *mdr, CDentry *srcdn) +{ + dout(10) << "_logged_slave_rename_prep " << *mdr << endl; - // we're golden. - // everything is xlocked by us, we rule, etc. - MClientReply *reply = new MClientReply(req, 0); - mdcache->renamer->file_rename( srcdn, destdn, - new C_MDS_CommitRequest(this, req, reply, srcdn->inode, - new EString("file rename fixme")) ); + // ack + MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); + if (srcdn->is_auth()) + srcdn->list_replicas(reply->srcdn_replicas); + mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); + + // done. + delete mdr->slave_request; + mdr->slave_request = 0; } +void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) +{ + dout(10) << "handle_slave_rename_prep_ack " << *mdr + << " witnessed by " << m->get_source() + << " " << *m << endl; + int from = m->get_source().num(); + + // witnessed! + assert(mdr->witnessed.count(from) == 0); + mdr->witnessed.insert(from); + + assert(mdr->waiting_on_remote_witness == from); + mdr->waiting_on_remote_witness = -1; + + // add extra witnesses? + if (!m->srcdn_replicas.empty()) { + dout(10) << " extra witnesses (srcdn replicas) are " << m->srcdn_replicas << endl; + mdr->extra_witnesses = m->srcdn_replicas; + mdr->extra_witnesses.erase(mds->get_nodeid()); // not me! + } + + dispatch_client_request(mdr); // go again! +} -*/ diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index 47ec010220c30..d881d8d8e2439 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -18,9 +18,10 @@ #include "MDS.h" class LogEvent; -class C_MDS_rename_local_finish; +class C_MDS_rename_finish; class MDRequest; - +class EMetaBlob; +class PVList; class MMDSSlaveRequest; class Server { @@ -133,22 +134,23 @@ public: void _unlink_remote(MDRequest *mdr, CDentry *dn); // rename - bool _rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr); void handle_client_rename(MDRequest *mdr); - void handle_client_rename_2(MDRequest *mdr, - CDentry *srcdn, - filepath& destpath, - vector& trace, - int r); - bool _rename_pin_dn_on_replicas(MDRequest *mdr, CDentry *dn, inodeno_t baseino, set& ls); - void _rename_local(MDRequest *mdr, CDentry *srcdn, CDentry *destdn); - void _rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *fin, - version_t atid1, version_t atid2); - void _rename_local_finish(MDRequest *mdr, - CDentry *srcdn, CDentry *destdn, CDentry *straydn, - version_t srcpv, version_t destpv, version_t straypv, version_t ipv, - version_t ddirpv, version_t sdirpv, utime_t ictime, - version_t atid1, version_t atid2); + void _rename_reanchored(LogEvent *le, C_MDS_rename_finish *fin, + version_t atid1, version_t atid2); + void _rename_finish(MDRequest *mdr, + CDentry *srcdn, CDentry *destdn, CDentry *straydn, + version_t atid1, version_t atid2); + + // helpers + CDentry *_rename_prepare(MDRequest *mdr, + EMetaBlob *metablob, + CDentry *srcdn, CDentry *destdn); + void _rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); + + // slaving + void handle_slave_rename_prep(MDRequest *mdr); + void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); + void _logged_slave_rename_prep(MDRequest *mdr, CDentry *srcdn); }; diff --git a/branches/sage/cephmds2/mds/events/ESlaveUpdate.h b/branches/sage/cephmds2/mds/events/ESlaveUpdate.h index ef2bb6c436682..6550d2838297f 100644 --- a/branches/sage/cephmds2/mds/events/ESlaveUpdate.h +++ b/branches/sage/cephmds2/mds/events/ESlaveUpdate.h @@ -20,6 +20,10 @@ class ESlaveUpdate : public LogEvent { public: + const static int OP_PREPARE = 1; + const static int OP_COMMIT = 2; + const static int OP_ABORT = 3; + string type; metareqid_t reqid; int op; // prepare, commit, abort diff --git a/branches/sage/cephmds2/messages/MDiscoverReply.h b/branches/sage/cephmds2/messages/MDiscoverReply.h index f8ce2f76c8683..4367e8177a052 100644 --- a/branches/sage/cephmds2/messages/MDiscoverReply.h +++ b/branches/sage/cephmds2/messages/MDiscoverReply.h @@ -101,11 +101,9 @@ class MDiscoverReply : public Message { dirs.size() + no_base_dir )); // dn/inode + dirs } - bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && dentries.size(); } - bool has_root() { - return (base_ino == MDS_INO_ROOT && no_base_dir && no_base_dentry); - } + bool has_base_dir() { return !no_base_dir && dirs.size(); } + bool has_base_dentry() { return !no_base_dentry && dentries.size(); } + bool has_base_inode() { return no_base_dir && no_base_dentry; } const string& get_path() { return path; } diff --git a/branches/sage/cephmds2/messages/MMDSSlaveRequest.h b/branches/sage/cephmds2/messages/MMDSSlaveRequest.h index dfa8bd6795283..b2a305130bc7a 100644 --- a/branches/sage/cephmds2/messages/MMDSSlaveRequest.h +++ b/branches/sage/cephmds2/messages/MMDSSlaveRequest.h @@ -28,10 +28,15 @@ class MMDSSlaveRequest : public Message { static const int OP_UNXLOCK = 2; static const int OP_AUTHPIN = 3; static const int OP_AUTHPINACK = -3; + static const int OP_PINDN = 5; static const int OP_PINDNACK = -5; static const int OP_UNPINDN = 6; - static const int OP_FINISH = 7; + + static const int OP_RENAMEPREP = 7; + static const int OP_RENAMEPREPACK = -7; + + static const int OP_FINISH = 17; const static char *get_opname(int o) { switch (o) { @@ -40,9 +45,14 @@ class MMDSSlaveRequest : public Message { case OP_UNXLOCK: return "unxlock"; case OP_AUTHPIN: return "authpin"; case OP_AUTHPINACK: return "authpin_ack"; + + case OP_RENAMEPREP: return "rename_prep"; + case OP_RENAMEPREPACK: return "rename_prep_ack"; + case OP_PINDN: return "pin_dn"; case OP_PINDNACK: return "pin_dn_ack"; case OP_UNPINDN: return "unpin_dn"; + case OP_FINISH: return "finish"; default: assert(0); return 0; } @@ -64,6 +74,13 @@ class MMDSSlaveRequest : public Message { list authpins; public: + // for rename prep + string srcdnpath; + string destdnpath; + set srcdn_replicas; + utime_t now; + +public: metareqid_t get_reqid() { return reqid; } int get_op() { return op; } bool is_reply() { return op < 0; } @@ -94,6 +111,10 @@ class MMDSSlaveRequest : public Message { ::_encode(dnpath, payload); ::_encode(dnpathbase, payload); ::_encode_complex(authpins, payload); + ::_encode(srcdnpath, payload); + ::_encode(destdnpath, payload); + ::_encode(srcdn_replicas, payload); + ::_encode(now, payload); } void decode_payload() { int off = 0; @@ -104,6 +125,10 @@ class MMDSSlaveRequest : public Message { ::_decode(dnpath, payload, off); ::_decode(dnpathbase, payload, off); ::_decode_complex(authpins, payload, off); + ::_decode(srcdnpath, payload, off); + ::_decode(destdnpath, payload, off); + ::_decode(srcdn_replicas, payload, off); + ::_decode(now, payload, off); } char *get_type_name() { return "slave_request"; } -- 2.39.5