From 59ccbb7b50dac8940c294da6ec66f75f6eee897a Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 8 Jun 2007 00:41:15 +0000 Subject: [PATCH] * scatterlock rewrite (untested) * slave mdrequest, remote xlock, etc. rewrite * fixed request forwarding bug git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1411 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/Makefile | 2 +- branches/sage/cephmds2/TODO | 13 + .../sage/cephmds2/client/SyntheticClient.cc | 19 + branches/sage/cephmds2/mds/CDentry.cc | 5 +- branches/sage/cephmds2/mds/CDentry.h | 2 +- branches/sage/cephmds2/mds/CInode.cc | 118 ++-- branches/sage/cephmds2/mds/CInode.h | 6 +- branches/sage/cephmds2/mds/Locker.cc | 559 ++++++++++++------ branches/sage/cephmds2/mds/Locker.h | 4 + branches/sage/cephmds2/mds/MDCache.cc | 83 ++- branches/sage/cephmds2/mds/MDCache.h | 45 +- branches/sage/cephmds2/mds/MDLog.cc | 7 +- branches/sage/cephmds2/mds/MDS.cc | 2 +- branches/sage/cephmds2/mds/Migrator.cc | 16 - branches/sage/cephmds2/mds/ScatterLock.h | 84 ++- branches/sage/cephmds2/mds/Server.cc | 187 ++++-- branches/sage/cephmds2/mds/Server.h | 7 +- branches/sage/cephmds2/mds/SimpleLock.h | 19 +- branches/sage/cephmds2/mds/mdstypes.h | 20 +- branches/sage/cephmds2/messages/MLock.h | 91 ++- .../sage/cephmds2/messages/MMDSSlaveRequest.h | 99 ++++ branches/sage/cephmds2/msg/Message.cc | 6 + branches/sage/cephmds2/msg/Message.h | 19 +- 23 files changed, 977 insertions(+), 436 deletions(-) create mode 100644 branches/sage/cephmds2/messages/MMDSSlaveRequest.h diff --git a/branches/sage/cephmds2/Makefile b/branches/sage/cephmds2/Makefile index ccc3b1eec9491..49cfc23c92adb 100644 --- a/branches/sage/cephmds2/Makefile +++ b/branches/sage/cephmds2/Makefile @@ -12,7 +12,7 @@ CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWI LDINC = ar -rc else # For linux -CFLAGS = -g -fPIC -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE LDINC = ld -i -o endif diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 17c6321345fb9..06e6b1a2e302d 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -46,6 +46,19 @@ sage doc sage mds +- dirlock-protected mtime updates vs migration, journaling, recovery +- rename fun + - remote pinning + - is there race with discover? + - generic pin_witnesses func we can use for local and remote renames + - fix up local rename + - journal renames on witnesses +- then? + - ESlaveUpdate + - remote link + - remote rename + - remote unlink + - journal+recovery - local rename - how to notify replicas... diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc index 5df66828c7bb7..7cbcf6300eb02 100644 --- a/branches/sage/cephmds2/client/SyntheticClient.cc +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -1360,6 +1360,25 @@ void SyntheticClient::make_dir_mess(const char *basedir, int n) void SyntheticClient::foo() { + if (1) { + // rename fun + for (int i=0; i<100; i++) { + int s = 5; + int a = rand() % s; + int b = rand() % s; + int c = rand() % s; + int d = rand() % s; + int e = rand() % s; + int f = rand() % s; + char src[80]; + char dst[80]; + sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); + sprintf(dst, "syn.0.0/dir.%d/dir.%d/file.%d", d, e, f); + client->rename(src, dst); + } + return; + } + // link fun client->mknod("one", 0755); client->mknod("two", 0755); diff --git a/branches/sage/cephmds2/mds/CDentry.cc b/branches/sage/cephmds2/mds/CDentry.cc index e8efed9749ac7..e9bb5663d5a2c 100644 --- a/branches/sage/cephmds2/mds/CDentry.cc +++ b/branches/sage/cephmds2/mds/CDentry.cc @@ -198,9 +198,10 @@ CDentryDiscover *CDentry::replicate_to(int who) // ---------------------------- // locking -void CDentry::set_mlock_info(MLock *m) +void CDentry::set_object_info(MDSCacheObjectInfo &info) { - m->set_dn(dir->dirfrag(), name); + info.dirfrag = dir->dirfrag(); + info.dname = name; } void CDentry::encode_lock_state(int type, bufferlist& bl) diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h index e6bd7159dc4f3..c5885382d254f 100644 --- a/branches/sage/cephmds2/mds/CDentry.h +++ b/branches/sage/cephmds2/mds/CDentry.h @@ -219,7 +219,7 @@ public: assert(type == LOCK_OTYPE_DN); return &lock; } - void set_mlock_info(MLock *m); + void set_object_info(MDSCacheObjectInfo &info); void encode_lock_state(int type, bufferlist& bl); void decode_lock_state(int type, bufferlist& bl); diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 0b614f7b1a3cc..aee8ee38d4bcc 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -137,6 +137,70 @@ void CInode::get_subtree_dirfrags(list& ls) ls.push_back(p->second); } + +CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) +{ + assert(is_dir()); + + // have it? + CDir *dir = get_dirfrag(fg); + if (dir) return dir; + + // create it. + assert(is_auth()); + dir = dirfrags[fg] = new CDir(this, fg, mdcache, true); + return dir; +} + +CDir *CInode::add_dirfrag(CDir *dir) +{ + assert(dirfrags.count(dir->dirfrag().frag) == 0); + dirfrags[dir->dirfrag().frag] = dir; + return dir; +} + +void CInode::close_dirfrag(frag_t fg) +{ + dout(14) << "close_dirfrag " << fg << endl; + assert(dirfrags.count(fg)); + + CDir *dir = dirfrags[fg]; + dir->remove_null_dentries(); + + // clear dirty flag + if (dir->is_dirty()) + dir->mark_clean(); + + // dump any remaining dentries, for debugging purposes + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) + dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << endl; + + assert(dir->get_num_ref() == 0); + delete dir; + dirfrags.erase(fg); +} + +void CInode::close_dirfrags() +{ + while (!dirfrags.empty()) + close_dirfrag(dirfrags.begin()->first); +} + +bool CInode::has_subtree_root_dirfrag() +{ + for (map::iterator p = dirfrags.begin(); + p != dirfrags.end(); + ++p) + if (p->second->is_subtree_root()) + return true; + return false; +} + + + + // pins void CInode::first_get() @@ -198,56 +262,6 @@ CInode *CInode::get_parent_inode() return NULL; } -CDir *CInode::get_or_open_dirfrag(MDCache *mdcache, frag_t fg) -{ - assert(is_dir()); - - // have it? - CDir *dir = get_dirfrag(fg); - if (dir) return dir; - - // create it. - assert(is_auth()); - dir = dirfrags[fg] = new CDir(this, fg, mdcache, true); - return dir; -} - -CDir *CInode::add_dirfrag(CDir *dir) -{ - assert(dirfrags.count(dir->dirfrag().frag) == 0); - dirfrags[dir->dirfrag().frag] = dir; - return dir; -} - -void CInode::close_dirfrag(frag_t fg) -{ - dout(14) << "close_dirfrag " << fg << endl; - assert(dirfrags.count(fg)); - - CDir *dir = dirfrags[fg]; - dir->remove_null_dentries(); - - // clear dirty flag - if (dir->is_dirty()) - dir->mark_clean(); - - // dump any remaining dentries, for debugging purposes - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - dout(14) << "close_dirfrag LEFTOVER dn " << *p->second << endl; - - assert(dir->get_num_ref() == 0); - delete dir; - dirfrags.erase(fg); -} - -void CInode::close_dirfrags() -{ - while (!dirfrags.empty()) - close_dirfrag(dirfrags.begin()->first); -} - void CInode::make_path(string& s) @@ -339,9 +353,9 @@ void CInode::mark_clean() // ------------------ // locking -void CInode::set_mlock_info(MLock *m) +void CInode::set_object_info(MDSCacheObjectInfo &info) { - m->set_ino(ino()); + info.ino = ino(); } void CInode::encode_lock_state(int type, bufferlist& bl) diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index bd3ef9f09b9ef..8830a703a15da 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -137,6 +137,7 @@ class CInode : public MDSCacheObject { CDir *add_dirfrag(CDir *dir); void close_dirfrag(frag_t fg); void close_dirfrags(); + bool has_subtree_root_dirfrag(); protected: // parent dentries in cache @@ -260,7 +261,7 @@ public: default: assert(0); } } - void set_mlock_info(MLock *m); + void set_object_info(MDSCacheObjectInfo &info); void encode_lock_state(int type, bufferlist& bl); void decode_lock_state(int type, bufferlist& bl); @@ -575,7 +576,8 @@ public: void update_inode(CInode *in, set& new_client_caps) { // treat scatterlocked mtime special, since replica may have newer info if (in->dirlock.get_state() == LOCK_SCATTER || - in->dirlock.get_state() == LOCK_GSYNCS) + in->dirlock.get_state() == LOCK_GLOCKC || + in->dirlock.get_state() == LOCK_GTEMPSYNCC) st.inode.mtime = MAX(in->inode.mtime, st.inode.mtime); in->inode = st.inode; diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 87fb8abaab45f..2daa52f332bc9 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -49,6 +49,8 @@ #include "messages/MClientRequest.h" #include "messages/MClientFileCaps.h" +#include "messages/MMDSSlaveRequest.h" + #include #include @@ -712,71 +714,78 @@ ALSO: */ - -void Locker::handle_lock(MLock *m) +SimpleLock *Locker::get_lock(int lock_type, MDSCacheObjectInfo &info) { - // nobody should be talking to us during recovery. - assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); - - switch (m->get_otype()) { + switch (lock_type) { case LOCK_OTYPE_DN: { - CDir *dir = mdcache->get_dirfrag(m->get_dirfrag()); + CDir *dir = mdcache->get_dirfrag(info.dirfrag); CDentry *dn = 0; if (dir) - dn = dir->lookup(m->get_dn()); + dn = dir->lookup(info.dname); if (!dn) { - dout(7) << "dont' have dn " << m->get_dirfrag() << " " << m->get_dn() << endl; - delete m; - return; + dout(7) << "get_lock don't have dn " << info.dirfrag << " " << info.dname << endl; + return 0; } - - handle_simple_lock(&dn->lock, m); + return &dn->lock; } - break; case LOCK_OTYPE_IAUTH: case LOCK_OTYPE_ILINK: case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IFILE: + case LOCK_OTYPE_IDIR: { - CInode *in = mdcache->get_inode(m->get_ino()); + CInode *in = mdcache->get_inode(info.ino); if (!in) { - dout(7) << "dont' have ino " << m->get_ino() << endl; - delete m; - return; + dout(7) << "get_lock don't have ino " << info.ino << endl; + return 0; } - switch (m->get_otype()) { - case LOCK_OTYPE_IAUTH: - handle_simple_lock(&in->authlock, m); - break; - case LOCK_OTYPE_ILINK: - handle_simple_lock(&in->linklock, m); - break; - case LOCK_OTYPE_IDIRFRAGTREE: - handle_simple_lock(&in->dirfragtreelock, m); - break; - case LOCK_OTYPE_IFILE: - handle_file_lock(&in->filelock, m); - break; + switch (lock_type) { + case LOCK_OTYPE_IAUTH: return &in->authlock; + case LOCK_OTYPE_ILINK: return &in->linklock; + case LOCK_OTYPE_IDIRFRAGTREE: return &in->dirfragtreelock; + case LOCK_OTYPE_IFILE: return &in->filelock; + case LOCK_OTYPE_IDIR: return &in->dirlock; } } + + default: + dout(7) << "get_lock don't know lock_type " << lock_type << endl; + assert(0); + break; + } + + return 0; +} + + +void Locker::handle_lock(MLock *m) +{ + // nobody should be talking to us during recovery. + assert(mds->is_rejoin() || mds->is_active() || mds->is_stopping()); + + SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info()); + assert(lock); + + switch (m->get_lock_type()) { + case LOCK_OTYPE_DN: + case LOCK_OTYPE_IAUTH: + case LOCK_OTYPE_ILINK: + case LOCK_OTYPE_IDIRFRAGTREE: + handle_simple_lock(lock, m); + break; + + case LOCK_OTYPE_IFILE: + handle_file_lock((FileLock*)lock, m); break; case LOCK_OTYPE_IDIR: - { - CInode *in = mdcache->get_inode(m->get_ino()); - if (!in) { - dout(7) << "dont' have ino " << m->get_ino() << endl; - delete m; - return; - } - handle_scatter_lock(&in->dirlock, m); - } + handle_scatter_lock((ScatterLock*)lock, m); break; default: - dout(7) << "handle_lock got otype " << m->get_otype() << endl; + dout(7) << "handle_lock got otype " << m->get_lock_type() << endl; assert(0); break; } @@ -829,18 +838,6 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) break; - case LOCK_AC_REQXLOCKACK: - dout(7) << "handle_simple_lock got remote xlock on " - << *lock << " " << *lock->get_parent() << endl; - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - lock->set_state(LOCK_REMOTEXLOCK); - lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); - } - break; - // -- auth -- case LOCK_AC_LOCKACK: assert(lock->get_state() == LOCK_GLOCKR); @@ -857,44 +854,6 @@ void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) } break; - case LOCK_AC_REQXLOCK: - assert(lock->get_parent()->is_auth()); - { - // register request - MDRequest *mdr = mdcache->request_start(m); - - dout(7) << "handle_simple_lock " << m->get_source() << " " << *mdr << " requesting xlock " - << *lock << " on " << *lock->get_parent() - << endl; - - if (!simple_xlock_start(lock, mdr)) - return; - - // ack - MLock *m = new MLock(lock, LOCK_AC_REQXLOCKACK, mds->get_nodeid()); - mds->send_message_mds(m, mdr->request->get_source().num(), MDS_PORT_LOCKER); - } - return; - - case LOCK_AC_UNXLOCK: - assert(lock->get_parent()->is_auth()); - { - // get request - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - - dout(7) << "handle_simple_lock " << m->get_source() << " " << *mdr << " dropping xlock " - << *lock << " on " << *lock->get_parent() - - << endl; - - simple_xlock_finish(lock, mdr); - - if (mdr->locks.empty()) - mdcache->request_finish(mdr); - - } - return; - } delete m; @@ -923,17 +882,20 @@ void Locker::simple_eval(SimpleLock *lock) } // finished remote xlock? + /* hmm: why did i do this here, and not in simple_xlock_finish()? if (lock->get_state() == LOCK_REMOTEXLOCK && !lock->is_xlocked()) { // tell auth assert(!lock->get_parent()->is_auth()); // should be auth_pinned on the auth dout(7) << "simple_eval releasing remote xlock on " << *lock->get_parent() << endl; int auth = lock->get_parent()->authority().first; - if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(); mds->send_message_mds(new MLock(lock, LOCK_AC_UNXLOCK, mds->get_nodeid()), auth, MDS_PORT_LOCKER); lock->set_state(LOCK_LOCK); } + */ // finished gathering? if (lock->get_state() == LOCK_GLOCKR && @@ -1110,27 +1072,27 @@ bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) } } else { // replica - + // this had better not be a remote xlock attempt! + assert(mdr->slave_request); + // wait for single auth if (lock->get_parent()->is_ambiguous_auth()) { lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); return false; } - int auth = lock->get_parent()->authority().first; - - // wait for sync. - // (???????????) - if (lock->get_state() != LOCK_SYNC) { - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } // send lock request - MLock *m = new MLock(lock, LOCK_AC_REQXLOCK, mds->get_nodeid()); - mds->send_message_mds(m, auth, MDS_PORT_LOCKER); - + int auth = lock->get_parent()->authority().first; + mdr->slaves.insert(auth); + MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + mds->send_message_mds(r, auth, MDS_PORT_LOCKER); + + // wait + // note: this also waits on parent object's SINGLEAUTH bit, in case of migration race lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mdr)); return false; } @@ -1147,6 +1109,19 @@ void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) mdr->locks.erase(lock); dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; + // remote xlock? + if (!lock->get_parent()->is_auth()) { + // tell auth + dout(7) << "simple_xlock_finish releasing remote xlock on " << *lock->get_parent() << endl; + int auth = lock->get_parent()->authority().first; + if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { + MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNXLOCK); + slavereq->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(slavereq->get_object_info()); + mds->send_message_mds(slavereq, auth, MDS_PORT_SERVER); + } + } + // others waiting? lock->finish_waiters(SimpleLock::WAIT_WR, 0); @@ -1215,6 +1190,14 @@ bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) dout(7) << "scatter_rdlock_start on " << *lock << " on " << *lock->get_parent() << endl; + // read on stable scattered replica? + if (lock->get_state() == LOCK_SCATTER && + !lock->get_parent()->is_auth()) { + dout(7) << "scatter_rdlock_start scatterlock read on a stable scattered replica, fw to auth" << endl; + mdcache->request_forward(mdr, lock->get_parent()->authority().first); + return false; + } + // pre-twiddle? if (lock->get_state() == LOCK_SCATTER && lock->get_parent()->is_auth() && @@ -1231,12 +1214,16 @@ bool Locker::scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr) } // wait for read. - lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + lock->add_waiter(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); - // initiate sync? - if (lock->get_state() == LOCK_SCATTER && - lock->get_parent()->is_auth()) - scatter_sync(lock); + // initiate sync or tempsync? + if (lock->is_stable() && + lock->get_parent()->is_auth()) { + if (lock->get_parent()->is_replicated()) + scatter_tempsync(lock); + else + scatter_sync(lock); + } return false; } @@ -1261,10 +1248,11 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) << " on " << *lock->get_parent() << endl; // pre-twiddle? - if (lock->get_state() == LOCK_SYNC && - lock->get_parent()->is_auth() && + if (lock->get_parent()->is_auth() && !lock->get_parent()->is_replicated() && - !lock->is_rdlocked()) + !lock->is_rdlocked() && + !lock->is_xlocked() && + lock->get_state() == LOCK_SYNC) scatter_scatter(lock); // can wrlock? @@ -1276,12 +1264,17 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) } // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); - - // initiate scatter? - if (lock->get_state() == LOCK_SYNC && - lock->get_parent()->is_auth()) - scatter_scatter(lock); + lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); + + // initiate scatter or lock? + if (lock->is_stable() && + lock->get_parent()->is_auth()) { + // scatter or lock? + if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) + scatter_scatter(lock); + else + scatter_lock(lock); + } return false; } @@ -1324,63 +1317,104 @@ void Locker::scatter_eval(ScatterLock *lock) if (!lock->get_parent()->is_auth()) { // REPLICA - if (lock->get_state() == LOCK_GSYNCS && + if (lock->get_state() == LOCK_GLOCKC && !lock->is_wrlocked()) { - dout(10) << "scatter_eval no wrlocks, acking sync" << endl; + dout(10) << "scatter_eval no wrlocks, acking lock" << endl; int auth = lock->get_parent()->authority().first; if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { bufferlist data; lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), + mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth, MDS_PORT_LOCKER); } - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_STABLE); // ? + lock->set_state(LOCK_LOCK); } - + } else { // AUTH + + // glocks|glockt -> lock? + if ((lock->get_state() == LOCK_GLOCKS || + lock->get_state() == LOCK_GLOCKT) && + !lock->is_gathering() && + !lock->is_rdlocked()) { + dout(7) << "scatter_eval finished lock gather/un-rdlock on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_LOCK); + lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); + } - // gsyncs -> sync? - if (lock->get_state() == LOCK_GSYNCS && + // glockc -> lock? + if (lock->get_state() == LOCK_GLOCKC && !lock->is_gathering() && !lock->is_wrlocked()) { - dout(7) << "scatter_eval finished gather/un-wrlock on " << *lock + dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_LOCK); + lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); + } + + // gSyncL -> sync? + if (lock->get_state() == LOCK_GSYNCL && + !lock->is_wrlocked()) { + dout(7) << "scatter_eval finished sync un-wrlock on " << *lock << " on " << *lock->get_parent() << endl; lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_STABLE|ScatterLock::WAIT_RD); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + + if (lock->get_parent()->is_replicated()) { + // encode and bcast + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SYNC, data); + } } - - // gscatters -> scatter? - if (lock->get_state() == LOCK_GSCATTERS && + + // gscattert -> scatter? + if (lock->get_state() == LOCK_GSCATTERT && !lock->is_rdlocked()) { - assert(lock->get_parent()->is_auth()); + dout(7) << "scatter_eval finished scatter un-rdlock on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_SCATTER); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); + if (lock->get_parent()->is_replicated()) { // encode and bcast bufferlist data; lock->encode_locked_state(data); send_lock_message(lock, LOCK_AC_SCATTER, data); - } - - lock->set_state(LOCK_SCATTER); - lock->get_wrlock(); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); - lock->put_wrlock(); + } } - - // waiting for rd? - if (lock->get_state() == LOCK_SCATTER && - !lock->is_wrlocked() && - lock->is_waiter_for(ScatterLock::WAIT_RD)) { - dout(10) << "scatter_eval no wrlocks, read waiter, syncing" << endl; - scatter_sync(lock); + + // gTempsyncC|gTempsyncL -> tempsync + if ((lock->get_state() == LOCK_GTEMPSYNCC || + lock->get_state() == LOCK_GTEMPSYNCL) && + !lock->is_gathering() && + !lock->is_wrlocked()) { + dout(7) << "scatter_eval finished tempsync gather/un-wrlock on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_TEMPSYNC); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); } - - // re-scatter? - if (lock->get_state() == LOCK_SYNC && - !lock->is_rdlocked()) { - dout(10) << "scatter_eval no rdlocks, scattering" << endl; - scatter_scatter(lock); + + + // stable -> ??? + if (lock->is_stable()) { + if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) { + // i _should_ be scattered. + if (!lock->is_rdlocked() && + !lock->is_xlocked()) { + dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << endl; + scatter_scatter(lock); + } + } else { + // i _should_ be sync. + if (!lock->is_wrlocked() && + !lock->is_xlocked()) { + dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << endl; + scatter_sync(lock); + } + } } } } @@ -1391,54 +1425,195 @@ void Locker::scatter_sync(ScatterLock *lock) dout(10) << "scatter_sync " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); + + assert(lock->is_stable()); + + switch (lock->get_state()) { + case LOCK_SYNC: + return; // already sync. + + case LOCK_TEMPSYNC: + break; // just do it. + + case LOCK_LOCK: + if (lock->is_wrlocked() || lock->is_xlocked()) { + lock->set_state(LOCK_GSYNCL); + return; + } + break; // do it. + + case LOCK_SCATTER: + // lock first. this is the slow way, incidentally. + if (lock->get_parent()->is_replicated()) { + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + } else { + if (!lock->is_wrlocked()) + break; // do it now, we're fine + } + lock->set_state(LOCK_GLOCKC); + return; + + default: + assert(0); + } - if (lock->get_state() == LOCK_SYNC) return; - assert(lock->get_state() == LOCK_SCATTER); - - // bcast + // do sync if (lock->get_parent()->is_replicated()) { - send_lock_message(lock, LOCK_AC_SYNC); - lock->set_state(LOCK_GSYNCS); - lock->init_gather(); - } - else if (lock->is_wrlocked()) { - lock->set_state(LOCK_GSYNCS); - } - else { - lock->set_state(LOCK_SYNC); - lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + // encode and bcast + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SYNC, data); } -} + lock->set_state(LOCK_SYNC); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); +} void Locker::scatter_scatter(ScatterLock *lock) { dout(10) << "scatter_scatter " << *lock << " on " << *lock->get_parent() << endl; assert(lock->get_parent()->is_auth()); + assert(lock->is_stable()); - if (lock->get_state() == LOCK_SCATTER) return; - assert(lock->get_state() == LOCK_SYNC); + switch (lock->get_state()) { + case LOCK_SYNC: + if (lock->is_rdlocked()) { + scatter_lock(lock); // lock first. + return; + } + break; // do it. + + case LOCK_LOCK: + if (lock->is_xlocked()) + return; // sorry + break; // do it. + + case LOCK_SCATTER: + return; // did it. + + case LOCK_TEMPSYNC: + if (lock->is_rdlocked()) { + lock->set_state(LOCK_GSCATTERT); + return; + } + break; // do it + + default: + assert(0); + } + + // do scatter + if (lock->get_parent()->is_replicated()) { + // encode and bcast + bufferlist data; + lock->encode_locked_state(data); + send_lock_message(lock, LOCK_AC_SCATTER, data); + } + lock->set_state(LOCK_SCATTER); + lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); +} + +void Locker::scatter_lock(ScatterLock *lock) +{ + dout(10) << "scatter_lock " << *lock + << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); + + assert(lock->is_stable()); + + switch (lock->get_state()) { + case LOCK_SYNC: + if (!lock->is_rdlocked() && + !lock->get_parent()->is_replicated()) + break; // do it. - if (lock->is_rdlocked()) { - lock->set_state(LOCK_GSCATTERS); - } else { if (lock->get_parent()->is_replicated()) { - // encode and bcast - bufferlist data; - lock->encode_locked_state(data); - send_lock_message(lock, LOCK_AC_SCATTER, data); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); } - lock->set_state(LOCK_SCATTER); - lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); + lock->set_state(LOCK_GLOCKS); + return; + + case LOCK_LOCK: + return; // done. + + case LOCK_SCATTER: + if (!lock->is_wrlocked() && + !lock->get_parent()->is_replicated()) + break; // do it. + + if (lock->get_parent()->is_replicated()) { + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + } + lock->set_state(LOCK_GLOCKC); + return; + + case LOCK_TEMPSYNC: + if (lock->is_rdlocked()) { + lock->set_state(LOCK_GLOCKT); + return; + } + break; // do it. + } + + // do lock + lock->set_state(LOCK_LOCK); + lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); +} + +void Locker::scatter_tempsync(ScatterLock *lock) +{ + dout(10) << "scatter_tempsync " << *lock + << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); + + assert(lock->is_stable()); + + switch (lock->get_state()) { + case LOCK_SYNC: + break; // do it. + + case LOCK_LOCK: + if (lock->is_wrlocked() || + lock->is_xlocked()) { + lock->set_state(LOCK_GTEMPSYNCL); + return; + } + break; // do it. + + case LOCK_SCATTER: + if (!lock->is_wrlocked() && + !lock->get_parent()->is_replicated()) + break; // do it. + + if (lock->get_parent()->is_replicated()) { + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); + } + lock->set_state(LOCK_GTEMPSYNCC); + return; + + case LOCK_TEMPSYNC: + return; // done } + + // do tempsync + lock->set_state(LOCK_TEMPSYNC); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); } + + + void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) { int from = m->get_asker(); + dout(10) << "handle_scatter_lock " << *m << " on " << *lock << " on " << *lock->get_parent() << endl; if (mds->is_rejoin()) { if (lock->get_parent()->is_rejoining()) { @@ -1452,32 +1627,49 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) switch (m->get_action()) { // -- replica -- case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_SCATTER); + assert(lock->get_state() == LOCK_LOCK); + + lock->set_state(LOCK_SYNC); + lock->decode_locked_state(m->get_data()); + lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE); + break; + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SCATTER || + lock->get_state() == LOCK_SYNC); + // wait for wrlocks to close? if (lock->is_wrlocked()) { + assert(lock->get_state() == LOCK_SCATTER); dout(7) << "handle_scatter_lock has wrlocks, waiting on " << *lock << " on " << *lock->get_parent() << endl; - lock->set_state(LOCK_GSYNCS); + lock->set_state(LOCK_GLOCKC); + } else if (lock->is_rdlocked()) { + assert(lock->get_state() == LOCK_SYNC); + dout(7) << "handle_scatter_lock has rdlocks, waiting on " << *lock + << " on " << *lock->get_parent() << endl; + lock->set_state(LOCK_GLOCKS); } else { // encode and reply bufferlist data; lock->encode_locked_state(data); - mds->send_message_mds(new MLock(lock, LOCK_AC_SYNCACK, mds->get_nodeid(), data), + mds->send_message_mds(new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), from, MDS_PORT_LOCKER); + lock->set_state(LOCK_LOCK); } break; case LOCK_AC_SCATTER: - assert(lock->get_state() == LOCK_SYNC); + assert(lock->get_state() == LOCK_LOCK); lock->decode_locked_state(m->get_data()); lock->set_state(LOCK_SCATTER); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); break; // -- for auth -- - case LOCK_AC_SYNCACK: - assert(lock->get_state() == LOCK_GSYNCS); + case LOCK_AC_LOCKACK: + assert(lock->get_state() == LOCK_GLOCKS || + lock->get_state() == LOCK_GLOCKC); assert(lock->is_gathering(from)); lock->remove_gather(from); lock->decode_locked_state(m->get_data()); @@ -1493,6 +1685,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) scatter_eval(lock); } break; + } delete m; diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h index 5f79bbacf4de0..1f72d7467662d 100644 --- a/branches/sage/cephmds2/mds/Locker.h +++ b/branches/sage/cephmds2/mds/Locker.h @@ -56,6 +56,8 @@ private: public: Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} + SimpleLock *get_lock(int lock_type, MDSCacheObjectInfo &info); + void dispatch(Message *m); void handle_lock(MLock *m); @@ -94,7 +96,9 @@ private: void handle_scatter_lock(ScatterLock *lock, MLock *m); void scatter_eval(ScatterLock *lock); void scatter_sync(ScatterLock *lock); + void scatter_lock(ScatterLock *lock); void scatter_scatter(ScatterLock *lock); + void scatter_tempsync(ScatterLock *lock); bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); void scatter_rdlock_finish(ScatterLock *lock, MDRequest *mdr); bool scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index a352fc4351df0..08b3d4cae7826 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -69,6 +69,8 @@ #include "messages/MClientRequest.h" #include "messages/MClientFileCaps.h" +#include "messages/MMDSSlaveRequest.h" + #include "IdAllocator.h" #include "common/Timer.h" @@ -369,6 +371,11 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) ++p) adjust_export_state(*p); + // evaluate subtree inode dirlock? + // (we should scatter the dirlock on subtree bounds) + if (dir->inode->is_auth()) + mds->locker->scatter_eval(&dir->inode->dirlock); + show_subtrees(); } @@ -1004,7 +1011,7 @@ void MDCache::handle_mds_failure(int who) for (hash_map::iterator p = active_requests.begin(); p != active_requests.end(); ++p) - if (p->second->by_mds == who) + if (p->second->slave_to_mds == who) ls.push_back(p->second); while (!ls.empty()) { dout(10) << "cleaning up slave request " << *ls.front() << endl; @@ -1664,7 +1671,7 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) // so only _locally_ opened files are significant. // ScatterLock -- adjust accordingly if (p->second.dirlock == LOCK_SCATTER || - p->second.dirlock == LOCK_GSCATTERS) // replica still has rdlocks + p->second.dirlock == LOCK_GLOCKC) // replica still has wrlocks in->dirlock.set_state(LOCK_SCATTER); } } else { @@ -1687,7 +1694,7 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) dout(10) << " inode xlock by " << q->second << " on " << *in << endl; // create slave mdrequest - MDRequest *mdr = request_start(q->second); + MDRequest *mdr = request_start_slave(q->second, from); // auth_pin mdr->auth_pin(in); @@ -1713,7 +1720,7 @@ void MDCache::handle_cache_rejoin_rejoin(MMDSCacheRejoin *m) dout(10) << " dn xlock by " << q->second << " on " << *dn << endl; // create slave mdrequest - MDRequest *mdr = request_start(q->second); + MDRequest *mdr = request_start_slave(q->second, from); // auth_pin mdr->auth_pin(dn->dir); @@ -2720,7 +2727,7 @@ bool MDCache::shutdown_pass() if (!subtrees.empty()) { dout(7) << "still have " << num_subtrees() << " subtrees" << endl; show_subtrees(); - show_cache(); + //show_cache(); return false; } assert(subtrees.empty()); @@ -3191,7 +3198,10 @@ int MDCache::path_traverse(MDRequest *mdr, req->clear_payload(); // reencode! } - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); + if (mdr) + request_forward(mdr, dauth.first, req->get_dest_port()); + else + mds->forward_message_mds(req, dauth.first, req->get_dest_port()); if (mds->logger) mds->logger->inc("cfw"); delete ondelay; @@ -3393,26 +3403,28 @@ void MDCache::make_trace(vector& trace, CInode *in) } -MDRequest *MDCache::request_start(metareqid_t ri) +MDRequest *MDCache::request_start_slave(metareqid_t ri, int by) { - MDRequest *mdr = new MDRequest(ri); + MDRequest *mdr = new MDRequest(ri, 0, by); + assert(active_requests.count(mdr->reqid) == 0); active_requests[mdr->reqid] = mdr; - dout(7) << "request_start " << *mdr << endl; + dout(7) << "request_start_slave " << *mdr << " by mds" << by << endl; return mdr; } MDRequest *MDCache::request_start(MClientRequest *req) { MDRequest *mdr = new MDRequest(req->get_reqid(), req); + assert(active_requests.count(mdr->reqid) == 0); active_requests[mdr->reqid] = mdr; dout(7) << "request_start " << *mdr << endl; return mdr; } -MDRequest *MDCache::request_start(MLock *req) +MDRequest *MDCache::request_start(MMDSSlaveRequest *slavereq) { - MDRequest *mdr = new MDRequest(req->get_reqid(), req); - mdr->by_mds = req->get_source().num(); + MDRequest *mdr = new MDRequest(slavereq->get_reqid(), slavereq, slavereq->get_source().num()); + assert(active_requests.count(mdr->reqid) == 0); active_requests[mdr->reqid] = mdr; dout(7) << "request_start " << *mdr << endl; return mdr; @@ -3429,7 +3441,8 @@ void MDCache::request_finish(MDRequest *mdr) { dout(7) << "request_finish " << *mdr << endl; - delete mdr->request; + delete mdr->client_request; + delete mdr->slave_request; request_cleanup(mdr); if (mds->logger) mds->logger->inc("reply"); @@ -3440,9 +3453,9 @@ void MDCache::request_forward(MDRequest *mdr, int who, int port) { if (!port) port = MDS_PORT_SERVER; - dout(7) << "request_forward to " << who << " req " << *mdr << endl; + dout(7) << "request_forward " << *mdr << " to mds" << who << " req " << *mdr << endl; - mds->forward_message_mds(mdr->request, who, port); + mds->forward_message_mds(mdr->client_request, who, port); request_cleanup(mdr); if (mds->logger) mds->logger->inc("fw"); @@ -3451,20 +3464,12 @@ void MDCache::request_forward(MDRequest *mdr, int who, int port) void MDCache::dispatch_request(MDRequest *mdr) { - assert(mdr->request); - - switch (mdr->request->get_type()) { - case MSG_CLIENT_REQUEST: - mds->server->dispatch_request(mdr); - break; - - case MSG_MDS_LOCK: - mds->locker->handle_lock((MLock*)mdr->request); - break; - - default: - assert(0); // shouldn't get here - } + if (mdr->client_request) { + mds->server->dispatch_client_request(mdr); + } else if (mdr->slave_request) { + mds->server->dispatch_slave_request(mdr); + } else + assert(0); } @@ -3489,6 +3494,7 @@ void MDCache::request_drop_locks(MDRequest *mdr) void MDCache::request_cleanup(MDRequest *mdr) { + dout(15) << "request_cleanup " << *mdr << endl; metareqid_t ri = mdr->reqid; // clear ref, trace @@ -3508,10 +3514,29 @@ void MDCache::request_cleanup(MDRequest *mdr) (*it)->put(MDSCacheObject::PIN_REQUEST); mdr->pins.clear(); + // drop remote dn pins + for (map >::iterator p = mdr->remote_dn_pins.begin(); + p != mdr->remote_dn_pins.end(); + ++p) { + //..... + assert(0); + } + + // slaves + for (set::iterator p = mdr->slaves.begin(); + p != mdr->slaves.end(); + ++p) { + MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); + mds->send_message_mds(r, *p, MDS_PORT_SERVER); + } + // remove from map active_requests.erase(mdr->reqid); delete mdr; + + + // log some stats ***** if (mds->logger) { mds->logger->set("c", lru.lru_get_size()); diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index aa58ce7c45f81..1c1967e4893f3 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -51,7 +51,7 @@ class MLock; class Message; class MClientRequest; - +class MMDSSlaveRequest; // MDCache @@ -66,17 +66,25 @@ class MClientRequest; */ struct MDRequest { metareqid_t reqid; - Message *request; // MClientRequest, or MLock - int by_mds; // if MLock, and remote xlock attempt - + + // -- i am a client (master) request + MClientRequest *client_request; // client request (if any) + set slaves; // mds nodes that have slave requests to me (implies client_request) + vector trace; // original path traversal. CInode *ref; // reference inode. if there is only one, and its path is pinned. + + // -- i am a slave request + MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) + int slave_to_mds; // this is a slave request. slave_request may or may not be a pending op. + // cache pins (so things don't expire) - set< MDSCacheObject* > pins; + set< MDSCacheObject* > pins; + map< CDentry*, set > remote_dn_pins; // [master] dn -> mds set it's pinned on // auth pins - set< CDir* > dir_auth_pins; + set< CDir* > dir_auth_pins; set< CInode* > inode_auth_pins; // held locks @@ -89,13 +97,20 @@ struct MDRequest { map< inodeno_t, inode_t > projected_inode; + // --------------------------------------------------- - MDRequest() : request(0), by_mds(-1), ref(0) {} - MDRequest(metareqid_t ri, Message *req=0) : reqid(ri), request(req), by_mds(-1), ref(0) {} + MDRequest() : + client_request(0), ref(0), + slave_request(0), slave_to_mds(-1) { } + MDRequest(metareqid_t ri, MClientRequest *req) : + reqid(ri), client_request(req), ref(0), + slave_request(0), slave_to_mds(-1) { } + MDRequest(metareqid_t ri, MMDSSlaveRequest *req, int by) : + reqid(ri), client_request(0), ref(0), + slave_request(req), slave_to_mds(by) { } - // request - MClientRequest *client_request() { - return (MClientRequest*)request; + bool is_slave() { + return slave_to_mds >= 0; } // pin items in cache @@ -139,6 +154,7 @@ inline ostream& operator<<(ostream& out, MDRequest &mdr) { out << "request(" << mdr.reqid; //if (mdr.request) out << " " << *mdr.request; + if (mdr.is_slave()) out << " slave_to mds" << mdr.slave_to_mds; out << ")"; return out; } @@ -223,9 +239,12 @@ protected: hash_map active_requests; public: - MDRequest* request_start(metareqid_t rid); MDRequest* request_start(MClientRequest *req); - MDRequest* request_start(MLock *req); + MDRequest* request_start(MMDSSlaveRequest *slavereq); + MDRequest* request_start_slave(metareqid_t rid, int by); + bool have_request(metareqid_t rid) { + return active_requests.count(rid); + } MDRequest* request_get(metareqid_t rid); void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); void request_finish(MDRequest *mdr); diff --git a/branches/sage/cephmds2/mds/MDLog.cc b/branches/sage/cephmds2/mds/MDLog.cc index ff5428a740dba..4acde2b2d8f10 100644 --- a/branches/sage/cephmds2/mds/MDLog.cc +++ b/branches/sage/cephmds2/mds/MDLog.cc @@ -279,7 +279,12 @@ void MDLog::trim(Context *c) // trim! dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl; - while (num_events > max_events) { + // hack: only trim for a few seconds at a time + utime_t stop = g_clock.now(); + stop += 2.0; + + while (num_events > max_events && + stop > g_clock.now()) { off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); dout(5) << "trim num_events " << num_events << " > max " << max_events diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc index e4b0d21d6959b..565433dc86256 100644 --- a/branches/sage/cephmds2/mds/MDS.cc +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -232,7 +232,7 @@ void MDS::forward_message_mds(Message *req, int mds, int port) if (!creq->is_idempotent()) { delete req; - return; // don't actually forward if non-idempotent + return; // don't actually forward if non-idempotent! client has to do it. } } diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 57be41a031a15..35038a1ff646d 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -47,22 +47,6 @@ #include "messages/MExportDirNotifyAck.h" #include "messages/MExportDirFinish.h" -#include "messages/MHashDirDiscover.h" -#include "messages/MHashDirDiscoverAck.h" -#include "messages/MHashDirPrep.h" -#include "messages/MHashDirPrepAck.h" -#include "messages/MHashDir.h" -#include "messages/MHashDirNotify.h" -#include "messages/MHashDirAck.h" - -#include "messages/MUnhashDirPrep.h" -#include "messages/MUnhashDirPrepAck.h" -#include "messages/MUnhashDir.h" -#include "messages/MUnhashDirAck.h" -#include "messages/MUnhashDirNotify.h" -#include "messages/MUnhashDirNotifyAck.h" - - #include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " diff --git a/branches/sage/cephmds2/mds/ScatterLock.h b/branches/sage/cephmds2/mds/ScatterLock.h index 08a57d8ea67b3..e14a7359717fa 100644 --- a/branches/sage/cephmds2/mds/ScatterLock.h +++ b/branches/sage/cephmds2/mds/ScatterLock.h @@ -19,18 +19,43 @@ #include "SimpleLock.h" -// lock state machine states. -#define LOCK_SYNC__ // rdlocks allowed (e.g., for stat) -#define LOCK_GSYNCS -20 // waiting for replicas to gather -#define LOCK_SCATTER 21 // mtime updates on replicas allowed, no reads. -#define LOCK_GSCATTERS 22 // waiting for rdlocks to release +// lock state machine states: +// Sync -- Lock -- sCatter +// Tempsync _/ +// auth repl +#define LOCK_SYNC__ // R . R . rdlocks allowed on auth and replicas +#define LOCK_GLOCKS -20 // r . r . waiting for replicas+rdlocks (auth), or rdlocks to release (replica) + +#define LOCK_GSYNCL__ // . w LOCK on replica. +#define LOCK_LOCK__ // . W . . +#define LOCK_GTEMPSYNCL -21 // . w LOCK on replica. + +#define LOCK_GLOCKC -22 // . w . w waiting for replicas+wrlocks (auth), or wrlocks to release (replica) +#define LOCK_SCATTER 23 // . W . W mtime updates on replicas allowed, no reads. stable here. +#define LOCK_GTEMPSYNCC -24 // . w . w GLOCKC|LOCK on replica + +#define LOCK_GSCATTERT -25 // r . LOCK on replica. +#define LOCK_GLOCKT -26 // r . LOCK on replica. +#define LOCK_TEMPSYNC 27 // R . LOCK on replica. + inline const char *get_scatterlock_state_name(int s) { switch(s) { - case LOCK_SYNC: return "sync"; - case LOCK_GSYNCS: return "gsyncs"; - case LOCK_SCATTER: return "scatter"; - case LOCK_GSCATTERS: return "gscatters"; + case LOCK_SYNC: return "Sync"; + case LOCK_GLOCKS: return "gLockS"; + + case LOCK_GSYNCL: return "gSyncL"; + case LOCK_LOCK: return "Lock"; + case LOCK_GTEMPSYNCL: return "gTempsyncL"; + + case LOCK_GLOCKC: return "gLockC"; + case LOCK_SCATTER: return "sCatter"; + case LOCK_GTEMPSYNCC: return "gTempsyncC"; + + case LOCK_GSCATTERT: return "gsCatterT"; + case LOCK_GLOCKT: return "gLockT"; + case LOCK_TEMPSYNC: return "Tempsync"; + default: assert(0); } } @@ -45,35 +70,56 @@ public: int get_replica_state() { switch (state) { case LOCK_SYNC: - case LOCK_GSYNCS: - case LOCK_GSCATTERS: return LOCK_SYNC; + + case LOCK_GLOCKS: + case LOCK_GSYNCL: + case LOCK_LOCK: + case LOCK_GTEMPSYNCL: + case LOCK_GLOCKC: + return LOCK_LOCK; + case LOCK_SCATTER: return LOCK_SCATTER; + + case LOCK_GTEMPSYNCC: + case LOCK_GSCATTERT: + case LOCK_GLOCKT: + case LOCK_TEMPSYNC: + return LOCK_LOCK; default: assert(0); } } void replicate_relax() { - if (state == LOCK_SYNC && !is_rdlocked()) - state = LOCK_SCATTER; + //if (state == LOCK_SYNC && !is_rdlocked()) + //state = LOCK_SCATTER; } // rdlock bool can_rdlock(MDRequest *mdr) { - return state == LOCK_SYNC; + return state == LOCK_SYNC || state == LOCK_TEMPSYNC; } bool can_rdlock_soon() { - return state == LOCK_SYNC || state == LOCK_GSYNCS; + return state == LOCK_GTEMPSYNCC; + } + + // xlock + bool can_xlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKC || + state == LOCK_GLOCKS); + else + return false; } // wrlock bool can_wrlock() { - return state == LOCK_SCATTER; + return state == LOCK_SCATTER || state == LOCK_LOCK; } void get_wrlock() { - assert(state == LOCK_SCATTER); + assert(can_wrlock()); ++num_wrlock; } void put_wrlock() { @@ -89,8 +135,8 @@ public: if (!get_gather_set().empty()) out << " g=" << get_gather_set(); if (is_rdlocked()) out << " r=" << get_num_rdlocks(); - //if (l.is_xlocked()) - //out << " x=" << l.get_xlocked_by(); + if (is_xlocked()) + out << " x=" << get_xlocked_by(); if (is_wrlocked()) out << " wr=" << get_num_wrlocks(); out << ")"; diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 09d7e54552243..05b63e7e92279 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -28,6 +28,8 @@ #include "messages/MClientReply.h" #include "messages/MClientReconnect.h" +#include "messages/MMDSSlaveRequest.h" + #include "messages/MLock.h" #include "messages/MDentryUnlink.h" @@ -78,6 +80,9 @@ void Server::dispatch(Message *m) case MSG_CLIENT_REQUEST: handle_client_request((MClientRequest*)m); return; + case MSG_MDS_SLAVE_REQUEST: + handle_slave_request((MMDSSlaveRequest*)m); + return; } dout(1) << "server unknown message " << m->get_type() << endl; @@ -307,8 +312,7 @@ void Server::reconnect_finish() */ void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) { - MClientRequest *req = mdr->client_request(); - reply_request(mdr, new MClientReply(req, r), tracei); + reply_request(mdr, new MClientReply(mdr->client_request, r), tracei); } @@ -318,7 +322,7 @@ void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) */ void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; dout(10) << "reply_request " << reply->get_result() << " (" << strerror(-reply->get_result()) @@ -421,19 +425,19 @@ void Server::handle_client_request(MClientRequest *req) mdr->pin(ref); } - dispatch_request(mdr); + dispatch_client_request(mdr); return; } -void Server::dispatch_request(MDRequest *mdr) +void Server::dispatch_client_request(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; if (mdr->ref) { - dout(7) << "dispatch_request " << *req << " ref " << *mdr->ref << endl; + dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << endl; } else { - dout(7) << "dispatch_request " << *req << endl; + dout(7) << "dispatch_client_request " << *req << endl; } switch (req->get_op()) { @@ -501,6 +505,116 @@ void Server::dispatch_request(MDRequest *mdr) } +// --------------------------------------- +// SLAVE REQUESTS + +void Server::handle_slave_request(MMDSSlaveRequest *m) +{ + dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << endl; + + // reply? + if (m->is_reply()) { + // yay! + switch (m->get_op()) { + case MMDSSlaveRequest::OP_XLOCKACK: + { + // identify lock, master request + SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), + m->get_object_info()); + MDRequest *mdr = mdcache->request_get(m->get_reqid()); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); + } + delete m; + return; + + default: + assert(0); + } + } else { + // am i a new slave? + MDRequest *mdr; + if (mdcache->have_request(m->get_reqid())) { + mdr = mdcache->request_get(m->get_reqid()); + assert(mdr->slave_request == 0); // only one at a time, please! + mdr->slave_request = m; + } else { + // new. + mdcache->request_start(m); + } + + dispatch_slave_request(mdr); + } +} + +void Server::dispatch_slave_request(MDRequest *mdr) +{ + dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << endl; + + switch (mdr->slave_request->get_op()) { + case MMDSSlaveRequest::OP_XLOCK: + { + // identify object + SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), + mdr->slave_request->get_object_info()); + + if (lock && lock->get_parent()->is_auth()) { + // xlock + if (!mds->locker->xlock_start(lock, mdr)) + return; + + // ack + MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCKACK); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + mds->send_message_mds(r, mdr->slave_request->get_source().num(), MDS_PORT_SERVER); + } else { + if (lock) { + dout(10) << "not auth for remote xlock attempt, dropping on " + << *lock << " on " << *lock->get_parent() << endl; + } else { + dout(10) << "don't have object, dropping" << endl; + assert(0); // can this happen? hmm. + } + } + + // done. + delete mdr->slave_request; + mdr->slave_request = 0; + } + break; + + case MMDSSlaveRequest::OP_UNXLOCK: + { + SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(), + mdr->slave_request->get_object_info()); + assert(lock); + mds->locker->xlock_finish(lock, mdr); + + // done. no ack necessary. + delete mdr->slave_request; + mdr->slave_request = 0; + } + break; + + case MMDSSlaveRequest::OP_PINDN: + case MMDSSlaveRequest::OP_UNPINDN: + // get the CDentry* + + + break; + + case MMDSSlaveRequest::OP_FINISH: + mdcache->request_finish(mdr); + break; + + default: + assert(0); + } +} + + // --------------------------------------- // HELPERS @@ -621,7 +735,7 @@ CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, file int r = mdcache->path_traverse(mdr, 0, refpath, trace, true, - mdr->request, ondelay, + mdr->client_request, ondelay, MDS_TRAVERSE_FORWARD, true); // is MClientRequest if (r > 0) return 0; // delayed @@ -656,7 +770,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) if (mdr->ref) return mdr->ref; - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; // traverse filepath refpath = req->get_filepath(); @@ -749,7 +863,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) */ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; vector trace; CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); @@ -944,7 +1058,7 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) void Server::handle_client_stat(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *ref = rdlock_path_pin_ref(mdr, false); if (!ref) return; @@ -1003,7 +1117,7 @@ public: in->mark_dirty(pv); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, in); } @@ -1014,7 +1128,7 @@ public: void Server::handle_client_utime(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; @@ -1069,7 +1183,7 @@ public: in->mark_dirty(pv); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, in); } @@ -1080,7 +1194,7 @@ public: void Server::handle_client_chmod(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; @@ -1130,7 +1244,7 @@ public: in->mark_dirty(pv); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, in); } @@ -1139,7 +1253,7 @@ public: void Server::handle_client_chown(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; @@ -1209,7 +1323,7 @@ int Server::encode_dir_contents(CDir *dir, void Server::handle_client_readdir(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *diri = rdlock_path_pin_ref(mdr, false); if (!diri) return; @@ -1307,7 +1421,7 @@ public: mds->balancer->hit_inode(newi, META_POP_IWR); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); mds->server->reply_request(mdr, reply, newi); } @@ -1317,7 +1431,7 @@ public: void Server::handle_client_mknod(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; @@ -1353,7 +1467,7 @@ void Server::handle_client_mknod(MDRequest *mdr) void Server::handle_client_mkdir(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; @@ -1409,7 +1523,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) void Server::handle_client_symlink(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); if (!dn) return; @@ -1445,7 +1559,7 @@ void Server::handle_client_symlink(MDRequest *mdr) void Server::handle_client_link(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; dout(7) << "handle_client_link " << req->get_filepath() << " to " << req->get_sarg() @@ -1616,7 +1730,7 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, mds->balancer->hit_inode(targeti, META_POP_IWR); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref } @@ -1704,7 +1818,7 @@ public: void Server::handle_client_unlink(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; // traverse to path vector trace; @@ -1913,7 +2027,7 @@ void Server::_unlink_local_finish(MDRequest *mdr, mds->balancer->hit_dir(dn->dir, META_POP_DWR); // reply - MClientReply *reply = new MClientReply(mdr->client_request(), 0); + MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref if (straydn) @@ -2071,7 +2185,7 @@ bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest * void Server::handle_client_rename(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; dout(7) << "handle_client_rename " << *req << endl; // traverse to dest dir (not dest) @@ -2271,7 +2385,9 @@ void Server::_rename_local(MDRequest *mdr, // dir mtimes version_t ddirpv = predirty_dn_diri(destdn, &le->metablob, now); - version_t sdirpv = predirty_dn_diri(srcdn, &le->metablob, now); + version_t sdirpv = 0; + if (destdn->dir != srcdn->dir) + sdirpv = predirty_dn_diri(srcdn, &le->metablob, now); if (linkmerge) { dout(10) << "will merge remote+primary links" << endl; @@ -2413,7 +2529,7 @@ void Server::_rename_local_finish(MDRequest *mdr, utime_t ictime, version_t atid1, version_t atid2) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; dout(10) << "_rename_local_finish " << *req << endl; CInode *oldin = destdn->inode; @@ -2424,7 +2540,8 @@ void Server::_rename_local_finish(MDRequest *mdr, // dir mtimes dirty_dn_diri(destdn, ddirpv, ictime); - dirty_dn_diri(srcdn, sdirpv, ictime); + if (destdn->dir != srcdn->dir) + dirty_dn_diri(srcdn, sdirpv, ictime); if (linkmerge) { assert(ipv); @@ -2730,7 +2847,7 @@ public: void Server::handle_client_truncate(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; CInode *cur = rdlock_path_pin_ref(mdr, true); if (!cur) return; @@ -2773,7 +2890,7 @@ void Server::handle_client_truncate(MDRequest *mdr) void Server::handle_client_open(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; int flags = req->args.open.flags; int cmode = req->get_open_file_mode(); @@ -2817,7 +2934,7 @@ void Server::handle_client_open(MDRequest *mdr) void Server::_do_open(MDRequest *mdr, CInode *cur) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; int cmode = req->get_open_file_mode(); // can we issue the caps they want? @@ -3019,7 +3136,7 @@ public: void Server::handle_client_openc(MDRequest *mdr) { - MClientRequest *req = mdr->client_request(); + MClientRequest *req = mdr->client_request; dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index ef46c0a56c8d6..0aa67d533c277 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -21,6 +21,8 @@ class LogEvent; class C_MDS_rename_local_finish; class MDRequest; +class MMDSSlaveRequest; + class Server { MDS *mds; MDCache *mdcache; @@ -54,10 +56,13 @@ public: // -- requests -- void handle_client_request(MClientRequest *m); - void dispatch_request(MDRequest *mdr); + void dispatch_client_request(MDRequest *mdr); void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); + void handle_slave_request(MMDSSlaveRequest *m); + void dispatch_slave_request(MDRequest *mdr); + // some helpers CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); diff --git a/branches/sage/cephmds2/mds/SimpleLock.h b/branches/sage/cephmds2/mds/SimpleLock.h index adb13dfc228c6..513af4161cc5d 100644 --- a/branches/sage/cephmds2/mds/SimpleLock.h +++ b/branches/sage/cephmds2/mds/SimpleLock.h @@ -41,6 +41,7 @@ inline const char *get_lock_type_name(int t) { } // -- lock states -- +// sync <-> lock #define LOCK_UNDEF 0 // auth rep #define LOCK_SYNC 1 // AR R . R . @@ -65,10 +66,10 @@ class SimpleLock { public: static const int WAIT_RD = (1<<0); // to read static const int WAIT_WR = (1<<1); // to write - static const int WAIT_SINGLEAUTH = (1<<2); + static const int WAIT_XLOCK = (1<<2); // to xlock static const int WAIT_STABLE = (1<<3); // for a stable state - static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock - static const int WAIT_BITS = 5; + static const int WAIT_REMOTEXLOCK = (1<<4); // for a remote xlock (*) + static const int WAIT_BITS = 4; protected: // parent (what i lock) @@ -77,11 +78,11 @@ protected: int wait_offset; // lock state - int state; - set<__int32_t> gather_set; // auth + int state; + set gather_set; // auth // local state - int num_rdlock; + int num_rdlock; MDRequest *xlock_by; public: @@ -113,7 +114,11 @@ public: parent->finish_waiting(mask << wait_offset, r); } void add_waiter(int mask, Context *c) { - parent->add_waiter(mask << wait_offset, c); + // (*) REMOTEXLOCK events alsowait on parent's WAIT_SINGLEAUTH. + if (mask & WAIT_REMOTEXLOCK) + parent->add_waiter((mask << wait_offset) | MDSCacheObject::WAIT_SINGLEAUTH, c); + else + parent->add_waiter(mask << wait_offset, c); } bool is_waiter_for(int mask) { return parent->is_waiter_for(mask << wait_offset); diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index 74b6f947d631c..560dd76131361 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -268,6 +268,24 @@ ostream& operator<<(ostream& out, mdsco_db_line_prefix o); // printer ostream& operator<<(ostream& out, MDSCacheObject &o); +class MDSCacheObjectInfo { +public: + inodeno_t ino; + dirfrag_t dirfrag; + string dname; + + void _encode(bufferlist& bl) { + ::_encode(ino, bl); + ::_encode(dirfrag, bl); + ::_encode(dname, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(ino, bl, off); + ::_decode(dirfrag, bl, off); + ::_decode(dname, bl, off); + } +}; + class MDSCacheObject { public: @@ -499,7 +517,7 @@ protected: // locking // noop unless overloaded. virtual SimpleLock* get_lock(int type) { assert(0); } - virtual void set_mlock_info(MLock *m) { assert(0); } + virtual void set_object_info(MDSCacheObjectInfo &info) { assert(0); } virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } diff --git a/branches/sage/cephmds2/messages/MLock.h b/branches/sage/cephmds2/messages/MLock.h index 890c536e75310..72ecfdc29bcf2 100644 --- a/branches/sage/cephmds2/messages/MLock.h +++ b/branches/sage/cephmds2/messages/MLock.h @@ -24,9 +24,6 @@ #define LOCK_AC_MIXED -2 #define LOCK_AC_LOCK -3 -#define LOCK_AC_REQXLOCKACK -4 // req dentry xlock -#define LOCK_AC_REQXLOCKNAK -5 // req dentry xlock - #define LOCK_AC_SCATTER -6 // for auth @@ -34,41 +31,47 @@ #define LOCK_AC_MIXEDACK 2 #define LOCK_AC_LOCKACK 3 -#define LOCK_AC_REQREAD 4 -#define LOCK_AC_REQWRITE 5 - -#define LOCK_AC_REQXLOCK 6 -#define LOCK_AC_UNXLOCK 7 -#define LOCK_AC_FINISH 8 +//#define LOCK_AC_REQREAD 4 +//#define LOCK_AC_REQWRITE 5 #define LOCK_AC_FOR_REPLICA(a) ((a) < 0) #define LOCK_AC_FOR_AUTH(a) ((a) > 0) +static const char *get_lock_action_name(int a) { + switch (a) { + case LOCK_AC_SYNC: return "sync"; + case LOCK_AC_MIXED: return "mixed"; + case LOCK_AC_LOCK: return "lock"; + case LOCK_AC_SCATTER: return "scatter"; + case LOCK_AC_SYNCACK: return "syncack"; + case LOCK_AC_MIXEDACK: return "mixedack"; + case LOCK_AC_LOCKACK: return "lockack"; + default: assert(0); + } +} + + class MLock : public Message { int asker; // who is initiating this request int action; // action type - - char otype; // lock object type - inodeno_t ino; // ino ref, or possibly - dirfrag_t dirfrag; - string dn; // dentry name - metareqid_t reqid; // for remote lock requests + + char lock_type; // lock object type + MDSCacheObjectInfo object_info; bufferlist data; // and possibly some data public: - inodeno_t get_ino() { return ino; } - dirfrag_t get_dirfrag() { return dirfrag; } - string& get_dn() { return dn; } bufferlist& get_data() { return data; } int get_asker() { return asker; } int get_action() { return action; } - int get_otype() { return otype; } metareqid_t get_reqid() { return reqid; } + int get_lock_type() { return lock_type; } + MDSCacheObjectInfo &get_object_info() { return object_info; } + MLock() {} MLock(int action, int asker) : Message(MSG_MDS_LOCK) { @@ -77,45 +80,29 @@ class MLock : public Message { } MLock(SimpleLock *lock, int action, int asker) : Message(MSG_MDS_LOCK) { - this->otype = lock->get_type(); - lock->get_parent()->set_mlock_info(this); + this->lock_type = lock->get_type(); + lock->get_parent()->set_object_info(object_info); this->action = action; this->asker = asker; } MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) : Message(MSG_MDS_LOCK) { - this->otype = lock->get_type(); - lock->get_parent()->set_mlock_info(this); + this->lock_type = lock->get_type(); + lock->get_parent()->set_object_info(object_info); this->action = action; this->asker = asker; data.claim(bl); } virtual char *get_type_name() { return "ILock"; } void print(ostream& out) { - out << "lock(a=" << action - << " " << ino - << " " << get_lock_type_name(otype) + out << "lock(a=" << get_lock_action_name(action) + << " " << get_lock_type_name(lock_type) + << " " << object_info.ino + << "/" << object_info.dirfrag + << "/" << object_info.dname << ")"; } - void set_ino(inodeno_t ino, char ot) { - otype = ot; - this->ino = ino; - } - void set_ino(inodeno_t ino) { - this->ino = ino; - } - /* - void set_dirfrag(dirfrag_t df) { - otype = LOCK_OTYPE_DIR; - this->dirfrag = df; - } - */ - void set_dn(dirfrag_t df, const string& dn) { - otype = LOCK_OTYPE_DN; - this->dirfrag = df; - this->dn = dn; - } void set_reqid(metareqid_t ri) { reqid = ri; } void set_data(const bufferlist& data) { this->data = data; @@ -123,23 +110,19 @@ class MLock : public Message { void decode_payload() { int off = 0; - ::_decode(action, payload, off); ::_decode(asker, payload, off); - ::_decode(otype, payload, off); - ::_decode(ino, payload, off); - ::_decode(dirfrag, payload, off); + ::_decode(action, payload, off); ::_decode(reqid, payload, off); - ::_decode(dn, payload, off); + ::_decode(lock_type, payload, off); + object_info._decode(payload, off); ::_decode(data, payload, off); } virtual void encode_payload() { - ::_encode(action, payload); ::_encode(asker, payload); - ::_encode(otype, payload); - ::_encode(ino, payload); - ::_encode(dirfrag, payload); + ::_encode(action, payload); ::_encode(reqid, payload); - ::_encode(dn, payload); + ::_encode(lock_type, payload); + object_info._encode(payload); ::_encode(data, payload); } diff --git a/branches/sage/cephmds2/messages/MMDSSlaveRequest.h b/branches/sage/cephmds2/messages/MMDSSlaveRequest.h new file mode 100644 index 0000000000000..a66538d1c23d8 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMDSSlaveRequest.h @@ -0,0 +1,99 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMDSSLAVEREQUEST_H +#define __MMDSSLAVEREQUEST_H + +#include "msg/Message.h" +#include "mds/mdstypes.h" + + +class MMDSSlaveRequest : public Message { + public: + static const int OP_XLOCK = 1; + static const int OP_XLOCKACK = -1; + static const int OP_UNXLOCK = 3; + static const int OP_PINDN = 4; + static const int OP_UNPINDN = 5; + static const int OP_FINISH = 6; + + const static char *get_opname(int o) { + switch (o) { + case OP_XLOCK: return "xlock"; + case OP_XLOCKACK: return "xlock_ack"; + case OP_UNXLOCK: return "unxlock"; + case OP_PINDN: return "pin_dn"; + case OP_UNPINDN: return "unpin_dn"; + case OP_FINISH: return "finish"; + default: assert(0); return 0; + } + } + + private: + metareqid_t reqid; + char op; + + // for locking + char lock_type; // lock object type + MDSCacheObjectInfo object_info; + + // for dn pins + string dnpath; + + public: + metareqid_t get_reqid() { return reqid; } + int get_op() { return op; } + bool is_reply() { return op < 0; } + + int get_lock_type() { return lock_type; } + MDSCacheObjectInfo &get_object_info() { return object_info; } + + const string& get_dnpath() { return dnpath; } + + void set_lock_type(int t) { lock_type = t; } + + // ---- + MMDSSlaveRequest() : Message(MSG_MDS_SLAVE_REQUEST) { } + MMDSSlaveRequest(metareqid_t ri, int o) : + Message(MSG_MDS_SLAVE_REQUEST), + reqid(ri), op(o) + { } + + void encode_payload() { + ::_encode(reqid, payload); + ::_encode(op, payload); + ::_encode(lock_type, payload); + object_info._encode(payload); + ::_encode(dnpath, payload); + } + void decode_payload() { + int off = 0; + ::_decode(reqid, payload, off); + ::_decode(op, payload, off); + ::_decode(lock_type, payload, off); + object_info._decode(payload, off); + ::_decode(dnpath, payload, off); + } + + char *get_type_name() { return "slave_request"; } + void print(ostream& out) { + out << "slave_request(" << reqid + << " " << get_opname(op) + << ")"; + } + +}; + +#endif diff --git a/branches/sage/cephmds2/msg/Message.cc b/branches/sage/cephmds2/msg/Message.cc index 39510c47b9bd3..95d03d72133dd 100644 --- a/branches/sage/cephmds2/msg/Message.cc +++ b/branches/sage/cephmds2/msg/Message.cc @@ -47,6 +47,8 @@ using namespace std; #include "messages/MClientReply.h" #include "messages/MClientFileCaps.h" +#include "messages/MMDSSlaveRequest.h" + #include "messages/MMDSGetMap.h" #include "messages/MMDSMap.h" #include "messages/MMDSBeacon.h" @@ -216,6 +218,10 @@ decode_message(msg_envelope_t& env, bufferlist& payload) break; // mds + case MSG_MDS_SLAVE_REQUEST: + m = new MMDSSlaveRequest; + break; + case MSG_MDS_GETMAP: m = new MMDSGetMap(); break; diff --git a/branches/sage/cephmds2/msg/Message.h b/branches/sage/cephmds2/msg/Message.h index c7e76297553ed..e364c79b35e8c 100644 --- a/branches/sage/cephmds2/msg/Message.h +++ b/branches/sage/cephmds2/msg/Message.h @@ -125,24 +125,7 @@ #define MSG_MDS_EXPORTDIRNOTIFYACK 159 #define MSG_MDS_EXPORTDIRFINISH 160 - -#define MSG_MDS_HASHDIRDISCOVER 170 -#define MSG_MDS_HASHDIRDISCOVERACK 171 -#define MSG_MDS_HASHDIRPREP 172 -#define MSG_MDS_HASHDIRPREPACK 173 -#define MSG_MDS_HASHDIR 174 -#define MSG_MDS_HASHDIRACK 175 -#define MSG_MDS_HASHDIRNOTIFY 176 - -#define MSG_MDS_HASHREADDIR 178 -#define MSG_MDS_HASHREADDIRREPLY 179 - -#define MSG_MDS_UNHASHDIRPREP 180 -#define MSG_MDS_UNHASHDIRPREPACK 181 -#define MSG_MDS_UNHASHDIR 182 -#define MSG_MDS_UNHASHDIRACK 183 -#define MSG_MDS_UNHASHDIRNOTIFY 184 -#define MSG_MDS_UNHASHDIRNOTIFYACK 185 +#define MSG_MDS_SLAVE_REQUEST 170 #define MSG_MDS_DENTRYUNLINK 200 -- 2.39.5