From ea784976dfe32912269018dd8822ec3e13660def Mon Sep 17 00:00:00 2001 From: sageweil Date: Wed, 28 Mar 2007 20:53:53 +0000 Subject: [PATCH] * extensive rewrite of locker, server request handling, lock acquisition code * mds: path_traverse cleanup (no more onfinish--we cache negative dentries now) git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1314 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 19 + .../sage/cephmds2/client/SyntheticClient.cc | 3 + branches/sage/cephmds2/mds/CDentry.h | 41 +- branches/sage/cephmds2/mds/CInode.h | 11 +- branches/sage/cephmds2/mds/Lock.h | 8 +- branches/sage/cephmds2/mds/Locker.cc | 521 +++++- branches/sage/cephmds2/mds/Locker.h | 39 +- branches/sage/cephmds2/mds/MDCache.cc | 462 ++--- branches/sage/cephmds2/mds/MDCache.h | 168 +- branches/sage/cephmds2/mds/Migrator.cc | 63 +- branches/sage/cephmds2/mds/Migrator.h | 2 - branches/sage/cephmds2/mds/Server.cc | 1544 ++++++++--------- branches/sage/cephmds2/mds/Server.h | 153 +- .../sage/cephmds2/messages/MClientRequest.h | 9 +- 14 files changed, 1614 insertions(+), 1429 deletions(-) diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index e4f750dbd5190..52339e42b1082 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -44,6 +44,7 @@ mds - discover / - hard link dentries + - open_remote_ino needs major work... - rejoin and replicas that are not in recovered node's cache... fetch storm? @@ -56,6 +57,24 @@ mds - in particular, i care about dirfragtree.. get it on rejoin? - and dir sizes, if i add that... also on rejoin? + +- locking rewrites + - active_requests + - keyed off reqid_t + - allow separate remote/local locks + - facilitate ordered lock acquisition + - redo dispatch_request framework -- ref arg optional (it's in the table) + - or, use Request* directly + - rename Locker primitives + - fix path pinning + +- truncate +- link +- unlink +- mdsreq_t +- clientmap request history + + - mds failure vs clients / - clean up client op redirection - idempotent client ops diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc index 3888d88a0e6c6..5304f466667d2 100644 --- a/branches/sage/cephmds2/client/SyntheticClient.cc +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -1334,6 +1334,8 @@ void SyntheticClient::foo() client->unlink("d"); client->rmdir("d"); + /* + // rename fun client->mkdir("dir1", 0755); client->mkdir("dir2", 0755); @@ -1373,6 +1375,7 @@ void SyntheticClient::foo() client->mknod("dir5/asdf", 0644); client->rename("dir3","dir4"); // ok client->rename("dir4","dir5"); // fail + */ } int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int depth, int n) diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h index d0780223a5daf..9f5a503df7b84 100644 --- a/branches/sage/cephmds2/mds/CDentry.h +++ b/branches/sage/cephmds2/mds/CDentry.h @@ -28,18 +28,24 @@ using namespace std; class CInode; class CDir; +class MDRequest; #define DN_LOCK_SYNC 0 #define DN_LOCK_PREXLOCK 1 #define DN_LOCK_XLOCK 2 #define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. -#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. +#define DN_XLOCK_FOREIGN ((MDRequest*)0x1) // not 0, not a valid pointer. FIXME FIXME class Message; class CDentryDiscover; class Anchor; +class CDentry; + +// define an ordering +bool operator<(CDentry& l, CDentry& r); + // dentry class CDentry : public MDSCacheObject, public LRUObject { public: @@ -68,6 +74,11 @@ class CDentry : public MDSCacheObject, public LRUObject { static const int EXPORT_NONCE = 1; + struct ptr_lt { + bool operator()(const CDentry* l, const CDentry* r) const { + return *l < *r; + } + }; protected: string name; @@ -79,14 +90,14 @@ class CDentry : public MDSCacheObject, public LRUObject { version_t version; // dir version when last touched. version_t projected_version; // what it will be when i unlock/commit. - // locking + // xlocks int lockstate; - Message *xlockedby; + MDRequest *xlockedby; set gather_set; - // path pins + // rdlocks int npins; - multiset pinset; + multiset pinset; friend class Migrator; friend class Locker; @@ -245,12 +256,12 @@ class CDentry : public MDSCacheObject, public LRUObject { bool is_sync() { return lockstate == DN_LOCK_SYNC; } bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } - bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); } + bool can_read(MDRequest *m) { return is_xlockedbyme(m) || can_read(); } bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } - Message* get_xlockedby() { return xlockedby; } - bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } - bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } - bool is_prexlockbyother(Message *m) { + MDRequest* get_xlockedby() { return xlockedby; } + bool is_xlockedbyother(MDRequest *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } + bool is_xlockedbyme(MDRequest *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } + bool is_prexlockbyother(MDRequest *m) { return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; } @@ -270,21 +281,21 @@ class CDentry : public MDSCacheObject, public LRUObject { void set_lockstate(int s) { lockstate = s; } // path pins - void pin(Message *m) { + void pin(MDRequest *m) { npins++; pinset.insert(m); assert(pinset.size() == (unsigned)npins); } - void unpin(Message *m) { + void unpin(MDRequest *m) { npins--; assert(npins >= 0); assert(pinset.count(m) > 0); pinset.erase(pinset.find(m)); assert(pinset.size() == (unsigned)npins); } - bool is_pinnable(Message *m) { + bool is_pinnable(MDRequest *m) { return (lockstate == DN_LOCK_SYNC) || - (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); + (lockstate == DN_LOCK_UNPINNING && m && pinset.count(m)); } bool is_pinned() { return npins>0; } int num_pins() { return npins; } @@ -294,8 +305,6 @@ class CDentry : public MDSCacheObject, public LRUObject { ostream& operator<<(ostream& out, CDentry& dn); -// define an ordering -bool operator<(CDentry& l, CDentry& r); class CDentryDiscover { diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index e3bb151d60b17..b51c2bb77e5bf 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -35,7 +35,6 @@ #include using namespace std; - class Context; class CDentry; class CDir; @@ -138,7 +137,6 @@ class CInode : public MDSCacheObject { // misc static const int EXPORT_NONCE = 1; // nonce given to replicas created by export - public: MDCache *mdcache; @@ -230,12 +228,19 @@ protected: bool is_auth() { return state & STATE_AUTH; } void set_auth(bool auth); - inodeno_t ino() { return inode.ino; } + inodeno_t ino() const { return inode.ino; } inode_t& get_inode() { return inode; } CDentry* get_parent_dn() { return parent; } CDir *get_parent_dir(); CInode *get_parent_inode(); + struct ptr_lt { + bool operator()(const CInode* l, const CInode* r) const { + return l->ino() < r->ino(); + } + }; + + // -- misc -- void make_path(string& s); diff --git a/branches/sage/cephmds2/mds/Lock.h b/branches/sage/cephmds2/mds/Lock.h index 0d9dabb61b669..b138a34d5f29f 100644 --- a/branches/sage/cephmds2/mds/Lock.h +++ b/branches/sage/cephmds2/mds/Lock.h @@ -70,7 +70,7 @@ any + statlite(mtime) // -- lock... hard or file -class Message; +class MDRequest; class CLock { protected: @@ -80,7 +80,7 @@ class CLock { // local state int nread; - Message *wrlock_by; + MDRequest *wrlock_by; public: @@ -166,7 +166,7 @@ class CLock { } int get_nread() { return nread; } - void get_write(Message *who) { + void get_write(MDRequest *who) { assert(wrlock_by == 0); wrlock_by = who; } @@ -175,7 +175,7 @@ class CLock { wrlock_by = 0; } bool is_wrlocked() { return wrlock_by ? true:false; } - Message *get_wrlocked_by() { return wrlock_by; } + MDRequest *get_wrlocked_by() { return wrlock_by; } bool is_used() { return (is_wrlocked() || (nread>0)) ? true:false; } diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index b5e999a6c030a..bd58f37836b8f 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -124,6 +124,213 @@ void Locker::send_lock_message(CDentry *dn, int msg) + + + + + + + + +bool Locker::acquire_locks(MDRequest *mdr, + set &dentry_rdlocks, + set &dentry_xlocks, + set &inode_hard_rdlocks, + set &inode_hard_xlocks) +{ + dout(10) << "acquire_locks " << *mdr << endl; + + // (local) AUTH PINS + + // can i auth_pin everything? + for (set::iterator p = dentry_xlocks.begin(); + p != dentry_xlocks.end(); + ++p) { + CDir *dir = (*p)->dir; + if (!dir->is_auth()) continue; + if (!mdr->is_auth_pinned(dir) && + !dir->can_auth_pin()) { + // wait + dir->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } + } + for (set::iterator p = inode_hard_xlocks.begin(); + p != inode_hard_xlocks.end(); + ++p) { + CInode *in = *p; + if (!in->is_auth()) continue; + if (!mdr->is_auth_pinned(in) && + !in->can_auth_pin()) { + in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } + } + + // ok, grab the auth pins + for (set::iterator p = dentry_xlocks.begin(); + p != dentry_xlocks.end(); + ++p) { + CDir *dir = (*p)->dir; + if (!dir->is_auth()) continue; + mdr->auth_pin(dir); + } + for (set::iterator p = inode_hard_xlocks.begin(); + p != inode_hard_xlocks.end(); + ++p) { + CInode *in = *p; + if (!in->is_auth()) continue; + mdr->auth_pin(in); + } + + + // DENTRY LOCKS + { + // sort all the dentries we will lock + set sorted; + for (set::iterator p = dentry_xlocks.begin(); + p != dentry_xlocks.end(); + ++p) { + dout(10) << "will xlock " << **p << endl; + sorted.insert(*p); + } + for (set::iterator p = dentry_rdlocks.begin(); + p != dentry_rdlocks.end(); + ++p) { + dout(10) << "will rdlock " << **p << endl; + sorted.insert(*p); + } + + // acquire dentry locks. make sure they match currently acquired locks. + set::iterator existing = mdr->dentry_locks.begin(); + for (set::iterator p = sorted.begin(); + p != sorted.end(); + ++p) { + + // already locked? + if (existing != mdr->dentry_locks.end() && *existing == *p) { + // right kind? + CDentry *had = *existing; + if (dentry_xlocks.count(*p) == had->is_xlockedbyme(mdr)) { + dout(10) << "acquire_locks already locked " << *had << endl; + existing++; + continue; + } + } + + // hose any stray locks + while (existing != mdr->dentry_locks.end()) { + CDentry *had = *existing; + existing++; + dout(10) << "acquire_locks had " << *had << " locked before " << **p + << ", unlocking" << endl; + if (had->is_xlockedbyme(mdr)) + dentry_xlock_finish(had, mdr); + else + dentry_rdlock_finish(had, mdr); + } + + // lock + if (dentry_xlocks.count(*p)) { + if (!dentry_xlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got xlock on " << **p << endl; + } else { + if (!dentry_rdlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got rdlock on " << **p << endl; + } + } + + // any extra unneeded locks? + while (existing != mdr->dentry_locks.end()) { + dout(10) << "acquire_locks had " << *existing << " locked, unlocking" << endl; + if ((*existing)->is_xlockedbyme(mdr)) + dentry_xlock_finish(*existing, mdr); + else + dentry_rdlock_finish(*existing, mdr); + } + } + + // INODES + { + // sort all the dentries we will lock + set sorted; + for (set::iterator p = inode_hard_xlocks.begin(); + p != inode_hard_xlocks.end(); + ++p) + sorted.insert(*p); + for (set::iterator p = inode_hard_rdlocks.begin(); + p != inode_hard_rdlocks.end(); + ++p) + sorted.insert(*p); + + // acquire inode locks. make sure they match currently acquired locks. + set::iterator existing = mdr->inode_hard_locks.begin(); + for (set::iterator p = sorted.begin(); + p != sorted.end(); + ++p) { + // already locked? + if (existing != mdr->inode_hard_locks.end() && *existing == *p) { + // right kind? + CInode *had = *existing; + if (inode_hard_xlocks.count(*p) == (had->hardlock.get_wrlocked_by() == mdr)) { + dout(10) << "acquire_locks already locked " << *had << endl; + existing++; + continue; + } + } + + // hose any stray locks + while (existing != mdr->inode_hard_locks.end()) { + CInode *had = *existing; + existing++; + dout(10) << "acquire_locks had " << *had << " locked before " << **p + << ", unlocking" << endl; + if (had->hardlock.get_wrlocked_by() == mdr) + inode_hard_xlock_finish(had, mdr); + else + inode_hard_rdlock_finish(had, mdr); + } + + // lock + if (inode_hard_xlocks.count(*p)) { + if (!inode_hard_xlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got xlock on " << **p << endl; + } else { + if (!inode_hard_rdlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got rdlock on " << **p << endl; + } + } + + // any extra unneeded locks? + while (existing != mdr->inode_hard_locks.end()) { + dout(10) << "acquire_locks had " << **existing << " locked, unlocking" << endl; + if ((*existing)->hardlock.get_wrlocked_by() == mdr) + inode_hard_xlock_finish(*existing, mdr); + else + inode_hard_rdlock_finish(*existing, mdr); + } + } + + return true; +} + + + + + + + + + + // file i/o ----------------------------------------- __uint64_t Locker::issue_file_data_version(CInode *in) @@ -528,9 +735,9 @@ void Locker::handle_lock(MLock *m) // =============================== // hard inode metadata -bool Locker::inode_hard_read_try(CInode *in, Context *con) +bool Locker::inode_hard_rdlock_try(CInode *in, Context *con) { - dout(7) << "inode_hard_read_try on " << *in << endl; + dout(7) << "inode_hard_rdlock_try on " << *in << endl; // can read? grab ref. if (in->hardlock.can_read(in->is_auth())) @@ -539,18 +746,20 @@ bool Locker::inode_hard_read_try(CInode *in, Context *con) assert(!in->is_auth()); // wait! - dout(7) << "inode_hard_read_try waiting on " << *in << endl; + dout(7) << "inode_hard_rdlock_try waiting on " << *in << endl; in->add_waiter(CInode::WAIT_HARDR, con); return false; } -bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m, CInode *ref) +bool Locker::inode_hard_rdlock_start(CInode *in, MDRequest *mdr) { - dout(7) << "inode_hard_read_start on " << *in << endl; + dout(7) << "inode_hard_rdlock_start on " << *in << endl; // can read? grab ref. if (in->hardlock.can_read(in->is_auth())) { in->hardlock.get_read(); + mdr->inode_hard_rdlocks.insert(in); + mdr->inode_hard_locks.insert(in); return true; } @@ -558,27 +767,29 @@ bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m, CInode *ref) assert(!in->is_auth()); // wait! - dout(7) << "inode_hard_read_start waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_HARDR, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_hard_rdlock_start waiting on " << *in << endl; + in->add_waiter(CInode::WAIT_HARDR, new C_MDS_RetryRequest(mdcache, mdr)); return false; } -void Locker::inode_hard_read_finish(CInode *in) +void Locker::inode_hard_rdlock_finish(CInode *in, MDRequest *mdr) { // drop ref assert(in->hardlock.can_read(in->is_auth())); in->hardlock.put_read(); + mdr->inode_hard_rdlocks.erase(in); + mdr->inode_hard_locks.erase(in); - dout(7) << "inode_hard_read_finish on " << *in << endl; + dout(7) << "inode_hard_rdlock_finish on " << *in << endl; //if (in->hardlock.get_nread() == 0) in->finish_waiting(CInode::WAIT_HARDNORD); } -bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m, CInode *ref) +bool Locker::inode_hard_xlock_start(CInode *in, MDRequest *mdr) { - dout(7) << "inode_hard_write_start on " << *in << endl; + dout(7) << "inode_hard_xlock_start on " << *in << endl; // if not replicated, i can twiddle lock at will if (in->is_auth() && @@ -589,7 +800,9 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m, CInode *ref) // can write? grab ref. if (in->hardlock.can_write(in->is_auth())) { assert(in->is_auth()); - in->hardlock.get_write(m); + in->hardlock.get_write(mdr); + mdr->inode_hard_xlocks.insert(in); + mdr->inode_hard_locks.insert(in); return true; } @@ -603,28 +816,30 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m, CInode *ref) inode_hard_lock(in); } - dout(7) << "inode_hard_write_start waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_HARDW, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_hard_xlock_start waiting on " << *in << endl; + in->add_waiter(CInode::WAIT_HARDW, new C_MDS_RetryRequest(mdcache, mdr)); return false; } else { // replica // fw to auth int auth = in->authority().first; - dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; + dout(7) << "inode_hard_xlock_start " << *in << " on replica, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); + mdcache->request_forward(mdr, auth); return false; } } -void Locker::inode_hard_write_finish(CInode *in) +void Locker::inode_hard_xlock_finish(CInode *in, MDRequest *mdr) { // drop ref //assert(in->hardlock.can_write(in->is_auth())); in->hardlock.put_write(); - dout(7) << "inode_hard_write_finish on " << *in << endl; + mdr->inode_hard_xlocks.erase(in); + mdr->inode_hard_locks.erase(in); + dout(7) << "inode_hard_xlock_finish on " << *in << endl; // others waiting? if (in->is_hardlock_write_wanted()) { @@ -810,8 +1025,8 @@ void Locker::handle_lock_inode_hard(MLock *m) dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; lock->set_state(LOCK_GLOCKR); in->add_waiter(CInode::WAIT_HARDNORD, - new C_MDS_RetryMessage(mds,m)); - assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) + new C_MDS_RetryMessage(mds, m)); + assert(0); // does this ever happen? (if so, fix hard_rdlock_finish, and CInodeExport.update_inode!) return; } else { @@ -850,9 +1065,9 @@ void Locker::handle_lock_inode_hard(MLock *m) // soft inode metadata -bool Locker::inode_file_read_start(CInode *in, MClientRequest *m, CInode *ref) +bool Locker::inode_file_rdlock_start(CInode *in, MDRequest *mdr) { - dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; + dout(7) << "inode_file_rdlock_start " << *in << " filelock=" << in->filelock << endl; // can read? grab ref. if (in->filelock.can_read(in->is_auth())) { @@ -863,7 +1078,7 @@ bool Locker::inode_file_read_start(CInode *in, MClientRequest *m, CInode *ref) // can't read, and replicated. if (in->filelock.can_read_soon(in->is_auth())) { // wait - dout(7) << "inode_file_read_start can_read_soon " << *in << endl; + dout(7) << "inode_file_rdlock_start can_read_soon " << *in << endl; } else { if (in->is_auth()) { // auth @@ -879,11 +1094,14 @@ bool Locker::inode_file_read_start(CInode *in, MClientRequest *m, CInode *ref) //in->filelock.get_write(); in->finish_waiting(CInode::WAIT_FILERWB|CInode::WAIT_FILESTABLE); //in->filelock.put_write(); + + mdr->inode_file_rdlocks.insert(in); + mdr->inode_file_locks.insert(in); return true; } } else { - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_file_rdlock_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } else { @@ -892,35 +1110,37 @@ bool Locker::inode_file_read_start(CInode *in, MClientRequest *m, CInode *ref) // fw to auth int auth = in->authority().first; - dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; + dout(7) << "inode_file_rdlock_start " << *in << " on replica and async, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); + mdcache->request_forward(mdr, auth); return false; } else { // wait until stable - dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_file_rdlock_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } } // wait - dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILER, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_file_rdlock_start waiting on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CInode::WAIT_FILER, new C_MDS_RetryRequest(mdcache, mdr)); return false; } -void Locker::inode_file_read_finish(CInode *in) +void Locker::inode_file_rdlock_finish(CInode *in, MDRequest *mdr) { // drop ref assert(in->filelock.can_read(in->is_auth())); in->filelock.put_read(); + mdr->inode_file_rdlocks.erase(in); + mdr->inode_file_locks.erase(in); - dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; + dout(7) << "inode_file_rdlock_finish on " << *in << ", filelock=" << in->filelock << endl; if (in->filelock.get_nread() == 0) { in->finish_waiting(CInode::WAIT_FILENORD); @@ -929,9 +1149,9 @@ void Locker::inode_file_read_finish(CInode *in) } -bool Locker::inode_file_write_start(CInode *in, MClientRequest *m, CInode *ref) +bool Locker::inode_file_xlock_start(CInode *in, MDRequest *mdr) { - dout(7) << "inode_file_write_start on " << *in << endl; + dout(7) << "inode_file_xlock_start on " << *in << endl; // can't write? if (!in->filelock.can_write(in->is_auth())) { @@ -941,8 +1161,8 @@ bool Locker::inode_file_write_start(CInode *in, MClientRequest *m, CInode *ref) // auth if (!in->filelock.can_write_soon(in->is_auth())) { if (!in->filelock.is_stable()) { - dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_file_xlock_start on auth, waiting for stable on " << *in << endl; + in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } @@ -955,9 +1175,9 @@ bool Locker::inode_file_write_start(CInode *in, MClientRequest *m, CInode *ref) // replica // fw to auth int auth = in->authority().first; - dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; + dout(7) << "inode_file_xlock_start " << *in << " on replica, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); + mdcache->request_forward(mdr, auth); return false; } } @@ -966,22 +1186,26 @@ bool Locker::inode_file_write_start(CInode *in, MClientRequest *m, CInode *ref) if (in->filelock.can_write(in->is_auth())) { // can i auth pin? assert(in->is_auth()); - in->filelock.get_write(m); + in->filelock.get_write(mdr); + mdr->inode_file_locks.insert(in); + mdr->inode_file_xlocks.insert(in); return true; } else { - dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CInode::WAIT_FILEW, new C_MDS_RetryRequest(mds, m, ref)); + dout(7) << "inode_file_xlock_start on auth, waiting for write on " << *in << endl; + in->add_waiter(CInode::WAIT_FILEW, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } -void Locker::inode_file_write_finish(CInode *in) +void Locker::inode_file_xlock_finish(CInode *in, MDRequest *mdr) { // drop ref //assert(in->filelock.can_write(in->is_auth())); in->filelock.put_write(); - dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; + mdr->inode_file_locks.erase(in); + mdr->inode_file_xlocks.erase(in); + dout(7) << "inode_file_xlock_finish on " << *in << ", filelock=" << in->filelock << endl; // drop lock? if (!in->is_filelock_write_wanted()) { @@ -1691,38 +1915,133 @@ void Locker::handle_lock_inode_file(MLock *m) void Locker::handle_lock_dir(MLock *m) { - } // DENTRY -bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) + +// trace helpers + +/** dentry_can_rdlock_trace + * see if we can _anonymously_ rdlock an entire trace. + * if not, and req is specified, wait and retry that message. + */ +bool Locker::dentry_can_rdlock_trace(vector& trace, MClientRequest *req) +{ + // verify dentries are rdlockable. + // we do this because + // - we're being less aggressive about locks acquisition, and + // - we're not acquiring the locks in order! + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + CDentry *dn = *it; + if (!dn->is_pinnable(0)) { + if (req) { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << ", waiting" << endl; + dn->dir->add_waiter(CDir::WAIT_DNPINNABLE, + dn->name, + new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << endl; + } + return false; + } + } + return true; +} + +void Locker::dentry_anon_rdlock_trace_start(vector& trace) +{ + // grab dentry rdlocks + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + (*it)->pin(0); +} + + + +bool Locker::dentry_rdlock_start(CDentry *dn, MDRequest *mdr) +{ + // verify lockable + if (!dn->is_pinnable(mdr)) { + // wait + dout(10) << "dentry_rdlock_start waiting on " << *dn << endl; + dn->dir->add_waiter(CDir::WAIT_DNPINNABLE, + dn->name, + new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + // rdlock + dout(10) << "dentry_rdlock_start " << *dn << endl; + dn->pin(mdr); + + mdr->dentry_rdlocks.insert(dn); + mdr->dentry_locks.insert(dn); + + return true; +} + + +void Locker::_dentry_rdlock_finish(CDentry *dn, MDRequest *mdr) +{ + dn->unpin(mdr); + + // did we completely unpin a waiter? + if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { + // return state to sync, in case the unpinner flails + dn->lockstate = DN_LOCK_SYNC; + + // run finisher right now to give them a fair shot. + dn->dir->finish_waiting(CDir::WAIT_DNUNPINNED, dn->name); + } +} + +void Locker::dentry_rdlock_finish(CDentry *dn, MDRequest *mdr) +{ + dout(10) << "dentry_rdlock_finish " << *dn << endl; + _dentry_rdlock_finish(dn, mdr); + mdr->dentry_rdlocks.erase(dn); + mdr->dentry_locks.erase(dn); +} + +void Locker::dentry_anon_rdlock_trace_finish(vector& trace) +{ + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + _dentry_rdlock_finish(*it, 0); +} + +bool Locker::dentry_xlock_start(CDentry *dn, MDRequest *mdr) { dout(7) << "dentry_xlock_start on " << *dn << endl; // locked? if (dn->lockstate == DN_LOCK_XLOCK) { - if (dn->xlockedby == m) return true; // locked by me! + if (dn->xlockedby == mdr) return true; // locked by me! // not by me, wait dout(7) << "dentry " << *dn << " xlock by someone else" << endl; dn->dir->add_waiter(CDir::WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); + new C_MDS_RetryRequest(mdcache, mdr)); return false; } // prelock? if (dn->lockstate == DN_LOCK_PREXLOCK) { - if (dn->xlockedby == m) { + if (dn->xlockedby == mdr) { dout(7) << "dentry " << *dn << " prexlock by me" << endl; dn->dir->add_waiter(CDir::WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); + new C_MDS_RetryRequest(mdcache, mdr)); } else { dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; dn->dir->add_waiter(CDir::WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); + new C_MDS_RetryRequest(mdcache, mdr)); } return false; } @@ -1738,29 +2057,12 @@ bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) dn->lockstate = DN_LOCK_UNPINNING; dn->dir->add_waiter(CDir::WAIT_DNUNPINNED, dn->name, - new C_MDS_RetryRequest(mds,m,ref)); + new C_MDS_RetryRequest(mdcache, mdr)); return false; } - // pin path up to dentry! (if success, point of no return) - CDentry *pdn = dn->dir->inode->get_parent_dn(); - if (pdn) { - if (mdcache->active_requests[m].traces.count(pdn)) { - dout(7) << "already path pinned parent dentry " << *pdn << endl; - } else { - dout(7) << "pinning parent dentry " << *pdn << endl; - vector trace; - mdcache->make_trace(trace, pdn->inode); - assert(trace.size()); - - if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false; - - mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; - } - } - // mine! - dn->xlockedby = m; + dn->xlockedby = mdr; // pin me! dn->get(CDentry::PIN_XLOCK); @@ -1793,16 +2095,17 @@ bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) // wait dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; dn->dir->add_waiter(CDir::WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mds, m, ref)); + new C_MDS_RetryRequest(mdcache, mdr)); return false; } else { dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); + mdr->dentry_xlocks.insert(dn); + mdr->dentry_locks.insert(dn); return true; } } -void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) +void Locker::dentry_xlock_finish(CDentry *dn, MDRequest *mdr, bool quiet) { dout(7) << "dentry_xlock_finish on " << *dn << endl; @@ -1811,8 +2114,8 @@ void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) dout(7) << "this was a foreign xlock" << endl; } else { // remove from request record - assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1); - mdcache->active_requests[dn->xlockedby].xlocks.erase(dn); + mdr->dentry_xlocks.erase(dn); + mdr->dentry_locks.erase(dn); } dn->xlockedby = 0; @@ -1821,10 +2124,6 @@ void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) // unpin dn->put(CDentry::PIN_XLOCK); - // unpin parent dir? - // -> no? because we might have xlocked 2 things in this dir. - // instead, we let request_finish clean up the mess. - // tell replicas? if (!quiet) { // tell even if dn is null. @@ -1840,12 +2139,44 @@ void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) } +void Locker::dentry_xlock_downgrade_to_rdlock(CDentry *dn, MDRequest *mdr) +{ + dout(7) << "dentry_xlock_downgrade_to_rdlock on " << *dn << endl; + + assert(dn->xlockedby); + if (dn->xlockedby == DN_XLOCK_FOREIGN) { + dout(7) << "this was a foreign xlock" << endl; + assert(0); // rewrite me + } + + // un-xlock + dn->xlockedby = 0; + dn->lockstate = DN_LOCK_SYNC; + mdr->dentry_xlocks.erase(dn); + dn->put(CDentry::PIN_XLOCK); + + // rdlock + mdr->dentry_rdlocks.insert(dn); + dn->pin(mdr); + + // tell replicas? + if (dn->is_replicated()) { + send_lock_message(dn, LOCK_AC_SYNC); + } + + // kick waiters + list finished; + dn->dir->take_waiting(CDir::WAIT_DNREAD, finished); + mds->queue_finished(finished); +} + + /* * onfinish->finish() will be called with * 0 on successful xlock, * -1 on failure */ - +/* class C_MDC_XlockRequest : public Context { Locker *mdc; CDir *dir; @@ -1908,7 +2239,7 @@ void Locker::dentry_xlock_request(CDir *dir, const string& dname, bool create, dir, dname, req, onfinish)); } - +*/ @@ -1939,15 +2270,18 @@ void Locker::handle_lock_dn(MLock *m) (m->get_action() == LOCK_AC_REQXLOCK || m->get_action() == LOCK_AC_REQXLOCKC)) { dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; - if (mdcache->active_requests.count(m)) + /*if (mdcache->active_requests.count(m)) mdcache->request_finish(m); else delete m; + */ + assert(0); // FIXME REWRITE ME >>>>>>> return; } dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; + /* ******* REWRITE ME SDFKJDSFDSFJK:SDFJKDFSJKFDSHJKDFSHJKDFS>>>>>>> // forward if (mdcache->active_requests.count(m)) { // xlock requests are requests, use request_* functions! @@ -1960,6 +2294,7 @@ void Locker::handle_lock_dn(MLock *m) // forward normally mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); } + */ return; } @@ -1991,8 +2326,10 @@ void Locker::handle_lock_dn(MLock *m) } // finish request (if we got that far) + /* FIXME F>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> if (mdcache->active_requests.count(m)) mdcache->request_finish(m); + */ delete m; return; @@ -2018,8 +2355,9 @@ void Locker::handle_lock_dn(MLock *m) vector trace; filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, - m, new C_MDS_RetryMessage(mds,m), + int r = mdcache->path_traverse(0, 0, // FIXME FIXME >>>>>>>>>>>>>>>>>>>>>>>> + path, trace, true, + m, new C_MDS_RetryMessage(mds, m), MDS_TRAVERSE_DISCOVER); assert(r>0); return; @@ -2031,7 +2369,8 @@ void Locker::handle_lock_dn(MLock *m) vector trace; filepath path = m->get_path(); - int r = mdcache->path_traverse(path, trace, true, + int r = mdcache->path_traverse(0, 0, // FIXME >>>>>>>>>>>>>>>>>>>>>>>> + path, trace, true, m, new C_MDS_RetryMessage(mds,m), MDS_TRAVERSE_DISCOVER); assert(r>0); @@ -2123,7 +2462,8 @@ void Locker::handle_lock_dn(MLock *m) if (dn->gather_set.size() == 0) { dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; dn->lockstate = DN_LOCK_XLOCK; - mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); + mdcache->active_requests[dn->xlockedby->reqid].dentry_xlocks.insert(dn); + mdcache->active_requests[dn->xlockedby->reqid].dentry_locks.insert(dn); dir->finish_waiting(CDir::WAIT_DNLOCK, dname); } break; @@ -2143,14 +2483,18 @@ void Locker::handle_lock_dn(MLock *m) reply->set_path(path); mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); + assert(0); // FIXME + /* // done if (mdcache->active_requests.count(m)) mdcache->request_finish(m); else delete m; + */ return; } + /* REWRITE ME HELP case LOCK_AC_REQXLOCK: if (dn) { dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; @@ -2195,14 +2539,15 @@ void Locker::handle_lock_dn(MLock *m) return; } break; +*/ case LOCK_AC_UNXLOCK: dout(7) << "handle_lock_dn unxlock on " << *dn << endl; { - Message *m = dn->xlockedby; + MDRequest *mdr = dn->xlockedby; // finish request - mdcache->request_finish(m); // this will drop the locks (and unpin paths!) + mdcache->request_finish(mdr); // this will drop the locks (and unpin paths!) return; } break; diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h index 9b7b1b569dec5..f819e13209729 100644 --- a/branches/sage/cephmds2/mds/Locker.h +++ b/branches/sage/cephmds2/mds/Locker.h @@ -59,17 +59,24 @@ private: void send_lock_message(CDentry *dn, int msg); // -- locks -- + bool acquire_locks(MDRequest *mdr, + set &dentry_rdlocks, + set &dentry_xlocks, + set &inode_hard_rdlocks, + set &inode_hard_xlocks); + + // high level interface public: - bool inode_hard_read_try(CInode *in, Context *con); - bool inode_hard_read_start(CInode *in, MClientRequest *m, CInode *ref); - void inode_hard_read_finish(CInode *in); - bool inode_hard_write_start(CInode *in, MClientRequest *m, CInode *ref); - void inode_hard_write_finish(CInode *in); - bool inode_file_read_start(CInode *in, MClientRequest *m, CInode *ref); - void inode_file_read_finish(CInode *in); - bool inode_file_write_start(CInode *in, MClientRequest *m, CInode *ref); - void inode_file_write_finish(CInode *in); + bool inode_hard_rdlock_try(CInode *in, Context *con); + bool inode_hard_rdlock_start(CInode *in, MDRequest *mdr); + void inode_hard_rdlock_finish(CInode *in, MDRequest *mdr); + bool inode_hard_xlock_start(CInode *in, MDRequest *mdr); + void inode_hard_xlock_finish(CInode *in, MDRequest *mdr); + bool inode_file_rdlock_start(CInode *in, MDRequest *mdr); + void inode_file_rdlock_finish(CInode *in, MDRequest *mdr); + bool inode_file_xlock_start(CInode *in, MDRequest *mdr); + void inode_file_xlock_finish(CInode *in, MDRequest *mdr); void inode_hard_eval(CInode *in); void inode_file_eval(CInode *in); @@ -108,10 +115,18 @@ private: void handle_lock_dir(MLock *m); // dentry locks + void _dentry_rdlock_finish(CDentry *dn, MDRequest *mdr); public: - bool dentry_xlock_start(CDentry *dn, - Message *m, CInode *ref); - void dentry_xlock_finish(CDentry *dn, bool quiet=false); + bool dentry_rdlock_start(CDentry *dn, MDRequest *mdr); + void dentry_rdlock_finish(CDentry *dn, MDRequest *mdr); + bool dentry_can_rdlock_trace(vector& trace, MClientRequest *req); + void dentry_anon_rdlock_trace_start(vector& trace); + void dentry_anon_rdlock_trace_finish(vector& trace); + + bool dentry_xlock_start(CDentry *dn, MDRequest *mdr); + void dentry_xlock_finish(CDentry *dn, MDRequest *mdr, bool quiet=false); + //bool dentry_xlock_upgrade_from_rdlock(CDentry *dn, MDRequest *mdr); // from rdlock + void dentry_xlock_downgrade_to_rdlock(CDentry *dn, MDRequest *mdr); // to rdlock void handle_lock_dn(MLock *m); void dentry_xlock_request(CDir *dir, const string& dname, bool create, Message *req, Context *onfinish); diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index ab2ce34f0724f..27e3fb0ec79ae 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -2437,6 +2437,7 @@ void MDCache::dispatch(Message *m) * the context is needed to pass a (failure) result code. */ +/* class C_MDC_TraverseDiscover : public Context { Context *onfinish, *ondelay; public: @@ -2455,14 +2456,16 @@ class C_MDC_TraverseDiscover : public Context { delete ondelay; } }; +*/ -int MDCache::path_traverse(filepath& origpath, +int MDCache::path_traverse(MDRequest *mdr, + CInode *base, // traverse starting from here. + filepath& origpath, vector& trace, bool follow_trailing_symlink, Message *req, Context *ondelay, int onfail, - Context *onfinish, bool is_client_req) // true if req is MClientRequest .. gross, FIXME { set< pair > symlinks_resolved; // keep a list of symlinks we touch to avoid loops @@ -2472,11 +2475,11 @@ int MDCache::path_traverse(filepath& origpath, onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true; // root - CInode *cur = get_root(); + CInode *cur = base; + if (!cur) cur = get_root(); if (cur == NULL) { dout(7) << "traverse: i don't have root" << endl; open_root(ondelay); - if (onfinish) delete onfinish; return 1; } @@ -2494,10 +2497,6 @@ int MDCache::path_traverse(filepath& origpath, if (!cur->is_dir()) { dout(7) << "traverse: " << *cur << " not a dir " << endl; delete ondelay; - if (onfinish) { - onfinish->finish(-ENOTDIR); - delete onfinish; - } return -ENOTDIR; } @@ -2510,7 +2509,6 @@ int MDCache::path_traverse(filepath& origpath, if (cur->is_frozen_dir()) { dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; cur->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, ondelay); - if (onfinish) delete onfinish; return 1; } @@ -2524,7 +2522,6 @@ int MDCache::path_traverse(filepath& origpath, else if (cur->auth_is_ambiguous()) { dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl; cur->add_waiter(CInode::WAIT_SINGLEAUTH, ondelay); - if (onfinish) delete onfinish; return 1; } else { filepath want = path.postfixpath(depth); @@ -2538,7 +2535,6 @@ int MDCache::path_traverse(filepath& origpath, dir_discovers[cur->ino()].insert(cur->authority().first); } cur->add_waiter(CInode::WAIT_DIR, ondelay); - if (onfinish) delete onfinish; return 1; } } @@ -2557,8 +2553,7 @@ int MDCache::path_traverse(filepath& origpath, */ // must read directory hard data (permissions, x bit) to traverse - if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) { - if (onfinish) delete onfinish; + if (!noperm && !mds->locker->inode_hard_rdlock_try(cur, ondelay)) { return 1; } @@ -2580,7 +2575,7 @@ int MDCache::path_traverse(filepath& origpath, // null and last_bit and xlocked by me? if (dn && dn->is_null() && - dn->is_xlockedbyme(req) && + dn->is_xlockedbyme(mdr) && depth == path.depth()-1) { dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl; trace.push_back(dn); @@ -2589,12 +2584,11 @@ int MDCache::path_traverse(filepath& origpath, if (dn && !dn->is_null()) { // dentry exists. xlocked? - if (!noperm && dn->is_xlockedbyother(req)) { + if (!noperm && dn->is_xlockedbyother(mdr)) { dout(10) << "traverse: xlocked dentry at " << *dn << endl; curdir->add_waiter(CDir::WAIT_DNREAD, path[depth], ondelay); - if (onfinish) delete onfinish; return 1; } @@ -2608,8 +2602,8 @@ int MDCache::path_traverse(filepath& origpath, dn->link_remote(in); } else { dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; - open_remote_ino(dn->get_remote_ino(), req, - ondelay); + assert(0); // REWRITE ME + //open_remote_ino(dn->get_remote_ino(), req, ondelay); return 1; } } @@ -2694,10 +2688,6 @@ int MDCache::path_traverse(filepath& origpath, if (curdir->is_complete()) { // file not found delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); - delete onfinish; - } return -ENOENT; } else { @@ -2712,7 +2702,6 @@ int MDCache::path_traverse(filepath& origpath, if (mds->logger) mds->logger->inc("cmiss"); - if (onfinish) delete onfinish; return 1; } } else { @@ -2730,8 +2719,7 @@ int MDCache::path_traverse(filepath& origpath, } else if (curdir->auth_is_ambiguous()) { dout(7) << "traverse: waiting for single auth on " << *curdir << endl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, - new C_MDC_TraverseDiscover(onfinish, ondelay)); + curdir->add_waiter(CDir::WAIT_SINGLEAUTH, ondelay); return 1; } else { dout(7) << "traverse: discover " << want << " from " << *curdir << endl; @@ -2746,12 +2734,7 @@ int MDCache::path_traverse(filepath& origpath, } // delay processing of current request. - // delay finish vs ondelay until result of traverse, so that ENOENT can be - // passed to onfinish if necessary - curdir->add_waiter(CDir::WAIT_DENTRY, - path[depth], - new C_MDC_TraverseDiscover(onfinish, ondelay)); - + curdir->add_waiter(CDir::WAIT_DENTRY, path[depth], ondelay); if (mds->logger) mds->logger->inc("cmiss"); return 1; } @@ -2763,7 +2746,6 @@ int MDCache::path_traverse(filepath& origpath, // wait dout(7) << "traverse: waiting for single auth in " << *curdir << endl; curdir->add_waiter(CDir::WAIT_SINGLEAUTH, ondelay); - if (onfinish) delete onfinish; return 1; } else { dout(7) << "traverse: forwarding, not auth for " << *curdir << endl; @@ -2779,17 +2761,12 @@ int MDCache::path_traverse(filepath& origpath, mds->forward_message_mds(req, dauth.first, req->get_dest_port()); if (mds->logger) mds->logger->inc("cfw"); - if (onfinish) delete onfinish; delete ondelay; return 2; } } if (onfail == MDS_TRAVERSE_FAIL) { delete ondelay; - if (onfinish) { - onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority! - delete onfinish; - } return -ENOENT; // not necessarily exactly true.... } } @@ -2799,10 +2776,6 @@ int MDCache::path_traverse(filepath& origpath, // success. delete ondelay; - if (onfinish) { - onfinish->finish(0); - delete onfinish; - } return 0; } @@ -2839,7 +2812,7 @@ void MDCache::open_remote_dir(CInode *diri, frag_t fg, Context *fin) /** get_dentry_inode * will return inode for primary, or link up/open up remote link's inode as necessary. */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MClientRequest *req, CInode *ref) +CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequest *mdr) { assert(!dn->is_null()); @@ -2854,8 +2827,7 @@ CInode *MDCache::get_dentry_inode(CDentry *dn, MClientRequest *req, CInode *ref) return in; } else { dout(10) << "get_dentry_ninode on remote dn, opening inode for " << *dn << endl; - open_remote_ino(dn->get_remote_ino(), req, - new C_MDS_RetryRequest(mds, req, ref)); + open_remote_ino(dn->get_remote_ino(), mdr, new C_MDS_RetryRequest(this, mdr)); return 0; } } @@ -2864,20 +2836,20 @@ CInode *MDCache::get_dentry_inode(CDentry *dn, MClientRequest *req, CInode *ref) class C_MDC_OpenRemoteInoLookup : public Context { MDCache *mdc; inodeno_t ino; - Message *req; + MDRequest *mdr; Context *onfinish; public: vector anchortrace; - C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) { + C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, MDRequest *r, Context *onfinish) { this->mdc = mdc; this->ino = ino; - this->req = req; + this->mdr = r; this->onfinish = onfinish; } void finish(int r) { assert(r == 0); if (r == 0) - mdc->open_remote_ino_2(ino, req, anchortrace, onfinish); + mdc->open_remote_ino_2(ino, mdr, anchortrace, onfinish); else { onfinish->finish(r); delete onfinish; @@ -2886,17 +2858,17 @@ public: }; void MDCache::open_remote_ino(inodeno_t ino, - Message *req, + MDRequest *mdr, Context *onfinish) { dout(7) << "open_remote_ino on " << ino << endl; - C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish); + C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, mdr, onfinish); mds->anchorclient->lookup(ino, c->anchortrace, c); } void MDCache::open_remote_ino_2(inodeno_t ino, - Message *req, + MDRequest *mdr, vector& anchortrace, Context *onfinish) { @@ -2927,66 +2899,6 @@ void MDCache::open_remote_ino_2(inodeno_t ino, -// path pins - -bool MDCache::path_pin(vector& trace, - Message *m, - Context *c) -{ - // verify everything is pinnable - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->is_pinnable(m)) { - // wait - if (c) { - dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl; - dn->dir->add_waiter(CDir::WAIT_DNPINNABLE, - dn->name, - c); - } else { - dout(10) << "path_pin can't pin, no waiter, failing." << endl; - } - return false; - } - } - - // pin! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - (*it)->pin(m); - dout(11) << "path_pinned " << *(*it) << endl; - } - - delete c; - return true; -} - - -void MDCache::path_unpin(vector& trace, - Message *m) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - dn->unpin(m); - dout(11) << "path_unpinned " << *dn << endl; - - // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { - // return state to sync, in case the unpinner flails - dn->lockstate = DN_LOCK_SYNC; - - // run finisher right now to give them a fair shot. - dn->dir->finish_waiting(CDir::WAIT_DNUNPINNED, dn->name); - } - } -} - - void MDCache::make_trace(vector& trace, CInode *in) { CInode *parent = in->get_parent_inode(); @@ -3000,130 +2912,89 @@ void MDCache::make_trace(vector& trace, CInode *in) } -bool MDCache::request_start(Message *req, - CInode *ref, - vector& trace) +MDRequest *MDCache::request_start(reqid_t ri) { - assert(active_requests.count(req) == 0); - - // pin path - if (!trace.empty()) - if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false; + assert(active_requests.count(ri) == 0); + active_requests[ri].reqid = ri; + MDRequest *mdr = &active_requests[ri]; + dout(7) << "request_start " << *mdr << endl; + return mdr; +} - dout(7) << "request_start " << *req << endl; +MDRequest *MDCache::request_start(MClientRequest *req) +{ + reqid_t ri = req->get_reqid(); + MDRequest *mdr = request_start(ri); + mdr->request = req; + return mdr; +} - // add to map - active_requests[req].ref = ref; - if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace; +void MDCache::request_finish(MDRequest *mdr) +{ + dout(7) << "request_finish " << *mdr << endl; - // request pins - request_pin_inode(req, ref); + delete mdr->request; + request_cleanup(mdr); - if (mds->logger) mds->logger->inc("req"); - - return true; + if (mds->logger) mds->logger->inc("reply"); } -void MDCache::request_pin_inode(Message *req, CInode *in) +void MDCache::request_forward(MDRequest *mdr, int who, int port) { - if (active_requests[req].request_inode_pins.count(in) == 0) { - in->request_pin_get(); - active_requests[req].request_inode_pins.insert(in); - } -} + if (!port) port = MDS_PORT_SERVER; -void MDCache::request_pin_dn(Message *req, CDentry *dn) -{ - if (active_requests[req].request_dn_pins.count(dn) == 0) { - dn->get(CDentry::PIN_REQUEST); - active_requests[req].request_dn_pins.insert(dn); - } -} + dout(7) << "request_forward to " << who << " req " << *mdr << endl; -void MDCache::request_pin_dir(Message *req, CDir *dir) -{ - if (active_requests[req].request_dir_pins.count(dir) == 0) { - dir->request_pin_get(); - active_requests[req].request_dir_pins.insert(dir); - } -} + mds->forward_message_mds(mdr->request, who, port); + request_cleanup(mdr); -void MDCache::request_auth_pin(Message *req, CDir *dir) -{ - if (active_requests[req].dir_auth_pins.count(dir) == 0) { - dir->auth_pin(); - active_requests[req].dir_auth_pins.insert(dir); - } + if (mds->logger) mds->logger->inc("fw"); } -void MDCache::request_auth_pin(Message *req, CInode *in) -{ - if (active_requests[req].inode_auth_pins.count(in) == 0) { - in->auth_pin(); - active_requests[req].inode_auth_pins.insert(in); - } -} -bool MDCache::request_auth_pinned(Message *req, CDir *dir) +void MDCache::dispatch_request(MDRequest *mdr) { - return active_requests[req].dir_auth_pins.count(dir); -} + assert(mdr->request); -bool MDCache::request_auth_pinned(Message *req, CInode *in) -{ - return active_requests[req].inode_auth_pins.count(in); -} + switch (mdr->request->get_type()) { + case MSG_CLIENT_REQUEST: + mds->server->dispatch_request(mdr); + break; -void MDCache::request_drop_auth_pins(Message *req) -{ - // dirs - for (set::iterator p = active_requests[req].dir_auth_pins.begin(); - p != active_requests[req].dir_auth_pins.end(); - ++p) - (*p)->auth_unpin(); - active_requests[req].dir_auth_pins.clear(); + case MSG_MDS_LOCK: + mds->locker->handle_lock_dn((MLock*)mdr->request); + break; - // inodes - for (set::iterator p = active_requests[req].inode_auth_pins.begin(); - p != active_requests[req].inode_auth_pins.end(); - ++p) - (*p)->auth_unpin(); - active_requests[req].inode_auth_pins.clear(); + default: + assert(0); // shouldn't get here + } } -void MDCache::request_cleanup(Message *req) +void MDCache::request_drop_locks(MDRequest *mdr) { - assert(active_requests.count(req) == 1); - - // leftover xlocks? - if (active_requests[req].xlocks.size()) { - set dns = active_requests[req].xlocks; - - for (set::iterator it = dns.begin(); - it != dns.end(); - it++) { - CDentry *dn = *it; - - dout(7) << "request_cleanup leftover xlock " << *dn << endl; - - mds->locker->dentry_xlock_finish(dn); - - // queue finishers - dn->dir->take_waiting(CDir::WAIT_ANY, dn->name, mds->finished_queue); - - // remove clean, null dentry? (from a failed rename or whatever) - if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) { - dn->dir->remove_dentry(dn); - } - } - - assert(active_requests[req].xlocks.empty()); // we just finished finished them - } + // leftover dentry locks + while (!mdr->dentry_xlocks.empty()) + mds->locker->dentry_xlock_finish(*mdr->dentry_xlocks.begin(), mdr); + while (!mdr->dentry_rdlocks.empty()) + mds->locker->dentry_rdlock_finish(*mdr->dentry_rdlocks.begin(), mdr); + + // inode locks + while (!mdr->inode_hard_xlocks.empty()) + mds->locker->inode_hard_xlock_finish(*mdr->inode_hard_xlocks.begin(), mdr); + while (!mdr->inode_hard_rdlocks.empty()) + mds->locker->inode_hard_rdlock_finish(*mdr->inode_hard_rdlocks.begin(), mdr); + + while (!mdr->inode_file_xlocks.empty()) + mds->locker->inode_file_xlock_finish(*mdr->inode_file_xlocks.begin(), mdr); + while (!mdr->inode_file_rdlocks.empty()) + mds->locker->inode_file_rdlock_finish(*mdr->inode_file_rdlocks.begin(), mdr); + + /* // foreign xlocks? if (active_requests[req].foreign_xlocks.size()) { set dns = active_requests[req].foreign_xlocks; @@ -3142,38 +3013,49 @@ void MDCache::request_cleanup(Message *req) mds->send_message_mds(m, dauth, MDS_PORT_CACHE); } } + */ - // unpin paths - for (map< CDentry*, vector >::iterator it = active_requests[req].traces.begin(); - it != active_requests[req].traces.end(); - it++) { - path_unpin(it->second, req); - } - - // request pins - for (set::iterator it = active_requests[req].request_inode_pins.begin(); - it != active_requests[req].request_inode_pins.end(); - it++) { - (*it)->request_pin_put(); - } - for (set::iterator it = active_requests[req].request_dn_pins.begin(); - it != active_requests[req].request_dn_pins.end(); - it++) { - (*it)->put(CDentry::PIN_REQUEST); - } - for (set::iterator it = active_requests[req].request_dir_pins.begin(); - it != active_requests[req].request_dir_pins.end(); - it++) { - (*it)->request_pin_put(); - } + // make sure ref and trace are empty + // if we are doing our own locking, we can't use them! + assert(mdr->ref == 0); + assert(mdr->trace.empty()); +} - // auth pins - request_drop_auth_pins(req); +void MDCache::request_cleanup(MDRequest *mdr) +{ + reqid_t ri = mdr->reqid; + assert(active_requests.count(ri)); - // remove from map - active_requests.erase(req); + // clear ref, trace + mdr->ref = 0; + mdr->trace.clear(); + // drop locks + request_drop_locks(mdr); + + // auth pins + mdr->drop_auth_pins(); + + // drop cache pins + for (set::iterator it = mdr->inode_pins.begin(); + it != mdr->inode_pins.end(); + it++) + (*it)->put(CInode::PIN_REQUEST); + mdr->inode_pins.clear(); + for (set::iterator it = mdr->dentry_pins.begin(); + it != mdr->dentry_pins.end(); + it++) + (*it)->put(CDentry::PIN_REQUEST); + mdr->dentry_pins.clear(); + for (set::iterator it = mdr->dir_pins.begin(); + it != mdr->dir_pins.end(); + it++) + (*it)->put(CDir::PIN_REQUEST); + mdr->dir_pins.clear(); + + // remove from map + active_requests.erase(ri); // log some stats ***** if (mds->logger) { @@ -3208,33 +3090,6 @@ for (int i=0; ilogger) mds->logger->inc("reply"); - - - //dump(); -} - - -void MDCache::request_forward(Message *req, int who, int port) -{ - if (!port) port = MDS_PORT_SERVER; - - dout(7) << "request_forward to " << who << " req " << *req << endl; - - // clean up my state - request_cleanup(req); - - mds->forward_message_mds(req, who, port); - - if (mds->logger) mds->logger->inc("fw"); -} - // -------------------------------------------------------------------- // ANCHORS @@ -3724,45 +3579,53 @@ void MDCache::handle_discover(MDiscover *dis) // lookup dentry CDentry *dn = curdir->lookup( dis->get_dentry(i) ); - if (dn) { - // add dentry - reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); - dout(7) << "added dentry " << *dn << endl; - - if (!dn->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dn->inode; - assert(next->is_auth()); - - reply->add_inode( next->replicate_to( dis->get_asker() ) ); - dout(7) << "added inode " << *next << endl; - - // descend, keep going. - cur = next; - continue; - } + if (!dn) { + // don't have it. + if (!curdir->is_complete()) { + // readdir + dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << endl; + if (reply->is_empty()) { + // fetch and wait + curdir->fetch(new C_MDS_RetryMessage(mds, dis)); + return; + } else { + // initiate fetch, but send what we have so far + curdir->fetch(0); + break; + } + } - // don't have dentry. - if (curdir->is_complete()) { - // set error flag in reply - dout(7) << "dname " << dis->get_dentry(i) << " dne in " << *curdir - << ", flagging error" << endl; - reply->set_flag_error_dn( dis->get_dentry(i) ); - } else { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << endl; - - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis)); - return; + if (1) { + // send null dentry + dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " + << *curdir << endl; + dn = curdir->add_dentry(dis->get_dentry(i), 0); } else { - // fetch, but send what we have so far - curdir->fetch(0); + // set error flag in reply + dout(7) << "dentry " << dis->get_dentry(i) << " dne, flagging error in " + << *curdir << endl; + reply->set_flag_error_dn( dis->get_dentry(i) ); } } - break; + + assert(dn); + + // add dentry + reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); + dout(7) << "added dentry " << *dn << endl; + + if (!dn->is_primary()) break; // stop on null or remote link. + + // add inode + CInode *next = dn->inode; + assert(next->is_auth()); + + reply->add_inode( next->replicate_to( dis->get_asker() ) ); + dout(7) << "added inode " << *next << endl; + + // descend, keep going. + cur = next; + continue; } // how did we do? @@ -4121,7 +3984,8 @@ void MDCache::handle_dir_update(MDirUpdate *m) dout(5) << "trying discover on dir_update for " << path << endl; - int r = path_traverse(path, trace, true, + int r = path_traverse(0, 0, + path, trace, true, m, new C_MDS_RetryMessage(mds, m), MDS_TRAVERSE_DISCOVER); if (r > 0) diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index d4739bcfc5f34..9c5537d66817d 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -29,7 +29,8 @@ #include "CDentry.h" #include "CDir.h" #include "Lock.h" - +#include "include/reqid.h" +#include "include/Context.h" class MDS; class Migrator; @@ -49,7 +50,7 @@ class MDirUpdate; class MDentryUnlink; class MLock; - +class Message; class MClientRequest; @@ -64,25 +65,102 @@ class MClientRequest; * mostly information about locks held, so that we can drop them all * the request is finished or forwarded. see request_*(). */ -typedef struct { - CInode *ref; // reference inode - set< CInode* > request_inode_pins; - set< CDentry* > request_dn_pins; - set< CDir* > request_dir_pins; - map< CDentry*, vector > traces; // path pins held - set< CDentry* > xlocks; // xlocks (local) - set< CDentry* > foreign_xlocks; // xlocks on foreign hosts +struct MDRequest { + reqid_t reqid; + Message *request; // MClientRequest, or MLock + + vector trace; // original path traversal. + CInode *ref; // reference inode. if there is only one, and its path is pinned. + + // cache pins (so things don't expire) + set< CInode* > inode_pins; + set< CDentry* > dentry_pins; + set< CDir* > dir_pins; + + // auth pins set< CDir* > dir_auth_pins; set< CInode* > inode_auth_pins; -} active_request_t; + + // held locks + set< CDentry*, CDentry::ptr_lt > dentry_locks; // sorted list of dentry locks we hold + set< CDentry* > dentry_rdlocks; + set< CDentry* > dentry_xlocks; + + set< CInode*, CInode::ptr_lt > inode_hard_locks; // sorted list of inode locks we hold + set< CInode* > inode_hard_rdlocks; + set< CInode* > inode_hard_xlocks; + + set< CInode*, CInode::ptr_lt > inode_file_locks; // sorted list of inode locks we hold + set< CInode* > inode_file_rdlocks; + set< CInode* > inode_file_xlocks; + + // old + set< CDentry* > xlocks; // xlocks (local) + set< CDentry* > foreign_xlocks; // xlocks on foreign hosts + + MDRequest() : request(0), ref(0) {} + MDRequest(reqid_t ri) : reqid(ri), request(0), ref(0) {} + + // requeest + MClientRequest *client_request() { + return (MClientRequest*)request; + } + + // pin items in cache + void pin(CInode *in) { + if (inode_pins.count(in) == 0) { + in->get(CInode::PIN_REQUEST); + inode_pins.insert(in); + } + } + void pin(CDir *dir) { + if (dir_pins.count(dir) == 0) { + dir->get(CDir::PIN_REQUEST); + dir_pins.insert(dir); + } + } + void pin(CDentry *dn) { + if (dentry_pins.count(dn) == 0) { + dn->get(CDentry::PIN_REQUEST); + dentry_pins.insert(dn); + } + } -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const Message *p) const { - static hash H; - return H((unsigned long)p); + // auth pins + void auth_pin(CInode *in) { + if (inode_auth_pins.count(in)) { + in->auth_pin(); + inode_auth_pins.insert(in); } - }; + } + void auth_pin(CDir *dir) { + if (dir_auth_pins.count(dir)) { + dir->auth_pin(); + dir_auth_pins.insert(dir); + } + } + bool is_auth_pinned(CInode *in) { return inode_auth_pins.count(in); } + bool is_auth_pinned(CDir *dir) { return dir_auth_pins.count(dir); } + void drop_auth_pins() { + for (set::iterator it = inode_auth_pins.begin(); + it != inode_auth_pins.end(); + it++) + (*it)->auth_unpin(); + inode_auth_pins.clear(); + for (set::iterator it = dir_auth_pins.begin(); + it != dir_auth_pins.end(); + it++) + (*it)->auth_unpin(); + dir_auth_pins.clear(); + } +}; + +inline ostream& operator<<(ostream& out, MDRequest &mdr) +{ + out << "request(" << mdr.reqid; + //if (mdr.request) out << " " << *mdr.request; + out << ")"; + return out; } class MDCache { @@ -158,9 +236,24 @@ protected: // -- requests -- - // active MDS requests - hash_map active_requests; +public: + + +protected: + hash_map active_requests; +public: + MDRequest* request_start(reqid_t rid); + MDRequest* request_start(MClientRequest *req); + void request_pin_ref(MDRequest *r, CInode *ref, vector& trace); + void request_finish(MDRequest *mdr); + void request_forward(MDRequest *mdr, int mds, int port=0); + void dispatch_request(MDRequest *mdr); + void request_drop_locks(MDRequest *mdr); + void request_cleanup(MDRequest *r); + + + // inode purging map purging; map > waiting_for_purge; @@ -325,37 +418,21 @@ public: CInode *create_stray_inode(int whose=-1); void open_local_stray(); void open_foreign_stray(int who, Context *c); - int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, + int path_traverse(MDRequest *mdr, + CInode *base, + filepath& path, vector& trace, bool follow_trailing_sym, Message *req, Context *ondelay, int onfail, - Context *onfinish=0, bool is_client_req = false); void open_remote_dir(CInode *diri, frag_t fg, Context *fin); - CInode *get_dentry_inode(CDentry *dn, MClientRequest *req, CInode *ref); - void open_remote_ino(inodeno_t ino, Message *req, Context *fin); - void open_remote_ino_2(inodeno_t ino, Message *req, + CInode *get_dentry_inode(CDentry *dn, MDRequest *mdr); + void open_remote_ino(inodeno_t ino, MDRequest *mdr, Context *fin); + void open_remote_ino_2(inodeno_t ino, MDRequest *mdr, vector& anchortrace, Context *onfinish); - bool path_pin(vector& trace, Message *m, Context *c); - void path_unpin(vector& trace, Message *m); void make_trace(vector& trace, CInode *in); - bool request_start(Message *req, - CInode *ref, - vector& trace); - void request_cleanup(Message *req); - void request_finish(Message *req); - void request_forward(Message *req, int mds, int port=0); - void request_pin_inode(Message *req, CInode *in); - void request_pin_dn(Message *req, CDentry *dn); - void request_pin_dir(Message *req, CDir *dir); - void request_auth_pin(Message *req, CDir *dir); - void request_auth_pin(Message *req, CInode *in); - bool request_auth_pinned(Message *req, CDir *dir); - bool request_auth_pinned(Message *req, CInode *in); - void request_drop_auth_pins(Message *req); - // -- anchors -- public: void anchor_create(CInode *in, Context *onfinish); @@ -443,5 +520,14 @@ protected: }; +class C_MDS_RetryRequest : public Context { + MDCache *cache; + MDRequest *mdr; + public: + C_MDS_RetryRequest(MDCache *c, MDRequest *r) : cache(c), mdr(r) {} + virtual void finish(int r) { + cache->dispatch_request(mdr); + } +}; #endif diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 8ee0b06dae787..686a58215c100 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -250,7 +250,7 @@ void Migrator::handle_mds_failure(int who) // unpin the path vector trace; cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); + mds->locker->dentry_anon_rdlock_trace_finish(trace); // wake up any waiters mds->queue_finished(export_finish_waiters[dir]); @@ -464,10 +464,11 @@ void Migrator::export_dir(CDir *dir, // pin path? vector trace; cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { + if (!mds->locker->dentry_can_rdlock_trace(trace, 0)) { dout(7) << "export_dir couldn't pin path, failing." << endl; return; } + mds->locker->dentry_anon_rdlock_trace_start(trace); // ok, let's go. assert(export_state.count(dir) == 0); @@ -1172,7 +1173,7 @@ void Migrator::export_finish(CDir *dir) dout(7) << "export_finish unpinning path" << endl; vector trace; cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); + mds->locker->dentry_anon_rdlock_trace_finish(trace); // discard delayed expires cache->discard_delayed_expire(dir); @@ -1208,21 +1209,6 @@ void Migrator::export_finish(CDir *dir) // ========================================================== // IMPORT - -class C_MDC_ExportDirDiscover : public Context { - Migrator *mig; - MExportDirDiscover *m; -public: - vector trace; - C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) : - mig(mig_), m(m_) {} - void finish(int r) { - CInode *in = 0; - if (r >= 0) in = trace[trace.size()-1]->get_inode(); - mig->handle_export_discover_2(m, in, r); - } -}; - void Migrator::handle_export_discover(MExportDirDiscover *m) { assert(m->get_source().num() != mds->get_nodeid()); @@ -1230,34 +1216,31 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover on " << m->get_path() << endl; // must discover it! - C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_export_discover_2(MExportDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_export_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { + vector trace; + int r = cache->path_traverse(0, + 0, + fpath, trace, true, + m, new C_MDS_RetryMessage(mds,m), // on delay/retry + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; // wait + if (r < 0) { dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - - mds->send_message_mds(new MExportDirDiscoverAck(m->get_dirfrag(), false), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; - return; } + CInode *in; + if (trace.empty()) { + in = cache->get_root(); + if (!in) { + cache->open_root(new C_MDS_RetryMessage(mds, m)); + return; + } + } else { + in = trace[trace.size()-1]->inode; + } assert(in->is_dir()); - + // pin inode in the cache (for now) in->get(CInode::PIN_IMPORTING); diff --git a/branches/sage/cephmds2/mds/Migrator.h b/branches/sage/cephmds2/mds/Migrator.h index d582fcfb7645a..a1dde351f8d7f 100644 --- a/branches/sage/cephmds2/mds/Migrator.h +++ b/branches/sage/cephmds2/mds/Migrator.h @@ -206,7 +206,6 @@ public: friend class C_MDS_ExportFinishLogged; // importer void handle_export_discover(MExportDirDiscover *m); - void handle_export_discover_2(MExportDirDiscover *m, CInode *in, int r); void handle_export_prep(MExportDirPrep *m); void handle_export_dir(MExportDir *m); int decode_import_dir(bufferlist& bl, @@ -225,7 +224,6 @@ public: void import_finish(CDir *dir, bool now=false); protected: - friend class C_MDC_ExportDirDiscover; friend class C_MDS_ImportDirLoggedStart; friend class C_MDS_ImportDirLoggedFinish; diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index b3d4af0c100bb..d8ffe3bc37bd6 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -18,7 +18,6 @@ #include "MDLog.h" #include "Migrator.h" #include "MDBalancer.h" -//#include "Renamer.h" #include "AnchorClient.h" #include "msg/Messenger.h" @@ -149,51 +148,19 @@ void Server::handle_client_unmount(Message *m) + /******* * some generic stuff for finishing off requests */ -/** C_MDS_CommitRequest - */ - -class C_MDS_CommitRequest : public Context { - Server *server; - MClientRequest *req; - MClientReply *reply; - CInode *tracei; // inode to include a trace for - LogEvent *event; - -public: - C_MDS_CommitRequest(Server *server, - MClientRequest *req, MClientReply *reply, CInode *tracei, - LogEvent *event=0) { - this->server = server; - this->req = req; - this->tracei = tracei; - this->reply = reply; - this->event = event; - } - void finish(int r) { - if (r != 0) { - // failure. set failure code and reply. - reply->set_result(r); - } - if (event) { - server->commit_request(req, reply, tracei, event); - } else { - // reply. - server->reply_request(req, reply, tracei); - } - } -}; - /* * send generic response (just and error code) */ -void Server::reply_request(MClientRequest *req, int r, CInode *tracei) +void Server::reply_request(MDRequest *mdr, int r, CInode *tracei) { - reply_request(req, new MClientReply(req, r), tracei); + MClientRequest *req = mdr->client_request(); + reply_request(mdr, new MClientReply(req, r), tracei); } @@ -201,8 +168,10 @@ void Server::reply_request(MClientRequest *req, int r, CInode *tracei) * send given reply * include a trace to tracei */ -void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) +void Server::reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei) { + MClientRequest *req = mdr->client_request(); + dout(10) << "reply_request " << reply->get_result() << " (" << strerror(-reply->get_result()) << ") " << *req << endl; @@ -215,72 +184,21 @@ void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tra // send reply messenger->send_message(reply, req->get_client_inst()); - - // discard request - mdcache->request_finish(req); - - // stupid stats crap (FIXME) - stat_ops++; -} - - -void Server::submit_update(MClientRequest *req, - CInode *wrlockedi, - LogEvent *event, - Context *oncommit) -{ - // log - mdlog->submit_entry(event); - - // pin - mdcache->request_pin_inode(req, wrlockedi); - - // wait - mdlog->wait_for_sync(oncommit); + + // finish request + mdcache->request_finish(mdr); } -/* - * commit event(s) to the metadata journal, then reply. - * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) - * - * NOTE: this is old and bad (write-behind!) - */ -void Server::commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2) -{ - // log - if (event) mdlog->submit_entry(event); - if (event2) mdlog->submit_entry(event2); - - if (g_conf.mds_log_before_reply && g_conf.mds_log && event) { - // SAFE mode! - - // pin inode so it doesn't go away! - if (tracei) mdcache->request_pin_inode(req, tracei); - - // wait for log sync - mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); - return; - } - else { - // just reply - reply_request(req, reply, tracei); - } -} /*** * process a client request */ - void Server::handle_client_request(MClientRequest *req) { - dout(4) << "req " << *req << endl; + dout(4) << "handle_client_request " << *req << endl; if (!mds->is_active()) { dout(5) << " not active, discarding client request." << endl; @@ -296,236 +214,459 @@ void Server::handle_client_request(MClientRequest *req) // okay, i want CInode *ref = 0; - vector trace; // might be blank, for fh guys - bool follow_trailing_symlink = false; - // operations on fh's or other non-files + // ----- + // some ops are on ino's switch (req->get_op()) { - /* case MDS_OP_FSTAT: - reply = handle_client_fstat(req, cur); - break; ****** fiX ME *** - */ + ref = mdcache->get_inode(req->args.fstat.ino); + assert(ref); + break; case MDS_OP_TRUNCATE: - if (!req->args.truncate.ino) break; // can be called w/ either fh OR path + if (!req->args.truncate.ino) + break; // can be called w/ either fh OR path + ref = mdcache->get_inode(req->args.truncate.ino); + assert(ref); + break; case MDS_OP_FSYNC: ref = mdcache->get_inode(req->args.fsync.ino); // fixme someday no ino needed? + assert(ref); + break; + } - if (!ref) { - int next = mds->get_nodeid() + 1; - if (next >= mds->mdsmap->get_num_mds()) next = 0; - dout(10) << "got request on ino we don't have, passing buck to " << next << endl; - mds->send_message_mds(req, next, MDS_PORT_SERVER); - return; - } + if (ref) { + MDRequest *mdr = mdcache->request_start(req); + dout(10) << "inode op on ref " << *ref << endl; + mdr->ref = ref; + mdr->pin(ref); + dispatch_request(mdr); + return; } - if (!ref) { - // we need to traverse a path - filepath refpath = req->get_filepath(); - - // ops on non-existing files --> directory paths - switch (req->get_op()) { - case MDS_OP_OPEN: - if (!(req->args.open.flags & O_CREAT)) break; - - case MDS_OP_MKNOD: - case MDS_OP_MKDIR: - case MDS_OP_SYMLINK: - case MDS_OP_LINK: - case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!! - case MDS_OP_RMDIR: - case MDS_OP_RENAME: - // remove last bit of path - refpath = refpath.prefixpath(refpath.depth()-1); - break; - } - dout(10) << "refpath = " << refpath << endl; - - Context *ondelay = new C_MDS_RetryMessage(mds, req); - - if (req->get_op() == MDS_OP_LSTAT) { - follow_trailing_symlink = false; - } - // do trace - int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink, - req, ondelay, - MDS_TRAVERSE_FORWARD, - 0, - true); // is MClientRequest - - if (r > 0) return; // delayed - if (r == -ENOENT || - r == -ENOTDIR || - r == -EISDIR) { - // error! - dout(10) << " path traverse error " << r << ", replying" << endl; + // ----- + // some ops are on existing inodes + + bool follow_trailing_symlink = false; + + switch (req->get_op()) { + case MDS_OP_LSTAT: + follow_trailing_symlink = false; + case MDS_OP_OPEN: + if (req->args.open.flags & O_CREAT) break; // handled below. + case MDS_OP_STAT: + case MDS_OP_UTIME: + case MDS_OP_CHMOD: + case MDS_OP_CHOWN: + case MDS_OP_READDIR: + { + filepath refpath = req->get_filepath(); + Context *ondelay = new C_MDS_RetryMessage(mds, req); + vector trace; - // send error - messenger->send_message(new MClientReply(req, r), - req->get_client_inst()); - - // - // is this a special debug command? - if (refpath.depth() - 1 == trace.size() && - refpath.last_dentry().find(".ceph.") == 0) { - /* -FIXME dirfrag - CDir *dir = 0; - if (!trace.empty()) - dir = mdcache->get_root()->dir; - else - dir = trace[trace.size()-1]->get_inode()->dir; - - dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_dentry() << "' in " << *dir << endl; - - if (refpath.last_dentry() == ".ceph.hash" && - refpath.depth() > 1) { - dout(1) << "got explicit hash command " << refpath << endl; - /// .... - } - else if (refpath.last_dentry() == ".ceph.commit") { - dout(1) << "got explicit commit command on " << *dir << endl; - dir->commit(0, 0); + int r = mdcache->path_traverse(0, 0, + refpath, trace, follow_trailing_symlink, + req, ondelay, + MDS_TRAVERSE_FORWARD, + true); // is MClientRequest + + if (r > 0) return; // delayed + if (r < 0) { + dout(10) << "traverse error " << r << " " << strerror(-r) << endl; + + // send error. don't bother registering request. + messenger->send_message(new MClientReply(req, r), + req->get_client_inst()); + + // + // is this a special debug command? + if (refpath.depth() - 1 == trace.size() && + refpath.last_dentry().find(".ceph.") == 0) { + // ... } -*/ + // } - // + // can we dnlock whole path? + if (!mds->locker->dentry_can_rdlock_trace(trace, req)) + return; - delete req; + // go + MDRequest *mdr = mdcache->request_start(req); + mds->locker->dentry_anon_rdlock_trace_start(trace); + dispatch_request(mdr); return; } - - if (trace.size()) - ref = trace[trace.size()-1]->inode; - else - ref = mdcache->get_root(); } + - dout(10) << "ref is " << *ref << endl; + // ---- + // the rest handle things themselves. - // rename doesn't pin src path (initially) - if (req->get_op() == MDS_OP_RENAME) trace.clear(); + switch (req->get_op()) { + case MDS_OP_OPEN: + assert(req->args.open.flags & O_CREAT); + case MDS_OP_MKNOD: + case MDS_OP_MKDIR: + case MDS_OP_SYMLINK: + case MDS_OP_LINK: + case MDS_OP_UNLINK: + case MDS_OP_RMDIR: + case MDS_OP_RENAME: + { + // register request + MDRequest *mdr = mdcache->request_start(req); + dispatch_request(mdr); + return; + } + } - // register - if (!mdcache->request_start(req, ref, trace)) - return; - - // process - dispatch_request(req, ref); + assert(0); // we missed something! } - -void Server::dispatch_request(Message *m, CInode *ref) +void Server::dispatch_request(MDRequest *mdr) { - MClientRequest *req = 0; - - // MLock or MClientRequest? - /* this is a little weird. - client requests and mlocks both initial dentry xlocks, path pins, etc., - and thus both make use of the context C_MDS_RetryRequest. - */ - switch (m->get_type()) { - case MSG_CLIENT_REQUEST: - req = (MClientRequest*)m; - break; // continue below! - - case MSG_MDS_LOCK: - mds->locker->handle_lock_dn((MLock*)m); - return; // done + MClientRequest *req = mdr->client_request(); - default: - assert(0); // shouldn't get here + if (mdr->ref) { + dout(7) << "dispatch_request " << *req << " ref " << *mdr->ref << endl; + } else { + dout(7) << "dispatch_request " << *req << endl; } - // MClientRequest. - - dout(7) << "handle_client " << *m << " ref " << *ref << endl; - switch (req->get_op()) { - - // files - case MDS_OP_OPEN: - if (req->args.open.flags & O_CREAT) - handle_client_openc(req, ref); - else - handle_client_open(req, ref); - break; - case MDS_OP_TRUNCATE: - handle_client_truncate(req, ref); - break; - /* - case MDS_OP_FSYNC: - handle_client_fsync(req, ref); - break; - */ - /* - case MDS_OP_RELEASE: - handle_client_release(req, ref); - break; - */ - // inodes + // inodes ops. case MDS_OP_STAT: case MDS_OP_LSTAT: - handle_client_stat(req, ref); + handle_client_stat(mdr); break; case MDS_OP_UTIME: - handle_client_utime(req, ref); + handle_client_utime(mdr); break; case MDS_OP_CHMOD: - handle_client_chmod(req, ref); + handle_client_chmod(mdr); break; case MDS_OP_CHOWN: - handle_client_chown(req, ref); + handle_client_chown(mdr); + break; + case MDS_OP_TRUNCATE: + handle_client_truncate(mdr); break; - - // namespace case MDS_OP_READDIR: - handle_client_readdir(req, ref); + handle_client_readdir(mdr); + break; + case MDS_OP_FSYNC: + //handle_client_fsync(req, ref); + break; + + // funky. + case MDS_OP_OPEN: + if ((req->args.open.flags & O_CREAT) && + !mdr->ref) + handle_client_openc(mdr); + else + handle_client_open(mdr); break; + + // namespace. + // no prior locks. case MDS_OP_MKNOD: - handle_client_mknod(req, ref); + handle_client_mknod(mdr); break; case MDS_OP_LINK: - handle_client_link(req, ref); + handle_client_link(mdr); break; case MDS_OP_UNLINK: - handle_client_unlink(req, ref); + handle_client_unlink(mdr); break; case MDS_OP_RENAME: - handle_client_rename(req, ref); + handle_client_rename(mdr); break; case MDS_OP_RMDIR: - handle_client_unlink(req, ref); + handle_client_unlink(mdr); break; case MDS_OP_MKDIR: - handle_client_mkdir(req, ref); + handle_client_mkdir(mdr); break; case MDS_OP_SYMLINK: - handle_client_symlink(req, ref); + handle_client_symlink(mdr); break; - default: dout(1) << " unknown client op " << req->get_op() << endl; assert(0); } +} + - return; + +// --------------------------------------- +// HELPERS + + +/** request_pin_ref + * return the ref inode, referred to by the last dentry in the trace. + * open if it is remote. + * pin. + * return existing, if mdr->ref already set. + */ +CInode *Server::request_pin_ref(MDRequest *mdr) +{ + // already did it? + if (mdr->ref) + return mdr->ref; + + // open and pin ref inode in cache too + CInode *ref = 0; + if (mdr->trace.empty()) + ref = mdcache->get_root(); + else { + ref = mdcache->get_dentry_inode(mdr->trace[mdr->trace.size()-1], mdr); + if (!ref) return 0; + } + mdr->pin(ref); + mdr->ref = ref; + return ref; } + +/** validate_dentry_dir + * + * verify that the dir exists and would own the dname. + * do not check if the dentry exists. + */ +CDir *Server::validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname) +{ + // make sure parent is a dir? + if (!diri->is_dir()) { + dout(7) << "validate_dentry_dir: not a dir" << endl; + reply_request(mdr, -ENOTDIR); + return false; + } + + // which dirfrag? + frag_t fg = diri->pick_dirfrag(dname); + + CDir *dir = try_open_auth_dir(diri, fg, mdr); + if (!dir) + return 0; + + // frozen? + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << endl; + dir->add_waiter(CDir::WAIT_UNFREEZE, + new C_MDS_RetryRequest(mdcache, mdr)); + return false; + } + + return dir; +} + + +/** prepare_null_dentry + * prepare a null (or existing) dentry in given dir. + * wait for any dn lock. + */ +CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist) +{ + dout(10) << "prepare_null_dentry " << dname << " in " << *dir << endl; + assert(dir->is_auth()); + + // does it already exist? + CDentry *dn = dir->lookup(dname); + if (dn) { + if (!dn->can_read(mdr)) { + dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; + dir->add_waiter(CDir::WAIT_DNREAD, dname, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + if (!dn->is_null()) { + // name already exists + dout(10) << "dentry " << dname << " exists in " << *dir << endl; + if (!okexist) { + reply_request(mdr, -EEXIST); + return 0; + } + } + + return dn; + } + + // make sure dir is complete + if (!dir->is_complete()) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // create + dn = dir->add_dentry(dname, 0); + dout(10) << "prepare_null_dentry added " << *dn << endl; + + return dn; +} + + +/** prepare_new_inode + * + * create a new inode. set c/m/atime. hit dir pop. + */ +CInode* Server::prepare_new_inode(MClientRequest *req, CDir *dir) +{ + CInode *in = mdcache->create_inode(); + in->inode.uid = req->get_caller_uid(); + in->inode.gid = req->get_caller_gid(); + in->inode.ctime = in->inode.mtime = in->inode.atime = g_clock.gettime(); // now + dout(10) << "prepare_new_inode " << *in << endl; + + // bump modify pop + mds->balancer->hit_dir(dir, META_POP_DWR); + + return in; +} + + + +CDir *Server::traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath) +{ + // figure parent dir vs dname + if (refpath.depth() == 0) { + dout(7) << "can't do that to root" << endl; + reply_request(mdr, -EINVAL); + return 0; + } + string dname = refpath.last_dentry(); + refpath.pop_dentry(); + + dout(10) << "traverse_to_auth_dir dirpath " << refpath << " dname " << dname << endl; + + // traverse to parent dir + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + int r = mdcache->path_traverse(mdr, + 0, + refpath, trace, true, + mdr->request, ondelay, + MDS_TRAVERSE_FORWARD, + true); // is MClientRequest + if (r > 0) return 0; // delayed + if (r < 0) { + reply_request(mdr, r); + return 0; + } + + // open inode + CInode *diri; + if (trace.empty()) + diri = mdcache->get_root(); + else + diri = mdcache->get_dentry_inode(trace[trace.size()-1], mdr); + if (!diri) + return 0; // opening inode. + + // is it an auth dir? + CDir *dir = validate_dentry_dir(mdr, diri, dname); + if (!dir) + return 0; // forwarded or waiting for freeze + + dout(10) << "traverse_to_auth_dir " << *dir << endl; + return dir; +} + + +/** rdlock_path_xlock_dentry + * traverse path to the directory that could/would contain dentry. + * make sure i am auth for that dentry, forward as necessary. + * create null dentry in place (or use existing if okexist). + * get rdlocks on traversed dentries, xlock on new dentry. + */ +CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist) +{ + MClientRequest *req = mdr->client_request(); + + vector trace; + CDir *dir = traverse_to_auth_dir(mdr, trace, req->get_filepath()); + dout(10) << "rdlock_path_xlock_dentry dir " << *dir << endl; + + // make sure we can auth_pin dir + if (!dir->can_auth_pin()) { + dout(7) << "waiting for authpinnable on " << *dir << endl; + dir->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // make a null dentry? + const string &dname = req->get_filepath().last_dentry(); + CDentry *dn; + if (mustexist) { + dn = dir->lookup(dname); + + // make sure dir is complete + if (!dn && !dir->is_complete()) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // readable? + if (dn && !dn->can_read(mdr)) { + dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; + dir->add_waiter(CDir::WAIT_DNREAD, dname, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; + } + + // exists? + if (!dn || dn->is_null()) { + dout(7) << "dentry " << dname << " dne in " << *dir << endl; + reply_request(mdr, -ENOENT); + return 0; + } + } else { + dn = prepare_null_dentry(mdr, dir, dname, okexist); + if (!dn) + return 0; + } + + // -- lock -- + set dentry_rdlocks; + set dentry_xlocks; + set inode_empty; + + for (unsigned i=0; iis_null()) + dentry_xlocks.insert(dn); // new dn, xlock + else + dentry_rdlocks.insert(dn); // existing dn, rdlock + + if (!mds->locker->acquire_locks(mdr, + dentry_rdlocks, dentry_xlocks, + inode_empty, inode_empty)) + return 0; + + // save the locked trace. + mdr->trace.swap(trace); + + return dn; +} + + + + + // FIXME: this probably should go somewhere else. -CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MClientRequest *req) +CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr) { CDir *dir = diri->get_dirfrag(fg); @@ -533,7 +674,7 @@ CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MClientRequest *req) if (!dir && !diri->is_auth()) { int inauth = diri->authority().first; dout(7) << "try_open_auth_dir: not open, not inode auth, fw to mds" << inauth << endl; - mdcache->request_forward(req, inauth); + mdcache->request_forward(mdr, inauth); return 0; } @@ -542,7 +683,7 @@ CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MClientRequest *req) dout(10) << "try_open_dir: dir inode is frozen, waiting " << *diri << endl; assert(diri->get_parent_dir()); diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); + new C_MDS_RetryRequest(mdcache, mdr)); return 0; } @@ -558,15 +699,14 @@ CDir* Server::try_open_auth_dir(CInode *diri, frag_t fg, MClientRequest *req) int auth = dir->authority().first; dout(7) << "try_open_auth_dir: not auth for " << *dir << ", fw to mds" << auth << endl; - mdcache->request_forward(req, auth); + mdcache->request_forward(mdr, auth); return 0; } return dir; } -CDir* Server::try_open_dir(CInode *diri, frag_t fg, - MClientRequest *req, CInode *ref) +CDir* Server::try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr) { CDir *dir = diri->get_dirfrag(fg); if (dir) @@ -579,7 +719,7 @@ CDir* Server::try_open_dir(CInode *diri, frag_t fg, dout(10) << "try_open_dir: dir inode is auth+frozen, waiting " << *diri << endl; assert(diri->get_parent_dir()); diri->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, diri)); + new C_MDS_RetryRequest(mdcache, mdr)); return 0; } @@ -593,7 +733,7 @@ CDir* Server::try_open_dir(CInode *diri, frag_t fg, } else { // not auth mdcache->open_remote_dir(diri, fg, - new C_MDS_RetryRequest(mds, req, ref)); + new C_MDS_RetryRequest(mdcache, mdr)); return 0; } } @@ -602,18 +742,21 @@ CDir* Server::try_open_dir(CInode *diri, frag_t fg, // =============================================================================== // STAT -void Server::handle_client_stat(MClientRequest *req, - CInode *ref) +void Server::handle_client_stat(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *ref = request_pin_ref(mdr); + if (!ref) return; + // FIXME: this is really not the way to handle the statlite mask. // do I need file info? int mask = req->args.stat.mask; if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { // yes. do a full stat. - if (!mds->locker->inode_file_read_start(ref, req, ref)) + if (!mds->locker->inode_file_rdlock_start(ref, mdr)) return; // syncing - mds->locker->inode_file_read_finish(ref); + mds->locker->inode_file_rdlock_finish(ref, mdr); } else { // nope! easy peasy. } @@ -623,7 +766,7 @@ void Server::handle_client_stat(MClientRequest *req, // reply //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; MClientReply *reply = new MClientReply(req); - reply_request(req, reply, ref); + reply_request(mdr, reply, ref); } @@ -638,13 +781,13 @@ void Server::handle_client_stat(MClientRequest *req, */ class C_MDS_utime_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CInode *in; version_t pv; time_t mtime, atime; public: - C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : - mds(m), req(r), in(i), + C_MDS_utime_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : + mds(m), mdr(r), in(i), pv(pdv), mtime(mt), atime(at) { } void finish(int r) { @@ -655,32 +798,32 @@ public: in->inode.atime = atime; in->mark_dirty(pv); - // unlock - mds->locker->inode_file_write_finish(in); - // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; // utime -void Server::handle_client_utime(MClientRequest *req, - CInode *cur) +void Server::handle_client_utime(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = request_pin_ref(mdr); + if (!cur) return; + // auth pin if (!cur->can_auth_pin()) { dout(7) << "waiting for authpinnable on " << *cur << endl; - cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); return; } - mdcache->request_auth_pin(req, cur); + mdr->auth_pin(cur); // write - if (!mds->locker->inode_file_write_start(cur, req, cur)) + if (!mds->locker->inode_file_xlock_start(cur, mdr)) return; // fw or (wait for) sync mds->balancer->hit_inode(cur, META_POP_IWR); @@ -689,7 +832,7 @@ void Server::handle_client_utime(MClientRequest *req, version_t pdv = cur->pre_dirty(); time_t mtime = req->args.utime.modtime; time_t atime = req->args.utime.actime; - C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv, + C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, mdr, cur, pdv, mtime, atime); // log + wait @@ -710,17 +853,17 @@ void Server::handle_client_utime(MClientRequest *req, // -------------- /* - * finisher: do a inode_hard_write_finish and reply. + * finisher: do a inode_hard_xlock_finish and reply. */ class C_MDS_chmod_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CInode *in; version_t pv; int mode; public: - C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) : - mds(m), req(r), in(i), pv(pdv), mode(mo) { } + C_MDS_chmod_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int mo) : + mds(m), mdr(r), in(i), pv(pdv), mode(mo) { } void finish(int r) { assert(r == 0); @@ -729,32 +872,32 @@ public: in->inode.mode |= (mode & 04777); in->mark_dirty(pv); - // unlock - mds->locker->inode_hard_write_finish(in); - // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; // chmod -void Server::handle_client_chmod(MClientRequest *req, - CInode *cur) +void Server::handle_client_chmod(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = request_pin_ref(mdr); + if (!cur) return; + // auth pin if (!cur->can_auth_pin()) { dout(7) << "waiting for authpinnable on " << *cur << endl; - cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); return; } - mdcache->request_auth_pin(req, cur); + mdr->auth_pin(cur); // write - if (!mds->locker->inode_hard_write_start(cur, req, cur)) + if (!mds->locker->inode_hard_xlock_start(cur, mdr)) return; // fw or (wait for) lock mds->balancer->hit_inode(cur, META_POP_IWR); @@ -762,7 +905,7 @@ void Server::handle_client_chmod(MClientRequest *req, // prepare version_t pdv = cur->pre_dirty(); int mode = req->args.chmod.mode; - C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv, + C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, mdr, cur, pdv, mode); // log + wait @@ -783,13 +926,13 @@ void Server::handle_client_chmod(MClientRequest *req, class C_MDS_chown_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CInode *in; version_t pv; int uid, gid; public: - C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) : - mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { } + C_MDS_chown_finish(MDS *m, MDRequest *r, CInode *i, version_t pdv, int u, int g) : + mds(m), mdr(r), in(i), pv(pdv), uid(u), gid(g) { } void finish(int r) { assert(r == 0); @@ -798,30 +941,30 @@ public: if (gid >= 0) in->inode.gid = gid; in->mark_dirty(pv); - // unlock - mds->locker->inode_hard_write_finish(in); - // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, in); + mds->server->reply_request(mdr, reply, in); } }; -void Server::handle_client_chown(MClientRequest *req, - CInode *cur) +void Server::handle_client_chown(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = request_pin_ref(mdr); + if (!cur) return; + // auth pin if (!cur->can_auth_pin()) { dout(7) << "waiting for authpinnable on " << *cur << endl; - cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); return; } - mdcache->request_auth_pin(req, cur); + mdr->auth_pin(cur); // write - if (!mds->locker->inode_hard_write_start(cur, req, cur)) + if (!mds->locker->inode_hard_xlock_start(cur, mdr)) return; // fw or (wait for) lock mds->balancer->hit_inode(cur, META_POP_IWR); @@ -830,7 +973,7 @@ void Server::handle_client_chown(MClientRequest *req, version_t pdv = cur->pre_dirty(); int uid = req->args.chown.uid; int gid = req->args.chown.gid; - C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv, + C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, mdr, cur, pdv, uid, gid); // log + wait @@ -850,9 +993,6 @@ void Server::handle_client_chown(MClientRequest *req, - - - // ================================================================= // DIRECTORY and NAMESPACE OPS @@ -887,14 +1027,17 @@ int Server::encode_dir_contents(CDir *dir, } -void Server::handle_client_readdir(MClientRequest *req, - CInode *diri) +void Server::handle_client_readdir(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *diri = request_pin_ref(mdr); + if (!diri) return; + // it's a directory, right? if (!diri->is_dir()) { // not a dir dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(req, -ENOTDIR); + reply_request(mdr, -ENOTDIR); return; } @@ -904,25 +1047,27 @@ void Server::handle_client_readdir(MClientRequest *req, // does it exist? if (diri->dirfragtree[fg] != fg) { dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; - reply_request(req, -EAGAIN); + reply_request(mdr, -EAGAIN); return; } - CDir *dir = try_open_auth_dir(diri, fg, req); + CDir *dir = try_open_auth_dir(diri, fg, mdr); if (!dir) return; // ok! assert(dir->is_auth()); // check perm - if (!mds->locker->inode_hard_read_start(diri, req, diri)) + /* + if (!mds->locker->inode_hard_rdlock_start(diri, mdr)) return; - mds->locker->inode_hard_read_finish(diri); + mds->locker->inode_hard_rdlock_finish(diri, mdr); + */ if (!dir->is_complete()) { // fetch dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, diri)); + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -946,7 +1091,7 @@ void Server::handle_client_readdir(MClientRequest *req, //balancer->hit_dir(diri->dir); // reply - reply_request(req, reply, diri); + reply_request(mdr, reply, diri); } @@ -957,13 +1102,13 @@ void Server::handle_client_readdir(MClientRequest *req, class C_MDS_mknod_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *dn; CInode *newi; version_t pv; public: - C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), + C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : + mds(m), mdr(r), dn(d), newi(ni), pv(d->get_projected_version()) {} void finish(int r) { assert(r == 0); @@ -974,35 +1119,24 @@ public: // dirty inode, dn, dir newi->mark_dirty(pv); - // unlock - mds->locker->dentry_xlock_finish(dn); - // hit pop mds->balancer->hit_inode(newi, META_POP_IWR); // reply - MClientReply *reply = new MClientReply(req, 0); + MClientReply *reply = new MClientReply(mdr->client_request(), 0); reply->set_result(0); - mds->server->reply_request(req, reply, newi); + mds->server->reply_request(mdr, reply, newi); } }; -void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -{ - CDir *dir = 0; - CDentry *dn = 0; - - // create null dentry - if (!prepare_null_dentry(req, diri, &dir, &dn)) - return; - assert(dir); - assert(dn); - - // xlock dentry - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; +void Server::handle_client_mknod(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request(); + + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; - CInode *newi = prepare_new_inode(req, dir); + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); // it's a file. @@ -1012,10 +1146,10 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *diri) newi->inode.mode |= INODE_MODE_FILE; // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mknod"); le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(dir); + le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); @@ -1026,166 +1160,17 @@ void Server::handle_client_mknod(MClientRequest *req, CInode *diri) -/** validate_dentry_dir - * - * verify that the dir exists and would own the dname. - * do not check if the dentry exists. - */ -CDir *Server::validate_dentry_dir(MClientRequest *req, CInode *ref, CInode *diri, const string& name) -{ - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "validate_dentry_dir: not a dir" << endl; - reply_request(req, -ENOTDIR); - return false; - } - - // which dirfrag? - frag_t fg = diri->pick_dirfrag(name); - - CDir *dir = try_open_auth_dir(diri, fg, req); - if (!dir) - return 0; - - /* - // dir auth pinnable? - if (!dir->can_auth_pin()) { - dout(7) << "validate_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; - dir->add_waiter(CDir::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return false; - } - */ - - // frozen? - if (dir->is_frozen()) { - dout(7) << "dir is frozen " << *dir << endl; - dir->add_waiter(CDir::WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, ref)); - return false; - } - - return dir; -} - -/** prepare_null_dentry - * - * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). - * create the inode and dentry, but do not link them. - * pre_dirty the dentry+dir. - * xlock the dentry. - * - * return val - * 0 - wait for something - * 1 - created - * 2 - already exists (only if okexist=true) - */ -int Server::prepare_null_dentry(MClientRequest *req, - CInode *diri, CDir **pdir, CDentry **pdn, - bool okexist) -{ - // get containing directory (without last bit) - filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); - string name = req->get_filepath().last_dentry(); - - return prepare_null_dentry(req, diri, - diri, name, - pdir, pdn, okexist); -} - -int Server::prepare_null_dentry(MClientRequest *req, CInode *ref, - CInode *diri, const string& name, - CDir **pdir, CDentry **pdn, - bool okexist) -{ - dout(10) << "prepare_null_dentry " << name << " in " << *diri << endl; - - CDir *dir = *pdir = validate_dentry_dir(req, ref, diri, name); - if (!dir) return 0; - - // make sure name doesn't already exist - *pdn = dir->lookup(name); - if (*pdn) { - if (!(*pdn)->can_read(req)) { - dout(10) << "waiting on (existing!) unreadable dentry " << **pdn << endl; - dir->add_waiter(CDir::WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, ref)); - return 0; - } - - if (!(*pdn)->is_null()) { - // name already exists - if (okexist) { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - return 2; - } else { - dout(10) << "dentry " << name << " exists in " << *dir << endl; - reply_request(req, -EEXIST); - return 0; - } - } - } - - // make sure dir is complete - if (!dir->is_complete()) { - dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, ref)); - return 0; - } - - // create null dentry - if (!*pdn) { - *pdn = dir->add_dentry(name, 0); - dout(10) << "prepare_null_dentry added " << **pdn << endl; - } else { - dout(10) << "prepare_null_dentry had " << **pdn << endl; - } - - - return 1; -} - - -/** prepare_new_inode - * - * create a new inode. set c/m/atime. hit dir pop. - */ -CInode* Server::prepare_new_inode(MClientRequest *req, CDir *dir) -{ - CInode *in = mdcache->create_inode(); - in->inode.uid = req->get_caller_uid(); - in->inode.gid = req->get_caller_gid(); - in->inode.ctime = in->inode.mtime = in->inode.atime = g_clock.gettime(); // now - dout(10) << "prepare_new_inode " << *in << endl; - - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - - return in; -} - - - - - // MKDIR -void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) +void Server::handle_client_mkdir(MDRequest *mdr) { - CDir *dir = 0; - CDentry *dn = 0; + MClientRequest *req = mdr->client_request(); - // make dentry - if (!prepare_null_dentry(req, diri, &dir, &dn)) - return; - assert(dir); - assert(dn); - - // xlock - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; // new inode - CInode *newi = prepare_new_inode(req, dir); + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); // it's a directory. @@ -1201,10 +1186,10 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) newdir->mark_dirty(newdir->pre_dirty()); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("mkdir"); le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(dir); + le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); le->metablob.add_dir(newdir, true); @@ -1230,25 +1215,16 @@ void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) } - // SYMLINK -void Server::handle_client_symlink(MClientRequest *req, CInode *diri) +void Server::handle_client_symlink(MDRequest *mdr) { - CDir *dir = 0; - CDentry *dn = 0; - - // make null dentry - if (!prepare_null_dentry(req, diri, &dir, &dn)) - return; - assert(dir); - assert(dn); - - // xlock - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; + MClientRequest *req = mdr->client_request(); + + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, false); + if (!dn) return; - CInode *newi = prepare_new_inode(req, dir); + CInode *newi = prepare_new_inode(req, dn->dir); assert(newi); // it's a symlink @@ -1258,10 +1234,10 @@ void Server::handle_client_symlink(MClientRequest *req, CInode *diri) newi->symlink = req->get_sarg(); // prepare finisher - C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, mdr, dn, newi); EUpdate *le = new EUpdate("symlink"); le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(dir); + le->metablob.add_dir_context(dn->dir); inode_t *pi = le->metablob.add_primary_dentry(dn, true, newi); pi->version = dn->get_projected_version(); @@ -1276,157 +1252,136 @@ void Server::handle_client_symlink(MClientRequest *req, CInode *diri) // LINK -class C_MDS_LinkTraverse : public Context { - Server *server; - MClientRequest *req; - CInode *ref; -public: - vector trace; - C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) { - this->server = server; - this->req = req; - this->ref = ref; - } - void finish(int r) { - server->handle_client_link_2(r, req, ref, trace); - } -}; - -void Server::handle_client_link(MClientRequest *req, CInode *ref) +void Server::handle_client_link(MDRequest *mdr) { - string dname = req->get_filepath().last_dentry(); - dout(7) << "handle_client_link " << dname << " in " << *ref + MClientRequest *req = mdr->client_request(); + + dout(7) << "handle_client_link " << req->get_filepath() << " to " << req->get_sarg() << endl; - // make sure we own the dname - CDir *dir = validate_dentry_dir(req, ref, ref, dname); - if (!dir) return; - - // discover link target - filepath target = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << target << endl; - C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); + // traverse to dest dir, make sure it's ours. + const filepath &linkpath = req->get_filepath(); + const string &dname = linkpath.last_dentry(); + vector linktrace; + CDir *dir = traverse_to_auth_dir(mdr, linktrace, linkpath); + dout(7) << "handle_client_link link " << dname << " in " << *dir << endl; - mdcache->path_traverse(target, onfinish->trace, false, - req, ondelay, - MDS_TRAVERSE_DISCOVER, //XLOCK, - onfinish); -} - - -void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector& trace) -{ - // target dne? + // traverse to link target + filepath targetpath = req->get_sarg(); + dout(7) << "handle_client_link discovering target " << targetpath << endl; + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); + vector targettrace; + int r = mdcache->path_traverse(mdr, 0, + targetpath, targettrace, false, + req, ondelay, + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; // wait if (r < 0) { - dout(7) << "target " << req->get_sarg() << " dne" << endl; - reply_request(req, r); + reply_request(mdr, r); return; } - assert(r == 0); - + // identify target inode - CInode *targeti = mdcache->get_root(); - if (trace.size()) targeti = trace[trace.size()-1]->inode; + CInode *targeti; + if (targettrace.empty()) + targeti = mdcache->get_root(); + else + targeti = targettrace[targettrace.size()-1]->inode; assert(targeti); + assert(r == 0); - // not a dir? + // dir? dout(7) << "target is " << *targeti << endl; if (targeti->is_dir()) { dout(7) << "target is a dir, failing" << endl; - reply_request(req, -EINVAL); + reply_request(mdr, -EINVAL); return; } - + // does the target need an anchor? if (targeti->is_auth()) { - if (targeti->get_parent_dir()->get_inode() == diri) { - dout(7) << "target is in the same dir, sweet" << endl; + /*if (targeti->get_parent_dir() == dn->dir) { + dout(7) << "target is in the same dirfrag, sweet" << endl; } - else if (targeti->is_anchored() && !targeti->is_unanchoring()) { + else + */ + if (targeti->is_anchored() && !targeti->is_unanchoring()) { dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; } else { dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; mdcache->anchor_create(targeti, - new C_MDS_RetryRequest(mds, req, diri)); + new C_MDS_RetryRequest(mdcache, mdr)); return; } } // can we create the dentry? - CDir *dir = 0; CDentry *dn = 0; - // make dentry and inode, xlock dentry. - r = prepare_null_dentry(req, diri, &dir, &dn); - if (!r) return; // wait or forward or something - assert(dir); - assert(dn); + // make null link dentry + dn = prepare_null_dentry(mdr, dir, dname, false); + if (!dn) return; + + // create lock lists + set dentry_rdlocks; + set dentry_xlocks; + set inode_hard_rdlocks; + set inode_hard_xlocks; + + for (unsigned i=0; ilocker->acquire_locks(mdr, + dentry_rdlocks, dentry_xlocks, + inode_hard_rdlocks, inode_hard_xlocks)) + return; + + // go! // local or remote? if (targeti->is_auth()) - _link_local(req, diri, dn, targeti); + _link_local(mdr, dn, targeti); else - _link_remote(req, diri, dn, targeti); + _link_remote(mdr, dn, targeti); } class C_MDS_link_local_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *dn; CInode *targeti; version_t dpv; time_t tctime; time_t tpv; public: - C_MDS_link_local_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ti, time_t ct) : - mds(m), req(r), dn(d), targeti(ti), + C_MDS_link_local_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ti, time_t ct) : + mds(m), mdr(r), dn(d), targeti(ti), dpv(d->get_projected_version()), tctime(ct), tpv(targeti->get_parent_dn()->get_projected_version()) {} void finish(int r) { assert(r == 0); - mds->server->_link_local_finish(req, dn, targeti, dpv, tctime, tpv); + mds->server->_link_local_finish(mdr, dn, targeti, dpv, tctime, tpv); } }; -void Server::_link_local(MClientRequest *req, CInode *diri, - CDentry *dn, CInode *targeti) +void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_local " << *dn << " to " << *targeti << endl; - // first, auth pin the dentry dir and targeti. - if (!mdcache->request_auth_pinned(req, dn->get_dir()) && - !dn->get_dir()->can_auth_pin()) { - dn->get_dir()->add_waiter(CDir::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - if (!mdcache->request_auth_pinned(req, targeti) && - !targeti->can_auth_pin()) { - targeti->add_waiter(CDir::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - mdcache->request_auth_pin(req, dn->get_dir()); - mdcache->request_auth_pin(req, targeti); - - // sweet. let's get our locks. - // lock dentry, target inode - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; - if (!mds->locker->inode_hard_write_start(targeti, req, diri)) - return; - // ok, let's do it. // prepare log entry EUpdate *le = new EUpdate("link_local"); - le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_client_req(mdr->reqid); // predirty dn->pre_dirty(); @@ -1444,14 +1399,14 @@ void Server::_link_local(MClientRequest *req, CInode *diri, pi->version = tpdv; // finisher - C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, req, dn, targeti, pi->ctime); + C_MDS_link_local_finish *fin = new C_MDS_link_local_finish(mds, mdr, dn, targeti, pi->ctime); // log + wait mdlog->submit_entry(le); mdlog->wait_for_sync(fin); } -void Server::_link_local_finish(MClientRequest *req, CDentry *dn, CInode *targeti, +void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, version_t dpv, time_t tctime, version_t tpv) { dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; @@ -1466,25 +1421,20 @@ void Server::_link_local_finish(MClientRequest *req, CDentry *dn, CInode *target targeti->inode.ctime = tctime; targeti->mark_dirty(tpv); - // unlock the new dentry and target inode - mds->locker->dentry_xlock_finish(dn); - mds->locker->inode_hard_write_finish(targeti); - // bump target popularity mds->balancer->hit_inode(targeti, META_POP_IWR); // reply - MClientReply *reply = new MClientReply(req, 0); - reply_request(req, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref + MClientReply *reply = new MClientReply(mdr->client_request(), 0); + reply_request(mdr, reply, dn->get_dir()->get_inode()); // FIXME: imprecise ref } -void Server::_link_remote(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti) +void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_remote " << *dn << " to " << *targeti << endl; - + /* // pin the target replica in our cache assert(!targeti->is_auth()); mdcache->request_pin_inode(req, targeti); @@ -1500,6 +1450,7 @@ void Server::_link_remote(MClientRequest *req, CInode *ref, // IMPLEMENT ME MClientReply *reply = new MClientReply(req, -EXDEV); reply_request(req, reply, dn->get_dir()->get_inode()); + */ } @@ -1571,88 +1522,47 @@ public: // UNLINK -void Server::handle_client_unlink(MClientRequest *req, CInode *diri) +void Server::handle_client_unlink(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + // rmdir or unlink? bool rmdir = false; if (req->get_op() == MDS_OP_RMDIR) rmdir = true; - - // find it - if (req->get_filepath().depth() == 0) { - dout(7) << "can't rmdir root" << endl; - reply_request(req, -EINVAL); - return; - } - string name = req->get_filepath().last_dentry(); - - // make sure parent is a dir? - if (!diri->is_dir()) { - dout(7) << "parent not a dir " << *diri << endl; - reply_request(req, -ENOTDIR); - return; - } - - // get the dir, if it's not frozen etc. - CDir *dir = validate_dentry_dir(req, diri, diri, name); - if (!dir) return; - // ok, it's auth, and authpinnable. - - // does the dentry exist? - CDentry *dn = dir->lookup(name); - if (!dn) { - if (!dir->is_complete()) { - dout(7) << "handle_client_rmdir/unlink missing dn " << name - << " but dir not complete, fetching " << *dir << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, diri)); - } else { - dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; - reply_request(req, -ENOENT); - } - return; - } - + + // get/lock the dentry and path + CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true); // must exist + if (!dn) return; + if (rmdir) { dout(7) << "handle_client_rmdir on " << *dn << endl; } else { dout(7) << "handle_client_unlink on " << *dn << endl; } - - // have it. locked? - if (!dn->can_read(req)) { - dout(10) << " waiting on " << *dn << endl; - dir->add_waiter(CDir::WAIT_DNREAD, name, - new C_MDS_RetryRequest(mds, req, diri)); - return; - } - - // null? - if (dn->is_null()) { - dout(10) << "unlink on null dn " << *dn << endl; - reply_request(req, -ENOENT); - return; - } + // dn looks ok. // get/open inode. - CInode *in = mdcache->get_dentry_inode(dn, req, diri); + CInode *in = request_pin_ref(mdr); if (!in) return; + dout(7) << "dn links to " << *in << endl; // rmdir vs is_dir if (in->is_dir()) { if (rmdir) { // do empty directory checks - if (!_verify_rmdir(req, diri, in)) + if (!_verify_rmdir(mdr, in)) return; } else { dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; - reply_request(req, -EISDIR); + reply_request(mdr, -EISDIR); return; } } else { if (rmdir) { // unlink dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; - reply_request(req, -ENOTDIR); + reply_request(mdr, -ENOTDIR); return; } } @@ -1662,58 +1572,52 @@ void Server::handle_client_unlink(MClientRequest *req, CInode *diri) // ok! if (dn->is_remote() && !dn->inode->is_auth()) - _unlink_remote(req, dn); + _unlink_remote(mdr, dn); else - _unlink_local(req, dn); + _unlink_local(mdr, dn); } class C_MDS_unlink_local_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *dn; CDentry *straydn; version_t ipv; // referred inode time_t ictime; version_t dpv; // deleted dentry public: - C_MDS_unlink_local_finish(MDS *m, MClientRequest *r, CDentry *d, CDentry *sd, + C_MDS_unlink_local_finish(MDS *m, MDRequest *r, CDentry *d, CDentry *sd, version_t v, time_t ct) : - mds(m), req(r), dn(d), straydn(sd), + mds(m), mdr(r), dn(d), straydn(sd), ipv(v), ictime(ct), dpv(d->get_projected_version()) { } void finish(int r) { assert(r == 0); - mds->server->_unlink_local_finish(req, dn, straydn, ipv, ictime, dpv); + mds->server->_unlink_local_finish(mdr, dn, straydn, ipv, ictime, dpv); } }; -void Server::_unlink_local(MClientRequest *req, CDentry *dn) +void Server::_unlink_local(MDRequest *mdr, CDentry *dn) { dout(10) << "_unlink_local " << *dn << endl; - // auth pin - if (!mdcache->request_auth_pinned(req, dn->get_dir()) && - !dn->get_dir()->can_auth_pin()) { - dn->get_dir()->add_waiter(CDir::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, dn->get_dir()->get_inode())); - return; - } - if (!mdcache->request_auth_pinned(req, dn->inode) && + // auth pin inode + if (!mdr->is_auth_pinned(dn->inode) && !dn->inode->can_auth_pin()) { - dn->inode->add_waiter(CInode::WAIT_AUTHPINNABLE, - new C_MDS_RetryRequest(mds, req, dn->get_dir()->get_inode())); + dn->inode->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + + // drop all locks while we wait (racey?) + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); return; } - mdcache->request_auth_pin(req, dn->get_dir()); - mdcache->request_auth_pin(req, dn->inode); + mdr->auth_pin(dn->inode); - // lock - if (!mds->locker->dentry_xlock_start(dn, req, dn->get_dir()->get_inode())) - return; - if (!mds->locker->inode_hard_write_start(dn->inode, req, dn->get_dir()->get_inode())) + // lock inode + if (!mds->locker->inode_hard_xlock_start(dn->inode, mdr)) return; @@ -1732,7 +1636,7 @@ void Server::_unlink_local(MClientRequest *req, CDentry *dn) // ok, let's do it. // prepare log entry EUpdate *le = new EUpdate("unlink_local"); - le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_client_req(mdr->reqid); version_t ipv = 0; // dirty inode version inode_t *pi = 0; // the inode @@ -1761,7 +1665,7 @@ void Server::_unlink_local(MClientRequest *req, CDentry *dn) pi->version = ipv; // finisher - C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, req, dn, straydn, + C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, ipv, pi->ctime); // log + wait @@ -1771,12 +1675,13 @@ void Server::_unlink_local(MClientRequest *req, CDentry *dn) mds->balancer->hit_dir(dn->dir, META_POP_DWR); } -void Server::_unlink_local_finish(MClientRequest *req, +void Server::_unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, version_t ipv, time_t ictime, version_t dpv) { dout(10) << "_unlink_local " << *dn << endl; + /* // unlink main dentry CInode *in = dn->inode; dn->dir->unlink_inode(dn); @@ -1806,28 +1711,27 @@ void Server::_unlink_local_finish(MClientRequest *req, // unlock mds->locker->dentry_xlock_finish(dn); - mds->locker->inode_hard_write_finish(in); + mds->locker->inode_hard_xlock_finish(in); // bump target popularity mds->balancer->hit_dir(dn->dir, META_POP_DWR); // reply - MClientReply *reply = new MClientReply(req, 0); - reply_request(req, reply, dn->dir->get_inode()); // FIXME: imprecise ref + MClientReply *reply = new MClientReply(mdr->client_request(), 0); + reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref if (straydn) mdcache->eval_stray(straydn); + */ } -void Server::_unlink_remote(MClientRequest *req, CDentry *dn) +void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) { - - // IMPLEMENT ME - MClientReply *reply = new MClientReply(req, -EXDEV); - reply_request(req, reply, dn->get_dir()->get_inode()); + MClientReply *reply = new MClientReply(mdr->client_request(), -EXDEV); + reply_request(mdr, reply, dn->get_dir()->get_inode()); } @@ -1841,7 +1745,7 @@ void Server::_unlink_remote(MClientRequest *req, CDentry *dn) * * @param in is the inode being rmdir'd. */ -bool Server::_verify_rmdir(MClientRequest *req, CInode *ref, CInode *in) +bool Server::_verify_rmdir(MDRequest *mdr, CInode *in) { dout(10) << "_verify_rmdir " << *in << endl; assert(in->is_auth()); @@ -1862,14 +1766,14 @@ bool Server::_verify_rmdir(MClientRequest *req, CInode *ref, CInode *in) dir->get_size() == 0 && !dir->is_complete()) { dout(7) << "_verify_rmdir fetching incomplete dir " << *dir << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, ref)); + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return false; } // does the frag _look_ empty? if (dir->get_size()) { dout(10) << "_verify_rmdir still " << dir->get_size() << " items in frag " << *dir << endl; - reply_request(req, -ENOTEMPTY); + reply_request(mdr, -ENOTEMPTY); return false; } @@ -1903,8 +1807,7 @@ bool Server::_verify_rmdir(MClientRequest *req, CInode *ref, CInode *in) class C_MDS_RenameTraverseDst : public Context { Server *server; - MClientRequest *req; - CInode *ref; + MDRequest *mdr; CInode *srci; CDir *srcdir; CDentry *srcdn; @@ -1913,18 +1816,16 @@ public: vector trace; C_MDS_RenameTraverseDst(Server *server, - MClientRequest *req, - CInode *ref, + MDRequest *r, CDentry *srcdn, filepath& destpath) { this->server = server; - this->req = req; - this->ref = ref; + this->mdr = r; this->srcdn = srcdn; this->destpath = destpath; } void finish(int r) { - server->handle_client_rename_2(req, ref, + server->handle_client_rename_2(mdr, srcdn, destpath, trace, r); } @@ -1943,14 +1844,14 @@ public: */ -bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MClientRequest *req, CInode *ref) +bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr) { // xlocked? - if (dn && !dn->can_read(req)) { + if (dn && !dn->can_read(mdr)) { dout(10) << "_rename_open_dn waiting on " << *dn << endl; dir->add_waiter(CDir::WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mds, req, ref)); + new C_MDS_RetryRequest(mdcache, mdr)); return false; } @@ -1958,27 +1859,29 @@ bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MClientRequ ((dn && dn->is_null()) || (!dn && dir->is_complete()))) { dout(10) << "_rename_open_dn dn dne in " << *dir << endl; - reply_request(req, -ENOENT); + reply_request(mdr, -ENOENT); return false; } if (!dn && !dir->is_complete()) { dout(10) << "_rename_open_dn readding incomplete dir" << endl; - dir->fetch(new C_MDS_RetryRequest(mds, req, ref)); + dir->fetch(new C_MDS_RetryRequest(mdcache, mdr)); return false; } assert(dn && !dn->is_null()); dout(10) << "_rename_open_dn dn is " << *dn << endl; - CInode *in = mdcache->get_dentry_inode(dn, req, ref); + CInode *in = mdcache->get_dentry_inode(dn, mdr); if (!in) return false; dout(10) << "_rename_open_dn inode is " << *in << endl; return true; } -void Server::handle_client_rename(MClientRequest *req, CInode *ref) +void Server::handle_client_rename(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + dout(7) << "handle_client_rename on " << *req << endl; // traverse to source @@ -1987,6 +1890,7 @@ void Server::handle_client_rename(MClientRequest *req, CInode *ref) (because we don't want to screw up the lock ordering) the ref inode (normally/initially srcdiri) may move, and this may fail. */ + /* filepath refpath = req->get_filepath(); string srcname = refpath.last_dentry(); refpath = refpath.prefixpath(refpath.depth()-1); @@ -1994,7 +1898,7 @@ void Server::handle_client_rename(MClientRequest *req, CInode *ref) dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl; vector trace; int r = mdcache->path_traverse(refpath, trace, true, - req, new C_MDS_RetryRequest(mds, req, ref), + req, new C_MDS_RetryRequest(mdcache, mdr), MDS_TRAVERSE_FORWARD); if (r > 0) return; if (r < 0) { // dne or something. got renamed out from under us, probably! @@ -2023,7 +1927,7 @@ void Server::handle_client_rename(MClientRequest *req, CInode *ref) frag_t srcfg = srcdiri->pick_dirfrag(srcname); // open dirfrag? is it mine? - CDir *srcdir = try_open_auth_dir(srcdiri, srcfg, req); + CDir *srcdir = try_open_auth_dir(srcdiri, srcfg, mdr); if (!srcdir) return; dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; @@ -2031,7 +1935,7 @@ void Server::handle_client_rename(MClientRequest *req, CInode *ref) // src dentry CDentry *srcdn = srcdir->lookup(srcname); - if (!_rename_open_dn(srcdir, srcdn, true, req, ref)) + if (!_rename_open_dn(srcdir, srcdn, true, mdr)) return; // pin src dentry in cache (so it won't expire) @@ -2042,21 +1946,25 @@ void Server::handle_client_rename(MClientRequest *req, CInode *ref) filepath destpath = req->get_sarg(); C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdn, destpath); - Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); + Context *ondelay = new C_MDS_RetryRequest(mdcache, mdr); - mdcache->path_traverse(destpath, onfinish->trace, false, + mdcache->path_traverse(mdr, + destpath, onfinish->trace, false, req, ondelay, MDS_TRAVERSE_DISCOVER, onfinish); + */ } -void Server::handle_client_rename_2(MClientRequest *req, - CInode *ref, +void Server::handle_client_rename_2(MDRequest *mdr, CDentry *srcdn, filepath& destpath, vector& trace, int r) { + /* + MClientRequest *req = mdr->client_request(); + dout(7) << "handle_client_rename_2 on " << *req << endl; dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl; @@ -2199,6 +2107,7 @@ void Server::handle_client_rename_2(MClientRequest *req, srcdn, destdir, destdn, destname); } + */ } @@ -2206,7 +2115,7 @@ void Server::handle_client_rename_2(MClientRequest *req, class C_MDS_rename_local_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *srcdn; CDentry *destdn; CDentry *straydn; @@ -2218,10 +2127,10 @@ class C_MDS_rename_local_finish : public Context { public: version_t atid1; version_t atid2; - C_MDS_rename_local_finish(MDS *m, MClientRequest *r, + C_MDS_rename_local_finish(MDS *m, MDRequest *r, CDentry *sdn, CDentry *ddn, CDentry *stdn, version_t v, time_t ct) : - mds(m), req(r), + mds(m), mdr(r), srcdn(sdn), destdn(ddn), straydn(stdn), ipv(v), straypv(straydn ? straydn->get_projected_version():0), @@ -2231,7 +2140,7 @@ public: atid1(0), atid2(0) { } void finish(int r) { assert(r == 0); - mds->server->_rename_local_finish(req, srcdn, destdn, straydn, + mds->server->_rename_local_finish(mdr, srcdn, destdn, straydn, srcpv, destpv, straypv, ipv, ictime, atid1, atid2); } @@ -2251,17 +2160,17 @@ public: } }; -void Server::_rename_local(MClientRequest *req, - CInode *ref, +void Server::_rename_local(MDRequest *mdr, CDentry *srcdn, CDir *destdir, CDentry *destdn, const string& destname) { + /* dout(10) << "_rename_local " << *srcdn << " to " << destname << " in " << *destdir << endl; // make sure target (possibly null) dentry exists - int r = prepare_null_dentry(req, ref, + int r = prepare_null_dentry(mdr, destdir->inode, destname, &destdir, &destdn, true); if (!r) return; @@ -2305,7 +2214,7 @@ void Server::_rename_local(MClientRequest *req, dosrc = !dosrc; } if (destdn->inode && - !mds->locker->inode_hard_write_start(destdn->inode, req, ref)) + !mds->locker->inode_hard_xlock_start(destdn->inode, req, ref)) return; @@ -2441,6 +2350,7 @@ void Server::_rename_local(MClientRequest *req, mdlog->submit_entry(le); mdlog->wait_for_sync(fin); } + */ } @@ -2459,12 +2369,13 @@ void Server::_rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *f } -void Server::_rename_local_finish(MClientRequest *req, +void Server::_rename_local_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn, version_t srcpv, version_t destpv, version_t straypv, version_t ipv, time_t ictime, version_t atid1, version_t atid2) { + /* dout(10) << "_rename_local_finish " << *req << endl; CInode *oldin = destdn->inode; @@ -2551,7 +2462,7 @@ void Server::_rename_local_finish(MClientRequest *req, mds->locker->dentry_xlock_finish(srcdn); mds->locker->dentry_xlock_finish(destdn); if (oldin) - mds->locker->inode_hard_write_finish(oldin); + mds->locker->inode_hard_xlock_finish(oldin); // reply MClientReply *reply = new MClientReply(req, 0); @@ -2560,6 +2471,7 @@ void Server::_rename_local_finish(MClientRequest *req, // clean up? if (straydn) mdcache->eval_stray(straydn); + */ } @@ -2733,8 +2645,9 @@ void Server::handle_client_rename_local(MClientRequest *req, * FIXME: this truncate implemention is WRONG WRONG WRONG */ -void Server::handle_client_truncate(MClientRequest *req, CInode *cur) +void Server::handle_client_truncate(MDRequest *mdr) { + /* // auth pin if (!cur->can_auth_pin()) { dout(7) << "waiting for authpinnable on " << *cur << endl; @@ -2744,7 +2657,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) mdcache->request_auth_pin(req, cur); // write - if (!mds->locker->inode_file_write_start(cur, req, cur)) + if (!mds->locker->inode_file_xlock_start(cur, req, cur)) return; // fw or (wait for) lock // check permissions @@ -2753,7 +2666,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) cur->inode.size = req->args.truncate.length; cur->_mark_dirty(); // fixme - mds->locker->inode_file_write_finish(cur); + mds->locker->inode_file_xlock_finish(cur); mds->balancer->hit_inode(cur, META_POP_IWR); @@ -2761,8 +2674,10 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) MClientReply *reply = new MClientReply(req, 0); // commit - commit_request(req, reply, cur, - new EString("truncate fixme")); + assert(0); // rewrite me + //commit_request(req, reply, cur, + //new EString("truncate fixme")); + */ } @@ -2770,18 +2685,22 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) // =========================== // open, openc, close -void Server::handle_client_open(MClientRequest *req, CInode *cur) +void Server::handle_client_open(MDRequest *mdr) { + MClientRequest *req = mdr->client_request(); + CInode *cur = request_pin_ref(mdr); + if (!cur) return; + int flags = req->args.open.flags; int cmode = req->get_open_file_mode(); dout(7) << "open " << flags << " on " << *cur << endl; dout(10) << "open flags = " << flags << " filemode = " << cmode << endl; - // is it a file? - if (!(cmode & INODE_MODE_FILE)) { - dout(7) << "not a regular file" << endl; - reply_request(req, -EINVAL); // FIXME what error do we want? + // regular file? + if ((cur->inode.mode & INODE_TYPE_MASK) != INODE_MODE_FILE) { + dout(7) << "not a regular file " << *cur << endl; + reply_request(mdr, -EINVAL); // FIXME what error do we want? return; } @@ -2792,7 +2711,7 @@ void Server::handle_client_open(MClientRequest *req, CInode *cur) assert(auth != mds->get_nodeid()); dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; - mdcache->request_forward(req, auth); + mdcache->request_forward(mdr, auth); return; } @@ -2801,20 +2720,20 @@ void Server::handle_client_open(MClientRequest *req, CInode *cur) // auth pin if (!cur->can_auth_pin()) { dout(7) << "waiting for authpinnable on " << *cur << endl; - cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + cur->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); return; } - mdcache->request_auth_pin(req, cur); + mdr->auth_pin(cur); // write - if (!mds->locker->inode_file_write_start(cur, req, cur)) + if (!mds->locker->inode_file_xlock_start(cur, mdr)) return; // fw or (wait for) lock // do update cur->inode.size = 0; cur->_mark_dirty(); // fixme - mds->locker->inode_file_write_finish(cur); + mds->locker->inode_file_xlock_finish(cur, mdr); } @@ -2835,19 +2754,19 @@ void Server::handle_client_open(MClientRequest *req, CInode *cur) reply->set_file_caps(cap->pending()); reply->set_file_caps_seq(cap->get_last_seq()); reply->set_file_data_version(fdv); - reply_request(req, reply, cur); + reply_request(mdr, reply, cur); } class C_MDS_openc_finish : public Context { MDS *mds; - MClientRequest *req; + MDRequest *mdr; CDentry *dn; CInode *newi; version_t pv; public: - C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : - mds(m), req(r), dn(d), newi(ni), + C_MDS_openc_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni) : + mds(m), mdr(r), dn(d), newi(ni), pv(d->get_projected_version()) {} void finish(int r) { assert(r == 0); @@ -2858,83 +2777,72 @@ public: // dirty inode, dn, dir newi->mark_dirty(pv); - // unlock - mds->locker->dentry_xlock_finish(dn); + // downgrade xlock to rdlock + mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); + // set/pin ref inode for open() + mdr->ref = newi; + mdr->pin(newi); + // hit pop mds->balancer->hit_inode(newi, META_POP_IWR); // ok, do the open. - mds->server->handle_client_open(req, newi); + mds->server->handle_client_open(mdr); } }; -void Server::handle_client_openc(MClientRequest *req, CInode *diri) +void Server::handle_client_openc(MDRequest *mdr) { - dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; + MClientRequest *req = mdr->client_request(); - CDir *dir = 0; - CDentry *dn = 0; + dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - // make dentry and inode, xlock dentry. bool excl = (req->args.open.flags & O_EXCL); - int r = prepare_null_dentry(req, diri, &dir, &dn, !excl); // okexist = !excl - if (r == 0) return; // wait on something - assert(dir); - assert(dn); - - - if (r == 1) { - // created null dn. - - // xlock - if (!mds->locker->dentry_xlock_start(dn, req, diri)) - return; - - // create inode. - CInode *in = prepare_new_inode(req, dir); - assert(in); - - // it's a file. - dn->pre_dirty(); - in->inode.mode = 0644; // FIXME req should have a umask - in->inode.mode |= INODE_MODE_FILE; - - // prepare finisher - C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); - EUpdate *le = new EUpdate("openc"); - le->metablob.add_client_req(req->get_reqid()); - le->metablob.add_dir_context(dir); - inode_t *pi = le->metablob.add_primary_dentry(dn, true, in); - pi->version = dn->get_projected_version(); - - // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - /* - FIXME. this needs to be rewritten when the write capability stuff starts - getting journaled. - */ - } else { - // exists! + CDentry *dn = rdlock_path_xlock_dentry(mdr, !excl, false); + if (!dn) return; - // O_EXCL? + if (!dn->is_null()) { + // it existed. if (req->args.open.flags & O_EXCL) { - // fail. dout(10) << "O_EXCL, target exists, failing with -EEXIST" << endl; - reply_request(req, -EEXIST, diri); + reply_request(mdr, -EEXIST, dn->get_dir()->get_inode()); return; } - - // get inode - CInode *in = mdcache->get_dentry_inode(dn, req, diri); - if (!in) return; - - // FIXME: do i need to repin path based existent inode? hmm. - handle_client_open(req, in); + + // pass to regular open handler. + handle_client_open(mdr); + return; } + + // created null dn. + + // create inode. + CInode *in = prepare_new_inode(req, dn->dir); + assert(in); + + // it's a file. + dn->pre_dirty(); + in->inode.mode = req->args.open.mode; + in->inode.mode |= INODE_MODE_FILE; + + // prepare finisher + C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); + EUpdate *le = new EUpdate("openc"); + le->metablob.add_client_req(req->get_reqid()); + le->metablob.add_dir_context(dn->dir); + inode_t *pi = le->metablob.add_primary_dentry(dn, true, in); + pi->version = dn->get_projected_version(); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); + + /* + FIXME. this needs to be rewritten when the write capability stuff starts + getting journaled. + */ } diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h index 79e3e394e7540..2ad7ca6427d28 100644 --- a/branches/sage/cephmds2/mds/Server.h +++ b/branches/sage/cephmds2/mds/Server.h @@ -18,6 +18,7 @@ class LogEvent; class C_MDS_rename_local_finish; +class MDRequest; class Server { MDS *mds; @@ -25,154 +26,96 @@ class Server { MDLog *mdlog; Messenger *messenger; - __uint64_t stat_ops; - - public: Server(MDS *m) : mds(m), mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger), - stat_ops(0) { + messenger(mds->messenger) { } void dispatch(Message *m); - // generic request helpers - void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); - void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); - - void submit_update(MClientRequest *req, CInode *wrlockedi, - LogEvent *event, - Context *oncommit); - - void commit_request(MClientRequest *req, - MClientReply *reply, - CInode *tracei, - LogEvent *event, - LogEvent *event2 = 0); - - CDir* try_open_auth_dir(CInode *diri, frag_t, MClientRequest *req); - CDir* try_open_dir(CInode *diri, frag_t fg, - MClientRequest *req, CInode *ref); - - - // clients + // message handlers void handle_client_mount(class MClientMount *m); void handle_client_unmount(Message *m); - void handle_client_request(MClientRequest *m); - void handle_client_request_2(MClientRequest *req, - vector& trace, - int r); - // fs ops - void handle_client_fstat(MClientRequest *req); - // requests - void dispatch_request(Message *m, CInode *ref); - - // inode request *req, CInode *ref; - void handle_client_stat(MClientRequest *req, CInode *ref); - void handle_client_utime(MClientRequest *req, CInode *ref); - void handle_client_inode_soft_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - void handle_client_chmod(MClientRequest *req, CInode *ref); - void handle_client_chown(MClientRequest *req, CInode *ref); - void handle_client_inode_hard_update_2(MClientRequest *req, - MClientReply *reply, - CInode *ref); - - // readdir - void handle_client_readdir(MClientRequest *req, CInode *ref); - int encode_dir_contents(CDir *dir, - list& inls, - list& dnls); + void dispatch_request(MDRequest *mdr); + void reply_request(MDRequest *mdr, int r = 0, CInode *tracei = 0); + void reply_request(MDRequest *mdr, MClientReply *reply, CInode *tracei); + + // some helpers + CInode *request_pin_ref(MDRequest *mdr); + CDir *validate_dentry_dir(MDRequest *mdr, CInode *diri, const string& dname); + CDir *traverse_to_auth_dir(MDRequest *mdr, vector &trace, filepath refpath); + CDentry *prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dname, bool okexist=false); + CInode* prepare_new_inode(MClientRequest *req, CDir *dir); + CDentry* rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mustexist); + CDir* try_open_auth_dir(CInode *diri, frag_t fg, MDRequest *mdr); + + CDir* try_open_dir(CInode *diri, frag_t fg, MDRequest *mdr); + + // requests on existing inodes. + void handle_client_stat(MDRequest *mdr); + void handle_client_utime(MDRequest *mdr); + void handle_client_chmod(MDRequest *mdr); + void handle_client_chown(MDRequest *mdr); + void handle_client_readdir(MDRequest *mdr); + int encode_dir_contents(CDir *dir, list& inls, list& dnls); + void handle_client_truncate(MDRequest *mdr); + void handle_client_fsync(MDRequest *mdr); + + // open + void handle_client_open(MDRequest *mdr); + void handle_client_openc(MDRequest *mdr); // O_CREAT variant. // namespace changes - void handle_client_mknod(MClientRequest *req, CInode *ref); - void handle_client_mkdir(MClientRequest *req, CInode *ref); - void handle_client_symlink(MClientRequest *req, CInode *ref); + void handle_client_mknod(MDRequest *mdr); + void handle_client_mkdir(MDRequest *mdr); + void handle_client_symlink(MDRequest *mdr); // link - void handle_client_link(MClientRequest *req, CInode *ref); - void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); - void _link_local(MClientRequest *req, CInode *diri, - CDentry *dn, CInode *targeti); - void _link_local_finish(MClientRequest *req, + void handle_client_link(MDRequest *mdr); + void _link_local(MDRequest *mdr, CDentry *dn, CInode *targeti); + void _link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, version_t, time_t, version_t); - void _link_remote(MClientRequest *req, CInode *diri, - CDentry *dn, CInode *targeti); + void _link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti); // unlink - void handle_client_unlink(MClientRequest *req, CInode *ref); - bool _verify_rmdir(MClientRequest *req, CInode *ref, CInode *rmdiri); - void _unlink_local(MClientRequest *req, CDentry *dn); - void _unlink_local_finish(MClientRequest *req, + void handle_client_unlink(MDRequest *mdr); + bool _verify_rmdir(MDRequest *mdr, CInode *rmdiri); + void _unlink_local(MDRequest *mdr, CDentry *dn); + void _unlink_local_finish(MDRequest *mdr, CDentry *dn, CDentry *straydn, version_t, time_t, version_t); - void _unlink_remote(MClientRequest *req, CDentry *dn); + void _unlink_remote(MDRequest *mdr, CDentry *dn); // rename - bool _rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MClientRequest *req, CInode *ref); - void handle_client_rename(MClientRequest *req, CInode *ref); - void handle_client_rename_2(MClientRequest *req, - CInode *ref, + bool _rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr); + void handle_client_rename(MDRequest *mdr); + void handle_client_rename_2(MDRequest *mdr, CDentry *srcdn, filepath& destpath, vector& trace, int r); - void _rename_local(MClientRequest *req, CInode *ref, + void _rename_local(MDRequest *mdr, CDentry *srcdn, CDir *destdir, CDentry *destdn, const string& destname); void _rename_local_reanchored(LogEvent *le, C_MDS_rename_local_finish *fin, version_t atid1, version_t atid2); - void _rename_local_finish(MClientRequest *req, + void _rename_local_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn, version_t srcpv, version_t destpv, version_t straypv, version_t ipv, time_t ictime, version_t atid1, version_t atid2); - // file - void handle_client_open(MClientRequest *req, CInode *in); - void handle_client_openc(MClientRequest *req, CInode *diri); - void handle_client_release(MClientRequest *req, CInode *in); - void handle_client_truncate(MClientRequest *req, CInode *in); - void handle_client_fsync(MClientRequest *req, CInode *in); - // some helpers - CDir *validate_dentry_dir(MClientRequest *req, CInode *ref, - CInode *diri, const string& dname); - int prepare_null_dentry(MClientRequest *req, - CInode *diri, CDir **pdir, CDentry **pdn, - bool okexist=false); - int prepare_null_dentry(MClientRequest *req, CInode *ref, - CInode *diri, const string& name, - CDir **pdir, CDentry **pdn, - bool okexist=false); - CInode *prepare_new_inode(MClientRequest *req, CDir *dir); - }; -class C_MDS_RetryRequest : public Context { - MDS *mds; - Message *req; // MClientRequest or MLock - CInode *ref; - public: - C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) { - assert(ref); - this->mds = mds; - this->req = req; - this->ref = ref; - } - virtual void finish(int r) { - mds->server->dispatch_request(req, ref); - } -}; diff --git a/branches/sage/cephmds2/messages/MClientRequest.h b/branches/sage/cephmds2/messages/MClientRequest.h index d1282efad184e..71b8ffe7e2d78 100644 --- a/branches/sage/cephmds2/messages/MClientRequest.h +++ b/branches/sage/cephmds2/messages/MClientRequest.h @@ -48,6 +48,7 @@ #define MDS_OP_STAT 100 #define MDS_OP_LSTAT 101 +#define MDS_OP_FSTAT 102 #define MDS_OP_UTIME 1102 #define MDS_OP_CHMOD 1103 #define MDS_OP_CHOWN 1104 @@ -94,6 +95,10 @@ class MClientRequest : public Message { struct { int mask; } stat; + struct { + _inodeno_t ino; + int mask; + } fstat; struct { _frag_t frag; } readdir; @@ -117,7 +122,7 @@ class MClientRequest : public Message { mode_t mode; } open; struct { - _inodeno_t ino; + _inodeno_t ino; // optional off_t length; } truncate; struct { @@ -215,6 +220,8 @@ class MClientRequest : public Message { out << "stat"; break; case MDS_OP_LSTAT: out << "lstat"; break; + case MDS_OP_FSTAT: + out << "fstat"; break; case MDS_OP_UTIME: out << "utime"; break; case MDS_OP_CHMOD: -- 2.39.5