From 4cb8c4a9111e0da07151687b195d62b0692a3167 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 30 Mar 2007 22:49:38 +0000 Subject: [PATCH] * mds.locker: huge rewrite. cache objects now share lock code (for real). not tested yet. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1326 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/TODO | 2 + branches/sage/cephmds2/include/types.h | 15 +- branches/sage/cephmds2/mds/CDentry.cc | 77 +- branches/sage/cephmds2/mds/CDentry.h | 129 +- branches/sage/cephmds2/mds/CDir.cc | 6 +- branches/sage/cephmds2/mds/CDir.h | 22 +- branches/sage/cephmds2/mds/CInode.cc | 49 +- branches/sage/cephmds2/mds/CInode.h | 98 +- branches/sage/cephmds2/mds/FileLock.h | 221 ++ branches/sage/cephmds2/mds/Lock.h | 321 --- branches/sage/cephmds2/mds/Locker.cc | 2121 ++++++----------- branches/sage/cephmds2/mds/Locker.h | 102 +- branches/sage/cephmds2/mds/MDCache.cc | 65 +- branches/sage/cephmds2/mds/MDCache.h | 18 +- branches/sage/cephmds2/mds/Migrator.cc | 23 +- branches/sage/cephmds2/mds/Server.cc | 118 +- branches/sage/cephmds2/mds/SimpleLock.h | 230 ++ branches/sage/cephmds2/mds/mdstypes.h | 39 +- .../sage/cephmds2/messages/MClientReply.h | 4 +- branches/sage/cephmds2/messages/MLock.h | 56 +- 20 files changed, 1569 insertions(+), 2147 deletions(-) create mode 100644 branches/sage/cephmds2/mds/FileLock.h delete mode 100644 branches/sage/cephmds2/mds/Lock.h create mode 100644 branches/sage/cephmds2/mds/SimpleLock.h diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO index 35e09b7e41d49..580248baa3f52 100644 --- a/branches/sage/cephmds2/TODO +++ b/branches/sage/cephmds2/TODO @@ -87,6 +87,8 @@ mds - statfs? +- fix lock caps gather ack versus ambiguous auth + foreign rename - question: can we generalize foreign and local rename? diff --git a/branches/sage/cephmds2/include/types.h b/branches/sage/cephmds2/include/types.h index 177a81c25aec6..98a33062f0d22 100644 --- a/branches/sage/cephmds2/include/types.h +++ b/branches/sage/cephmds2/include/types.h @@ -235,13 +235,16 @@ namespace __gnu_cxx { struct inode_t { // base (immutable) inodeno_t ino; - - // other. FileLayout layout; // ?immutable? - int nlink; // base, + + // affected by any inode change... time_t ctime; // inode change time - // hard/perm (namespace permissions) + // nlink + int nlink; + bool anchored; // auth only? + + // perm (namespace permissions) mode_t mode; uid_t uid; gid_t gid; @@ -255,8 +258,6 @@ struct inode_t { // special stuff version_t version; // auth only - unsigned char hash_seed; // only defined for dir; 0 if not hashed. - bool anchored; // auth only version_t file_data_version; // auth only bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } @@ -267,8 +268,6 @@ struct inode_t { -// client types -typedef int fh_t; // file handle // dentries diff --git a/branches/sage/cephmds2/mds/CDentry.cc b/branches/sage/cephmds2/mds/CDentry.cc index 8ff67b11a0914..4f9a921a6535d 100644 --- a/branches/sage/cephmds2/mds/CDentry.cc +++ b/branches/sage/cephmds2/mds/CDentry.cc @@ -21,6 +21,8 @@ #include "MDS.h" #include "MDCache.h" +#include "messages/MLock.h" + #include #undef dout @@ -48,11 +50,7 @@ ostream& operator<<(ostream& out, CDentry& dn) if (dn.is_null()) out << " NULL"; if (dn.is_remote()) out << " REMOTE"; - if (dn.is_rdlocked()) out << " " << dn.get_num_rdlocks() << " rdlocks"; - - if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; - if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); - if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); + out << " " << dn.lock; out << " v=" << dn.get_version(); out << " pv=" << dn.get_projected_version(); @@ -70,7 +68,7 @@ ostream& operator<<(ostream& out, CDentry& dn) } -bool operator<(CDentry& l, CDentry& r) +bool operator<(const CDentry& l, const CDentry& r) { if (l.get_dir()->ino() < r.get_dir()->ino()) return true; if (l.get_dir()->ino() == r.get_dir()->ino() && @@ -79,9 +77,9 @@ bool operator<(CDentry& l, CDentry& r) } - -CDentry::CDentry(const CDentry& m) { - assert(0); //std::cerr << "copy cons called, implement me" << endl; +void CDentry::print(ostream& out) +{ + out << *this; } @@ -196,6 +194,47 @@ CDentryDiscover *CDentry::replicate_to(int who) } +// ---------------------------- +// locking + +void CDentry::set_mlock_info(MLock *m) +{ + m->set_dn(dir->dirfrag(), name); +} + +void CDentry::encode_lock_state(int type, bufferlist& bl) +{ + +} + +void CDentry::decode_lock_state(int type, bufferlist& bl) +{ + +} + +int CDentry::convert_lock_mask(int mask) +{ + return mask << CDir::WAIT_DNLOCK_OFFSET; +} + +void CDentry::finish_lock_waiters(int type, int mask, int r) +{ + dir->finish_waiting(convert_lock_mask(mask), name, r); +} + +void CDentry::add_lock_waiter(int type, int mask, Context *c) +{ + dir->add_waiter(convert_lock_mask(mask), name, c); +} + +bool CDentry::is_lock_waiting(int type, int mask) +{ + return dir->waiting_for(convert_lock_mask(mask), name); +} + + + +/* // = @@ -204,22 +243,4 @@ const CDentry& CDentry::operator= (const CDentry& right) { return *this; } - // comparisons - bool CDentry::operator== (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator!= (const CDentry& right) const { - return name == right.name; - } - bool CDentry::operator< (const CDentry& right) const { - return name < right.name; - } - bool CDentry::operator> (const CDentry& right) const { - return name > right.name; - } - bool CDentry::operator>= (const CDentry& right) const { - return name >= right.name; - } - bool CDentry::operator<= (const CDentry& right) const { - return name <= right.name; - } +*/ diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h index 3cd23283849c6..be5c96abe6cda 100644 --- a/branches/sage/cephmds2/mds/CDentry.h +++ b/branches/sage/cephmds2/mds/CDentry.h @@ -26,17 +26,12 @@ using namespace std; #include "include/lru.h" #include "mdstypes.h" +#include "SimpleLock.h" + class CInode; class CDir; class MDRequest; -#define DN_LOCK_SYNC 0 -#define DN_LOCK_PREXLOCK 1 -#define DN_LOCK_XLOCK 2 -#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. - -#define DN_XLOCK_FOREIGN ((MDRequest*)0x1) // not 0, not a valid pointer. FIXME FIXME - class Message; class CDentryDiscover; class Anchor; @@ -44,7 +39,7 @@ class Anchor; class CDentry; // define an ordering -bool operator<(CDentry& l, CDentry& r); +bool operator<(const CDentry& l, const CDentry& r); // dentry class CDentry : public MDSCacheObject, public LRUObject { @@ -74,11 +69,9 @@ class CDentry : public MDSCacheObject, public LRUObject { static const int EXPORT_NONCE = 1; - struct ptr_lt { - bool operator()(const CDentry* l, const CDentry* r) const { - return *l < *r; - } - }; + bool is_lt(const MDSCacheObject *r) const { + return *this < *(CDentry*)r; + } protected: string name; @@ -90,14 +83,6 @@ class CDentry : public MDSCacheObject, public LRUObject { version_t version; // dir version when last touched. version_t projected_version; // what it will be when i unlock/commit. - // xlocks - int lockstate; - MDRequest *xlockedby; - set gather_set; - - // rdlocks - int num_rdlocks; - multiset rdlock_set; friend class Migrator; friend class Locker; @@ -108,6 +93,13 @@ class CDentry : public MDSCacheObject, public LRUObject { friend class CInode; friend class C_MDC_XlockRequest; + +public: + // lock + SimpleLock lock; + + + public: // cons CDentry() : @@ -116,9 +108,7 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(0), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - num_rdlocks(0) { } + lock(this, LOCK_OTYPE_DN) { } CDentry(const string& n, inodeno_t ino, CInode *in=0) : name(n), inode(in), @@ -126,9 +116,7 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(ino), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - num_rdlocks(0) { } + lock(this, LOCK_OTYPE_DN) { } CDentry(const string& n, CInode *in) : name(n), inode(in), @@ -136,13 +124,11 @@ class CDentry : public MDSCacheObject, public LRUObject { remote_ino(0), version(0), projected_version(0), - lockstate(DN_LOCK_SYNC), - xlockedby(0), - num_rdlocks(0) { } + lock(this, LOCK_OTYPE_DN) { } - CInode *get_inode() { return inode; } - CDir *get_dir() { return dir; } - const string& get_name() { return name; } + CInode *get_inode() const { return inode; } + CDir *get_dir() const { return dir; } + const string& get_name() const { return name; } inodeno_t get_ino(); inodeno_t get_remote_ino() { return remote_ino; } @@ -173,14 +159,6 @@ class CDentry : public MDSCacheObject, public LRUObject { CDentry(const CDentry& m); const CDentry& operator= (const CDentry& right); - // comparisons - bool operator== (const CDentry& right) const; - bool operator!= (const CDentry& right) const; - bool operator< (const CDentry& right) const; - bool operator> (const CDentry& right) const; - bool operator>= (const CDentry& right) const; - bool operator<= (const CDentry& right) const; - // misc void make_path(string& p); void make_anchor_trace(vector& trace, CInode *in); @@ -214,8 +192,7 @@ class CDentry : public MDSCacheObject, public LRUObject { bl.append((char*)&state, sizeof(state)); bl.append((char*)&version, sizeof(version)); bl.append((char*)&projected_version, sizeof(projected_version)); - bl.append((char*)&lockstate, sizeof(lockstate)); - ::_encode(gather_set, bl); + lock._encode(bl); ::_encode(replicas, bl); // twiddle @@ -233,9 +210,7 @@ class CDentry : public MDSCacheObject, public LRUObject { off += sizeof(version); bl.copy(off, sizeof(projected_version), (char*)&projected_version); off += sizeof(projected_version); - bl.copy(off, sizeof(lockstate), (char*)&lockstate); - off += sizeof(lockstate); - ::_decode(gather_set, bl, off); + lock._decode(bl, off); ::_decode(replicas, bl, off); // twiddle @@ -250,55 +225,21 @@ class CDentry : public MDSCacheObject, public LRUObject { remove_replica(to); } - // -- locking - int get_lockstate() { return lockstate; } - set& get_gather_set() { return gather_set; } - - bool is_sync() { return lockstate == DN_LOCK_SYNC; } - bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } - bool can_read(MDRequest *m) { return is_xlockedbyme(m) || can_read(); } - bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } - MDRequest* get_xlockedby() { return xlockedby; } - bool is_xlockedbyother(MDRequest *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } - bool is_xlockedbyme(MDRequest *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } - bool is_prexlockbyother(MDRequest *m) { - return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; + // -- locking -- + SimpleLock* get_lock(int type) { + assert(type == LOCK_OTYPE_DN); + return &lock; } + void set_mlock_info(MLock *m); + void encode_lock_state(int type, bufferlist& bl); + void decode_lock_state(int type, bufferlist& bl); + int convert_lock_mask(int mask); + void finish_lock_waiters(int type, int mask, int r=0); + void add_lock_waiter(int type, int mask, Context *c); + bool is_lock_waiting(int type, int mask); - int get_replica_lockstate() { - switch (lockstate) { - case DN_LOCK_XLOCK: - case DN_LOCK_SYNC: - return lockstate; - case DN_LOCK_PREXLOCK: - return DN_LOCK_XLOCK; - case DN_LOCK_UNPINNING: - return DN_LOCK_SYNC; - } - assert(0); - return 0; - } - void set_lockstate(int s) { lockstate = s; } - // path pins - void get_rdlock(MDRequest *m) { - num_rdlocks++; - rdlock_set.insert(m); - assert(rdlock_set.size() == (unsigned)num_rdlocks); - } - void put_rdlock(MDRequest *m) { - num_rdlocks--; - assert(num_rdlocks >= 0); - assert(rdlock_set.count(m) > 0); - rdlock_set.erase(rdlock_set.find(m)); - assert(rdlock_set.size() == (unsigned)num_rdlocks); - } - bool can_rdlock(MDRequest *m) { - return (lockstate == DN_LOCK_SYNC) || - (lockstate == DN_LOCK_UNPINNING && m && rdlock_set.count(m)); - } - bool is_rdlocked() { return num_rdlocks>0; } - int get_num_rdlocks() { return num_rdlocks; } + void print(ostream& out); friend class CDir; }; @@ -319,7 +260,7 @@ public: CDentryDiscover() {} CDentryDiscover(CDentry *dn, int nonce) : dname(dn->get_name()), replica_nonce(nonce), - lockstate(dn->get_replica_lockstate()), + lockstate(dn->lock.get_replica_state()), ino(dn->get_ino()), remote_ino(dn->get_remote_ino()) { } @@ -335,7 +276,7 @@ public: } void update_new_dentry(CDentry *dn) { update_dentry(dn); - dn->set_lockstate( lockstate ); + dn->lock.set_state( lockstate ); } void _encode(bufferlist& bl) { diff --git a/branches/sage/cephmds2/mds/CDir.cc b/branches/sage/cephmds2/mds/CDir.cc index 6281d3102800b..ecbefd8acee44 100644 --- a/branches/sage/cephmds2/mds/CDir.cc +++ b/branches/sage/cephmds2/mds/CDir.cc @@ -88,6 +88,11 @@ ostream& operator<<(ostream& out, CDir& dir) return out << "]"; } +void CDir::print(ostream& out) +{ + out << *this; +} + #include "config.h" #undef dout @@ -348,7 +353,6 @@ void CDir::remove_null_dentries() { it != dns.end(); it++) { CDentry *dn = *it; - assert(dn->is_sync()); remove_dentry(dn); } //assert(null_items.empty()); diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h index 27d4902fa1ae8..2f3f2096cee24 100644 --- a/branches/sage/cephmds2/mds/CDir.h +++ b/branches/sage/cephmds2/mds/CDir.h @@ -164,12 +164,8 @@ class CDir : public MDSCacheObject { static const int WAIT_IMPORTED = (1<<4); // import finish static const int WAIT_SINGLEAUTH = (1<<5); - static const int WAIT_DNREAD = (1<<20); - static const int WAIT_DNLOCK = (1<<21); - static const int WAIT_DNUNPINNED = (1<<22); - static const int WAIT_DNPINNABLE = (WAIT_DNREAD|WAIT_DNUNPINNED); - static const int WAIT_DNREQXLOCK = (1<<23); - + static const int WAIT_DNLOCK_OFFSET = 6; + static const int WAIT_ANY = (0xffffffff); static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE); static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); @@ -184,6 +180,10 @@ class CDir : public MDSCacheObject { CInode *inode; // my inode frag_t frag; // my frag + bool is_lt(const MDSCacheObject *r) const { + return dirfrag() < ((const CDir*)r)->dirfrag(); + } + protected: // contents CDir_map_t items; // non-null AND null @@ -230,8 +230,8 @@ class CDir : public MDSCacheObject { // -- accessors -- - inodeno_t ino() { return inode->ino(); } // deprecate me? - dirfrag_t dirfrag() { return dirfrag_t(inode->ino(), frag); } + inodeno_t ino() const { return inode->ino(); } // deprecate me? + dirfrag_t dirfrag() const { return dirfrag_t(inode->ino(), frag); } CInode *get_inode() { return inode; } CDir *get_parent_dir() { return inode->get_parent_dir(); } @@ -341,7 +341,8 @@ class CDir : public MDSCacheObject { return true; } - + + // -- fetch -- object_t get_ondisk_object() { return object_t(ino(), frag); } void fetch(Context *c); @@ -464,6 +465,9 @@ class CDir : public MDSCacheObject { CDir *get_frozen_tree_root(); + + + void print(ostream& out); }; diff --git a/branches/sage/cephmds2/mds/CInode.cc b/branches/sage/cephmds2/mds/CInode.cc index 67af1751904d4..9cef818b1a90c 100644 --- a/branches/sage/cephmds2/mds/CInode.cc +++ b/branches/sage/cephmds2/mds/CInode.cc @@ -23,6 +23,8 @@ #include "common/Clock.h" +#include "messages/MLock.h" + #include #include @@ -78,8 +80,15 @@ ostream& operator<<(ostream& out, CInode& in) } +void CInode::print(ostream& out) +{ + out << *this; +} + + // ====== CInode ======= -CInode::CInode(MDCache *c, bool auth) +CInode::CInode(MDCache *c, bool auth) : hardlock(this, LOCK_OTYPE_IHARD), + filelock(this, LOCK_OTYPE_IFILE) { mdcache = c; @@ -332,6 +341,44 @@ void CInode::mark_clean() +// ------------------ +// locking + +void CInode::set_mlock_info(MLock *m) +{ + m->set_ino(ino()); +} + +void CInode::encode_lock_state(int type, bufferlist& bl) +{ + switch (type) { + case LOCK_OTYPE_IFILE: + encode_file_state(bl); + break; + case LOCK_OTYPE_IHARD: + encode_hard_state(bl); + break; + default: + assert(0); + } +} + +void CInode::decode_lock_state(int type, bufferlist& bl) +{ + int off = 0; + switch (type) { + case LOCK_OTYPE_IFILE: + decode_file_state(bl, off); + break; + case LOCK_OTYPE_IHARD: + decode_hard_state(bl, off); + break; + default: + assert(0); + } +} + + // new state encoders diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h index 6eef2a9818205..4f6a9cfbaa7c3 100644 --- a/branches/sage/cephmds2/mds/CInode.h +++ b/branches/sage/cephmds2/mds/CInode.h @@ -23,7 +23,8 @@ #include "mdstypes.h" #include "CDentry.h" -#include "Lock.h" +#include "SimpleLock.h" +#include "FileLock.h" #include "Capability.h" @@ -110,22 +111,11 @@ class CInode : public MDSCacheObject { static const int WAIT_ANCHORED = (1<<5); static const int WAIT_UNANCHORED = (1<<6); static const int WAIT_UNLINK = (1<<7); // as in remotely nlink-- - static const int WAIT_HARDR = (1<<8); // 131072 - static const int WAIT_HARDW = (1<<9); // 262... - static const int WAIT_HARDB = (1<<10); - static const int WAIT_HARDRWB = (WAIT_HARDR|WAIT_HARDW|WAIT_HARDB); - static const int WAIT_HARDSTABLE = (1<<11); - static const int WAIT_HARDNORD = (1<<12); - static const int WAIT_FILER = (1<<13); - static const int WAIT_FILEW = (1<<14); - static const int WAIT_FILEB = (1<<15); - static const int WAIT_FILERWB = (WAIT_FILER|WAIT_FILEW|WAIT_FILEB); - static const int WAIT_FILESTABLE = (1<<16); - static const int WAIT_FILENORD = (1<<17); - static const int WAIT_FILENOWR = (1<<18); - static const int WAIT_RENAMEACK =(1<<19); - static const int WAIT_RENAMENOTIFYACK =(1<<20); - static const int WAIT_CAPS =(1<<21); + static const int WAIT_CAPS = (1<<8); + + static const int WAIT_HARDLOCK_OFFSET = 9; + static const int WAIT_FILELOCK_OFFSET = 17; + static const int WAIT_ANY = 0xffffffff; // misc @@ -172,10 +162,6 @@ class CInode : public MDSCacheObject { // -- distributed state -- -public: - // inode metadata locks - CLock hardlock; - CLock filelock; protected: // file capabilities map client_caps; // client -> caps @@ -228,11 +214,9 @@ protected: CDir *get_parent_dir(); CInode *get_parent_inode(); - struct ptr_lt { - bool operator()(const CInode* l, const CInode* r) const { - return l->ino() < r->ino(); - } - }; + bool is_lt(const MDSCacheObject *r) const { + return ino() < ((CInode*)r)->ino(); + } @@ -274,11 +258,42 @@ protected: void finish_waiting(int mask, int result = 0); - bool is_hardlock_write_wanted() { - return waiting_for(WAIT_HARDW); + // -- locks -- +public: + SimpleLock hardlock; + FileLock filelock; + + SimpleLock* get_lock(int type) { + switch (type) { + case LOCK_OTYPE_IFILE: return &filelock; + case LOCK_OTYPE_IHARD: return &hardlock; + default: assert(0); + } + } + void set_mlock_info(MLock *m); + void encode_lock_state(int type, bufferlist& bl); + void decode_lock_state(int type, bufferlist& bl); + + int convert_lock_mask(int type, int lmask) { + switch (type) { + case LOCK_OTYPE_IFILE: + return lmask << WAIT_FILELOCK_OFFSET; + break; + case LOCK_OTYPE_IHARD: + return lmask << WAIT_HARDLOCK_OFFSET; + break; + default: + assert(0); + } } - bool is_filelock_write_wanted() { - return waiting_for(WAIT_FILEW); + void finish_lock_waiters(int type, int mask, int r=0) { + finish_waiting(convert_lock_mask(type, mask), r); + } + void add_lock_waiter(int type, int mask, Context *c) { + add_waiter(convert_lock_mask(type, mask), c); + } + bool is_lock_waiting(int type, int mask) { + return waiting_for(convert_lock_mask(type, mask)); } // -- caps -- (new) @@ -461,6 +476,8 @@ public: } */ + void print(ostream& out); + }; @@ -552,7 +569,8 @@ class CInodeExport { map replicas; map cap_map; - CLock hardlock,filelock; + bufferlist hardlock; + bufferlist filelock; //int remaining_issued; public: @@ -565,8 +583,8 @@ public: st.is_dirty = in->is_dirty(); replicas = in->replicas; - hardlock = in->hardlock; - filelock = in->filelock; + in->hardlock._encode(hardlock); + in->filelock._encode(filelock); st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); @@ -599,8 +617,10 @@ public: if (!replicas.empty()) in->get(CInode::PIN_CACHED); - in->hardlock = hardlock; - in->filelock = filelock; + int off = 0; + in->hardlock._decode(hardlock, off); + off = 0; + in->filelock._decode(filelock, off); // caps in->merge_client_caps(cap_map, new_client_caps); @@ -615,8 +635,8 @@ public: // cached_by + nonce ::_encode(replicas, bl); - hardlock.encode_state(bl); - filelock.encode_state(bl); + ::_encode(hardlock, bl); + ::_encode(filelock, bl); // caps for (map::iterator it = cap_map.begin(); @@ -635,8 +655,8 @@ public: ::_decode(replicas, bl, off); - hardlock.decode_state(bl, off); - filelock.decode_state(bl, off); + ::_decode(hardlock, bl, off); + ::_decode(filelock, bl, off); // caps for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FILELOCK_H +#define __FILELOCK_H + +#include +#include +using namespace std; + +#include "include/buffer.h" + +#include "SimpleLock.h" +#include "Capability.h" + +// states and such. +// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio + +// -----auth-------- ---replica------- +#define LOCK_SYNC_ 0 // AR R . / C R . . . L R . / C R . . . L stat() +#define LOCK_GSYNCL -11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... +#define LOCK_GSYNCM -12 // A . . / . R . . . L + +#define LOCK_LOCK_ 1 // AR R W / C . . . . . . . / C . . . . . truncate() +#define LOCK_GLOCKR_ -2 // AR R . / C . . . . . . . / C . . . . . +#define LOCK_GLOCKL -3 // A . . / . . . . . . loner -> lock +#define LOCK_GLOCKM -4 // A . . / . . . . . . + +#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L +#define LOCK_GMIXEDR -6 // AR R . / . R . . . L . . / . R . . . L +#define LOCK_GMIXEDL -7 // A . . / . . . . . L loner -> mixed + +#define LOCK_LONER 8 // A . . / C R W A B L (lock) +#define LOCK_GLONERR -9 // A . . / . R . . . L +#define LOCK_GLONERM -10 // A . . / . R W A . L + + +// 4 stable +// +9 transition +// 13 total + +inline const char *get_filelock_state_name(int n) { + switch (n) { + case LOCK_SYNC: return "sync"; + case LOCK_GSYNCL: return "gsyncl"; + case LOCK_GSYNCM: return "gsyncm"; + case LOCK_LOCK: return "lock"; + case LOCK_GLOCKR: return "glockr"; + case LOCK_GLOCKL: return "glockl"; + case LOCK_GLOCKM: return "glockm"; + case LOCK_MIXED: return "mixed"; + case LOCK_GMIXEDR: return "gmixedr"; + case LOCK_GMIXEDL: return "gmixedl"; + case LOCK_LONER: return "loner"; + case LOCK_GLONERR: return "glonerr"; + case LOCK_GLONERM: return "glonerm"; + default: assert(0); + } +} + + +/* no append scenarios: + +loner + truncate(): + - loner needs to lose A (?unless it's the loner doing the truncate?) +loner + statlite(size): + - loner needs to lose A + +any + statlite(size) + - all lose A + +any + statlite(mtime) + - all lose W + +-> we need to add lonerfixed and mixedfixed states (and associated transitions) + in order to efficiently support statlite(size) and truncate(). until then, + we have to LOCK. + + */ + +// -- lock... hard or file + +class MDRequest; + +class FileLock : public SimpleLock { + public: + FileLock(MDSCacheObject *o, int t) : SimpleLock(o, t) { } + + char get_replica_state() { + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKM: + case LOCK_GLOCKL: + case LOCK_GLOCKR: + case LOCK_LONER: + case LOCK_GLONERR: + case LOCK_GLONERM: + return LOCK_LOCK; + case LOCK_MIXED: + case LOCK_GMIXEDR: + return LOCK_MIXED; + case LOCK_SYNC: + return LOCK_SYNC; + + // after gather auth will bc LOCK_AC_MIXED or whatever + case LOCK_GSYNCM: + return LOCK_MIXED; + case LOCK_GSYNCL: + case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. + return LOCK_LOCK; + + default: + assert(0); + } + return 0; + } + + + // read/write access + bool can_rdlock(MDRequest *mdr) { + if (!parent->is_auth()) + return (state == LOCK_SYNC); + if (state == LOCK_LOCK && mdr && xlock_by == mdr) + return true; + return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) + || (state == LOCK_GLOCKR); + } + bool can_rdlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKL); + else + return false; + } + bool can_xlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) + || (state == LOCK_GLOCKM); + else + return false; + } + + // client caps allowed + int caps_allowed_ever() { + if (parent->is_auth()) + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + else + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + } + int caps_allowed() { + if (parent->is_auth()) + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; + + case LOCK_GLOCKL: + case LOCK_GLOCKM: + return 0; + + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + case LOCK_GMIXEDR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GMIXEDL: + return 0; + + case LOCK_LONER: // single client writer, of course. + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + case LOCK_GLONERR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GLONERM: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + + case LOCK_GSYNCL: + return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; + case LOCK_GSYNCM: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + else + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; + case LOCK_GMIXEDR: + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + assert(0); + return 0; + } +}; + +inline ostream& operator<<(ostream& out, FileLock& l) +{ + out << "(" << get_lock_type_name(l.get_type()) + << get_filelock_state_name(l.get_state()); + if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); + if (l.get_num_rdlock()) + out << " r=" << l.get_num_rdlock(); + if (l.is_xlocked()) + out << " x=" << l.get_xlocked_by(); + out << ")"; + return out; +} + +#endif diff --git a/branches/sage/cephmds2/mds/Lock.h b/branches/sage/cephmds2/mds/Lock.h deleted file mode 100644 index b138a34d5f29f..0000000000000 --- a/branches/sage/cephmds2/mds/Lock.h +++ /dev/null @@ -1,321 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __LOCK_H -#define __LOCK_H - -#include -#include -using namespace std; - -#include "include/buffer.h" - -#include "Capability.h" - -// states and such. -// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio - -// basic lock -----auth-------- ---replica------- -#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat() -#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate() -#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . . - -// file lock states -#define LOCK_GLOCKL 3 // A . . / . . . . . . loner -> lock -#define LOCK_GLOCKM 4 // A . . / . . . . . . -#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L -#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L -#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed - -#define LOCK_LONER 8 // A . . / C R W A B L (lock) -#define LOCK_GLONERR 9 // A . . / . R . . . L -#define LOCK_GLONERM 10 // A . . / . R W A . L - -#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... -#define LOCK_GSYNCM 12 // A . . / . R . . . L - -// 4 stable -// +9 transition -// 13 total - -/* no append scenarios: - -loner + truncate(): - - loner needs to lose A (?unless it's the loner doing the truncate?) -loner + statlite(size): - - loner needs to lose A - -any + statlite(size) - - all lose A - -any + statlite(mtime) - - all lose W - --> we need to add lonerfixed and mixedfixed states (and associated transitions) - in order to efficiently support statlite(size) and truncate(). until then, - we have to LOCK. - - */ - -// -- lock... hard or file - -class MDRequest; - -class CLock { - protected: - // lock state - char state; - set gather_set; // auth - - // local state - int nread; - MDRequest *wrlock_by; - - - public: - CLock() : - state(LOCK_SYNC), - nread(0), - wrlock_by(0) { - } - - // encode/decode - void encode_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - _encode(gather_set, bl); - - //bl.append((char*)&nread, sizeof(nread)); - //bl.append((char*)&nwrite, sizeof(nwrite)); - } - void decode_state(bufferlist& bl, int& off) { - bl.copy(off, sizeof(state), (char*)&state); - off += sizeof(state); - _decode(gather_set, bl, off); - - //bl.copy(off, sizeof(nread), (char*)&nread); - //off += sizeof(nread); - //bl.copy(off, sizeof(nwrite), (char*)&nwrite); - //off += sizeof(nwrite); - } - - char get_state() { return state; } - char set_state(char s) { - state = s; - assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. - return s; - }; - - char get_replica_state() { - switch (state) { - case LOCK_LOCK: - case LOCK_GLOCKM: - case LOCK_GLOCKL: - case LOCK_GLOCKR: - case LOCK_LONER: - case LOCK_GLONERR: - case LOCK_GLONERM: - return LOCK_LOCK; - case LOCK_MIXED: - case LOCK_GMIXEDR: - return LOCK_MIXED; - case LOCK_SYNC: - return LOCK_SYNC; - - // after gather auth will bc LOCK_AC_MIXED or whatever - case LOCK_GSYNCM: - return LOCK_MIXED; - case LOCK_GSYNCL: - case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. - return LOCK_LOCK; - - default: - assert(0); - } - return 0; - } - - // gather set - set& get_gather_set() { return gather_set; } - void init_gather(const map& i) { - for (map::const_iterator p = i.begin(); p != i.end(); ++p) - gather_set.insert(p->first); - } - bool is_gathering(int i) { - return gather_set.count(i); - } - void clear_gather() { - gather_set.clear(); - } - - // ref counting - int get_read() { return ++nread; } - int put_read() { - assert(nread>0); - return --nread; - } - int get_nread() { return nread; } - - void get_write(MDRequest *who) { - assert(wrlock_by == 0); - wrlock_by = who; - } - void put_write() { - assert(wrlock_by); - wrlock_by = 0; - } - bool is_wrlocked() { return wrlock_by ? true:false; } - MDRequest *get_wrlocked_by() { return wrlock_by; } - bool is_used() { - return (is_wrlocked() || (nread>0)) ? true:false; - } - - - // stable - bool is_stable() { - return (state == LOCK_SYNC) || - (state == LOCK_LOCK) || - (state == LOCK_MIXED) || - (state == LOCK_LONER); - } - - // read/write access - bool can_read(bool auth) { - if (auth) - return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) - || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); - else - return (state == LOCK_SYNC); - } - bool can_read_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKL); - else - return false; - } - - bool can_write(bool auth) { - if (auth) - return (state == LOCK_LOCK) && !is_wrlocked(); - else - return false; - } - bool can_write_soon(bool auth) { - if (auth) - return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) - || (state == LOCK_GLOCKM); - else - return false; - } - - // client caps allowed - int caps_allowed_ever(bool auth) { - if (auth) - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - else - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - } - int caps_allowed(bool auth) { - if (auth) - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - - case LOCK_GLOCKL: - case LOCK_GLOCKM: - return 0; - - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - case LOCK_GMIXEDR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GMIXEDL: - return 0; - - case LOCK_LONER: // single client writer, of course. - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; - case LOCK_GLONERR: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_GLONERM: - return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; - - case LOCK_GSYNCL: - return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; - case LOCK_GSYNCM: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - else - switch (state) { - case LOCK_SYNC: - return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; - case LOCK_LOCK: - case LOCK_GLOCKR: - return CAP_FILE_RDCACHE; - case LOCK_GMIXEDR: - case LOCK_MIXED: - return CAP_FILE_RD | CAP_FILE_LAZYIO; - } - assert(0); - return 0; - } - - friend class MDCache; - friend class Locker; - friend class Migrator; -}; - -//ostream& operator<<(ostream& out, CLock& l); -inline ostream& operator<<(ostream& out, CLock& l) -{ - static char* __lock_states[] = { - "sync", - "lock", - "glockr", - "glockl", - "glockm", - "mixed", - "gmixedr", - "gmixedl", - "loner", - "glonerr", - "glonerm", - "gsyncl", - "gsyncm" - }; - - out << "(" << __lock_states[(int)l.get_state()]; - - if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); - - if (l.get_nread()) - out << " r=" << l.get_nread(); - if (l.is_wrlocked()) - out << " w=" << l.get_wrlocked_by(); - - // rw? - /* - out << " "; - if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; - out << "/"; - if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; - if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; - */ - out << ")"; - return out; -} - -#endif diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc index 714d52ba9ee6f..7233fa02b0605 100644 --- a/branches/sage/cephmds2/mds/Locker.cc +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -84,41 +84,27 @@ void Locker::dispatch(Message *m) } -void Locker::send_lock_message(CInode *in, int msg, int type) +void Locker::send_lock_message(SimpleLock *lock, int msg) { - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); + for (map::iterator it = lock->get_parent()->replicas_begin(); + it != lock->get_parent()->replicas_end(); it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); + MLock *m = new MLock(lock, msg, mds->get_nodeid()); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); } } - -void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data) +void Locker::send_lock_message(SimpleLock *lock, int msg, bufferlist &data) { - for (map::iterator it = in->replicas_begin(); - it != in->replicas_end(); + for (map::iterator it = lock->get_parent()->replicas_begin(); + it != lock->get_parent()->replicas_end(); it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_ino(in->ino(), type); + MLock *m = new MLock(lock, msg, mds->get_nodeid()); m->set_data(data); mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); } } -void Locker::send_lock_message(CDentry *dn, int msg) -{ - for (map::iterator it = dn->replicas_begin(); - it != dn->replicas_end(); - it++) { - MLock *m = new MLock(msg, mds->get_nodeid()); - m->set_dn(dn->dir->dirfrag(), dn->name); - mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); - } -} - @@ -130,194 +116,123 @@ void Locker::send_lock_message(CDentry *dn, int msg) bool Locker::acquire_locks(MDRequest *mdr, - set &dentry_rdlocks, - set &dentry_xlocks, - set &inode_hard_rdlocks, - set &inode_hard_xlocks) + set &rdlocks, + set &xlocks) { dout(10) << "acquire_locks " << *mdr << endl; + // sort everything we will lock + set sorted; + // (local) AUTH PINS // can i auth_pin everything? - for (set::iterator p = dentry_xlocks.begin(); - p != dentry_xlocks.end(); + for (set::iterator p = xlocks.begin(); + p != xlocks.end(); ++p) { - CDir *dir = (*p)->dir; - dout(10) << "might auth_pin " << *dir << endl; - - if (!dir->is_auth()) continue; - if (!mdr->is_auth_pinned(dir) && - !dir->can_auth_pin()) { - // wait - dir->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - mdcache->request_drop_locks(mdr); - mdr->drop_auth_pins(); - return false; - } - } - for (set::iterator p = inode_hard_xlocks.begin(); - p != inode_hard_xlocks.end(); - ++p) { - CInode *in = *p; - if (!in->is_auth()) continue; - if (!mdr->is_auth_pinned(in) && - !in->can_auth_pin()) { - in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); - mdcache->request_drop_locks(mdr); - mdr->drop_auth_pins(); - return false; + dout(10) << "will xlock " << **p << " " << *(*p)->get_parent() << endl; + + sorted.insert(*p); + if ((*p)->get_type() == LOCK_OTYPE_DN) { + CDir *dir = ((CDentry*)(*p)->get_parent())->dir; + dout(10) << "might auth_pin " << *dir << endl; + + if (!dir->is_auth()) continue; + if (!mdr->is_auth_pinned(dir) && + !dir->can_auth_pin()) { + // wait + dir->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } + } else { + CInode *in = (CInode*)(*p)->get_parent(); + if (!in->is_auth()) continue; + if (!mdr->is_auth_pinned(in) && + !in->can_auth_pin()) { + in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + mdcache->request_drop_locks(mdr); + mdr->drop_auth_pins(); + return false; + } } } // ok, grab the auth pins - for (set::iterator p = dentry_xlocks.begin(); - p != dentry_xlocks.end(); + for (set::iterator p = xlocks.begin(); + p != xlocks.end(); ++p) { - CDir *dir = (*p)->dir; - if (!dir->is_auth()) continue; - dout(10) << "auth_pinning " << *dir << endl; - mdr->auth_pin(dir); + if ((*p)->get_type() == LOCK_OTYPE_DN) { + CDir *dir = ((CDentry*)(*p)->get_parent())->dir; + if (!dir->is_auth()) continue; + dout(10) << "auth_pinning " << *dir << endl; + mdr->auth_pin(dir); + } else { + CInode *in = (CInode*)(*p)->get_parent(); + if (!in->is_auth()) continue; + dout(10) << "auth_pinning " << *in << endl; + mdr->auth_pin(in); + } } - for (set::iterator p = inode_hard_xlocks.begin(); - p != inode_hard_xlocks.end(); + + for (set::iterator p = rdlocks.begin(); + p != rdlocks.end(); ++p) { - CInode *in = *p; - if (!in->is_auth()) continue; - dout(10) << "auth_pinning " << *in << endl; - mdr->auth_pin(in); + dout(10) << "will rdlock " << **p << " " << *(*p)->get_parent() << endl; + sorted.insert(*p); } + // acquire locks. + // make sure they match currently acquired locks. + set::iterator existing = mdr->locks.begin(); + for (set::iterator p = sorted.begin(); + p != sorted.end(); + ++p) { - // DENTRY LOCKS - { - // sort all the dentries we will lock - set sorted; - for (set::iterator p = dentry_xlocks.begin(); - p != dentry_xlocks.end(); - ++p) { - dout(10) << "will xlock " << **p << endl; - sorted.insert(*p); - } - for (set::iterator p = dentry_rdlocks.begin(); - p != dentry_rdlocks.end(); - ++p) { - dout(10) << "will rdlock " << **p << endl; - sorted.insert(*p); - } - - // acquire dentry locks. make sure they match currently acquired locks. - set::iterator existing = mdr->dentry_locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - - // already locked? - if (existing != mdr->dentry_locks.end() && *existing == *p) { - // right kind? - CDentry *had = *existing; - if (dentry_xlocks.count(*p) == had->is_xlockedbyme(mdr)) { - dout(10) << "acquire_locks already locked " << *had << endl; - existing++; - continue; - } - } - - // hose any stray locks - while (existing != mdr->dentry_locks.end()) { - CDentry *had = *existing; + // already locked? + if (existing != mdr->locks.end() && *existing == *p) { + // right kind? + SimpleLock *had = *existing; + if (xlocks.count(*p) == (had->get_xlocked_by() == mdr)) { + dout(10) << "acquire_locks already locked " << *had << " " << *had->get_parent() << endl; existing++; - dout(10) << "acquire_locks had " << *had << " locked before " << **p - << ", unlocking" << endl; - if (had->is_xlockedbyme(mdr)) - dentry_xlock_finish(had, mdr); - else - dentry_rdlock_finish(had, mdr); - } - - // lock - if (dentry_xlocks.count(*p)) { - if (!dentry_xlock_start(*p, mdr)) - return false; - dout(10) << "acquire_locks got xlock on " << **p << endl; - } else { - if (!dentry_rdlock_start(*p, mdr)) - return false; - dout(10) << "acquire_locks got rdlock on " << **p << endl; + continue; } } - // any extra unneeded locks? - while (existing != mdr->dentry_locks.end()) { - dout(10) << "acquire_locks had " << *existing << " locked, unlocking" << endl; - if ((*existing)->is_xlockedbyme(mdr)) - dentry_xlock_finish(*existing, mdr); + // hose any stray locks + while (existing != mdr->locks.end()) { + SimpleLock *had = *existing; + existing++; + dout(10) << "acquire_locks unlocking out-of-order " << **existing + << " " << *(*existing)->get_parent() << endl; + if (had->get_xlocked_by() == mdr) + xlock_finish(had, mdr); else - dentry_rdlock_finish(*existing, mdr); + rdlock_finish(had, mdr); } - } - - // INODES - { - // sort all the dentries we will lock - set sorted; - for (set::iterator p = inode_hard_xlocks.begin(); - p != inode_hard_xlocks.end(); - ++p) - sorted.insert(*p); - for (set::iterator p = inode_hard_rdlocks.begin(); - p != inode_hard_rdlocks.end(); - ++p) - sorted.insert(*p); - - // acquire inode locks. make sure they match currently acquired locks. - set::iterator existing = mdr->inode_hard_locks.begin(); - for (set::iterator p = sorted.begin(); - p != sorted.end(); - ++p) { - // already locked? - if (existing != mdr->inode_hard_locks.end() && *existing == *p) { - // right kind? - CInode *had = *existing; - if (inode_hard_xlocks.count(*p) == (had->hardlock.get_wrlocked_by() == mdr)) { - dout(10) << "acquire_locks already locked " << *had << endl; - existing++; - continue; - } - } - - // hose any stray locks - while (existing != mdr->inode_hard_locks.end()) { - CInode *had = *existing; - existing++; - dout(10) << "acquire_locks had " << *had << " locked before " << **p - << ", unlocking" << endl; - if (had->hardlock.get_wrlocked_by() == mdr) - inode_hard_xlock_finish(had, mdr); - else - inode_hard_rdlock_finish(had, mdr); - } - // lock - if (inode_hard_xlocks.count(*p)) { - if (!inode_hard_xlock_start(*p, mdr)) - return false; - dout(10) << "acquire_locks got xlock on " << **p << endl; - } else { - if (!inode_hard_rdlock_start(*p, mdr)) - return false; - dout(10) << "acquire_locks got rdlock on " << **p << endl; - } + // lock + if (xlocks.count(*p)) { + if (!xlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got xlock on " << **p << " " << *(*p)->get_parent() << endl; + } else { + if (!rdlock_start(*p, mdr)) + return false; + dout(10) << "acquire_locks got rdlock on " << **p << " " << *(*p)->get_parent() << endl; } + } - // any extra unneeded locks? - while (existing != mdr->inode_hard_locks.end()) { - dout(10) << "acquire_locks had " << **existing << " locked, unlocking" << endl; - if ((*existing)->hardlock.get_wrlocked_by() == mdr) - inode_hard_xlock_finish(*existing, mdr); - else - inode_hard_rdlock_finish(*existing, mdr); - } + // any extra unneeded locks? + while (existing != mdr->locks.end()) { + dout(10) << "acquire_locks unlocking " << *existing + << " " << *(*existing)->get_parent() << endl; + if ((*existing)->get_xlocked_by() == mdr) + xlock_finish(*existing, mdr); + else + rdlock_finish(*existing, mdr); } return true; @@ -325,7 +240,59 @@ bool Locker::acquire_locks(MDRequest *mdr, +// generics + +/* +bool Locker::rdlock_try(SimpleLock *lock, Context *con) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_rdlock_try((FileLock*)lock, con); + default: + return simple_rdlock_try(lock, con); + } +} +*/ + +bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_rdlock_start((FileLock*)lock, mdr); + default: + return simple_rdlock_start(lock, mdr); + } +} + +void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_rdlock_finish((FileLock*)lock, mdr); + default: + return simple_rdlock_finish(lock, mdr); + } +} + +bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_xlock_start((FileLock*)lock, mdr); + default: + return simple_xlock_start(lock, mdr); + } +} +void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + switch (lock->get_type()) { + case LOCK_OTYPE_IFILE: + return file_xlock_finish((FileLock*)lock, mdr); + default: + return simple_xlock_finish(lock, mdr); + } +} @@ -378,7 +345,7 @@ Capability* Locker::issue_new_caps(CInode *in, if (in->is_auth()) { // [auth] twiddle mode? - inode_file_eval(in); + file_eval(&in->filelock); } else { // [replica] tell auth about any new caps wanted request_inode_file_caps(in); @@ -415,7 +382,7 @@ Capability* Locker::issue_new_caps(CInode *in, bool Locker::issue_caps(CInode *in) { // allowed caps are determined by the lock mode. - int allowed = in->filelock.caps_allowed(in->is_auth()); + int allowed = in->filelock.caps_allowed(); dout(7) << "issue_caps filelock allows=" << cap_string(allowed) << " on " << *in << endl; @@ -528,7 +495,7 @@ void Locker::handle_inode_file_caps(MInodeFileCaps *m) else in->mds_caps_wanted.erase(m->get_from()); - inode_file_eval(in); + file_eval(&in->filelock); delete m; } @@ -559,7 +526,7 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) assert(cap); // filter wanted based on what we could ever give out (given auth/replica status) - int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); + int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(); dout(7) << "handle_client_file_caps seq " << m->get_seq() << " confirms caps " << cap_string(m->get_caps()) @@ -621,7 +588,7 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) } // reevaluate, waiters - inode_file_eval(in); + file_eval(&in->filelock); in->finish_waiting(CInode::WAIT_CAPS, 0); delete m; @@ -708,20 +675,44 @@ ALSO: void Locker::handle_lock(MLock *m) { switch (m->get_otype()) { - case LOCK_OTYPE_IHARD: - handle_lock_inode_hard(m); - break; + case LOCK_OTYPE_DN: + { + CDir *dir = mdcache->get_dirfrag(m->get_dirfrag()); + CDentry *dn = 0; + if (dir) + dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "dont' have dn " << m->get_dirfrag() << " " << m->get_dn() << endl; + delete m; + return; + } - case LOCK_OTYPE_IFILE: - handle_lock_inode_file(m); + handle_simple_lock(&dn->lock, m); + } break; - - case LOCK_OTYPE_DIR: - handle_lock_dir(m); + + case LOCK_OTYPE_IHARD: + { + CInode *in = mdcache->get_inode(m->get_ino()); + if (!in) { + dout(7) << "dont' have ino " << m->get_ino() << endl; + delete m; + return; + } + handle_simple_lock(&in->hardlock, m); + } break; - - case LOCK_OTYPE_DN: - handle_lock_dn(m); + + case LOCK_OTYPE_IFILE: + { + CInode *in = mdcache->get_inode(m->get_ino()); + if (!in) { + dout(7) << "dont' have ino " << m->get_ino() << endl; + delete m; + return; + } + handle_file_lock(&in->filelock, m); + } break; default: @@ -733,486 +724,481 @@ void Locker::handle_lock(MLock *m) -// =============================== -// hard inode metadata - -bool Locker::inode_hard_rdlock_try(CInode *in, Context *con) -{ - dout(7) << "inode_hard_rdlock_try on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) - return true; - - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_rdlock_try waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_HARDR, con); - return false; -} - -bool Locker::inode_hard_rdlock_start(CInode *in, MDRequest *mdr) -{ - dout(7) << "inode_hard_rdlock_start on " << *in << endl; - - // can read? grab ref. - if (in->hardlock.can_read(in->is_auth())) { - in->hardlock.get_read(); - mdr->inode_hard_rdlocks.insert(in); - mdr->inode_hard_locks.insert(in); - return true; - } - - // can't read, and replicated. - assert(!in->is_auth()); - - // wait! - dout(7) << "inode_hard_rdlock_start waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_HARDR, new C_MDS_RetryRequest(mdcache, mdr)); - return false; -} - - -void Locker::inode_hard_rdlock_finish(CInode *in, MDRequest *mdr) +void Locker::handle_simple_lock(SimpleLock *lock, MLock *m) { - // drop ref - assert(in->hardlock.can_read(in->is_auth())); - in->hardlock.put_read(); - mdr->inode_hard_rdlocks.erase(in); - mdr->inode_hard_locks.erase(in); - - dout(7) << "inode_hard_rdlock_finish on " << *in << endl; + int from = m->get_asker(); - //if (in->hardlock.get_nread() == 0) in->finish_waiting(CInode::WAIT_HARDNORD); -} - + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK); + lock->decode_locked_state(m->get_data()); + lock->set_state(LOCK_SYNC); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC); + //|| lock->get_state() == LOCK_GLOCKR); + + // wait for readers to finish? + if (lock->get_num_rdlock() > 0) { + dout(7) << "handle_simple_lock has reader, waiting before ack on " << *lock << " on " << *lock->get_parent() + << endl; + lock->set_state(LOCK_GLOCKR); + lock->add_waiter(SimpleLock::WAIT_NORD, new C_MDS_RetryMessage(mds, m)); + return; + } -bool Locker::inode_hard_xlock_start(CInode *in, MDRequest *mdr) -{ - dout(7) << "inode_hard_xlock_start on " << *in << endl; + // update lock and reply + lock->set_state(LOCK_LOCK); + + { + MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); + mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + } + break; - // if not replicated, i can twiddle lock at will - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_LOCK) - in->hardlock.set_state(LOCK_LOCK); - - // can write? grab ref. - if (in->hardlock.can_write(in->is_auth())) { - assert(in->is_auth()); - in->hardlock.get_write(mdr); - mdr->inode_hard_xlocks.insert(in); - mdr->inode_hard_locks.insert(in); - return true; - } - - // can't write, replicated. - if (in->is_auth()) { - // auth - if (in->hardlock.can_write_soon(in->is_auth())) { - // just wait + // -- auth -- + case LOCK_AC_LOCKACK: + assert(lock->get_state() == LOCK_GLOCKR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - // initiate lock - inode_hard_lock(in); + dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from + << ", last one" << endl; + simple_eval(lock); } - - dout(7) << "inode_hard_xlock_start waiting on " << *in << endl; - in->add_waiter(CInode::WAIT_HARDW, new C_MDS_RetryRequest(mdcache, mdr)); - - return false; - } else { - // replica - // fw to auth - int auth = in->authority().first; - dout(7) << "inode_hard_xlock_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; } -} - -void Locker::inode_hard_xlock_finish(CInode *in, MDRequest *mdr) -{ - // drop ref - //assert(in->hardlock.can_write(in->is_auth())); - in->hardlock.put_write(); - mdr->inode_hard_xlocks.erase(in); - mdr->inode_hard_locks.erase(in); - dout(7) << "inode_hard_xlock_finish on " << *in << endl; - - // others waiting? - if (in->is_hardlock_write_wanted()) { - // wake 'em up - in->take_waiting(CInode::WAIT_HARDW, mds->finished_queue); - } else { - // auto-sync if alone. - if (in->is_auth() && - !in->is_replicated() && - in->hardlock.get_state() != LOCK_SYNC) - in->hardlock.set_state(LOCK_SYNC); - - inode_hard_eval(in); - } + delete m; } -void Locker::inode_hard_eval(CInode *in) +void Locker::simple_eval(SimpleLock *lock) { // finished gather? - if (in->is_auth() && - !in->hardlock.is_stable() && - in->hardlock.gather_set.empty()) { - dout(7) << "inode_hard_eval finished gather on " << *in << endl; - switch (in->hardlock.get_state()) { + if (lock->get_parent()->is_auth() && + !lock->is_stable() && + !lock->is_gathering()) { + dout(7) << "simple_eval finished gather on " << *lock << " on " << *lock->get_parent() << endl; + switch (lock->get_state()) { case LOCK_GLOCKR: - in->hardlock.set_state(LOCK_LOCK); - - // waiters - //in->hardlock.get_write(); - in->finish_waiting(CInode::WAIT_HARDRWB|CInode::WAIT_HARDSTABLE); - //in->hardlock.put_write(); + lock->set_state(LOCK_LOCK); + lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE); break; default: assert(0); } } - if (!in->hardlock.is_stable()) return; + if (!lock->is_stable()) return; - if (in->is_auth()) { - + if (lock->get_parent()->is_auth()) { + // sync? - if (in->is_replicated() && - in->is_hardlock_write_wanted() && - in->hardlock.get_state() != LOCK_SYNC) { - dout(7) << "inode_hard_eval stable, syncing " << *in << endl; - inode_hard_sync(in); + if (lock->get_state() != LOCK_SYNC && + lock->get_parent()->is_replicated() && + !lock->is_waiting(SimpleLock::WAIT_WR)) { + dout(7) << "simple_eval stable, syncing " << *lock << " on " << *lock->get_parent() << endl; + simple_sync(lock); } } else { // replica } + } // mid -void Locker::inode_hard_sync(CInode *in) +void Locker::simple_sync(SimpleLock *lock) { - dout(7) << "inode_hard_sync on " << *in << endl; - assert(in->is_auth()); + dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); // check state - if (in->hardlock.get_state() == LOCK_SYNC) + if (lock->get_state() == LOCK_SYNC) return; // already sync - if (in->hardlock.get_state() == LOCK_GLOCKR) + if (lock->get_state() == LOCK_GLOCKR) assert(0); // um... hmm! - assert(in->hardlock.get_state() == LOCK_LOCK); + assert(lock->get_state() == LOCK_LOCK); // hard data - bufferlist harddata; - in->encode_hard_state(harddata); + bufferlist data; + lock->encode_locked_state(data); // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata); + send_lock_message(lock, LOCK_AC_SYNC, data); // change lock - in->hardlock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); // waiters? - in->finish_waiting(CInode::WAIT_HARDSTABLE); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } -void Locker::inode_hard_lock(CInode *in) +void Locker::simple_lock(SimpleLock *lock) { - dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; - assert(in->is_auth()); + dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << endl; + assert(lock->get_parent()->is_auth()); // check state - if (in->hardlock.get_state() == LOCK_LOCK || - in->hardlock.get_state() == LOCK_GLOCKR) + if (lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_GLOCKR) return; // already lock or locking - assert(in->hardlock.get_state() == LOCK_SYNC); - - // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD); + assert(lock->get_state() == LOCK_SYNC); - // change lock - in->hardlock.set_state(LOCK_GLOCKR); - in->hardlock.init_gather(in->get_replicas()); + if (lock->get_parent()->is_replicated()) { + // bcast to replicas + send_lock_message(lock, LOCK_AC_LOCK); + + // change lock + lock->set_state(LOCK_GLOCKR); + lock->init_gather(); + } else { + lock->set_state(LOCK_LOCK); + } } +// top +bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) +{ + dout(7) << "simple_rdlock_try on " << *lock << " on " << *lock->get_parent() << endl; + // can read? grab ref. + if (lock->can_rdlock(0)) + return true; + + assert(!lock->get_parent()->is_auth()); -// messenger + // wait! + dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, con); + return false; +} -void Locker::handle_lock_inode_hard(MLock *m) +bool Locker::simple_rdlock_start(SimpleLock *lock, MDRequest *mdr) { - assert(m->get_otype() == LOCK_OTYPE_IHARD); + dout(7) << "simple_rdlock_start on " << *lock << " on " << *lock->get_parent() << endl; + + // can read? grab ref. + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } - if (mds->logger) mds->logger->inc("lih"); + // can't read, and replicated. + assert(!lock->get_parent()->is_auth()); - int from = m->get_asker(); - CInode *in = mdcache->get_inode(m->get_ino()); + // wait! + dout(7) << "simple_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, + new C_MDS_RetryRequest(mdcache, mdr)); + return false; +} + +void Locker::simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + // drop ref + lock->put_rdlock(); + if (mdr) { + mdr->rdlocks.erase(lock); + mdr->locks.erase(lock); + } + + dout(7) << "simple_rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (LOCK_AC_FOR_AUTH(m->get_action())) { + if (lock->get_state() == LOCK_GLOCKR && + lock->get_num_rdlock() == 0) { + lock->set_state(LOCK_SYNC); // return state to sync, in case the unpinner flails + lock->finish_waiters(SimpleLock::WAIT_NORD); + } +} + +bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) +{ + dout(7) << "simple_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; + + // auth? + if (lock->get_parent()->is_auth()) { // auth - assert(in); - assert(in->is_auth());// || in->is_proxy()); - dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; - - /*if (in->is_proxy()) { - // fw - int newauth = in->authority().first; - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); + + // lock. + if (lock->get_state() == LOCK_SYNC) + simple_lock(lock); + + // already locked? + if (lock->get_state() == LOCK_LOCK) { + if (lock->is_xlocked()) { + // by me? + if (lock->get_xlocked_by() == mdr) + return true; + // by someone else. + lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - return; + + // xlock. + lock->get_xlock(mdr); + mdr->xlocks.insert(lock); + mdr->locks.insert(lock); + return true; + } else { + // wait for lock + lock->add_waiter(SimpleLock::WAIT_LOCK, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - */ } else { // replica - if (!in) { - dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; - /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness - to keep gather_set a proper/correct subset of cached_by. better to use the existing - cacheexpire mechanism instead! - */ - delete m; - return; + + assert(0); + + // wait for sync. + if (lock->get_state() != LOCK_SYNC) { + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - - assert(!in->is_auth()); + + // do remote xlock + assert(0); } +} - dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; - - CLock *lock = &in->hardlock; - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_SYNC: - assert(lock->get_state() == LOCK_LOCK); - - { // assim data - int off = 0; - in->decode_hard_state(m->get_data(), off); - } - - // update lock - lock->set_state(LOCK_SYNC); - - // no need to reply + +void Locker::simple_xlock_finish(SimpleLock *lock, MDRequest *mdr) +{ + // drop ref + assert(lock->can_xlock(mdr)); + lock->put_xlock(); + mdr->xlocks.erase(lock); + mdr->locks.erase(lock); + dout(7) << "simple_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; + + // others waiting? + if (lock->is_waiting(SimpleLock::WAIT_WR)) { + // wake 'em up + lock->finish_waiters(SimpleLock::WAIT_WR, 0); + } else { + // auto-sync if alone. + if (lock->get_parent()->is_auth() && + !lock->get_parent()->is_replicated() && + lock->get_state() != LOCK_SYNC) + lock->set_state(LOCK_SYNC); - // waiters - in->finish_waiting(CInode::WAIT_HARDR|CInode::WAIT_HARDSTABLE); - break; - - case LOCK_AC_LOCK: - assert(lock->get_state() == LOCK_SYNC); - //|| lock->get_state() == LOCK_GLOCKR); - - // wait for readers to finish? - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; - lock->set_state(LOCK_GLOCKR); - in->add_waiter(CInode::WAIT_HARDNORD, - new C_MDS_RetryMessage(mds, m)); - assert(0); // does this ever happen? (if so, fix hard_rdlock_finish, and CInodeExport.update_inode!) - return; - } else { + simple_eval(lock); + } +} - // update lock and reply - lock->set_state(LOCK_LOCK); - - { - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + + +// dentry specific helpers + +// trace helpers + +/** dentry_can_rdlock_trace + * see if we can _anonymously_ rdlock an entire trace. + * if not, and req is specified, wait and retry that message. + */ +bool Locker::dentry_can_rdlock_trace(vector& trace, MClientRequest *req) +{ + // verify dentries are rdlockable. + // we do this because + // - we're being less aggressive about locks acquisition, and + // - we're not acquiring the locks in order! + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + CDentry *dn = *it; + if (!dn->lock.can_rdlock(0)) { + if (req) { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << ", waiting" << endl; + dn->lock.add_waiter(SimpleLock::WAIT_RD, + new C_MDS_RetryMessage(mds, req)); + } else { + dout(10) << "can_rdlock_trace can't rdlock " << *dn << endl; } + return false; } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); + } + return true; +} - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; - } else { - dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; - inode_hard_eval(in); - } - } - delete m; +void Locker::dentry_anon_rdlock_trace_start(vector& trace) +{ + // grab dentry rdlocks + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + (*it)->lock.get_rdlock(); +} + + +void Locker::dentry_anon_rdlock_trace_finish(vector& trace) +{ + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) + simple_rdlock_finish(&(*it)->lock, 0); } -// ===================== -// soft inode metadata + +// =============================== +// file lock -bool Locker::inode_file_rdlock_start(CInode *in, MDRequest *mdr) +bool Locker::file_rdlock_start(FileLock *lock, MDRequest *mdr) { - dout(7) << "inode_file_rdlock_start " << *in << " filelock=" << in->filelock << endl; + dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << endl; // can read? grab ref. - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); return true; } // can't read, and replicated. - if (in->filelock.can_read_soon(in->is_auth())) { + if (lock->can_rdlock_soon()) { // wait - dout(7) << "inode_file_rdlock_start can_read_soon " << *in << endl; + dout(7) << "file_rdlock_start can_rdlock_soon " << *lock << " on " << *lock->get_parent() << endl; } else { - if (in->is_auth()) { + if (lock->get_parent()->is_auth()) { // auth // FIXME or qsync? - if (in->filelock.is_stable()) { - inode_file_lock(in); // lock, bc easiest to back off - - if (in->filelock.can_read(in->is_auth())) { - in->filelock.get_read(); + if (lock->is_stable()) { + file_lock(lock); // lock, bc easiest to back off ... FIXME + + if (lock->can_rdlock(mdr)) { + lock->get_rdlock(); + mdr->rdlocks.insert(lock); + mdr->locks.insert(lock); - //in->filelock.get_write(); - in->finish_waiting(CInode::WAIT_FILERWB|CInode::WAIT_FILESTABLE); - //in->filelock.put_write(); - - mdr->inode_file_rdlocks.insert(in); - mdr->inode_file_locks.insert(in); + lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE); return true; } } else { - dout(7) << "inode_file_rdlock_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); + dout(7) << "file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } else { // replica - if (in->filelock.is_stable()) { - + if (lock->is_stable()) { + // fw to auth + CInode *in = (CInode*)lock->get_parent(); int auth = in->authority().first; - dout(7) << "inode_file_rdlock_start " << *in << " on replica and async, fw to auth " << auth << endl; + dout(7) << "file_rdlock_start " << *lock << " on " << *lock->get_parent() << " on replica and async, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); mdcache->request_forward(mdr, auth); return false; } else { // wait until stable - dout(7) << "inode_file_rdlock_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); + dout(7) << "inode_file_rdlock_start waiting until stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } } - + // wait - dout(7) << "inode_file_rdlock_start waiting on " << *in << ", filelock=" << in->filelock << endl; - in->add_waiter(CInode::WAIT_FILER, new C_MDS_RetryRequest(mdcache, mdr)); + dout(7) << "file_rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return false; } -void Locker::inode_file_rdlock_finish(CInode *in, MDRequest *mdr) + +void Locker::file_rdlock_finish(FileLock *lock, MDRequest *mdr) { // drop ref - assert(in->filelock.can_read(in->is_auth())); - in->filelock.put_read(); - mdr->inode_file_rdlocks.erase(in); - mdr->inode_file_locks.erase(in); + assert(lock->can_rdlock(mdr)); + lock->put_rdlock(); + mdr->rdlocks.erase(lock); + mdr->locks.erase(lock); - dout(7) << "inode_file_rdlock_finish on " << *in << ", filelock=" << in->filelock << endl; + dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << endl; - if (in->filelock.get_nread() == 0) { - in->finish_waiting(CInode::WAIT_FILENORD); - inode_file_eval(in); + if (lock->get_num_rdlock() == 0) { + lock->finish_waiters(SimpleLock::WAIT_NORD); + file_eval(lock); } } -bool Locker::inode_file_xlock_start(CInode *in, MDRequest *mdr) +bool Locker::file_xlock_start(FileLock *lock, MDRequest *mdr) { - dout(7) << "inode_file_xlock_start on " << *in << endl; + dout(7) << "file_xlock_start on " << *lock << " on " << *lock->get_parent() << endl; - // can't write? - if (!in->filelock.can_write(in->is_auth())) { - - // can't write. - if (in->is_auth()) { - // auth - if (!in->filelock.can_write_soon(in->is_auth())) { - if (!in->filelock.is_stable()) { - dout(7) << "inode_file_xlock_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CInode::WAIT_FILESTABLE, new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // initiate lock - inode_file_lock(in); + assert(lock->get_parent()->is_auth()); // remote file xlock not implemented + + // already xlocked by me? + if (lock->get_xlocked_by() == mdr) + return true; - // fall-thru to below. + // can't write? + if (!lock->can_xlock(mdr)) { + + // auth + if (!lock->can_xlock_soon()) { + if (!lock->is_stable()) { + dout(7) << "file_xlock_start on auth, waiting for stable on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); + return false; } - } else { - // replica - // fw to auth - int auth = in->authority().first; - dout(7) << "inode_file_xlock_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(mdr, auth); - return false; + + // initiate lock + file_lock(lock); + + // fall-thru to below. } } // check again - if (in->filelock.can_write(in->is_auth())) { - // can i auth pin? - assert(in->is_auth()); - in->filelock.get_write(mdr); - mdr->inode_file_locks.insert(in); - mdr->inode_file_xlocks.insert(in); + if (lock->can_xlock(mdr)) { + assert(lock->get_parent()->is_auth()); + lock->get_xlock(mdr); + mdr->locks.insert(lock); + mdr->xlocks.insert(lock); return true; } else { - dout(7) << "inode_file_xlock_start on auth, waiting for write on " << *in << endl; - in->add_waiter(CInode::WAIT_FILEW, new C_MDS_RetryRequest(mdcache, mdr)); + dout(7) << "file_xlock_start on auth, waiting for write on " << *lock << " on " << *lock->get_parent() << endl; + lock->add_waiter(SimpleLock::WAIT_WR, new C_MDS_RetryRequest(mdcache, mdr)); return false; } } -void Locker::inode_file_xlock_finish(CInode *in, MDRequest *mdr) +void Locker::file_xlock_finish(FileLock *lock, MDRequest *mdr) { // drop ref - //assert(in->filelock.can_write(in->is_auth())); - in->filelock.put_write(); - mdr->inode_file_locks.erase(in); - mdr->inode_file_xlocks.erase(in); - dout(7) << "inode_file_xlock_finish on " << *in << ", filelock=" << in->filelock << endl; + assert(lock->can_xlock(mdr)); + lock->put_xlock(); + mdr->locks.erase(lock); + mdr->xlocks.erase(lock); + dout(7) << "file_xlock_finish on " << *lock << " on " << *lock->get_parent() << endl; // drop lock? - if (!in->is_filelock_write_wanted()) { - in->finish_waiting(CInode::WAIT_FILENOWR); - inode_file_eval(in); - } + if (!lock->is_waiting(SimpleLock::WAIT_LOCK)) + file_eval(lock); } @@ -1224,70 +1210,70 @@ void Locker::inode_file_xlock_finish(CInode *in, MDRequest *mdr) * - checks if soft state should change (eg bc last writer closed) */ -void Locker::inode_file_eval(CInode *in) +void Locker::file_eval(FileLock *lock) { + CInode *in = (CInode*)lock->get_parent(); + int issued = in->get_caps_issued(); // [auth] finished gather? if (in->is_auth() && - !in->filelock.is_stable() && - in->filelock.gather_set.size() == 0) { - dout(7) << "inode_file_eval finished mds gather on " << *in << endl; + !lock->is_stable() && + !lock->is_gathering()) { + dout(7) << "file_eval finished mds gather on " << *lock << " on " << *lock->get_parent() << endl; - switch (in->filelock.get_state()) { + switch (lock->get_state()) { // to lock case LOCK_GLOCKR: case LOCK_GLOCKM: case LOCK_GLOCKL: if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); // waiters - in->filelock.get_read(); - //in->filelock.get_write(); - in->finish_waiting(CInode::WAIT_FILERWB|CInode::WAIT_FILESTABLE); - in->filelock.put_read(); - //in->filelock.put_write(); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_LOCK|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); } break; // to mixed case LOCK_GMIXEDR: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); - in->finish_waiting(CInode::WAIT_FILESTABLE); + lock->set_state(LOCK_MIXED); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; case LOCK_GMIXEDL: if ((issued & ~(CAP_FILE_WR)) == 0) { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); if (in->is_replicated()) { // data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_MIXED, softdata); } - in->finish_waiting(CInode::WAIT_FILESTABLE); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; // to loner case LOCK_GLONERR: if (issued == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CInode::WAIT_FILESTABLE); + lock->set_state(LOCK_LONER); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; case LOCK_GLONERM: if ((issued & ~CAP_FILE_WR) == 0) { - in->filelock.set_state(LOCK_LONER); - in->finish_waiting(CInode::WAIT_FILESTABLE); + lock->set_state(LOCK_LONER); + lock->finish_waiters(SimpleLock::WAIT_STABLE); } break; @@ -1295,19 +1281,19 @@ void Locker::inode_file_eval(CInode *in) case LOCK_GSYNCL: case LOCK_GSYNCM: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); { // bcast data to replicas bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // waiters - in->filelock.get_read(); - in->finish_waiting(CInode::WAIT_FILER|CInode::WAIT_FILESTABLE); - in->filelock.put_read(); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); } break; @@ -1320,26 +1306,24 @@ void Locker::inode_file_eval(CInode *in) // [replica] finished caps gather? if (!in->is_auth() && - !in->filelock.is_stable()) { - switch (in->filelock.get_state()) { + !lock->is_stable()) { + switch (lock->get_state()) { case LOCK_GMIXEDR: if ((issued & ~(CAP_FILE_RD)) == 0) { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); } break; case LOCK_GLOCKR: if (issued == 0) { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); // ack - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); mds->send_message_mds(reply, in->authority().first, MDS_PORT_LOCKER); } break; @@ -1350,58 +1334,58 @@ void Locker::inode_file_eval(CInode *in) } // !stable -> do nothing. - if (!in->filelock.is_stable()) return; + if (!lock->is_stable()) return; // stable. - assert(in->filelock.is_stable()); + assert(lock->is_stable()); if (in->is_auth()) { // [auth] int wanted = in->get_caps_wanted(); bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); - dout(7) << "inode_file_eval wanted=" << cap_string(wanted) - << " filelock=" << in->filelock + dout(7) << "file_eval wanted=" << cap_string(wanted) + << " filelock=" << *lock << " on " << *lock->get_parent() << " loner=" << loner << endl; // * -> loner? - if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && + if (lock->get_num_rdlock() == 0 && + !lock->is_waiting(SimpleLock::WAIT_WR) && (wanted & CAP_FILE_WR) && loner && - in->filelock.get_state() != LOCK_LONER) { - dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl; - inode_file_loner(in); + lock->get_state() != LOCK_LONER) { + dout(7) << "file_eval stable, bump to loner " << *lock << " on " << *lock->get_parent() << endl; + file_loner(lock); } // * -> mixed? - else if (in->filelock.get_nread() == 0 && - !in->is_filelock_write_wanted() && + else if (lock->get_num_rdlock() == 0 && + !lock->is_waiting(SimpleLock::WAIT_WR) && (wanted & CAP_FILE_RD) && (wanted & CAP_FILE_WR) && - !(loner && in->filelock.get_state() == LOCK_LONER) && - in->filelock.get_state() != LOCK_MIXED) { - dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; - inode_file_mixed(in); + !(loner && lock->get_state() == LOCK_LONER) && + lock->get_state() != LOCK_MIXED) { + dout(7) << "file_eval stable, bump to mixed " << *lock << " on " << *lock->get_parent() << endl; + file_mixed(lock); } // * -> sync? - else if (!in->is_filelock_write_wanted() && + else if (!in->filelock.is_waiting(SimpleLock::WAIT_WR) && !(wanted & CAP_FILE_WR) && ((wanted & CAP_FILE_RD) || in->is_replicated() || - (!loner && in->filelock.get_state() == LOCK_LONER)) && - in->filelock.get_state() != LOCK_SYNC) { - dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; - inode_file_sync(in); + (!loner && lock->get_state() == LOCK_LONER)) && + lock->get_state() != LOCK_SYNC) { + dout(7) << "file_eval stable, bump to sync " << *lock << " on " << *lock->get_parent() << endl; + file_sync(lock); } // * -> lock? (if not replicated or open) else if (!in->is_replicated() && wanted == 0 && - in->filelock.get_state() != LOCK_LOCK) { - inode_file_lock(in); + lock->get_state() != LOCK_LOCK) { + file_lock(lock); } } else { @@ -1413,77 +1397,78 @@ void Locker::inode_file_eval(CInode *in) // mid -bool Locker::inode_file_sync(CInode *in) +bool Locker::file_sync(FileLock *lock) { - dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "file_sync " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_SYNC || - in->filelock.get_state() == LOCK_GSYNCL || - in->filelock.get_state() == LOCK_GSYNCM) + if (lock->get_state() == LOCK_SYNC || + lock->get_state() == LOCK_GSYNCL || + lock->get_state() == LOCK_GSYNCM) return true; - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); - if (in->filelock.get_state() == LOCK_LOCK) { + if (lock->get_state() == LOCK_LOCK) { if (in->is_replicated()) { // soft data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_SYNC, softdata); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); // reissue caps issue_caps(in); return true; } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { // writers? if (issued & CAP_FILE_WR) { // gather client write caps - in->filelock.set_state(LOCK_GSYNCM); + lock->set_state(LOCK_GSYNCM); issue_caps(in); } else { // no writers, go straight to sync if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); + send_lock_message(lock, LOCK_AC_SYNC); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); } return false; } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { // writers? if (issued & CAP_FILE_WR) { // gather client write caps - in->filelock.set_state(LOCK_GSYNCL); + lock->set_state(LOCK_GSYNCL); issue_caps(in); } else { // no writers, go straight to sync if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); + send_lock_message(lock, LOCK_AC_SYNC); } // change lock - in->filelock.set_state(LOCK_SYNC); + lock->set_state(LOCK_SYNC); } return false; } @@ -1495,31 +1480,32 @@ bool Locker::inode_file_sync(CInode *in) -void Locker::inode_file_lock(CInode *in) +void Locker::file_lock(FileLock *lock) { - dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "inode_file_lock " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_LOCK || - in->filelock.get_state() == LOCK_GLOCKR || - in->filelock.get_state() == LOCK_GLOCKM || - in->filelock.get_state() == LOCK_GLOCKL) + if (lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_GLOCKR || + lock->get_state() == LOCK_GLOCKM || + lock->get_state() == LOCK_GLOCKL) return; // lock or locking - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLOCKR); + lock->set_state(LOCK_GLOCKR); // call back caps if (issued) @@ -1527,22 +1513,22 @@ void Locker::inode_file_lock(CInode *in) } else { if (issued) { // call back caps - in->filelock.set_state(LOCK_GLOCKR); + lock->set_state(LOCK_GLOCKR); issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLOCKM); + lock->set_state(LOCK_GLOCKM); // call back caps issue_caps(in); @@ -1550,25 +1536,25 @@ void Locker::inode_file_lock(CInode *in) //assert(issued); // ??? -sage 2/19/06 if (issued) { // change lock - in->filelock.set_state(LOCK_GLOCKM); + lock->set_state(LOCK_GLOCKM); // call back caps issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { if (issued & CAP_FILE_WR) { // change lock - in->filelock.set_state(LOCK_GLOCKL); + lock->set_state(LOCK_GLOCKL); // call back caps issue_caps(in); } else { - in->filelock.set_state(LOCK_LOCK); + lock->set_state(LOCK_LOCK); } } else @@ -1576,67 +1562,68 @@ void Locker::inode_file_lock(CInode *in) } -void Locker::inode_file_mixed(CInode *in) +void Locker::file_mixed(FileLock *lock) { - dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; + dout(7) << "file_mixed " << *lock << " on " << *lock->get_parent() << endl; + CInode *in = (CInode*)lock->get_parent(); assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_GMIXEDR || - in->filelock.get_state() == LOCK_GMIXEDL) + if (lock->get_state() == LOCK_GMIXEDR || + lock->get_state() == LOCK_GMIXEDL) return; // mixed or mixing - assert(in->filelock.is_stable()); + assert(lock->is_stable()); int issued = in->get_caps_issued(); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_MIXED); + lock->init_gather(); - in->filelock.set_state(LOCK_GMIXEDR); + lock->set_state(LOCK_GMIXEDR); issue_caps(in); } else { if (issued) { - in->filelock.set_state(LOCK_GMIXEDR); + lock->set_state(LOCK_GMIXEDR); issue_caps(in); } else { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); } } } - else if (in->filelock.get_state() == LOCK_LOCK) { + else if (lock->get_state() == LOCK_LOCK) { if (in->is_replicated()) { // data bufferlist softdata; - in->encode_file_state(softdata); + lock->encode_locked_state(softdata); // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); + send_lock_message(lock, LOCK_AC_MIXED, softdata); } // change lock - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } - else if (in->filelock.get_state() == LOCK_LONER) { + else if (lock->get_state() == LOCK_LONER) { if (issued & CAP_FILE_WRBUFFER) { // gather up WRBUFFER caps - in->filelock.set_state(LOCK_GMIXEDL); + lock->set_state(LOCK_GMIXEDL); issue_caps(in); } else if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); - in->filelock.set_state(LOCK_MIXED); + send_lock_message(lock, LOCK_AC_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } else { - in->filelock.set_state(LOCK_MIXED); + lock->set_state(LOCK_MIXED); issue_caps(in); } } @@ -1646,52 +1633,53 @@ void Locker::inode_file_mixed(CInode *in) } -void Locker::inode_file_loner(CInode *in) +void Locker::file_loner(FileLock *lock) { - dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl; + CInode *in = (CInode*)lock->get_parent(); + dout(7) << "inode_file_loner " << *lock << " on " << *lock->get_parent() << endl; assert(in->is_auth()); // check state - if (in->filelock.get_state() == LOCK_LONER || - in->filelock.get_state() == LOCK_GLONERR || - in->filelock.get_state() == LOCK_GLONERM) + if (lock->get_state() == LOCK_LONER || + lock->get_state() == LOCK_GLONERR || + lock->get_state() == LOCK_GLONERM) return; - assert(in->filelock.is_stable()); + assert(lock->is_stable()); assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); - if (in->filelock.get_state() == LOCK_SYNC) { + if (lock->get_state() == LOCK_SYNC) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLONERR); + lock->set_state(LOCK_GLONERR); } else { // only one guy with file open, who gets it all, so - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } } - else if (in->filelock.get_state() == LOCK_LOCK) { + else if (lock->get_state() == LOCK_LOCK) { // change lock. ignore replicas; they don't know about LONER. - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } - else if (in->filelock.get_state() == LOCK_MIXED) { + else if (lock->get_state() == LOCK_MIXED) { if (in->is_replicated()) { // bcast to replicas - send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); - in->filelock.init_gather(in->get_replicas()); + send_lock_message(lock, LOCK_AC_LOCK); + lock->init_gather(); // change lock - in->filelock.set_state(LOCK_GLONERM); + lock->set_state(LOCK_GLONERM); } else { - in->filelock.set_state(LOCK_LONER); + lock->set_state(LOCK_LONER); issue_caps(in); } } @@ -1700,52 +1688,20 @@ void Locker::inode_file_loner(CInode *in) assert(0); } + + // messenger -void Locker::handle_lock_inode_file(MLock *m) +void Locker::handle_file_lock(FileLock *lock, MLock *m) { - assert(m->get_otype() == LOCK_OTYPE_IFILE); - if (mds->logger) mds->logger->inc("lif"); - CInode *in = mdcache->get_inode(m->get_ino()); + CInode *in = (CInode*)lock->get_parent(); int from = m->get_asker(); - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - assert(in); - assert(in->is_auth());// || in->is_proxy()); - dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; - - /*if (in->is_proxy()) { - // fw - int newauth = in->authority().first; - assert(newauth >= 0); - if (from == newauth) { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; - delete m; - } else { - dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; - mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); - } - return; - } - */ - } else { - // replica - if (!in) { - // drop it. don't nak. - dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; - delete m; - return; - } - - assert(!in->is_auth()); - } - - dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; + dout(7) << "handle_file_lock a=" << m->get_action() << " from " << from << " " + << *in << " filelock=" << *lock << endl; - CLock *lock = &in->filelock; int issued = in->get_caps_issued(); switch (m->get_action()) { @@ -1754,21 +1710,16 @@ void Locker::handle_lock_inode_file(MLock *m) assert(lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_MIXED); - { // assim data - int off = 0; - in->decode_file_state(m->get_data(), off); - } - - // update lock + lock->decode_locked_state(m->get_data()); lock->set_state(LOCK_SYNC); // no need to reply. // waiters - in->filelock.get_read(); - in->finish_waiting(CInode::WAIT_FILER|CInode::WAIT_FILESTABLE); - in->filelock.put_read(); - inode_file_eval(in); + lock->get_rdlock(); + lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE); + lock->put_rdlock(); + file_eval(lock); break; case LOCK_AC_LOCK: @@ -1777,13 +1728,12 @@ void Locker::handle_lock_inode_file(MLock *m) // call back caps? if (issued & CAP_FILE_RD) { - dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; + dout(7) << "handle_file_lock client readers, gathering caps on " << *in << endl; issue_caps(in); } - if (lock->get_nread() > 0) { - dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; - in->add_waiter(CInode::WAIT_FILENORD, - new C_MDS_RetryMessage(mds,m)); + if (lock->get_num_rdlock() > 0) { + dout(7) << "handle_file_lock readers, waiting before ack on " << *in << endl; + in->add_waiter(SimpleLock::WAIT_NORD, new C_MDS_RetryMessage(mds, m)); lock->set_state(LOCK_GLOCKR); assert(0);// i am broken.. why retry message when state captures all the info i need? return; @@ -1796,9 +1746,8 @@ void Locker::handle_lock_inode_file(MLock *m) // nothing to wait for, lock and ack. { lock->set_state(LOCK_LOCK); - - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + + MLock *reply = new MLock(lock, LOCK_AC_LOCKACK, mds->get_nodeid()); mds->send_message_mds(reply, from, MDS_PORT_LOCKER); } break; @@ -1819,8 +1768,7 @@ void Locker::handle_lock_inode_file(MLock *m) lock->set_state(LOCK_MIXED); // ack - MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + MLock *reply = new MLock(lock, LOCK_AC_MIXEDACK, mds->get_nodeid()); mds->send_message_mds(reply, from, MDS_PORT_LOCKER); } } else { @@ -1833,10 +1781,8 @@ void Locker::handle_lock_inode_file(MLock *m) issue_caps(in); // waiters - //in->filelock.get_write(); - in->finish_waiting(CInode::WAIT_FILEW|CInode::WAIT_FILESTABLE); - //in->filelock.put_write(); - inode_file_eval(in); + lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE); + file_eval(lock); break; @@ -1844,25 +1790,27 @@ void Locker::handle_lock_inode_file(MLock *m) // -- auth -- case LOCK_AC_LOCKACK: - assert(lock->state == LOCK_GLOCKR || - lock->state == LOCK_GLOCKM || - lock->state == LOCK_GLONERM || - lock->state == LOCK_GLONERR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); - - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + assert(lock->get_state() == LOCK_GLOCKR || + lock->get_state() == LOCK_GLOCKM || + lock->get_state() == LOCK_GLONERM || + lock->get_state() == LOCK_GLONERR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); + + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; case LOCK_AC_SYNCACK: - assert(lock->state == LOCK_GSYNCM); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); + assert(lock->get_state() == LOCK_GSYNCM); + assert(lock->is_gathering(from)); + lock->remove_gather(from); /* not used currently { @@ -1872,24 +1820,28 @@ void Locker::handle_lock_inode_file(MLock *m) } */ - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; case LOCK_AC_MIXEDACK: - assert(lock->state == LOCK_GMIXEDR); - assert(lock->gather_set.count(from)); - lock->gather_set.erase(from); + assert(lock->get_state() == LOCK_GMIXEDR); + assert(lock->is_gathering(from)); + lock->remove_gather(from); - if (lock->gather_set.size()) { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + if (lock->is_gathering()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", still gathering " << lock->get_gather_set() << endl; } else { - dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; - inode_file_eval(in); + dout(7) << "handle_lock_inode_file " << *in << " from " << from + << ", last one" << endl; + file_eval(lock); } break; @@ -1906,666 +1858,3 @@ void Locker::handle_lock_inode_file(MLock *m) - - - - - - - - -void Locker::handle_lock_dir(MLock *m) -{ -} - - - -// DENTRY - - -// trace helpers - -/** dentry_can_rdlock_trace - * see if we can _anonymously_ rdlock an entire trace. - * if not, and req is specified, wait and retry that message. - */ -bool Locker::dentry_can_rdlock_trace(vector& trace, MClientRequest *req) -{ - // verify dentries are rdlockable. - // we do this because - // - we're being less aggressive about locks acquisition, and - // - we're not acquiring the locks in order! - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) { - CDentry *dn = *it; - if (!dn->can_rdlock(0)) { - if (req) { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << ", waiting" << endl; - dn->dir->add_waiter(CDir::WAIT_DNPINNABLE, - dn->name, - new C_MDS_RetryMessage(mds, req)); - } else { - dout(10) << "can_rdlock_trace can't rdlock " << *dn << endl; - } - return false; - } - } - return true; -} - -void Locker::dentry_anon_rdlock_trace_start(vector& trace) -{ - // grab dentry rdlocks - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - (*it)->get_rdlock(0); -} - - - -bool Locker::dentry_rdlock_start(CDentry *dn, MDRequest *mdr) -{ - // verify lockable - if (!dn->can_rdlock(mdr)) { - // wait - dout(10) << "dentry_rdlock_start waiting on " << *dn << endl; - dn->dir->add_waiter(CDir::WAIT_DNPINNABLE, - dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // rdlock - dout(10) << "dentry_rdlock_start " << *dn << endl; - dn->get_rdlock(mdr); - - mdr->dentry_rdlocks.insert(dn); - mdr->dentry_locks.insert(dn); - - return true; -} - - -void Locker::_dentry_rdlock_finish(CDentry *dn, MDRequest *mdr) -{ - dn->put_rdlock(mdr); - - // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { - // return state to sync, in case the unpinner flails - dn->lockstate = DN_LOCK_SYNC; - - // run finisher right now to give them a fair shot. - dn->dir->finish_waiting(CDir::WAIT_DNUNPINNED, dn->name); - } -} - -void Locker::dentry_rdlock_finish(CDentry *dn, MDRequest *mdr) -{ - dout(10) << "dentry_rdlock_finish " << *dn << endl; - _dentry_rdlock_finish(dn, mdr); - mdr->dentry_rdlocks.erase(dn); - mdr->dentry_locks.erase(dn); -} - -void Locker::dentry_anon_rdlock_trace_finish(vector& trace) -{ - for (vector::iterator it = trace.begin(); - it != trace.end(); - it++) - _dentry_rdlock_finish(*it, 0); -} - -bool Locker::dentry_xlock_start(CDentry *dn, MDRequest *mdr) -{ - dout(7) << "dentry_xlock_start on " << *dn << endl; - - // locked? - if (dn->lockstate == DN_LOCK_XLOCK) { - if (dn->xlockedby == mdr) return true; // locked by me! - - // not by me, wait - dout(7) << "dentry " << *dn << " xlock by someone else" << endl; - dn->dir->add_waiter(CDir::WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // prelock? - if (dn->lockstate == DN_LOCK_PREXLOCK) { - if (dn->xlockedby == mdr) { - dout(7) << "dentry " << *dn << " prexlock by me" << endl; - dn->dir->add_waiter(CDir::WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; - dn->dir->add_waiter(CDir::WAIT_DNREAD, dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - } - return false; - } - - - // lockable! - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING); - - // is dentry path pinned? - if (dn->is_rdlocked()) { - dout(7) << "dentry " << *dn << " pinned, waiting" << endl; - dn->lockstate = DN_LOCK_UNPINNING; - dn->dir->add_waiter(CDir::WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } - - // mine! - dn->xlockedby = mdr; - - // pin me! - dn->get(CDentry::PIN_XLOCK); - - if (dn->is_replicated()) { - dn->lockstate = DN_LOCK_PREXLOCK; - - // xlock with whom? - set who; - for (map::iterator p = dn->replicas_begin(); - p != dn->replicas_end(); - ++p) - who.insert(p->first); - dn->gather_set = who; - - // make path - string path; - dn->make_path(path); - dout(10) << "path is " << path << " for " << *dn << endl; - - for (set::iterator it = who.begin(); - it != who.end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_dn(dn->dir->dirfrag(), dn->name); - m->set_path(path); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - - // wait - dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; - dn->dir->add_waiter(CDir::WAIT_DNLOCK, dn->name, - new C_MDS_RetryRequest(mdcache, mdr)); - return false; - } else { - dn->lockstate = DN_LOCK_XLOCK; - mdr->dentry_xlocks.insert(dn); - mdr->dentry_locks.insert(dn); - return true; - } -} - -void Locker::dentry_xlock_finish(CDentry *dn, MDRequest *mdr, bool quiet) -{ - dout(7) << "dentry_xlock_finish on " << *dn << endl; - - assert(dn->xlockedby); - if (dn->xlockedby == DN_XLOCK_FOREIGN) { - dout(7) << "this was a foreign xlock" << endl; - } else { - // remove from request record - mdr->dentry_xlocks.erase(dn); - mdr->dentry_locks.erase(dn); - } - - dn->xlockedby = 0; - dn->lockstate = DN_LOCK_SYNC; - - // unpin - dn->put(CDentry::PIN_XLOCK); - - // tell replicas? - if (!quiet) { - // tell even if dn is null. - if (dn->is_replicated()) { - send_lock_message(dn, LOCK_AC_SYNC); - } - } - - // kick waiters - list finished; - dn->dir->take_waiting(CDir::WAIT_DNREAD, finished); - mds->queue_finished(finished); -} - - -void Locker::dentry_xlock_downgrade_to_rdlock(CDentry *dn, MDRequest *mdr) -{ - dout(7) << "dentry_xlock_downgrade_to_rdlock on " << *dn << endl; - - assert(dn->xlockedby); - if (dn->xlockedby == DN_XLOCK_FOREIGN) { - dout(7) << "this was a foreign xlock" << endl; - assert(0); // rewrite me - } - - // un-xlock - dn->xlockedby = 0; - dn->lockstate = DN_LOCK_SYNC; - mdr->dentry_xlocks.erase(dn); - dn->put(CDentry::PIN_XLOCK); - - // rdlock - mdr->dentry_rdlocks.insert(dn); - dn->get_rdlock(mdr); - - // tell replicas? - if (dn->is_replicated()) { - send_lock_message(dn, LOCK_AC_SYNC); - } - - // kick waiters - list finished; - dn->dir->take_waiting(CDir::WAIT_DNREAD, finished); - mds->queue_finished(finished); -} - - -/* - * onfinish->finish() will be called with - * 0 on successful xlock, - * -1 on failure - */ -/* -class C_MDC_XlockRequest : public Context { - Locker *mdc; - CDir *dir; - string dname; - Message *req; - Context *finisher; -public: - C_MDC_XlockRequest(Locker *mdc, - CDir *dir, const string& dname, - Message *req, - Context *finisher) { - this->mdc = mdc; - this->dir = dir; - this->dname = dname; - this->req = req; - this->finisher = finisher; - } - - void finish(int r) { - mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher); - } -}; - -void Locker::dentry_xlock_request_finish(int r, - CDir *dir, const string& dname, - Message *req, - Context *finisher) -{ - dout(10) << "dentry_xlock_request_finish r = " << r << endl; - if (r == 1) { // 1 for xlock request success - CDentry *dn = dir->lookup(dname); - if (dn && dn->xlockedby == 0) { - // success - dn->xlockedby = req; // our request was the winner - dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl; - - // remember! - mdcache->active_requests[req].foreign_xlocks.insert(dn); - } - } - - // retry request (or whatever) - finisher->finish(0); - delete finisher; -} - -void Locker::dentry_xlock_request(CDir *dir, const string& dname, bool create, - Message *req, Context *onfinish) -{ - dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; - // send request - int dauth = dir->dentry_authority(dname).first; - MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); - m->set_dn(dir->dirfrag(), dname); - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - - // add waiter - dir->add_waiter(CDir::WAIT_DNREQXLOCK, dname, - new C_MDC_XlockRequest(this, - dir, dname, req, - onfinish)); -} -*/ - - - -void Locker::handle_lock_dn(MLock *m) -{ - assert(m->get_otype() == LOCK_OTYPE_DN); - - CDir *dir = mdcache->get_dirfrag(m->get_dirfrag()); // may be null - string dname = m->get_dn(); - int from = m->get_asker(); - CDentry *dn = 0; - - if (LOCK_AC_FOR_AUTH(m->get_action())) { - // auth - - // normally we have it always - if (dir) { - int dauth = dir->dentry_authority(dname).first; - assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, - m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak - m->get_action() == LOCK_AC_REQXLOCKNAK); - - if (dir->is_proxy()) { - - assert(dauth >= 0); - - if (dauth == m->get_asker() && - (m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC)) { - dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; - /*if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - */ - assert(0); // FIXME REWRITE ME >>>>>>> - return; - } - - dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; - - /* ******* REWRITE ME SDFKJDSFDSFJK:SDFJKDFSJKFDSHJKDFSHJKDFS>>>>>>> - // forward - if (mdcache->active_requests.count(m)) { - // xlock requests are requests, use request_* functions! - assert(m->get_action() == LOCK_AC_REQXLOCK || - m->get_action() == LOCK_AC_REQXLOCKC); - // forward as a request - mdcache->request_forward(m, dauth, MDS_PORT_LOCKER); - } else { - // not an xlock req, or it is and we just didn't register the request yet - // forward normally - mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); - } - */ - return; - } - - dn = dir->lookup(dname); - } - - // except with.. an xlock request? - if (!dn) { - assert(dir); // we should still have the dir, though! the requester has the dir open. - switch (m->get_action()) { - - case LOCK_AC_LOCK: - dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl; - dn = dir->add_dentry(dname); - break; - - case LOCK_AC_REQXLOCK: - // send nak - if (dir->state_test(CDir::STATE_DELETED)) { - dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl; - } - { - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->dirfrag(), dname); - reply->set_path(m->get_path()); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - - // finish request (if we got that far) - /* FIXME F>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - */ - - delete m; - return; - - case LOCK_AC_REQXLOCKC: - dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl; - break; - - default: - assert(0); - } - } - } else { - // replica - if (dir) dn = dir->lookup(dname); - if (!dn) { - dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl; - - if (m->get_action() == LOCK_AC_REQXLOCKACK || - m->get_action() == LOCK_AC_REQXLOCKNAK) { - dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl; - //assert(0); // how can this happen? tell me now! - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(0, 0, // FIXME FIXME >>>>>>>>>>>>>>>>>>>>>>>> - path, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER, - false, true); - assert(r>0); - return; - } - - if (m->get_action() == LOCK_AC_LOCK) { - if (0) { // not anymore - dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl; - - vector trace; - filepath path = m->get_path(); - int r = mdcache->path_traverse(0, 0, // FIXME >>>>>>>>>>>>>>>>>>>>>>>> - path, trace, true, - m, new C_MDS_RetryMessage(mds,m), - MDS_TRAVERSE_DISCOVER, - false, true); - assert(r>0); - } - if (1) { - // NAK - MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid()); - reply->set_dn(m->get_dirfrag(), dname); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - } - } else { - dout(7) << "safely ignoring." << endl; - delete m; - } - return; - } - - assert(dn); - } - - if (dn) { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl; - } else { - dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl; - } - - switch (m->get_action()) { - // -- replica -- - case LOCK_AC_LOCK: - assert(dn->lockstate == DN_LOCK_SYNC || - dn->lockstate == DN_LOCK_UNPINNING || - dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover! - - if (dn->is_rdlocked()) { - dn->lockstate = DN_LOCK_UNPINNING; - - // wait - dout(7) << "dn pinned, waiting " << *dn << endl; - dn->dir->add_waiter(CDir::WAIT_DNUNPINNED, - dn->name, - new C_MDS_RetryMessage(mds, m)); - return; - } else { - dn->lockstate = DN_LOCK_XLOCK; - dn->xlockedby = 0; - - // ack now - MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); - reply->set_dn(dir->dirfrag(), dname); - mds->send_message_mds(reply, from, MDS_PORT_LOCKER); - } - - // wake up waiters - dir->finish_waiting(CDir::WAIT_DNLOCK, dname); // ? will this happen on replica ? - break; - - case LOCK_AC_SYNC: - assert(dn->lockstate == DN_LOCK_XLOCK); - dn->lockstate = DN_LOCK_SYNC; - dn->xlockedby = 0; - - // null? hose it. - if (dn->is_null()) { - dout(7) << "hosing null (and now sync) dentry " << *dn << endl; - dir->remove_dentry(dn); - } - - // wake up waiters - dir->finish_waiting(CDir::WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out - break; - - case LOCK_AC_REQXLOCKACK: - case LOCK_AC_REQXLOCKNAK: - { - dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl; - list finished; - dir->take_waiting(CDir::WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY! - finish_contexts(finished, - (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1); - } - break; - - - // -- auth -- - case LOCK_AC_LOCKACK: - case LOCK_AC_LOCKNAK: - assert(dn->gather_set.count(from) == 1); - dn->gather_set.erase(from); - if (dn->gather_set.size() == 0) { - dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; - dn->lockstate = DN_LOCK_XLOCK; - // FIXME - mdcache->slave_requests[dn->xlockedby->reqid]->dentry_xlocks.insert(dn); - mdcache->slave_requests[dn->xlockedby->reqid]->dentry_locks.insert(dn); - dir->finish_waiting(CDir::WAIT_DNLOCK, dname); - } - break; - - - case LOCK_AC_REQXLOCKC: - // make sure it's a _file_, if it exists. - if (dn && dn->inode && dn->inode->is_dir()) { - dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl; - - // nak - string path; - dn->make_path(path); - - MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); - reply->set_dn(dir->dirfrag(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - assert(0); // FIXME - /* - // done - if (mdcache->active_requests.count(m)) - mdcache->request_finish(m); - else - delete m; - */ - return; - } - - /* REWRITE ME HELP - case LOCK_AC_REQXLOCK: - if (dn) { - dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; - } else { - dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl; - } - - - // start request? - if (!mdcache->active_requests.count(m)) { - vector trace; - if (!mdcache->request_start(m, dir->inode, trace)) - return; // waiting for pin - } - - // try to xlock! - if (!dn) { - assert(m->get_action() == LOCK_AC_REQXLOCKC); - dn = dir->add_dentry(dname); - } - - if (dn->xlockedby != m) { - if (!dentry_xlock_start(dn, m, dir->inode)) { - // hose null dn if we're waiting on something - if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); - return; // waiting for xlock - } - } else { - // successfully xlocked! on behalf of requestor. - string path; - dn->make_path(path); - - dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl; - - // ACK xlock request - MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid()); - reply->set_dn(dir->dirfrag(), dname); - reply->set_path(path); - mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); - - // note: keep request around in memory (to hold the xlock/pins on behalf of requester) - return; - } - break; -*/ - - case LOCK_AC_UNXLOCK: - dout(7) << "handle_lock_dn unxlock on " << *dn << endl; - { - MDRequest *mdr = dn->xlockedby; - - // finish request - mdcache->request_finish(mdr); // this will drop the locks (and unpin paths!) - return; - } - break; - - default: - assert(0); - } - - delete m; -} - - - - - - - diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h index f819e13209729..eeec966a4a150 100644 --- a/branches/sage/cephmds2/mds/Locker.h +++ b/branches/sage/cephmds2/mds/Locker.h @@ -43,6 +43,8 @@ class MClientRequest; class Anchor; class Capability; +class SimpleLock; +class FileLock; class Locker { private: @@ -53,50 +55,51 @@ private: Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} void dispatch(Message *m); + void handle_lock(MLock *m); - void send_lock_message(CInode *in, int msg, int type); - void send_lock_message(CInode *in, int msg, int type, bufferlist& data); - void send_lock_message(CDentry *dn, int msg); + void send_lock_message(SimpleLock *lock, int msg); + void send_lock_message(SimpleLock *lock, int msg, bufferlist &data); // -- locks -- bool acquire_locks(MDRequest *mdr, - set &dentry_rdlocks, - set &dentry_xlocks, - set &inode_hard_rdlocks, - set &inode_hard_xlocks); + set &rdlocks, + set &xlocks); + + bool rdlock_try(SimpleLock *lock, Context *con); + bool rdlock_start(SimpleLock *lock, MDRequest *mdr); + void rdlock_finish(SimpleLock *lock, MDRequest *mdr); + bool xlock_start(SimpleLock *lock, MDRequest *mdr); + void xlock_finish(SimpleLock *lock, MDRequest *mdr); + + // simple + void handle_simple_lock(SimpleLock *lock, MLock *m); + void simple_eval(SimpleLock *lock); + void simple_sync(SimpleLock *lock); + void simple_lock(SimpleLock *lock); + bool simple_rdlock_try(SimpleLock *lock, Context *con); + bool simple_rdlock_start(SimpleLock *lock, MDRequest *mdr); + void simple_rdlock_finish(SimpleLock *lock, MDRequest *mdr); + bool simple_xlock_start(SimpleLock *lock, MDRequest *mdr); + void simple_xlock_finish(SimpleLock *lock, MDRequest *mdr); + + bool dentry_can_rdlock_trace(vector& trace, MClientRequest *req); + void dentry_anon_rdlock_trace_start(vector& trace); + void dentry_anon_rdlock_trace_finish(vector& trace); + // file + void handle_file_lock(FileLock *lock, MLock *m); + void file_eval(FileLock *lock); + bool file_sync(FileLock *lock); + void file_lock(FileLock *lock); + void file_mixed(FileLock *lock); + void file_loner(FileLock *lock); + bool file_rdlock_try(FileLock *lock, Context *con); + bool file_rdlock_start(FileLock *lock, MDRequest *mdr); + void file_rdlock_finish(FileLock *lock, MDRequest *mdr); + bool file_xlock_start(FileLock *lock, MDRequest *mdr); + void file_xlock_finish(FileLock *lock, MDRequest *mdr); - // high level interface - public: - bool inode_hard_rdlock_try(CInode *in, Context *con); - bool inode_hard_rdlock_start(CInode *in, MDRequest *mdr); - void inode_hard_rdlock_finish(CInode *in, MDRequest *mdr); - bool inode_hard_xlock_start(CInode *in, MDRequest *mdr); - void inode_hard_xlock_finish(CInode *in, MDRequest *mdr); - bool inode_file_rdlock_start(CInode *in, MDRequest *mdr); - void inode_file_rdlock_finish(CInode *in, MDRequest *mdr); - bool inode_file_xlock_start(CInode *in, MDRequest *mdr); - void inode_file_xlock_finish(CInode *in, MDRequest *mdr); - - void inode_hard_eval(CInode *in); - void inode_file_eval(CInode *in); - protected: - void inode_hard_mode(CInode *in, int mode); - void inode_file_mode(CInode *in, int mode); - - // low level triggers - void inode_hard_sync(CInode *in); - void inode_hard_lock(CInode *in); - bool inode_file_sync(CInode *in); - void inode_file_lock(CInode *in); - void inode_file_mixed(CInode *in); - void inode_file_loner(CInode *in); - - // messengers - void handle_lock(MLock *m); - void handle_lock_inode_hard(MLock *m); - void handle_lock_inode_file(MLock *m); // -- file i/o -- public: @@ -111,31 +114,6 @@ private: void handle_inode_file_caps(class MInodeFileCaps *m); - // dirs - void handle_lock_dir(MLock *m); - - // dentry locks - void _dentry_rdlock_finish(CDentry *dn, MDRequest *mdr); - public: - bool dentry_rdlock_start(CDentry *dn, MDRequest *mdr); - void dentry_rdlock_finish(CDentry *dn, MDRequest *mdr); - bool dentry_can_rdlock_trace(vector& trace, MClientRequest *req); - void dentry_anon_rdlock_trace_start(vector& trace); - void dentry_anon_rdlock_trace_finish(vector& trace); - - bool dentry_xlock_start(CDentry *dn, MDRequest *mdr); - void dentry_xlock_finish(CDentry *dn, MDRequest *mdr, bool quiet=false); - //bool dentry_xlock_upgrade_from_rdlock(CDentry *dn, MDRequest *mdr); // from rdlock - void dentry_xlock_downgrade_to_rdlock(CDentry *dn, MDRequest *mdr); // to rdlock - void handle_lock_dn(MLock *m); - void dentry_xlock_request(CDir *dir, const string& dname, bool create, - Message *req, Context *onfinish); - void dentry_xlock_request_finish(int r, - CDir *dir, const string& dname, - Message *req, - Context *finisher); - - }; diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc index 6ff40c83b7ff0..249a627c1cf60 100644 --- a/branches/sage/cephmds2/mds/MDCache.cc +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -174,10 +174,7 @@ void MDCache::remove_inode(CInode *o) // FIXME: multiple parents? CDentry *dn = o->get_parent_dn(); assert(!dn->is_dirty()); - if (dn->is_sync()) - dn->dir->remove_dentry(dn); // unlink inode AND hose dentry - else - dn->dir->unlink_inode(dn); // leave dentry + dn->dir->unlink_inode(dn); // leave dentry ... FIXME? } // remove from inode map @@ -1476,7 +1473,7 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) assert(dn); int nonce = dn->add_replica(from); dout(10) << " has " << *dn << endl; - ack->add_dentry(*p, *q, dn->get_lockstate(), nonce); + ack->add_dentry(*p, *q, dn->lock.get_state(), nonce); } } @@ -1491,8 +1488,8 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) in->mds_caps_wanted[from] = p->second; else in->mds_caps_wanted.erase(from); - in->hardlock.gather_set.erase(from); // just in case - in->filelock.gather_set.erase(from); // just in case + in->hardlock.remove_gather(from); // just in case + in->filelock.remove_gather(from); // just in case dout(10) << " has " << *in << endl; ack->add_inode(p->first, in->hardlock.get_replica_state(), in->filelock.get_replica_state(), @@ -1528,7 +1525,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) CDentry *dn = dir->lookup(q->first); assert(dn); dn->set_replica_nonce(q->second.nonce); - dn->set_lockstate(q->second.lock); + dn->lock.set_state(q->second.lock); dout(10) << " got " << *dn << endl; } } @@ -2151,20 +2148,20 @@ void MDCache::inode_remove_replica(CInode *in, int from) // note: this code calls _eval more often than it needs to! // fix lock if (in->hardlock.is_gathering(from)) { - in->hardlock.gather_set.erase(from); - if (in->hardlock.gather_set.size() == 0) - mds->locker->inode_hard_eval(in); + in->hardlock.remove_gather(from); + if (!in->hardlock.is_gathering()) + mds->locker->simple_eval(&in->hardlock); } if (in->filelock.is_gathering(from)) { - in->filelock.gather_set.erase(from); - if (in->filelock.gather_set.size() == 0) - mds->locker->inode_file_eval(in); + in->filelock.remove_gather(from); + if (!in->filelock.is_gathering()) + mds->locker->file_eval(&in->filelock); } // alone now? if (!in->is_replicated()) { - mds->locker->inode_hard_eval(in); - mds->locker->inode_file_eval(in); + mds->locker->simple_eval(&in->hardlock); + mds->locker->file_eval(&in->filelock); } } @@ -2572,7 +2569,7 @@ int MDCache::path_traverse(MDRequest *mdr, */ // must read directory hard data (permissions, x bit) to traverse - if (!noperm && !mds->locker->inode_hard_rdlock_try(cur, ondelay)) { + if (!noperm && !mds->locker->simple_rdlock_try(&cur->hardlock, ondelay)) { return 1; } @@ -2601,11 +2598,9 @@ int MDCache::path_traverse(MDRequest *mdr, if (dn && !dn->is_null()) { // dentry exists. xlocked? - if (!noperm && dn->is_xlockedbyother(mdr)) { + if (!noperm && dn->lock.is_xlocked() && dn->lock.get_xlocked_by() != mdr) { dout(10) << "traverse: xlocked dentry at " << *dn << endl; - curdir->add_waiter(CDir::WAIT_DNREAD, - path[depth], - ondelay); + dn->lock.add_waiter(SimpleLock::WAIT_RD, ondelay); return 1; } @@ -2976,9 +2971,9 @@ void MDCache::dispatch_request(MDRequest *mdr) mds->server->dispatch_request(mdr); break; - case MSG_MDS_LOCK: - mds->locker->handle_lock_dn((MLock*)mdr->request); - break; + //case MSG_MDS_LOCK: + //mds->locker->handle_lock_dn((MLock*)mdr->request); + //break; default: assert(0); // shouldn't get here @@ -2990,23 +2985,11 @@ void MDCache::dispatch_request(MDRequest *mdr) void MDCache::request_drop_locks(MDRequest *mdr) { - // leftover dentry locks - while (!mdr->dentry_xlocks.empty()) - mds->locker->dentry_xlock_finish(*mdr->dentry_xlocks.begin(), mdr); - while (!mdr->dentry_rdlocks.empty()) - mds->locker->dentry_rdlock_finish(*mdr->dentry_rdlocks.begin(), mdr); - - // inode locks - while (!mdr->inode_hard_xlocks.empty()) - mds->locker->inode_hard_xlock_finish(*mdr->inode_hard_xlocks.begin(), mdr); - while (!mdr->inode_hard_rdlocks.empty()) - mds->locker->inode_hard_rdlock_finish(*mdr->inode_hard_rdlocks.begin(), mdr); - - while (!mdr->inode_file_xlocks.empty()) - mds->locker->inode_file_xlock_finish(*mdr->inode_file_xlocks.begin(), mdr); - while (!mdr->inode_file_rdlocks.empty()) - mds->locker->inode_file_rdlock_finish(*mdr->inode_file_rdlocks.begin(), mdr); - + // leftover locks + while (!mdr->xlocks.empty()) + mds->locker->xlock_finish(*mdr->xlocks.begin(), mdr); + while (!mdr->rdlocks.empty()) + mds->locker->rdlock_finish(*mdr->rdlocks.begin(), mdr); /* // foreign xlocks? diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h index a07a2ea4d2849..3fb3ed133eab6 100644 --- a/branches/sage/cephmds2/mds/MDCache.h +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -28,7 +28,6 @@ #include "CInode.h" #include "CDentry.h" #include "CDir.h" -#include "Lock.h" #include "include/Context.h" class MDS; @@ -81,24 +80,13 @@ struct MDRequest { set< CInode* > inode_auth_pins; // held locks - set< CDentry*, CDentry::ptr_lt > dentry_locks; // sorted list of dentry locks we hold - set< CDentry* > dentry_rdlocks; - set< CDentry* > dentry_xlocks; + set< SimpleLock* > rdlocks; + set< SimpleLock* > xlocks; + set< SimpleLock*, SimpleLock::ptr_lt > locks; - set< CInode*, CInode::ptr_lt > inode_hard_locks; // sorted list of inode locks we hold - set< CInode* > inode_hard_rdlocks; - set< CInode* > inode_hard_xlocks; - - set< CInode*, CInode::ptr_lt > inode_file_locks; // sorted list of inode locks we hold - set< CInode* > inode_file_rdlocks; - set< CInode* > inode_file_xlocks; - // projected updates map< inodeno_t, inode_t > projected_inode; - // old - set< CDentry* > xlocks; // xlocks (local) - set< CDentry* > foreign_xlocks; // xlocks on foreign hosts MDRequest() : request(0), ref(0) {} MDRequest(metareqid_t ri, Message *req=0) : reqid(ri), request(req), ref(0) {} diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc index 538eafa9e4d4e..46e64cdefadbc 100644 --- a/branches/sage/cephmds2/mds/Migrator.cc +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -870,7 +870,6 @@ int Migrator::encode_export_dir(list& dirstatelist, // null dentry? if (dn->is_null()) { enc_dir.append("N", 1); // null dentry - assert(dn->is_sync()); continue; } @@ -1792,10 +1791,10 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol // twiddle locks // hard if (in->hardlock.get_state() == LOCK_GLOCKR) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - in->hardlock.gather_set.erase(oldauth); - if (in->hardlock.gather_set.empty()) - mds->locker->inode_hard_eval(in); + in->hardlock.remove_gather(mds->get_nodeid()); + in->hardlock.remove_gather(oldauth); + if (!in->hardlock.is_gathering()) + mds->locker->simple_eval(&in->hardlock); } // caps @@ -1816,18 +1815,18 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol // filelock if (!in->filelock.is_stable()) { // take me and old auth out of gather set - in->filelock.gather_set.erase(mds->get_nodeid()); - in->filelock.gather_set.erase(oldauth); - if (in->filelock.gather_set.empty()) // necessary but not suffient... - mds->locker->inode_file_eval(in); + in->filelock.remove_gather(mds->get_nodeid()); + in->filelock.remove_gather(oldauth); + if (!in->filelock.is_gathering()) // necessary but not suffient... + mds->locker->file_eval(&in->filelock); } } int Migrator::decode_import_dir(bufferlist& bl, - int oldauth, - CDir *import_root, - EImportStart *le) + int oldauth, + CDir *import_root, + EImportStart *le) { int off = 0; diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc index 4be711cb04687..3c07af279766b 100644 --- a/branches/sage/cephmds2/mds/Server.cc +++ b/branches/sage/cephmds2/mds/Server.cc @@ -376,9 +376,9 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn // does it already exist? CDentry *dn = dir->lookup(dname); if (dn) { - if (!dn->can_read(mdr)) { + if (!dn->lock.can_rdlock(mdr)) { dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; - dir->add_waiter(CDir::WAIT_DNREAD, dname, new C_MDS_RetryRequest(mdcache, mdr)); + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } @@ -550,16 +550,13 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) } // lock the path - set dentry_rdlocks; - set dentry_xlocks; - set inode_empty; + set rdlocks; + set xlocks; for (unsigned i=0; ilock); - if (!mds->locker->acquire_locks(mdr, - dentry_rdlocks, dentry_xlocks, - inode_empty, inode_empty)) + if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks)) return 0; // set and pin ref @@ -609,9 +606,9 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus } // readable? - if (dn && !dn->can_read(mdr)) { + if (dn && !dn->lock.can_rdlock(mdr)) { dout(10) << "waiting on (existing!) unreadable dentry " << *dn << endl; - dir->add_waiter(CDir::WAIT_DNREAD, dname, new C_MDS_RetryRequest(mdcache, mdr)); + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } @@ -628,23 +625,20 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus } // -- lock -- - set dentry_rdlocks; - set dentry_xlocks; - set inode_empty; + set rdlocks; + set xlocks; for (unsigned i=0; ilock); } dout(10) << "will rd or x lock " << *dn << endl; if (dn->is_null()) - dentry_xlocks.insert(dn); // new dn, xlock + xlocks.insert(&dn->lock); // new dn, xlock else - dentry_rdlocks.insert(dn); // existing dn, rdlock + rdlocks.insert(&dn->lock); // existing dn, rdlock - if (!mds->locker->acquire_locks(mdr, - dentry_rdlocks, dentry_xlocks, - inode_empty, inode_empty)) + if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks)) return 0; // save the locked trace. @@ -748,9 +742,9 @@ void Server::handle_client_stat(MDRequest *mdr) int mask = req->args.stat.mask; if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { // yes. do a full stat. - if (!mds->locker->inode_file_rdlock_start(ref, mdr)) + if (!mds->locker->rdlock_start(&ref->filelock, mdr)) return; // syncing - mds->locker->inode_file_rdlock_finish(ref, mdr); + mds->locker->rdlock_finish(&ref->filelock, mdr); } else { // nope! easy peasy. } @@ -809,7 +803,7 @@ void Server::handle_client_utime(MDRequest *mdr) if (!cur) return; // write - if (!mds->locker->inode_file_xlock_start(cur, mdr)) + if (!mds->locker->xlock_start(&cur->filelock, mdr)) return; mds->balancer->hit_inode(cur, META_POP_IWR); @@ -875,7 +869,7 @@ void Server::handle_client_chmod(MDRequest *mdr) if (!cur) return; // write - if (!mds->locker->inode_hard_xlock_start(cur, mdr)) + if (!mds->locker->xlock_start(&cur->hardlock, mdr)) return; mds->balancer->hit_inode(cur, META_POP_IWR); @@ -934,7 +928,7 @@ void Server::handle_client_chown(MDRequest *mdr) if (!cur) return; // write - if (!mds->locker->inode_hard_xlock_start(cur, mdr)) + if (!mds->locker->xlock_start(&cur->hardlock, mdr)) return; mds->balancer->hit_inode(cur, META_POP_IWR); @@ -1293,21 +1287,17 @@ void Server::handle_client_link(MDRequest *mdr) if (!dn) return; // create lock lists - set dentry_rdlocks; - set dentry_xlocks; - set inode_hard_rdlocks; - set inode_hard_xlocks; + set rdlocks; + set xlocks; for (unsigned i=0; ilock); + xlocks.insert(&dn->lock); for (unsigned i=0; ilock); + xlocks.insert(&targeti->hardlock); - if (!mds->locker->acquire_locks(mdr, - dentry_rdlocks, dentry_xlocks, - inode_hard_rdlocks, inode_hard_xlocks)) + if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks)) return; // go! @@ -1511,9 +1501,9 @@ void Server::handle_client_unlink(MDRequest *mdr) } // readable? - if (!dn->can_read(mdr)) { + if (!dn->lock.can_rdlock(mdr)) { dout(10) << "waiting on unreadable dentry " << *dn << endl; - dn->dir->add_waiter(CDir::WAIT_DNREAD, dn->name, new C_MDS_RetryRequest(mdcache, mdr)); + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -1546,19 +1536,15 @@ void Server::handle_client_unlink(MDRequest *mdr) } // lock - set dentry_rdlocks; - set dentry_xlocks; - set inode_hard_rdlocks; - set inode_hard_xlocks; + set rdlocks; + set xlocks; for (unsigned i=0; ilock); + xlocks.insert(&dn->lock); + xlocks.insert(&in->hardlock); - if (!mds->locker->acquire_locks(mdr, - dentry_rdlocks, dentry_xlocks, - inode_hard_rdlocks, inode_hard_xlocks)) + if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks)) return; // ok! @@ -1814,10 +1800,9 @@ public: bool Server::_rename_open_dn(CDir *dir, CDentry *dn, bool mustexist, MDRequest *mdr) { // xlocked? - if (dn && !dn->can_read(mdr)) { + if (dn && !dn->lock.can_rdlock(mdr)) { dout(10) << "_rename_open_dn waiting on " << *dn << endl; - dir->add_waiter(CDir::WAIT_DNREAD, - dn->name, + dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr)); return false; } @@ -1903,8 +1888,9 @@ void Server::handle_client_rename(MDRequest *mdr) // identify/create dest dentry CDentry *destdn = destdir->lookup(destname); - if (destdn && !destdn->can_read(mdr)) { - destdir->add_waiter(CDir::WAIT_DNREAD, destname, new C_MDS_RetryRequest(mdcache, mdr)); + if (destdn && !destdn->lock.can_rdlock(mdr)) { + destdn->lock.add_waiter(SimpleLock::WAIT_RD, + new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -1939,27 +1925,23 @@ void Server::handle_client_rename(MDRequest *mdr) // -- locks -- - set dentry_rdlocks; - set dentry_xlocks; - set inode_hard_rdlocks; - set inode_hard_xlocks; + set rdlocks; + set xlocks; // rdlock sourcedir path, xlock src dentry for (unsigned i=0; ilock); + xlocks.insert(&srcdn->lock); // rdlock destdir path, xlock dest dentry for (unsigned i=0; ilock); + xlocks.insert(&destdn->lock); // xlock oldin - if (oldin) inode_hard_xlocks.insert(oldin); + if (oldin) xlocks.insert(&oldin->hardlock); - if (!mds->locker->acquire_locks(mdr, - dentry_rdlocks, dentry_xlocks, - inode_hard_rdlocks, inode_hard_xlocks)) + if (!mds->locker->acquire_locks(mdr, rdlocks, xlocks)) return; @@ -2499,7 +2481,7 @@ void Server::handle_client_truncate(MDRequest *mdr) // check permissions? // xlock inode - if (!mds->locker->inode_file_xlock_start(cur, mdr)) + if (!mds->locker->xlock_start(&cur->filelock, mdr)) return; // fw or (wait for) lock // already small enough? @@ -2564,7 +2546,7 @@ void Server::handle_client_open(MDRequest *mdr) assert(cur->is_auth()); // xlock file size - if (!mds->locker->inode_file_xlock_start(cur, mdr)) + if (!mds->locker->xlock_start(&cur->filelock, mdr)) return; if (cur->inode.size > 0) { @@ -2701,7 +2683,7 @@ public: newi->mark_dirty(pv); // downgrade xlock to rdlock - mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); + //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); // set/pin ref inode for open() mdr->ref = newi; diff --git a/branches/sage/cephmds2/mds/SimpleLock.h b/branches/sage/cephmds2/mds/SimpleLock.h new file mode 100644 index 0000000000000..8787bd5167f43 --- /dev/null +++ b/branches/sage/cephmds2/mds/SimpleLock.h @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __SIMPLELOCK_H +#define __SIMPLELOCK_H + +// -- lock types -- +// NOTE: this also defines the lock ordering! +#define LOCK_OTYPE_DN 1 +#define LOCK_OTYPE_IFILE 2 +#define LOCK_OTYPE_IHARD 3 // deprecate me? + +#define LOCK_OTYPE_IPERM 4 +#define LOCK_OTYPE_ILINK 5 +#define LOCK_OTYPE_IDIRTREE 6 +#define LOCK_OTYPE_DIR 7 + +inline const char *get_lock_type_name(int t) { + switch (t) { + case LOCK_OTYPE_DN: return "dentry"; + case LOCK_OTYPE_IFILE: return "inode_file"; + case LOCK_OTYPE_IHARD: return "inode_hard"; + case LOCK_OTYPE_IPERM: return "inode_perm"; + case LOCK_OTYPE_ILINK: return "inode_link"; + case LOCK_OTYPE_IDIRTREE: return "inode_dirtree"; + default: assert(0); + } +} + +// -- lock states -- +// auth rep +#define LOCK_SYNC 0 // AR R . R . +#define LOCK_LOCK 1 // AR R W . . +#define LOCK_GLOCKR 2 // AR R . . . + +inline const char *get_simplelock_state_name(int n) { + switch (n) { + case LOCK_SYNC: return "sync"; + case LOCK_LOCK: return "lock"; + case LOCK_GLOCKR: return "glockr"; + default: assert(0); + } +} + +class MDRequest; + +class SimpleLock { +public: + static const int WAIT_RD = (1<<0); // to read + static const int WAIT_NORD = (1<<1); // for last rdlock to finish + static const int WAIT_WR = (1<<2); // to write + //static const int WAIT_RDWR = (1<<3); // to read+write + static const int WAIT_LOCK = (1<<4); // for locked state + static const int WAIT_STABLE = (1<<5); // for a stable state + static const int WAIT_BITS = 6; + +protected: + // parent (what i lock) + MDSCacheObject *parent; + int type; + + // lock state + char state; + set<__int32_t> gather_set; // auth + + // local state + int num_rdlock; + MDRequest *xlock_by; + +public: + SimpleLock(MDSCacheObject *o, int t) : + parent(o), type(t), + state(LOCK_SYNC), + num_rdlock(0), xlock_by(0) { } + + // parent + MDSCacheObject *get_parent() { return parent; } + int get_type() { return type; } + + struct ptr_lt { + bool operator()(const SimpleLock* l, const SimpleLock* r) const { + if (l->type < r->type) return true; + if (l->type == r->type) return l->parent->is_lt(r->parent); + return false; + } + }; + + void decode_locked_state(bufferlist& bl) { + parent->decode_lock_state(type, bl); + } + void encode_locked_state(bufferlist& bl) { + parent->encode_lock_state(type, bl); + } + void finish_waiters(int mask, int r=0) { + parent->finish_lock_waiters(type, mask, r); + } + void add_waiter(int mask, Context *c) { + parent->add_lock_waiter(type, mask, c); + } + bool is_waiting(int mask) { + return parent->is_lock_waiting(type, mask); + } + + + + // state + char get_state() { return state; } + char set_state(char s) { + state = s; + assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + return s; + }; + bool is_stable() { + return state >= 0; + } + + + // gather set + const set& get_gather_set() { return gather_set; } + void init_gather() { + for (map::const_iterator p = parent->replicas_begin(); + p != parent->replicas_end(); + ++p) + gather_set.insert(p->first); + } + bool is_gathering() { return !gather_set.empty(); } + bool is_gathering(int i) { + return gather_set.count(i); + } + void clear_gather() { + gather_set.clear(); + } + void remove_gather(int i) { + gather_set.erase(i); + } + + // ref counting + int get_rdlock() { return ++num_rdlock; } + int put_rdlock() { + assert(num_rdlock>0); + return --num_rdlock; + } + int get_num_rdlock() { return num_rdlock; } + + void get_xlock(MDRequest *who) { + assert(xlock_by == 0); + xlock_by = who; + } + void put_xlock() { + assert(xlock_by); + xlock_by = 0; + } + bool is_xlocked() { return xlock_by ? true:false; } + MDRequest *get_xlocked_by() { return xlock_by; } + bool is_used() { + return (is_xlocked() || (num_rdlock>0)) ? true:false; + } + + // encode/decode + void _encode(bufferlist& bl) { + ::_encode(state, bl); + ::_encode(gather_set, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(state, bl, off); + ::_decode(gather_set, bl, off); + } + + + // simplelock specifics + char get_replica_state() { + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKR: + return LOCK_LOCK; + case LOCK_SYNC: + return LOCK_SYNC; + default: + assert(0); + } + return 0; + } + + bool can_rdlock(MDRequest *mdr) { + if (state == LOCK_SYNC) + return true; + if (state == LOCK_LOCK && mdr && xlock_by == mdr) + return true; + return false; + } + bool can_xlock(MDRequest *mdr) { + if (!parent->is_auth()) return false; + if (state != LOCK_LOCK) return false; + if (mdr && xlock_by == mdr) return true; + return false; + } + bool can_xlock_soon() { + if (parent->is_auth()) + return (state == LOCK_GLOCKR); + else + return false; + } +}; + +inline ostream& operator<<(ostream& out, SimpleLock& l) +{ + out << "(" << get_lock_type_name(l.get_type()) + << get_simplelock_state_name(l.get_state()); + if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); + if (l.get_num_rdlock()) + out << " r=" << l.get_num_rdlock(); + if (l.is_xlocked()) + out << " w=" << l.get_xlocked_by(); + out << ")"; + return out; +} + + +#endif diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h index 16738956e2092..1d228c959a9f3 100644 --- a/branches/sage/cephmds2/mds/mdstypes.h +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -236,7 +236,12 @@ inline mds_load_t operator/( mds_load_t& a, double d ) // ================================================================ -#define MDS_PIN_REPLICATED 1 +#define MDS_PIN_REPLICATED 1 +#define MDS_STATE_AUTH (1<<0) + +class MLock; +class Context; +class SimpleLock; class MDSCacheObject { protected: @@ -263,6 +268,8 @@ class MDSCacheObject { unsigned state_test(unsigned mask) { return state & mask; } void state_reset(unsigned s) { state = s; } + bool is_auth() { return state & MDS_STATE_AUTH; } + // -------------------------------------------- // pins int get_num_ref() { return ref; } @@ -358,7 +365,37 @@ class MDSCacheObject { int get_replica_nonce() { return replica_nonce;} void set_replica_nonce(int n) { replica_nonce = n; } + + + // --------------------------------------------- + // locking + // noop unless overloaded. + virtual SimpleLock* get_lock(int type) { assert(0); } + virtual void set_mlock_info(MLock *m) { assert(0); } + virtual void encode_lock_state(int type, bufferlist& bl) { assert(0); } + virtual void decode_lock_state(int type, bufferlist& bl) { assert(0); } + virtual void finish_lock_waiters(int type, int mask, int r=0) { assert(0); } + virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } + virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } + + + // --------------------------------------------- + // ordering + virtual bool is_lt(const MDSCacheObject *r) const = 0; + struct ptr_lt { + bool operator()(const MDSCacheObject* l, const MDSCacheObject* r) const { + return l->is_lt(r); + } + }; + + // printing + virtual void print(ostream& out) = 0; }; +inline ostream& operator<<(ostream& out, MDSCacheObject& o) { + o.print(out); + return out; +} + #endif diff --git a/branches/sage/cephmds2/messages/MClientReply.h b/branches/sage/cephmds2/messages/MClientReply.h index 038190a774bf3..068ab52dabb29 100644 --- a/branches/sage/cephmds2/messages/MClientReply.h +++ b/branches/sage/cephmds2/messages/MClientReply.h @@ -68,9 +68,9 @@ class InodeStat { { // inode.mask inode.mask = INODE_MASK_BASE; - if (in->filelock.can_read(in->is_auth())) + if (in->filelock.can_rdlock(0)) inode.mask |= INODE_MASK_PERM; - if (in->hardlock.can_read(in->is_auth())) + if (in->hardlock.can_rdlock(0)) inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. // symlink content? diff --git a/branches/sage/cephmds2/messages/MLock.h b/branches/sage/cephmds2/messages/MLock.h index 21492d6f72517..c851102f2e823 100644 --- a/branches/sage/cephmds2/messages/MLock.h +++ b/branches/sage/cephmds2/messages/MLock.h @@ -17,36 +17,30 @@ #include "msg/Message.h" -#define LOCK_OTYPE_IHARD 1 -#define LOCK_OTYPE_IFILE 2 -#define LOCK_OTYPE_DIR 3 -#define LOCK_OTYPE_DN 4 // for replicas -#define LOCK_AC_SYNC 0 -#define LOCK_AC_MIXED 1 -#define LOCK_AC_LOCK 2 +#define LOCK_AC_SYNC -1 +#define LOCK_AC_MIXED -2 +#define LOCK_AC_LOCK -3 -#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock -#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock -#define LOCK_AC_LOCKNAK 12 // for dentry xlock +#define LOCK_AC_REQXLOCKACK -4 // req dentry xlock +#define LOCK_AC_REQXLOCKNAK -5 // req dentry xlock -#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) -#define LOCK_AC_FOR_AUTH(a) ((a) >= 11) +#define LOCK_AC_FOR_REPLICA(a) ((a) < 0) +#define LOCK_AC_FOR_AUTH(a) ((a) > 0) // for auth +#define LOCK_AC_SYNCACK 1 +#define LOCK_AC_MIXEDACK 2 +#define LOCK_AC_LOCKACK 3 -#define LOCK_AC_SYNCACK 13 -#define LOCK_AC_MIXEDACK 14 -#define LOCK_AC_LOCKACK 15 +#define LOCK_AC_REQREAD 4 +#define LOCK_AC_REQWRITE 5 -#define LOCK_AC_REQREAD 19 -#define LOCK_AC_REQWRITE 20 - -#define LOCK_AC_REQXLOCK 21 -#define LOCK_AC_REQXLOCKC 22 // create if necessary -#define LOCK_AC_UNXLOCK 23 +#define LOCK_AC_REQXLOCK 6 +#define LOCK_AC_REQXLOCKC 7 // create if necessary +#define LOCK_AC_UNXLOCK 8 #define lock_ac_name(x) @@ -59,8 +53,8 @@ class MLock : public Message { inodeno_t ino; // ino ref, or possibly dirfrag_t dirfrag; string dn; // dentry name - bufferlist data; // and possibly some data - string path; // possibly a path too (for dentry lock discovers) + + bufferlist data; // and possibly some data public: inodeno_t get_ino() { return ino; } @@ -70,7 +64,6 @@ class MLock : public Message { int get_asker() { return asker; } int get_action() { return action; } int get_otype() { return otype; } - string& get_path() { return path; } MLock() {} MLock(int action, int asker) : @@ -78,12 +71,22 @@ class MLock : public Message { this->action = action; this->asker = asker; } + MLock(SimpleLock *lock, int action, int asker) : + Message(MSG_MDS_LOCK) { + this->otype = lock->get_type(); + lock->get_parent()->set_mlock_info(this); + this->action = action; + this->asker = asker; + } virtual char *get_type_name() { return "ILock"; } void set_ino(inodeno_t ino, char ot) { otype = ot; this->ino = ino; } + void set_ino(inodeno_t ino) { + this->ino = ino; + } void set_dirfrag(dirfrag_t df) { otype = LOCK_OTYPE_DIR; this->dirfrag = df; @@ -96,9 +99,6 @@ class MLock : public Message { void set_data(bufferlist& data) { this->data.claim( data ); } - void set_path(const string& p) { - path = p; - } void decode_payload() { int off = 0; @@ -113,7 +113,6 @@ class MLock : public Message { payload.copy(off,sizeof(dirfrag), (char*)&dirfrag); off += sizeof(dirfrag); ::_decode(dn, payload, off); - ::_decode(path, payload, off); ::_decode(data, payload, off); } virtual void encode_payload() { @@ -123,7 +122,6 @@ class MLock : public Message { payload.append((char*)&ino, sizeof(ino)); payload.append((char*)&dirfrag, sizeof(dirfrag)); ::_encode(dn, payload); - ::_encode(path, payload); ::_encode(data, payload); } -- 2.39.5