From ca0df734f2a6fbbc4f0e0562e9018580cd3295e9 Mon Sep 17 00:00:00 2001 From: sage Date: Thu, 7 Apr 2005 04:42:39 +0000 Subject: [PATCH] rewrote most of hte locking code. lot sof stuff ripped out for hte time being.. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@140 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/TODO | 22 +- ceph/config.cc | 35 +- ceph/config.h | 28 + ceph/include/types.h | 60 +- ceph/mds/CInode.cc | 98 +- ceph/mds/CInode.h | 143 ++- ceph/mds/InoAllocator.cc | 88 -- ceph/mds/InoAllocator.h | 45 - ceph/mds/Lock.h | 202 ++-- ceph/mds/MDCache.cc | 1788 +++++++++++++++++----------------- ceph/mds/MDCache.h | 116 ++- ceph/mds/MDS.cc | 66 +- ceph/mds/MDS.h | 18 - ceph/mds/MDStore.cc | 4 +- ceph/mds/oldcachestuff.cc | 930 ++++++++++++++++++ ceph/messages/MClientReply.h | 2 +- ceph/messages/MLock.h | 51 +- ceph/msg/FakeMessenger.cc | 1 - ceph/msg/Messenger.cc | 33 +- 19 files changed, 2406 insertions(+), 1324 deletions(-) delete mode 100644 ceph/mds/InoAllocator.cc delete mode 100644 ceph/mds/InoAllocator.h create mode 100644 ceph/mds/oldcachestuff.cc diff --git a/ceph/TODO b/ceph/TODO index bf811b12fb67c..fa39a1c62bc71 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,25 +1,35 @@ +/- lock states +/ - discover needs to set initial state intelligently +/ - expire must remove from gather_set + - more gracefully than it does now? + + + + CLIENT - structure.. multithreaded until messenger +- client sync + +- openwr, softasync +- unlink vs soft/hard locks? MDS - qsync -- high level locker functions -- expand Lock - - can_*_soon(), is_stable()... -- add waiters for stable states (to match is_stable()) +- lock modes (sync, async) + - recalls - freeze interaction..... test! - - freeze state diagram + - freeze state diagram? -/- generalize inoallocator - add client file handles. - fh modes: RD, RDWR, WR ... append? :/ - fh states: SYNC, ASYNC, ...???? +- unlink in terms of dentry lock - symlinks! - stored in the inode.. how? diff --git a/ceph/config.cc b/ceph/config.cc index 76995921b5dc7..e5d866f27acd0 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -14,7 +14,7 @@ md_config_t g_conf = { num_mds: 33, num_osd: 10, - num_client: 10, + num_client: 100, osd_cow: false, // crashy? true, @@ -31,7 +31,7 @@ md_config_t g_conf = { mdlog_read_inc: 4096, fake_clock: true, - fakemessenger_serialize: true, + fakemessenger_serialize: false, debug: 10, @@ -44,7 +44,34 @@ md_config_t g_conf = { mdbal_replicate_threshold: 500, mdbal_unreplicate_threshold: 200, - mds_heartbeat_op_interval: 20000, - mds_verify_export_dirauth: true + mds_heartbeat_op_interval: 2000, + mds_verify_export_dirauth: true, + + client_op_statfs: false, + + client_op_stat: true, + client_op_touch: true, + client_op_utime: false, + client_op_chmod: true, + client_op_chown: false, + + client_op_readdir: true, + client_op_mknod: false, + client_op_link: false, + client_op_unlink: false, + client_op_rename: false, + + client_op_mkdir: false, + client_op_rmdir: false, + client_op_symlink: false, + + client_op_openrd: false, + client_op_openwr: false, + client_op_openwrc: false, + client_op_read: false, + client_op_write: false, + client_op_truncate: false, + client_op_fsync: false, + client_op_close: false }; diff --git a/ceph/config.h b/ceph/config.h index 6791a236f4c70..6bf1dc65c3e49 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -36,6 +36,34 @@ struct md_config_t { int mds_heartbeat_op_interval; bool mds_verify_export_dirauth; + + bool client_op_statfs; + + bool client_op_stat; + bool client_op_touch; + bool client_op_utime; + bool client_op_chmod; + bool client_op_chown; + + bool client_op_readdir; + bool client_op_mknod; + bool client_op_link; + bool client_op_unlink; + bool client_op_rename; + + bool client_op_mkdir; + bool client_op_rmdir; + bool client_op_symlink; + + bool client_op_openrd; + bool client_op_openwr; + bool client_op_openwrc; + bool client_op_read; + bool client_op_write; + bool client_op_truncate; + bool client_op_fsync; + bool client_op_close; + }; extern md_config_t g_conf; diff --git a/ceph/include/types.h b/ceph/include/types.h index 8d546ae66029c..6aeda72685277 100644 --- a/ceph/include/types.h +++ b/ceph/include/types.h @@ -9,6 +9,40 @@ using namespace std; +// md ops +#define MDS_OP_STATFS 1 + +#define MDS_OP_STAT 100 +#define MDS_OP_TOUCH 101 // this is made up, not a real POSIX thing +#define MDS_OP_UTIME 102 +#define MDS_OP_CHMOD 103 +#define MDS_OP_CHOWN 104 + + +#define MDS_OP_READDIR 200 +#define MDS_OP_MKNOD 201 +#define MDS_OP_LINK 202 +#define MDS_OP_UNLINK 203 +#define MDS_OP_RENAME 204 + +#define MDS_OP_MKDIR 220 +#define MDS_OP_RMDIR 221 +#define MDS_OP_SYMLINK 222 + +#define MDS_OP_OPENRD 301 +#define MDS_OP_OPENWR 302 +#define MDS_OP_OPENWRC 303 +#define OSD_OP_READ 304 +#define OSD_OP_WRITE 305 +#define MDS_OP_TRUNCATE 306 +#define MDS_OP_FSYNC 307 +#define MDS_OP_CLOSE 310 + + + + + + // -- stl crap -- @@ -34,25 +68,29 @@ typedef __uint64_t inodeno_t; // ino typedef __uint64_t mdloc_t; // dir locator? struct inode_t { + // immutable inodeno_t ino; // NOTE: this must come first + time_t ctime; - __uint32_t touched; - __uint64_t size; - __uint32_t mode; + // hard (perm) + mode_t mode; uid_t uid; gid_t gid; - time_t atime, mtime, ctime; - unsigned short isdir; // normal = 1, hashed = 2, file = 0 - unsigned short type; // see below + // soft + __uint64_t size; + time_t atime, mtime; + + // special stuff + unsigned char hash_seed; // 0 if not hashed. }; -#define INODE_DIR_NORMAL 1 -#define INODE_DIR_HASHED 2 +#define INODE_MODE_FILE 0100000 // S_IFREG +#define INODE_MODE_SYMLINK 0120000 // S_IFLNK +#define INODE_MODE_DIR 0040000 // S_IFDIR + + -#define INODE_TYPE_FILE 0 -#define INODE_TYPE_DIR 1 -#define INODE_TYPE_SYMLINK 2 #define MAX_DENTRY_LEN 255 diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 84b31159b4702..9bf59b432a3a7 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -20,7 +20,7 @@ ostream& operator<<(ostream& out, CInode& in) { string path; in.make_path(path); - out << "[inode " << in.inode.ino << " " << path << " "; + out << "[inode " << in.inode.ino << " ~" << path << " "; if (in.is_auth()) { out << "auth"; if (in.is_cached_by_anyone()) @@ -32,17 +32,8 @@ ostream& operator<<(ostream& out, CInode& in) assert(in.get_replica_nonce() >= 0); } - if (in.is_syncbyauth()) out << " syncbyauth"; - if (in.is_syncbyme()) out << " syncbyme"; - if (in.is_presync()) out << " presync"; - if (in.is_softasync()) out << " softasync"; - if (in.is_waitonunsync()) out << " waitonunsync"; - - if (in.is_lockbyauth()) out << " lockbyauth"; - if (in.is_lockbyme()) out << " lockbyme"; - if (in.is_prelock()) out << " prelock"; - if (in.is_waitonunlock()) out << " waitonunluck"; - + out << " hard=" << in.hardlock; + out << " soft=" << in.softlock; if (in.is_pinned()) { out << " |"; @@ -60,7 +51,9 @@ ostream& operator<<(ostream& out, CInode& in) // ====== CInode ======= -CInode::CInode(bool auth) : LRUObject() { +CInode::CInode(bool auth) : LRUObject(), + hardlock(LOCK_TYPE_BASIC), + softlock(LOCK_TYPE_ASYNC) { ref = 0; parent = NULL; @@ -74,14 +67,10 @@ CInode::CInode(bool auth) : LRUObject() { state = 0; dist_state = 0; - lock_active_count = 0; - pending_sync_request = 0; - version = 0; - //this->auth = auth; // by default. - state_set(CINODE_STATE_AUTH); + if (auth) state_set(CINODE_STATE_AUTH); } CInode::~CInode() { @@ -267,6 +256,66 @@ crope CInode::encode_export_state() } */ + +// new state encoders + +void CInode::encode_soft_state(crope& r) +{ + r.append((char*)&inode.size, sizeof(inode.size)); + r.append((char*)&inode.mtime, sizeof(inode.mtime)); + r.append((char*)&inode.atime, sizeof(inode.atime)); // ?? +} + +void CInode::decode_soft_state(crope& r, int& off) +{ + r.copy(off, sizeof(inode.size), (char*)&inode.size); + off += sizeof(inode.size); + r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime); + off += sizeof(inode.mtime); + r.copy(off, sizeof(inode.atime), (char*)&inode.atime); + off += sizeof(inode.atime); +} + +void CInode::decode_merge_soft_state(crope& r, int& off) +{ + __uint64_t size; + r.copy(off, sizeof(size), (char*)&size); + off += sizeof(size); + if (size > inode.size) inode.size = size; + + time_t t; + r.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); + if (t > inode.mtime) inode.mtime = t; + + r.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); + if (t > inode.atime) inode.atime = t; +} + +void CInode::encode_hard_state(crope& r) +{ + r.append((char*)&inode.mode, sizeof(inode.mode)); + r.append((char*)&inode.uid, sizeof(inode.uid)); + r.append((char*)&inode.gid, sizeof(inode.gid)); + r.append((char*)&inode.ctime, sizeof(inode.ctime)); +} + +void CInode::decode_hard_state(crope& r, int& off) +{ + r.copy(off, sizeof(inode.mode), (char*)&inode.mode); + off += sizeof(inode.mode); + r.copy(off, sizeof(inode.uid), (char*)&inode.uid); + off += sizeof(inode.uid); + r.copy(off, sizeof(inode.gid), (char*)&inode.gid); + off += sizeof(inode.gid); + r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime); + off += sizeof(inode.ctime); +} + + +// old state encoders + crope CInode::encode_basic_state() { crope r; @@ -350,12 +399,6 @@ void CInode::add_waiter(int tag, Context *c) { get(CINODE_PIN_WAITER); waiting.insert(pair(tag,c)); dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; - - // specialness? - if (tag == CINODE_WAIT_LOCK) { - lock_active_count++; - dout(10) << "add_waiter context " << c << " inc lock_active_count now " << lock_active_count << " on " << *this << endl; - } } @@ -391,13 +434,6 @@ void CInode::finish_waiting(int mask, int result) it++) { Context *c = *it; - // HACK ugly - if (mask == CINODE_WAIT_LOCK) { - assert(lock_active_count > 0); - lock_active_count--; - dout(10) << "finish_waiting context " << c << " dec lock_active_count now " << lock_active_count << " on " << *this << endl; - } - dout(11) << "finish_waiting finishing " << c << endl; c->finish(result); delete c; diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 29fb4dd19a5ee..9f6ca73a29666 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -81,33 +81,35 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { // sync => coherent soft metadata (size, mtime, etc.) // lock => coherent hard metadata (owner, mode, etc. affecting namespace) -#define CINODE_DIST_PRESYNC 1 // mtime, size, etc. -#define CINODE_DIST_SYNCBYME 2 -#define CINODE_DIST_SYNCBYAUTH 4 -#define CINODE_DIST_WAITONUNSYNC 8 +//#define CINODE_DIST_PRESYNC 1 // mtime, size, etc. +//#define CINODE_DIST_SYNCBYME 2 +//#define CINODE_DIST_SYNCBYAUTH 4 +//#define CINODE_DIST_WAITONUNSYNC 8 #define CINODE_DIST_SOFTASYNC 16 // replica can soft write w/o sync -#define CINODE_DIST_PRELOCK 64 // file mode, owner, etc. -#define CINODE_DIST_LOCKBYME 128 // i am auth -#define CINODE_DIST_LOCKBYAUTH 256 // i am not auth -#define CINODE_DIST_WAITONUNLOCK 512 +//#define CINODE_DIST_PRELOCK 64 // file mode, owner, etc. +//#define CINODE_DIST_LOCKBYME 128 // i am auth +//#define CINODE_DIST_LOCKBYAUTH 256 // i am not auth +//#define CINODE_DIST_WAITONUNLOCK 512 // wait reasons -#define CINODE_WAIT_SYNC 128 +//#define CINODE_WAIT_SYNC 128 // waiters: read_soft_start, write_soft_start // trigger: handle_inode_sync_ack -#define CINODE_WAIT_UNSYNC 256 +//#define CINODE_WAIT_UNSYNC 256 // waiters: read_soft_start, write_soft_start // trigger: handle_inode_sync_release -#define CINODE_WAIT_LOCK 512 +//#define CINODE_WAIT_LOCK 512 // waiters: write_hard_start // trigger: handle_inode_lock_ack // SPECIALNESS: lock_active_count indicates waiter, active lock count. -#define CINODE_WAIT_UNLOCK 1024 +//#define CINODE_WAIT_UNLOCK 1024 // waiters: read_hard_try // trigger: handle_inode_lock_release + + #define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE // waiters: write_hard_start, read_soft_start, write_soft_start (mdcache) // handle_client_chmod, handle_client_touch (mds) @@ -131,7 +133,21 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { // waiters: file_rename // triggers: file_rename_finish -#define CINODE_WAIT_ANY 0xffffff +#define CINODE_WAIT_HARDR (1<<17) // 131072 +#define CINODE_WAIT_HARDW (1<<18) +#define CINODE_WAIT_HARDB (1<<19) +#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) +#define CINODE_WAIT_HARDSTABLE (1<<20) +#define CINODE_WAIT_SOFTR (1<<21) // 2097152 +#define CINODE_WAIT_SOFTW (1<<22) +#define CINODE_WAIT_SOFTB (1<<23) +#define CINODE_WAIT_SOFTRWB (CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTW|CINODE_WAIT_SOFTB) +#define CINODE_WAIT_SOFTSTABLE (1<<24) + + + + +#define CINODE_WAIT_ANY 0xffffffff // state @@ -161,7 +177,7 @@ class MDCluster; class Message; class CInode; -class MInodeSyncStart; +//class MInodeSyncStart; ostream& operator<<(ostream& out, CInode& in); @@ -172,6 +188,11 @@ class CInode : LRUObject { inode_t inode; // the inode itself CDir *dir; // directory, if we have it opened. + string symlink; // symlink dest, if symlink + + // inode metadata locks + CLock hardlock; + CLock softlock; protected: int ref; // reference count @@ -186,8 +207,9 @@ class CInode : LRUObject { CDentry *parent; // if 1 parent (usually) vector parents; // if > 1 + // dcache lru - CInode *lru_next, *lru_prev; + //CInode *lru_next, *lru_prev; // -- distributed caching //bool auth; // safety check; true if this is authoritative. @@ -198,15 +220,16 @@ class CInode : LRUObject { /* NOTE: if replica is_cacheproxy(), cached_by is still defined! */ map cached_by_nonce; // [auth] nonce issued to each replica int replica_nonce; // [replica] defined on replica + set soft_tokens; // replicas who can soft update the inode XXX FIXME /* ..and thus may have a newer mtime, size, etc.! .. w/o sync for authority: set of nodes; self is assumed, but not included for replica: undefined */ unsigned dist_state; - set sync_waiting_for_ack; - set lock_waiting_for_ack; - int lock_active_count; // count for in progress or waiting locks - bool sync_replicawantback; // avoids sticky sync + //set sync_waiting_for_ack; + //set lock_waiting_for_ack; + //int lock_active_count; // count for in progress or waiting locks + //bool sync_replicawantback; // avoids sticky sync set unlink_waiting_for_ack; set rename_waiting_for_ack; @@ -223,7 +246,7 @@ class CInode : LRUObject { multiset open_read; multiset open_write; - MInodeSyncStart *pending_sync_request; + //MInodeSyncStart *pending_sync_request; private: // waiters @@ -245,7 +268,10 @@ class CInode : LRUObject { // -- accessors -- - bool is_dir() { return inode.isdir; } + bool is_file() { return (inode.mode & INODE_MODE_FILE) ? true:false; } + bool is_symlink() { return (inode.mode & INODE_MODE_SYMLINK) ? true:false; } + bool is_dir() { return (inode.mode & INODE_MODE_DIR) ? true:false; } + bool is_root() { return state & CINODE_STATE_ROOT; } bool is_proxy() { return state & CINODE_STATE_PROXY; } @@ -264,7 +290,7 @@ class CInode : LRUObject { CDir *set_dir(CDir *newdir); bool dir_is_hashed() { - if (inode.isdir == INODE_DIR_HASHED) return true; + if (inode.hash_seed) return true; return false; } bool dir_is_auth(); @@ -298,7 +324,29 @@ class CInode : LRUObject { crope encode_export_state(); + void encode_soft_state(crope& r); + void decode_soft_state(crope& r, int& off); + void decode_merge_soft_state(crope& r, int& off); + + void encode_hard_state(crope& r); + void decode_hard_state(crope& r, int& off); + void replicate_relax_locks() { + assert(is_auth()); + assert(!is_cached_by_anyone()); + dout(10) << " relaxing locks on " << *this << endl; + + if (hardlock.get_state() == LOCK_LOCK && + !hardlock.is_used()) { + dout(10) << " hard now sync " << *this << endl; + hardlock.set_state(LOCK_SYNC); + } + if (softlock.get_state() == LOCK_LOCK && + !softlock.is_used()) { + softlock.set_state(LOCK_SYNC); + dout(10) << " soft now sync " << *this << endl; + } + } // -- dirtyness -- __uint64_t get_version() { return version; } @@ -379,6 +427,7 @@ class CInode : LRUObject { // -- sync, lock -- + /* bool is_sync() { return dist_state & (CINODE_DIST_SYNCBYME| CINODE_DIST_SYNCBYAUTH); } bool is_syncbyme() { return dist_state & CINODE_DIST_SYNCBYME; } @@ -391,6 +440,7 @@ class CInode : LRUObject { bool is_lockbyauth() { return dist_state & CINODE_DIST_LOCKBYAUTH; } bool is_prelock() { return dist_state & CINODE_DIST_PRELOCK; } bool is_waitonunlock() { return dist_state & CINODE_DIST_WAITONUNLOCK; } + */ // -- open files -- bool is_open() { @@ -519,18 +569,17 @@ class CInodeDiscover { inode_t inode; int replica_nonce; - bool is_syncbyauth; - bool is_softasync; - bool is_lockbyauth; + + int hardlock_state; + int softlock_state; public: CInodeDiscover() {} CInodeDiscover(CInode *in, int nonce) { inode = in->inode; replica_nonce = nonce; - is_syncbyauth = in->is_syncbyme() || in->is_presync(); - is_softasync = in->is_softasync(); - is_lockbyauth = in->is_lockbyme() || in->is_prelock(); + hardlock_state = in->hardlock.get_replica_state(); + softlock_state = in->softlock.get_replica_state(); } inodeno_t get_ino() { return inode.ino; } @@ -539,19 +588,16 @@ class CInodeDiscover { in->inode = inode; in->replica_nonce = replica_nonce; - - if (is_syncbyauth) in->dist_state |= CINODE_DIST_SYNCBYAUTH; - if (is_softasync) in->dist_state |= CINODE_DIST_SOFTASYNC; - if (is_lockbyauth) in->dist_state |= CINODE_DIST_LOCKBYAUTH; + in->hardlock.set_state(hardlock_state); + in->softlock.set_state(softlock_state); } crope _rope() { crope r; r.append((char*)&inode, sizeof(inode)); r.append((char*)&replica_nonce, sizeof(replica_nonce)); - r.append((char*)&is_syncbyauth, sizeof(bool)); - r.append((char*)&is_softasync, sizeof(bool)); - r.append((char*)&is_lockbyauth, sizeof(bool)); + r.append((char*)&hardlock_state, sizeof(hardlock_state)); + r.append((char*)&softlock_state, sizeof(softlock_state)); return r; } @@ -560,12 +606,10 @@ class CInodeDiscover { off += sizeof(inode_t); s.copy(off, sizeof(int), (char*)&replica_nonce); off += sizeof(int); - s.copy(off, sizeof(bool), (char*)&is_syncbyauth); - off += sizeof(bool); - s.copy(off, sizeof(bool), (char*)&is_softasync); - off += sizeof(bool); - s.copy(off, sizeof(bool), (char*)&is_lockbyauth); - off += sizeof(bool); + s.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); + off += sizeof(hardlock_state); + s.copy(off, sizeof(softlock_state), (char*)&softlock_state); + off += sizeof(softlock_state); return off; } @@ -579,7 +623,6 @@ typedef struct { __uint64_t version; DecayCounter popularity; bool is_dirty; // dirty inode? - bool is_softasync; int ncached_by; // int pairs follow } CInodeExport_st; @@ -591,6 +634,8 @@ class CInodeExport { set cached_by; map cached_by_nonce; + CLock hardlock,softlock; + public: CInodeExport() {} CInodeExport(CInode *in) { @@ -598,9 +643,10 @@ public: st.version = in->get_version(); st.popularity = in->get_popularity(); st.is_dirty = in->is_dirty(); - st.is_softasync = in->is_softasync(); cached_by = in->cached_by; cached_by_nonce = in->cached_by_nonce; + hardlock = in->hardlock; + softlock = in->softlock; } inodeno_t get_ino() { return st.inode.ino; } @@ -614,14 +660,14 @@ public: if (st.is_dirty) in->mark_dirty(); - if (st.is_softasync) - in->dist_state |= CINODE_DIST_SOFTASYNC; - in->cached_by.clear(); in->cached_by = cached_by; in->cached_by_nonce = cached_by_nonce; if (!cached_by.empty()) in->get(CINODE_PIN_CACHED); + + in->hardlock = hardlock; + in->softlock = softlock; } crope _rope() { @@ -639,6 +685,8 @@ public: r.append((char*)&n, sizeof(int)); } + hardlock.encode_state(r); + softlock.encode_state(r); return r; } @@ -655,6 +703,9 @@ public: cached_by.insert(m); cached_by_nonce.insert(pair(m,n)); } + + hardlock.decode_state(s, off); + softlock.decode_state(s, off); return off; } }; diff --git a/ceph/mds/InoAllocator.cc b/ceph/mds/InoAllocator.cc deleted file mode 100644 index 83413d92d8f5d..0000000000000 --- a/ceph/mds/InoAllocator.cc +++ /dev/null @@ -1,88 +0,0 @@ - -#include "InoAllocator.h" -#include "MDS.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -char ifn[100]; - -char *IdAllocator::get_filename() { - sprintf(ifn,"osddata/idalloc.%d", mds->get_nodeid()); - return ifn; -} - -void IdAllocator::save() -{ - int fd; - fd = open(get_filename(), O_CREAT|O_WRONLY); - if (fd >= 0) { - fchmod(fd, 0644); - - int ntypes = free.size(); - write(fd, (char*)&ntypes, sizeof(ntypes)); - - // over types - for (map >::iterator ty = free.begin(); - ty != free.end(); - ty++) { - char type = *ty; - write(fd, &type, 1); - - int mapsize = free[type].map_size(); - write(fd, (char*)&mapsize, sizeof(mapsize)); - - // over entries - for (map::iterator it = free[type]..map_begin(); - it != free[type].map_end(); - it++) { - id_t a = it->first; - id_t b = it->second; - write(fd, &a, sizeof(a)); - write(fd, &b, sizeof(b)); - } - } - close(fd); - } else - assert(0); -} - - -void IdAllocator::load() -{ - int fd; - fd = open(get_filename(), O_RDONLY); - if (fd >= 0) { - int ntypes; - read(fd, &ntypes, sizeof(ntypes)); - - for (int ty = 0; ty < ntypes; ty++) { - char type; - read(fd, &type, 1); - - int mapsize = 0; - read(fd, &mapsize, sizeof(mapsize)); - for (int i=0; iget_nodeid()+1), - (long long)1000000000000LL * (mds->get_nodeid()+2) - 1); - free[ID_FH].map_insert((long long)1000000000000LL * (mds->get_nodeid()+1), - (long long)1000000000000LL * (mds->get_nodeid()+2) - 1); - } -} diff --git a/ceph/mds/InoAllocator.h b/ceph/mds/InoAllocator.h deleted file mode 100644 index 4034ba529bcfe..0000000000000 --- a/ceph/mds/InoAllocator.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef __INOALLOCATOR_H -#define __INOALLOCATOR_H - -#include "include/types.h" -#include "rangeset.h" - -class MDS; - -class InoAllocator { - MDS *mds; - - rangeset free; - - public: - InoAllocator(); - InoAllocator(MDS *mds) { - this->mds = mds; - load(); - } - //~InoAllocator(); - - inodeno_t get_ino() { - free.dump(); - inodeno_t ino = free.first(); - free.erase(ino); - cout << "ino is " << ino << endl; - free.dump(); - save(); - return ino; - } - void reclaim_ino(inodeno_t ino) { - free.insert(ino); - save(); - } - - // load/save from disk (hack) - void load(); - void save(); - - private: - char *get_filename(); - -}; - -#endif diff --git a/ceph/mds/Lock.h b/ceph/mds/Lock.h index cd56ce4998ea2..258c04faa2cac 100644 --- a/ceph/mds/Lock.h +++ b/ceph/mds/Lock.h @@ -3,6 +3,7 @@ #include #include +#include using namespace std; // STATES @@ -20,54 +21,164 @@ using namespace std; #define LOCK_GASYNC 8 // gather to async +#define LOCK_TYPE_BASIC 0 +#define LOCK_TYPE_ASYNC 1 + +#define LOCK_MODE_SYNC 0 +#define LOCK_MODE_ASYNC 1 + // -- basic lock -class BasicLock { +class CLock { protected: // lock state + char type; char state; + char mode; set gather_set; // auth + int nread, nwrite; + + bool req_read, req_write; // dual meaning: on replicas, whether we've requested; on auth, whether others have requested. public: - BasicLock() : state(0) { + CLock() {} + CLock(char t) : + type(t), + state(LOCK_LOCK), + mode(LOCK_MODE_SYNC), + nread(0), + nwrite(0) { + } + + // encode/decode + void encode_state(crope& r) { + r.append((char*)&type, sizeof(char)); + r.append((char*)&state, sizeof(state)); + r.append((char*)&mode, sizeof(mode)); + r.append((char*)&nread, sizeof(nread)); + r.append((char*)&nwrite, sizeof(nwrite)); + int n = gather_set.size(); + r.append((char*)&n, sizeof(n)); + for (set::iterator it = gather_set.begin(); + it != gather_set.end(); + it++) { + n = *it; + r.append((char*)&n, sizeof(n)); + } + } + void decode_state(crope& r, int& off) { + r.copy(off, sizeof(type), (char*)&type); + off += sizeof(type); + r.copy(off, sizeof(state), (char*)&state); + off += sizeof(state); + r.copy(off, sizeof(mode), (char*)&mode); + off += sizeof(mode); + r.copy(off, sizeof(nread), (char*)&nread); + off += sizeof(nread); + r.copy(off, sizeof(nwrite), (char*)&nwrite); + off += sizeof(nwrite); + + int n; + r.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + gather_set.clear(); + int x; + for (int i=0; i& get_gather_set() { return gather_set; } + char set_state(char s) { + state = s; + assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + }; + + char get_mode() { return mode; } + char set_mode(char m) { + mode = m; + } + + char get_replica_state() { + if (state == LOCK_PRELOCK) return LOCK_LOCK; + if (state == LOCK_GLOCK) return LOCK_LOCK; + return state; // SYNC, LOCK, GASYNC, GSYNC + } + // gather set + set& get_gather_set() { return gather_set; } void init_gather(set& i) { gather_set = i; } - + bool is_gathering(int i) { + return gather_set.count(i); + } + + // ref counting + int get_read() { return ++nread; } + int put_read() { + assert(nread>0); + return --nread; + } + int get_nread() { return nread; } + + int get_write() { return ++nwrite; } + int put_write() { + assert(nwrite>0); + return --nwrite; + } + int get_nwrite() { return nwrite; } + bool is_used() { + return (nwrite+nread)>0 ? true:false; + } + + bool get_req_read() { return req_read; } + bool get_req_write() { return req_write; } + void set_req_read(bool b) { req_read = b; } + void set_req_write(bool b) { req_write = b; } + + // stable bool is_stable() { - return (state == LOCK_SYNC) || (state == LOCK_LOCK); + return (state == LOCK_SYNC) || (state == LOCK_LOCK) || (state == LOCK_ASYNC); } + // read/write access bool can_read(bool auth) { if (auth) - return (state == LOCK_SYNC) || (state == LOCK_PRELOCK) || (state == LOCK_LOCK); - if (!auth) + return (state == LOCK_SYNC) || (state == LOCK_PRELOCK) + || (state == LOCK_LOCK) || (state == LOCK_GASYNC); + else return (state == LOCK_SYNC); } bool can_read_soon(bool auth) { - if (auth) - return false; - if (!auth) - return false; + if (auth) + return (state == LOCK_GSYNC) || (state == LOCK_GLOCK); + else + return (state == LOCK_GSYNC); } bool can_write(bool auth) { - return auth && state == LOCK_LOCK; + if (auth) + return (state == LOCK_LOCK) || (state == LOCK_ASYNC) || + (state == LOCK_GLOCK) || (state == LOCK_GSYNC); + else + return (state == LOCK_ASYNC); } bool can_write_soon(bool auth) { - return auth && (state == LOCK_PRELOCK); + if (auth) + return (state == LOCK_PRELOCK) || (state == LOCK_GASYNC); + else + return (state == LOCK_GASYNC); } friend class MDCache; }; -inline ostream& operator<<(ostream& out, BasicLock& l) { +//ostream& operator<<(ostream& out, CLock& l); +inline ostream& operator<<(ostream& out, CLock& l) +{ static char* __lock_states[] = { "sync", "prelock", @@ -80,63 +191,26 @@ inline ostream& operator<<(ostream& out, BasicLock& l) { "gasync" }; - out << "Lock(" << __lock_states[l.get_state()]; + out << "(" << __lock_states[l.get_state()]; if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); + if (l.get_nread()) + out << " " << l.get_nread() << "r"; + if (l.get_nwrite()) + out << " " << l.get_nwrite() << "w"; + // rw? + /* out << " "; - if (l.can_read(true)) out << "r"; - if (l.can_write(true)) out << "w"; + if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; + if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; out << "/"; - if (l.can_read(false)) out << "r"; - if (l.can_write(false)) out << "w"; - + if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; + if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; + */ out << ")"; return out; } - -// -- async lock - -class AsyncLock : public BasicLock { - public: - AsyncLock() : BasicLock() { - assert(state == 0); - } - bool is_stable() { - return (state == LOCK_SYNC) || (state == LOCK_LOCK) || (state == LOCK_ASYNC); - } - - bool can_read(bool auth) { - if (auth) - return (state == LOCK_SYNC) || (state == LOCK_PRELOCK) - || (state == LOCK_LOCK) || (state == LOCK_GASYNC); - if (!auth) - return (state == LOCK_SYNC); - } - bool can_read_soon(bool auth) { - if (auth) - return (state == LOCK_GSYNC) || (state == LOCK_GLOCK); - else - return (state == LOCK_GSYNC); - } - - bool can_write(bool auth) { - if (auth) - return (state == LOCK_LOCK) || (state == LOCK_ASYNC) || - (state == LOCK_GLOCK) || (state == LOCK_GSYNC); - if (!auth) - return (state == LOCK_ASYNC); - } - bool can_write_soon(bool auth) { - if (auth) - return (state == LOCK_PRELOCK) || (state == LOCK_GASYNC); - else - return (state == LOCK_GASYNC); - } - - friend class MDCache; -}; - #endif diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 4b1a783984a3d..cda820fe38c06 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -43,8 +43,6 @@ #include "messages/MInodeUnlink.h" #include "messages/MInodeUnlinkAck.h" -#include "messages/MLock.h" - #include "messages/MInodeSyncStart.h" #include "messages/MInodeSyncAck.h" #include "messages/MInodeSyncRelease.h" @@ -54,6 +52,8 @@ #include "messages/MInodeLockAck.h" #include "messages/MInodeLockRelease.h" +#include "messages/MLock.h" + #include "messages/MDirSyncStart.h" #include "messages/MDirSyncAck.h" #include "messages/MDirSyncRelease.h" @@ -76,6 +76,10 @@ using namespace std; + + + + MDCache::MDCache(MDS *m) { mds = m; @@ -268,8 +272,8 @@ void MDCache::shutdown_start() it++) { CInode *in = it->second; if (in->is_auth()) { - if (in->is_syncbyme()) inode_sync_release(in); - if (in->is_lockbyme()) inode_lock_release(in); + //if (in->is_syncbyme()) inode_sync_release(in); + //if (in->is_lockbyme()) inode_lock_release(in); } } @@ -305,21 +309,6 @@ bool MDCache::shutdown_pass() trim(0); dout(7) << "cache size now " << lru->lru_get_size() << endl; - - // send inode_expire's on all potentially cache pinned items - //no: expires now reliable; leaves will always expire - if (false && - !did_inode_updates) { - did_inode_updates = true; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - if (it->second->ref_set.count(CINODE_PIN_CACHED)) - send_inode_updates(it->second); // send an update to discover who dropped the ball - } - } - // send all imports back to 0. if (mds->get_nodeid() != 0) { @@ -409,12 +398,11 @@ int MDCache::open_root(Context *c) // i am root inode CInode *root = new CInode(); root->inode.ino = 1; - root->inode.isdir = true; // make it up (FIXME) - root->inode.mode = 0755; + root->inode.mode = 0755 | INODE_MODE_DIR; root->inode.size = 0; - root->inode.touched = 0; + root->inode.mtime = 0; root->state_set(CINODE_STATE_ROOT); @@ -520,15 +508,6 @@ int MDCache::proc_message(Message *m) handle_cache_expire((MCacheExpire*)m); break; - /* - case MSG_MDS_INODEEXPIRE: - handle_inode_expire((MInodeExpire*)m); - break; - - case MSG_MDS_DIREXPIRE: - handle_dir_expire((MDirExpire*)m); - break; - */ case MSG_MDS_INODEUNLINK: handle_inode_unlink((MInodeUnlink*)m); @@ -537,7 +516,7 @@ int MDCache::proc_message(Message *m) handle_inode_unlink_ack((MInodeUnlinkAck*)m); break; - + /* // sync case MSG_MDS_INODESYNCSTART: handle_inode_sync_start((MInodeSyncStart*)m); @@ -551,7 +530,8 @@ int MDCache::proc_message(Message *m) case MSG_MDS_INODESYNCRECALL: handle_inode_sync_recall((MInodeSyncRecall*)m); break; - + */ + /* // lock case MSG_MDS_INODELOCKSTART: handle_inode_lock_start((MInodeLockStart*)m); @@ -562,8 +542,11 @@ int MDCache::proc_message(Message *m) case MSG_MDS_INODELOCKRELEASE: handle_inode_lock_release((MInodeLockRelease*)m); break; - + */ + case MSG_MDS_LOCK: + handle_lock((MLock*)m); + break; // import case MSG_MDS_EXPORTDIRDISCOVER: @@ -643,7 +626,7 @@ int MDCache::path_traverse(filepath& path, dout(12) << " path seg " << path[depth] << endl; if (!cur->is_dir()) { - dout(7) << *cur << " not a dir " << cur->inode.isdir << endl; + dout(7) << *cur << " not a dir " << endl; return -ENOTDIR; } @@ -684,11 +667,12 @@ int MDCache::path_traverse(filepath& path, } // must read hard data to traverse - if (!read_hard_try(cur, req)) + if (!inode_hard_read_start(cur, req)) return 1; + inode_hard_read_finish(cur); // check permissions? - + // XXX // dentry CDentry *dn = cur->dir->lookup(path[depth]); @@ -916,24 +900,6 @@ void MDCache::handle_discover(MDiscover *dis) assert(cur); dout(10) << "dir is " << *cur->dir << endl; - - /* - if (cur->dir->is_proxy() || - cur->is_auth() && !cur->dir->is_auth()) { - // fwd to dir auth - int dirauth = cur->dir->authority(); - if (dirauth == dis->get_asker()) { - dout(7) << "from (new hopefully) dir auth, dropping on floor." << endl; - assert(dis->get_asker() == dis->get_source()); - delete dis; - } else { - dout(7) << "fwd to dir auth " << dirauth << endl; - mds->messenger->send_message( dis, - MSG_ADDR_MDS( dirauth ), MDS_PORT_CACHE, MDS_PORT_CACHE ); - } - return; - } - */ // create reply reply = new MDiscoverReply(cur->ino()); @@ -986,6 +952,11 @@ void MDCache::handle_discover(MDiscover *dis) CInode *next = dn->inode; assert(next->is_auth()); + // relax inode lock before we replicate? + if (!next->is_cached_by_anyone()) { + next->replicate_relax_locks(); + } + // add dentry + inode reply->add_dentry( dis->get_dentry(i) ); reply->add_inode( new CInodeDiscover(next, @@ -1147,6 +1118,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // root in->state_set(CINODE_STATE_ROOT); set_root( in ); + dout(7) << " got root: " << *in << endl; finished.splice(finished.end(), waiting_for_root); } else { @@ -1313,6 +1285,47 @@ void MDCache::handle_cache_expire(MCacheExpire *m) // remove from our cached_by dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl; in->cached_by_remove(from); + + // fix locks + if (in->hardlock.is_gathering(from)) { + // ugly hack + int ac; + switch (in->hardlock.get_state()) { + case LOCK_PRELOCK: + ac = LOCK_AC_LOCKNAK; + break; + default: + assert(0); + } + MLock *m = new MLock(ac, from); + m->set_ino(in->ino(), LOCK_OTYPE_IHARD); + handle_lock_inode_hard(m); + } + if (in->softlock.is_gathering(from)) { + // ugly hack + int ac; + switch (in->hardlock.get_state()) { + case LOCK_GSYNC: + ac = LOCK_AC_GSYNCNAK; + break; + case LOCK_GLOCK: + ac = LOCK_AC_GLOCKNAK; + break; + case LOCK_GASYNC: + ac = LOCK_AC_GASYNCNAK; + break; + case LOCK_PRELOCK: + ac = LOCK_AC_LOCKNAK; + break; + default: + assert(0); + } + MLock *m = new MLock(ac, from); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + handle_lock_inode_soft(m); + } + + } else { // this is an old nonce, ignore expire. @@ -1447,6 +1460,7 @@ public: void MDCache::inode_unlink(CInode *in, Context *c) { + /* assert(in->is_auth()); assert(!in->is_presync()); assert(!in->is_prelock()); @@ -1485,6 +1499,7 @@ void MDCache::inode_unlink(CInode *in, Context *c) in->state_set(CINODE_STATE_UNLINKING); in->unlink_waiting_for_ack = in->cached_by; } + */ } @@ -1578,6 +1593,7 @@ public: void MDCache::file_rename(CInode *from, CDir *destdir, string& name, CInode *oldin, Context *c) { + /* assert(from->is_auth()); assert(!from->is_cached_by_anyone() || from->is_lockbyme()); assert(!oldin || !oldin->is_cached_by_anyone() || oldin->is_lockbyme()); @@ -1633,7 +1649,7 @@ void MDCache::file_rename(CInode *from, CDir *destdir, string& name, CInode *old // done file_rename_finish(from, destdir, oldin, c); } - + */ } void MDCache::file_rename_finish(CInode *from, CDir *destdir, CInode *oldin, Context *c) @@ -1642,10 +1658,10 @@ void MDCache::file_rename_finish(CInode *from, CDir *destdir, CInode *oldin, Con // drop locks? from->state_clear(CINODE_STATE_RENAMING); - write_hard_finish(from); + inode_hard_write_finish(from); if (oldin) { - write_hard_finish(oldin); + inode_hard_write_finish(oldin); oldin->state_clear(CINODE_STATE_RENAMINGTO); } @@ -1694,8 +1710,8 @@ INODES: = two types of inode metadata: hard - uid/gid, mode - soft - m/ctime, size - ? atime - atime (*) + soft - mtime, size + ? atime - atime (*) <-- we want a lazy update strategy? * if we want _strict_ atime behavior, atime can be folded into soft. for lazy atime, should we just leave the atime lock in async state? XXX @@ -1707,23 +1723,24 @@ INODES: -> These locks are completely orthogonal! = metadata ops and how they affect inode metadata: - scma=size ctime mtime atime + sma=size mtime atime HARD SOFT OP files: - R RRRR stat - RW chmod/chown - R wW touch ?ctime - R openr - W read atime - R openw - R w openwc ?ctime - W W write size mtime - close + R RRR stat + RW chmod/chown + R W touch ?ctime + R openr + W read atime + R openw + Wc openwc ?ctime + WW write size mtime + close + dirs: - R W readdir atime - RRRR ( + implied stats on files) - R W W link/unlink/rename/rmdir - R WwW mkdir (ctime on new dir, size+mtime on parent dir) + R W readdir atime + RRR ( + implied stats on files) + Rc WW mkdir (ctime on new dir, size+mtime on parent dir) + R WW link/unlink/rename/rmdir (size+mtime on dir) @@ -1735,7 +1752,7 @@ INODES: - truncate ... need to stop writers for the atomic truncate operation - need a full lock - ???? + ALSO: @@ -1747,1009 +1764,1008 @@ ALSO: */ -/* void MDCache::handle_lock(MLock *m) { - // action type switch (m->get_otype()) { - case LOCK_OTYPE_INO: - CInode *in = get_inode(m->get_ino()); - + case LOCK_OTYPE_IHARD: + handle_lock_inode_hard(m); break; - - case LOCK_OTYPE_DIRINO: - CInode *in = get_inode(m->get_ino()); - CDir *dir = in->dir; + + case LOCK_OTYPE_ISOFT: + handle_lock_inode_soft(m); break; + case LOCK_OTYPE_DIR: + handle_lock_dir(m); + break; case LOCK_OTYPE_DN: - CInode *in = get_inode(m->get_ino()); - CDir *dir = in->dir; - CDentry = dir->lookup(m->get_dn()); - + handle_lock_dn(m); break; - } - -} - -*/ - - -/* - -OLD LOCK CRAP - - - (old): - sync - soft metadata.. no reads/writes can proceed. (eg no stat) - lock - hard(+soft) metadata.. path traversals stop etc. (??) - - - replication consistency modes: - hard+soft - hard and soft are defined on all replicas. - all reads proceed (in absense of sync lock) - writes require sync lock; possibly fw to auth - -> normal behavior. - - hard - hard only, soft is undefined - reads require a sync - writes proceed if field updates are monotonic (e.g. size, m/c/atime) - -> 'softasync' - - types of access by cache users: - - hard soft - R - read_hard_try path traversal - R <= R read_soft_start stat - R <= W write_soft_start touch - W => W write_hard_start chmod - - note on those implications: - read_soft_start() calls read_hard_try() - write_soft_start() calls read_hard_try() - a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a lock is held) - - - relationship with frozen directories: - - read_hard_try - can proceed, because any hard changes require a lock, which requires an active - authority, which implies things are unfrozen. - write_hard_start - waits (has to; only auth can initiate) - read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth) - write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth)) - - if sticky is on, an export_dir will drop any sync or lock so that the freeze will - proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing(). - - - -NAMESPACE: + default: + dout(7) << "handle_lock got otype " << m->get_otype() << endl; + assert(0); + break; + } +} -*/ - - +// hard inode metadata - -/* soft sync locks: mtime, size, etc. - */ - -bool MDCache::read_soft_start(CInode *in, Message *m) +bool MDCache::inode_hard_read_start(CInode *in, Message *m) { - if (!read_hard_try(in, m)) - return false; - - // if frozen: i can't proceed (for now, see above) - if (in->is_frozen()) { - dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; - } - - - dout(5) << "read_soft_start " << *in << endl; - - // what soft sync mode? - - if (in->is_softasync()) { - // softasync: hard consistency only - - if (in->is_auth()) { - // i am auth: i need sync - if (in->is_syncbyme()) goto yes; - if (in->is_lockbyme()) goto yes; // lock => sync - if (!in->is_cached_by_anyone() && - !in->is_open_write()) goto yes; // i'm alone - } else { - // i am replica: fw to auth - int auth = in->authority(); - dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mds->messenger->send_message(m, - MSG_ADDR_MDS(auth), m->get_dest_port(), - MDS_PORT_CACHE); - return false; - } - } else { - // normal: soft+hard consistency + dout(7) << "inode_hard_read_start on " << *in << " hardlock=" << in->hardlock << endl; - if (in->is_syncbyauth()) { - // wait for sync - } else { - // i'm consistent - goto yes; - } - } - - // we need sync - if (in->is_syncbyauth() && !in->is_softasync()) { - dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl; - } else if (in->is_softasync() && in->is_auth()) { - dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl; - } else - assert(2+2==5); - - if (!in->can_auth_pin()) { - dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return false; + // can read? grab ref. + if (in->hardlock.can_read(in->is_auth())) { + in->hardlock.get_read(); + return true; } + + // can't read, and replicated. if (in->is_auth()) { - // wait for sync - in->add_waiter(CINODE_WAIT_SYNC, - new C_MDS_RetryMessage(mds, m)); - - if (!in->is_presync()) - inode_sync_start(in); + // auth + assert(0); // this shouldn't happen. } else { - // wait for unsync - in->add_waiter(CINODE_WAIT_UNSYNC, - new C_MDS_RetryMessage(mds, m)); - - assert(in->is_syncbyauth()); + // replica - if (!in->is_waitonunsync()) - inode_sync_wait(in); + // wait! + dout(7) << "inode_hard_read_start waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryMessage(mds, m)); } return false; - - yes: - mds->balancer->hit_inode(in, MDS_POP_SOFTRD); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; } -int MDCache::read_soft_finish(CInode *in) +void MDCache::inode_hard_read_finish(CInode *in) { - dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl; - return 0; // do nothing, actually.. + // drop ref + assert(in->hardlock.can_read(in->is_auth())); + in->hardlock.put_read(); + + dout(7) << "inode_hard_read_finish on " << *in << ", hardlock=" << in->hardlock << endl; } -bool MDCache::write_soft_start(CInode *in, Message *m) +bool MDCache::inode_hard_write_start(CInode *in, Message *m) { - if (!read_hard_try(in, m)) - return false; + dout(7) << "inode_hard_write_start on " << *in << " hardlock=" << in->hardlock << endl; - // if frozen: i can't proceed (for now, see above) - if (in->is_frozen()) { - dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; + // if not replicated, i can twiddle lock at will + if (in->is_auth() && + !in->is_cached_by_anyone() && + in->hardlock.get_state() != LOCK_LOCK) + in->hardlock.set_state(LOCK_LOCK); + + // can write? grab ref. + if (in->hardlock.can_write(in->is_auth())) { + in->hardlock.get_write(); + return true; } - - dout(5) << "write_soft_start " << *in << endl; - // what soft sync mode? - - if (in->is_softasync()) { - // softasync: hard consistency only - - if (in->is_syncbyauth()) { - // wait for sync release + + // can't write, replicated. + if (in->is_auth()) { + // auth + if (in->hardlock.can_write_soon(in->is_auth())) { + // just wait } else { - // i'm inconsistent; write away! - goto yes; + // initiate lock + inode_hard_lock(in); } - - } else { - // normal: soft+hard consistency - if (in->is_auth()) { - // i am auth: i need sync - if (in->is_syncbyme()) goto yes; - if (in->is_lockbyme()) goto yes; // lock => sync - if (!in->is_cached_by_anyone() && - !in->is_open_write()) goto yes; // i'm alone - } else { - // i am replica: fw to auth - int auth = in->authority(); - dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mds->messenger->send_message(m, - MSG_ADDR_MDS(auth), m->get_dest_port(), - MDS_PORT_CACHE); - return false; - } - } + dout(7) << "inode_hard_write_start waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryMessage(mds, m)); - // we need sync - if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) { - dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl; - } else if (!in->is_softasync() && in->is_auth()) { - dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl; - } else - assert(2+2==5); - - if (!in->can_auth_pin()) { - dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); return false; - } - - if (in->is_auth()) { - // wait for sync - in->add_waiter(CINODE_WAIT_SYNC, - new C_MDS_RetryMessage(mds, m)); - - if (!in->is_presync()) - inode_sync_start(in); } else { - // wait for unsync - in->add_waiter(CINODE_WAIT_UNSYNC, - new C_MDS_RetryMessage(mds, m)); - - assert(in->is_syncbyauth()); - assert(in->is_softasync()); - - if (!in->is_waitonunsync()) - inode_sync_wait(in); + // replica + // fw to auth + int auth = in->authority(); + dout(5) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; } - - return false; - - yes: - mds->balancer->hit_inode(in, MDS_POP_SOFTWR); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; } -int MDCache::write_soft_finish(CInode *in) +void MDCache::inode_hard_write_finish(CInode *in) { - dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl; - return 0; // do nothing, actually.. + // drop ref + assert(in->hardlock.can_write(in->is_auth())); + in->hardlock.put_write(); + dout(7) << "inode_hard_write_finish on " << *in << ", hardlock=" << in->hardlock << endl; + + // drop lock? + if (in->hardlock.get_nwrite() == 0) + inode_hard_eval(in); } -// sync interface - -void MDCache::inode_sync_wait(CInode *in) +void MDCache::inode_hard_eval(CInode *in) { - assert(!in->is_auth()); - - int auth = in->authority(); - dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl; - - assert(in->is_syncbyauth()); - assert(!in->is_waitonunsync()); - - in->dist_state |= CINODE_DIST_WAITONUNSYNC; - in->get(CINODE_PIN_WAITONUNSYNC); - - if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) || - (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) { - // actually recall; if !sticky, auth will immediately release. - dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl; - mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino), - MSG_ADDR_MDS(auth), MDS_PORT_CACHE, - MDS_PORT_CACHE); + assert(in->hardlock.get_nwrite() == 0); + + if (in->is_auth() && + in->is_cached_by_anyone() && + in->hardlock.is_stable()) { + dout(7) << "inode_hard_eval stable, syncing " << *in << ", hardlock=" << in->hardlock << endl; + inode_hard_sync(in); } } -void MDCache::inode_sync_start(CInode *in) -{ - // wait for all replicas - dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl; +// mid +void MDCache::inode_hard_sync(CInode *in) +{ + dout(7) << "inode_hard_sync on " << *in << " hardlock=" << in->hardlock << endl; assert(in->is_auth()); - assert(!in->is_presync()); - assert(!in->is_sync()); - - in->sync_waiting_for_ack.clear(); - in->dist_state |= CINODE_DIST_PRESYNC; - in->get(CINODE_PIN_PRESYNC); - in->auth_pin(); - in->sync_replicawantback = false; - - // send messages + // check state + if (in->hardlock.get_state() == LOCK_SYNC) + return; // already sync + if (in->hardlock.get_state() == LOCK_PRELOCK) + assert(0); // um... hmm! + assert(in->hardlock.get_state() == LOCK_LOCK); + + // hard data + crope harddata; + in->encode_hard_state(harddata); + + // bcast to replicas for (set::iterator it = in->cached_by_begin(); it != in->cached_by_end(); it++) { - in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()), + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IHARD); + m->set_data(harddata); + mds->messenger->send_message(m, MSG_ADDR_MDS(*it), MDS_PORT_CACHE, MDS_PORT_CACHE); } - - // sync clients - int last = -1; - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times) - in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - + + // change lock + in->hardlock.set_state(LOCK_SYNC); + + // waiters? + in->finish_waiting(CINODE_WAIT_HARDSTABLE); } -void MDCache::inode_sync_release(CInode *in) +void MDCache::inode_hard_lock(CInode *in) { - dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl; - - assert(in->is_syncbyme()); + dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; assert(in->is_auth()); - - in->dist_state &= ~CINODE_DIST_SYNCBYME; - - // release replicas + + // check state + if (in->hardlock.get_state() == LOCK_LOCK || + in->hardlock.get_state() == LOCK_PRELOCK) + return; // already lock or locking + assert(in->hardlock.get_state() == LOCK_SYNC); + + // bcast to replicas for (set::iterator it = in->cached_by_begin(); it != in->cached_by_end(); it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IHARD); + mds->messenger->send_message(m, MSG_ADDR_MDS(*it), MDS_PORT_CACHE, MDS_PORT_CACHE); } - // release writers - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - - in->auth_unpin(); -} - - - - - -int MDCache::ino_proxy_auth(inodeno_t ino, - int frommds, - map >& inomap) -{ - // check proxy sets for this ino - for (map >::iterator wit = inomap.begin(); - wit != inomap.end(); - wit++) { - CDir *dir = wit->first; - - // does this map apply to this node? - if (export_notify_ack_waiting[dir].count(frommds) == 0) continue; - - // is this ino in the set? - if (inomap[dir].count(ino)) { - int dirauth = dir->authority(); - assert(dirauth >= 0); - return dirauth; - } - } - return -1; // no proxy -} - - -void MDCache::do_ino_proxy(CInode *in, Message *m) -{ - // check proxy maps - int newauth = ino_proxy_auth(in->ino(), - m->get_source(), // works bc we only every proxy 1 hop - export_proxy_inos); - dout(7) << "inode " << *in << " proxy, new auth is " << newauth << endl; - assert(newauth >= 0); // we should know the new authority! - assert(in->is_frozen()); // i should be frozen right now! - assert(in->state_test(CINODE_STATE_PROXY)); - - // forward - mds->messenger->send_message(m, - MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, MDS_PORT_CACHE); - return; -} - - -void MDCache::do_dir_proxy(CDir *dir, Message *m) -{ - // check proxy maps - int newauth = ino_proxy_auth(dir->ino(), - m->get_source(), // works because we only every proxy 1 hop - export_proxy_dirinos); - dout(7) << "dir " << *dir << " proxy, new auth is " << newauth << endl; - assert(newauth >= 0); // we should know the new authority! - assert(dir->is_frozen()); // i should be frozen right now! - assert(dir->state_test(CDIR_STATE_PROXY)); - - // forward - mds->messenger->send_message(m, - MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, MDS_PORT_CACHE); - return; + // change lock + in->hardlock.set_state(LOCK_PRELOCK); + in->hardlock.init_gather(in->get_cached_by()); } +// messenger -// messages -void MDCache::handle_inode_sync_start(MInodeSyncStart *m) +void MDCache::handle_lock_inode_hard(MLock *m) { - // assume asker == authority for now. + assert(m->get_otype() == LOCK_OTYPE_IHARD); - // authority is requesting a lock + int from = m->get_asker(); CInode *in = get_inode(m->get_ino()); - if (!in) { - // don't have it anymore! - dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl; - mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - delete m; // done - return; - } - - dout(10) << "handle_sync_start " << *in << endl; - - // we shouldn't be authoritative... - assert(!in->is_auth()); - // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority()); - - // lock it - in->dist_state |= CINODE_DIST_SYNCBYAUTH; + if (LOCK_AC_FOR_AUTH(m->get_action())) { + // auth + assert(in); + assert(in->is_auth() || in->is_proxy()); + dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; - // open for write by clients? - if (in->is_open_write()) { - dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl; - - // sync clients - in->sync_waiting_for_ack.clear(); - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), - MSG_ADDR_CLIENT(*it), 0, + if (in->is_proxy()) { + // fw + int newauth = ino_proxy_auth(in->ino(), + from, + export_proxy_inos); + assert(newauth >= 0); + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, MDS_PORT_CACHE); + return; } - - in->pending_sync_request = m; } else { - // no writers, ack. - dout(7) << "handle_sync_start " << *in << ", sending ack" << endl; - - inode_sync_ack(in, m); - } -} - -void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback) -{ - dout(7) << "sending inode_sync_ack " << *in << endl; - - // send ack - mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - - delete m; -} - -void MDCache::handle_inode_sync_ack(MInodeSyncAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl; - - if (in->is_auth()) { - assert(in->is_presync()); - } else { - assert(in->is_syncbyauth()); - assert(in->pending_sync_request); + // replica + if (!in) { + dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; + /* + * do NOT nak.. if we go that route we need ot duplicate all the nonce funkiness + to keep gather_set a proper/correct subset of cached_by. better to use the existing + cacheexpire mechanism. + */ + /* + MLock *reply = new MLock(m->get_action() + 3, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + */ + delete m; + return; + } + + assert(!in->is_auth()); } - // remove it from waiting list - in->sync_waiting_for_ack.erase(m->get_source()); + dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; + + CLock *lock = &in->hardlock; - if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) { - // erase from cached_by too! - in->cached_by_remove(m->get_source()); - } - - if (m->replica_wantsback()) - in->sync_replicawantback = true; - - if (in->sync_waiting_for_ack.size()) { - - // more coming - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl; + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK); - } else { + { // assim data + int off = 0; + in->decode_hard_state(m->get_data(), off); + } - // yay! - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl; - - if (!in->is_auth()) { - // replica, sync ack back to auth - assert(in->pending_sync_request); - inode_sync_ack(in, in->pending_sync_request, true); - in->pending_sync_request = 0; - delete m; - return; + // update lock + lock->set_state(LOCK_SYNC); + + // no need to reply + + // waiters + in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC); + + // update lock and reply + lock->set_state(LOCK_LOCK); + + { + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); } - - in->dist_state &= ~CINODE_DIST_PRESYNC; - in->dist_state |= CINODE_DIST_SYNCBYME; - in->put(CINODE_PIN_PRESYNC); - - // do waiters! - in->finish_waiting(CINODE_WAIT_SYNC); - - - // release sync right away? - if (in->is_syncbyme()) { - if (in->is_freezing()) { - dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl; - inode_sync_release(in); - } - else if (in->sync_replicawantback) { - dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl; - inode_sync_release(in); - } - else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) || - (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) { - dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl; - inode_sync_release(in); - } - else { - dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl; - } + break; + + + // -- auth -- + case LOCK_AC_LOCKNAK: + // do NOT remove from cached_by; we don't know the nonce! + // and somewhere out there there's an expire that will take care of it. + + case LOCK_AC_LOCKACK: + assert(lock->state == LOCK_PRELOCK); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; } else { - dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl; + dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; + lock->set_state(LOCK_LOCK); + + // waiters + in->hardlock.get_write(); + in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); + in->hardlock.put_write(); + inode_hard_eval(in); } - } - - delete m; // done + } + delete m; } -void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) -{ - CInode *in = get_inode(m->get_ino()); - - if (!in) { - dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl; - delete m; // done - return; - } - - if (!in->is_syncbyauth()) { - dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl; - assert(0); // this shouldn't happen. - delete m; // done - return; - } - - dout(7) << "handle_sync_release " << *in << endl; - assert(!in->is_auth()); - - // release state - in->dist_state &= ~CINODE_DIST_SYNCBYAUTH; - - // waiters? - if (in->is_waitonunsync()) { - in->put(CINODE_PIN_WAITONUNSYNC); - in->dist_state &= ~CINODE_DIST_WAITONUNSYNC; - - // finish - in->finish_waiting(CINODE_WAIT_UNSYNC); - } - // client readers? - if (in->is_open_write()) { - dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl; - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - } - - // done - delete m; -} +// soft inode metadata - -void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m) +bool MDCache::inode_soft_read_start(CInode *in, Message *m) { - CInode *in = get_inode(m->get_ino()); + dout(7) << "inode_soft_read_start " << *in << " softlock=" << in->softlock << endl; - if (!in) { - dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl; - assert(0); // shouldn't happen - delete m; // done - return; + if (in->is_auth() && + !in->softlock.can_read(in->is_auth()) && + !in->is_cached_by_anyone()) { + in->softlock.set_state(LOCK_LOCK); // twiddle lock at will } - if(!in->is_auth()) { - do_ino_proxy(in, m); - return; - } - - if (in->is_syncbyme()) { - dout(7) << "handle_sync_recall " << *in << ", releasing" << endl; - inode_sync_release(in); - } - else if (in->is_presync()) { - dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl; - in->sync_replicawantback = true; - } - else { - dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl; - } - - // done - delete m; -} - - - -/* hard locks: owner, mode - */ - -bool MDCache::read_hard_try(CInode *in, - Message *m) -{ - //dout(5) << "read_hard_try " << *in << endl; + // can read? grab ref. + if (in->softlock.can_read(in->is_auth())) { + in->softlock.get_read(); + return true; + } - if (in->is_auth()) { - // auth - goto yes; // fine - } else { - // replica - if (in->is_lockbyauth()) { - // locked by auth; wait! - dout(7) << "read_hard_try waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m)); - if (!in->is_waitonunlock()) - inode_lock_wait(in); - return false; + // can't read, and replicated. + if (in->softlock.can_read_soon(in->is_auth())) { + // wait + dout(7) << "inode_soft_read_start can_read_soon " << *in << endl; + } else { + if (in->is_auth()) { + // auth + // FIXME or qsync? + // sync or lock? + assert(in->softlock.is_stable()); // should be async! + inode_soft_lock(in); // lock, easier to back off } else { - // not locked. - goto yes; + // replica + if (in->softlock.is_stable()) { + // recall? + + // FIXME XXX + + } else { + // wait until stable + dout(7) << "inode_soft_read_start waiting until stable on " << *in << ", softlock=" << in->softlock << endl; + in->add_waiter(CINODE_WAIT_SOFTSTABLE, new C_MDS_RetryMessage(mds, m)); + return false; + } } } - yes: - mds->balancer->hit_inode(in, MDS_POP_HARDRD); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; + // wait + dout(7) << "inode_soft_read_start waiting on " << *in << ", softlock=" << in->softlock << endl; + in->add_waiter(CINODE_WAIT_SOFTR, new C_MDS_RetryMessage(mds, m)); + + return false; } -bool MDCache::write_hard_start(CInode *in, - Message *m) +void MDCache::inode_soft_read_finish(CInode *in) { - // if frozen: i can't proceed; only auth can initiate lock - if (in->is_frozen()) { - dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; - } + // drop ref + assert(in->softlock.can_read(in->is_auth())); + in->softlock.put_read(); + + dout(7) << "inode_soft_read_finish on " << *in << ", softlock=" << in->softlock << endl; + + if (in->softlock.get_nread() == 0) + inode_soft_eval(in); +} - // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where - // the freeze is waiting for our lock to be released) +bool MDCache::inode_soft_write_start(CInode *in, Message *m) +{ + dout(7) << "inode_soft_read_start " << *in << " softlock=" << in->softlock << endl; + // if no replicated, i can twiddle lock at will + if (in->is_auth() && + !in->is_cached_by_anyone() && + in->softlock.get_state() != LOCK_LOCK) + in->softlock.set_state(LOCK_LOCK); + + // can write? grab ref. + if (in->softlock.can_write(in->is_auth())) { + in->softlock.get_write(); + return true; + } + + // can't write, replicated. if (in->is_auth()) { // auth - if (in->is_lockbyme()) goto success; - if (!in->is_cached_by_anyone()) goto success; - - // need lock - if (!in->can_auth_pin()) { - dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m)); - return false; + if (in->softlock.can_write_soon(in->is_auth())) { + // just wait + } else { + // initiate lock + // OR async ....... FIXME + inode_soft_lock(in); } - in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m)); - - if (!in->is_prelock()) - inode_lock_start(in); - + dout(7) << "inode_soft_write_start waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_SOFTW, new C_MDS_RetryMessage(mds, m)); + return false; } else { // replica + + // request ASYNC??? + // fw to auth int auth = in->authority(); - dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl; + dout(5) << "inode_soft_write_start " << *in << " on replica, fw to auth " << auth << endl; assert(auth != mds->get_nodeid()); mds->messenger->send_message(m, MSG_ADDR_MDS(auth), m->get_dest_port(), MDS_PORT_CACHE); return false; } - - success: - in->lock_active_count++; - dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl; - assert(in->lock_active_count > 0); - - mds->balancer->hit_inode(in, MDS_POP_HARDWR); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; -} - -void MDCache::write_hard_finish(CInode *in) -{ - in->lock_active_count--; - dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl; - assert(in->lock_active_count >= 0); - - // release lock? - if (in->lock_active_count == 0 && - in->is_lockbyme() && - !g_conf.mdcache_sticky_lock) { - dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl; - inode_lock_release(in); - } + } -void MDCache::inode_lock_start(CInode *in) +void MDCache::inode_soft_write_finish(CInode *in) { - dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl; - - assert(in->is_auth()); - assert(!in->is_prelock()); - assert(!in->is_lockbyme()); - assert(!in->is_lockbyauth()); - - in->lock_waiting_for_ack = in->cached_by; - in->dist_state |= CINODE_DIST_PRELOCK; - in->get(CINODE_PIN_PRELOCK); - in->auth_pin(); - - // send messages - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + // drop ref + assert(in->softlock.can_write(in->is_auth())); + in->softlock.put_write(); + dout(7) << "inode_soft_write_finish on " << *in << ", softlock=" << in->softlock << endl; + + // drop lock? + if (in->softlock.get_nwrite() == 0) + inode_soft_eval(in); +} + + +void MDCache::inode_soft_eval(CInode *in) +{ + if (!in->softlock.is_stable()) return; // do nothing + + // bump to async? + // FIXME XXX + + // bump to sync? + if (in->is_auth() && + in->is_cached_by_anyone() && + in->softlock.get_nwrite() == 0) { + dout(7) << "inode_soft_eval stable, syncing " << *in << ", softlock=" << in->softlock << endl; + inode_soft_sync(in); } } +// mid -void MDCache::inode_lock_release(CInode *in) +bool MDCache::inode_soft_sync(CInode *in) { - dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl; - - assert(in->is_lockbyme()); + dout(7) << "inode_soft_sync " << *in << " softlock=" << in->softlock << endl; + assert(in->is_auth()); - in->dist_state &= ~CINODE_DIST_LOCKBYME; + // check state + if (in->softlock.get_state() == LOCK_SYNC) return true; + if (in->softlock.get_state() == LOCK_GSYNC) return false; + + assert(in->softlock.is_stable()); + if (in->softlock.get_state() == LOCK_PRELOCK || + in->softlock.get_state() == LOCK_GLOCK) + assert(0); // hmm! + assert(in->softlock.get_state() == LOCK_LOCK || + in->softlock.get_state() == LOCK_ASYNC); + + if (in->softlock.get_state() == LOCK_LOCK) { + // soft data + crope softdata; + in->encode_soft_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + m->set_data(softdata); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // change lock + in->softlock.set_state(LOCK_SYNC); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - mds->messenger->send_message(new MInodeLockRelease(in), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + return true; } - in->auth_unpin(); -} + else if (in->softlock.get_state() == LOCK_ASYNC) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_GSYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // change lock + in->softlock.set_state(LOCK_GSYNC); + in->softlock.init_gather(in->get_cached_by()); -void MDCache::inode_lock_wait(CInode *in) -{ - dout(5) << "lock_wait on " << *in << endl; - assert(!in->is_auth()); - assert(in->is_lockbyauth()); - - in->dist_state |= CINODE_DIST_WAITONUNLOCK; - in->get(CINODE_PIN_WAITONUNLOCK); + return false; + } + else + assert(0); // wtf. } -void MDCache::handle_inode_lock_start(MInodeLockStart *m) +void MDCache::inode_soft_lock(CInode *in) { - // authority is requesting a lock - CInode *in = get_inode(m->get_ino()); - if (!in) { - // don't have it anymore! - dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl; - mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - delete m; // done - return; - } - - // we shouldn't be authoritative... - assert(!in->is_auth()); - - dout(7) << "handle_lock_start " << *in << ", sending ack" << endl; - - // lock it - in->dist_state |= CINODE_DIST_LOCKBYAUTH; + dout(7) << "inode_soft_lock " << *in << " softlock=" << in->softlock << endl; - // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority()); + assert(in->is_auth()); - // send ack - mds->messenger->send_message(new MInodeLockAck(in->ino()), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); + // check state + if (in->softlock.get_state() == LOCK_LOCK || + in->softlock.get_state() == LOCK_PRELOCK || + in->softlock.get_state() == LOCK_GLOCK) + return; // lock or locking + assert(in->softlock.is_stable()); + if (in->softlock.get_state() == LOCK_GSYNC) + assert(0); // hmm! + assert(in->softlock.get_state() == LOCK_SYNC || + in->softlock.get_state() == LOCK_ASYNC); + + if (in->softlock.get_state() == LOCK_SYNC) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // change lock + in->softlock.set_state(LOCK_PRELOCK); + in->softlock.init_gather(in->get_cached_by()); + } - delete m; // done + else if (in->softlock.get_state() == LOCK_ASYNC) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_GLOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // change lock + in->softlock.set_state(LOCK_GLOCK); + in->softlock.init_gather(in->get_cached_by()); + } + else + assert(0); // wtf. } -void MDCache::handle_inode_lock_ack(MInodeLockAck *m) +void MDCache::inode_soft_async(CInode *in) { - CInode *in = get_inode(m->get_ino()); - int from = m->get_source(); - dout(7) << "handle_lock_ack from " << from << " on " << *in << endl; + dout(7) << "inode_soft_async " << *in << " softlock=" << in->softlock << endl; - assert(in); assert(in->is_auth()); - assert(in->dist_state & CINODE_DIST_PRELOCK); - - // remove it from waiting list - in->lock_waiting_for_ack.erase(from); - if (!m->did_have()) { - // erase from cached_by too! - in->cached_by_remove(from); + // check state + if (in->softlock.get_state() == LOCK_ASYNC) + return; // async + assert(in->softlock.is_stable()); + if (in->softlock.get_state() == LOCK_GSYNC || + in->softlock.get_state() == LOCK_GLOCK || + in->softlock.get_state() == LOCK_PRELOCK) + assert(0); // hmm! + assert(in->softlock.get_state() == LOCK_SYNC || + in->softlock.get_state() == LOCK_LOCK); + + if (in->softlock.get_state() == LOCK_SYNC) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_GASYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // change lock + in->softlock.set_state(LOCK_GASYNC); + in->softlock.init_gather(in->get_cached_by()); } - if (in->lock_waiting_for_ack.size()) { - - // more coming - dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl; + else if (in->softlock.get_state() == LOCK_LOCK) { + // data + crope softdata; + in->encode_soft_state(softdata); - } else { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_ASYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + m->set_data(softdata); + mds->messenger->send_message(m, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } - // yay! - dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl; - - in->dist_state &= ~CINODE_DIST_PRELOCK; - in->dist_state |= CINODE_DIST_LOCKBYME; - in->put(CINODE_PIN_PRELOCK); - - // do waiters! - in->finish_waiting(CINODE_WAIT_LOCK); + // change lock + in->softlock.set_state(LOCK_ASYNC); } - - delete m; // done + else + assert(0); // wtf. } -void MDCache::handle_inode_lock_release(MInodeLockRelease *m) +// messenger + +void MDCache::handle_lock_inode_soft(MLock *m) { + assert(m->get_otype() == LOCK_OTYPE_ISOFT); + CInode *in = get_inode(m->get_ino()); + int from = m->get_asker(); - if (!in) { - dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl; - delete m; // done - return; - } - - if (!in->is_lockbyauth()) { - dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl; - assert(0); // i should have it, locked, or not have it at all! - delete m; // done - return; + if (LOCK_AC_FOR_AUTH(m->get_action())) { + // auth + assert(in); + assert(in->is_auth() || in->is_proxy()); + + if (in->is_proxy()) { + // fw + int newauth = ino_proxy_auth(in->ino(), + from, + export_proxy_inos); + assert(newauth >= 0); + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + return; + } + } else { + // replica + if (!in) { + // nack + dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; + + // DONT NAK + /* + MLock *reply = new MLock(m->get_action() + LOCK_AC_NAKOFFSET, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + */ + delete m; + return; + } + + assert(!in->is_auth()); } + + dout(7) << "handle_lock_inode_soft a=" << m->get_action() << " from " << from << " " << *in << " softlock=" << in->softlock << endl; - dout(7) << "handle_lock_release " << *in << endl; - assert(!in->is_auth()); + CLock *lock = &in->softlock; - // release state - in->dist_state &= ~CINODE_DIST_LOCKBYAUTH; + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_GSYNC); + + { // assim data + int off = 0; + in->decode_soft_state(m->get_data(), off); + } + + // update lock + lock->set_state(LOCK_SYNC); + + // no need to reply. + + // waiters + in->softlock.get_read(); + in->finish_waiting(CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTSTABLE); + in->softlock.put_read(); + inode_soft_eval(in); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC); + + // update lock + lock->set_state(LOCK_LOCK); + + // ack + { + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + break; + + case LOCK_AC_ASYNC: + assert(lock->get_state() == LOCK_GASYNC || + lock->get_state() == LOCK_LOCK); + + // update lock + lock->set_state(LOCK_ASYNC); + + // waiters + in->softlock.get_write(); + in->finish_waiting(CINODE_WAIT_SOFTW|CINODE_WAIT_SOFTSTABLE); + in->softlock.put_write(); + inode_soft_eval(in); + break; + - // waiters? - if (in->is_waitonunlock()) { - in->put(CINODE_PIN_WAITONUNLOCK); - in->dist_state &= ~CINODE_DIST_WAITONUNLOCK; + case LOCK_AC_GASYNC: + assert(lock->get_state() == LOCK_SYNC); - // finish - in->finish_waiting(CINODE_WAIT_UNLOCK); - } + // update lock + lock->set_state(LOCK_GASYNC); + + // ack + { + MLock *reply = new MLock(LOCK_AC_GASYNCACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + break; + + + case LOCK_AC_GSYNC: + assert(lock->get_state() == LOCK_ASYNC); + + // update lock + lock->set_state(LOCK_GSYNC); + + // reply w/ our data + { + MLock *reply = new MLock(LOCK_AC_GSYNCACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + + // payload + crope sd; + in->encode_soft_state(sd); + reply->set_data(sd); + + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + break; + + case LOCK_AC_GLOCK: + assert(lock->get_state() == LOCK_ASYNC); + + // update lock + lock->set_state(LOCK_LOCK); + + // reply w/ our data + { + MLock *reply = new MLock(LOCK_AC_GLOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + + // payload + crope sd; + in->encode_soft_state(sd); + reply->set_data(sd); + + mds->messenger->send_message(reply, + MSG_ADDR_MDS(from), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + break; + + + + // -- auth -- + case LOCK_AC_LOCKNAK: + // do NOT remove from cached_by; we don't know the nonce! + // and somewhere out there there's an expire that will take care of it. + + case LOCK_AC_LOCKACK: + assert(lock->state == LOCK_PRELOCK); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; + lock->set_state(LOCK_LOCK); + + // waiters + in->softlock.get_read(); + in->softlock.get_write(); + in->finish_waiting(CINODE_WAIT_SOFTRWB|CINODE_WAIT_SOFTSTABLE); + in->softlock.put_read(); + in->softlock.put_write(); + inode_soft_eval(in); + } + break; + + + case LOCK_AC_GLOCKNAK: + // do NOT remove from cached_by; we don't know the nonce! + // and somewhere out there there's an expire that will take care of it. + + case LOCK_AC_GLOCKACK: + assert(lock->state == LOCK_GLOCK); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (m->get_action() == LOCK_AC_GLOCKACK) { + // merge data (keep largest size, mtime, etc.) + int off; + in->decode_merge_soft_state(m->get_data(), off); + } + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; + lock->set_state(LOCK_LOCK); + + // waiters + in->softlock.get_read(); + in->softlock.get_write(); + in->finish_waiting(CINODE_WAIT_SOFTRWB|CINODE_WAIT_SOFTSTABLE); + in->softlock.put_read(); + in->softlock.put_write(); + inode_soft_eval(in); + } + break; + + + case LOCK_AC_GSYNCNAK: + // do NOT remove from cached_by; we don't know the nonce! + // and somewhere out there there's an expire that will take care of it. + + case LOCK_AC_GSYNCACK: + assert(lock->state == LOCK_GSYNC); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (m->get_action() == LOCK_AC_GSYNCACK) { + // merge data (keep largest size, mtime, etc.) + int off; + in->decode_merge_soft_state(m->get_data(), off); + } + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_soft " << *in << " from " << from << ", last one" << endl; + lock->set_state(LOCK_SYNC); + + // bcast data to replicas + crope softdata; + in->encode_soft_state(softdata); + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_ISOFT); + reply->set_data(softdata); + mds->messenger->send_message(reply, + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // waiters + in->softlock.get_read(); + in->finish_waiting(CINODE_WAIT_SOFTR|CINODE_WAIT_SOFTSTABLE); + in->softlock.put_read(); + inode_soft_eval(in); + } + break; + } - // done delete m; } +void MDCache::handle_lock_dir(MLock *m) +{ +} +void MDCache::handle_lock_dn(MLock *m) +{ +} -// DIR SYNC -/* - dir sync - - this are used when a directory is HASHED only. namely, - - to stat the dir inode we need an accurate directory size (????) - - for a readdir -*/ -void MDCache::dir_sync_start(CDir *dir) -{ - // wait for all replicas - dout(5) << "sync_start on " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(!dir->is_presync()); - assert(!dir->is_sync()); +// ino proxy - dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set(); - dir->state_set(CDIR_STATE_PRESYNC); - dir->auth_pin(); - - //dir->sync_replicawantback = false; +int MDCache::ino_proxy_auth(inodeno_t ino, + int frommds, + map >& inomap) +{ + // check proxy sets for this ino + for (map >::iterator wit = inomap.begin(); + wit != inomap.end(); + wit++) { + CDir *dir = wit->first; - // send messages - for (set::iterator it = dir->sync_waiting_for_ack.begin(); - it != dir->sync_waiting_for_ack.end(); - it++) { - mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); + // does this map apply to this node? + if (export_notify_ack_waiting[dir].count(frommds) == 0) continue; + + // is this ino in the set? + if (inomap[dir].count(ino)) { + int dirauth = dir->authority(); + assert(dirauth >= 0); + return dirauth; + } } + return -1; // no proxy } -void MDCache::dir_sync_release(CDir *dir) +void MDCache::do_ino_proxy(CInode *in, Message *m) { + // check proxy maps + int newauth = ino_proxy_auth(in->ino(), + m->get_source(), // works bc we only every proxy 1 hop + export_proxy_inos); + dout(7) << "inode " << *in << " proxy, new auth is " << newauth << endl; + assert(newauth >= 0); // we should know the new authority! + assert(in->is_frozen()); // i should be frozen right now! + assert(in->state_test(CINODE_STATE_PROXY)); + + // forward + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, MDS_PORT_CACHE); + return; +} +void MDCache::do_dir_proxy(CDir *dir, Message *m) +{ + // check proxy maps + int newauth = ino_proxy_auth(dir->ino(), + m->get_source(), // works because we only every proxy 1 hop + export_proxy_dirinos); + dout(7) << "dir " << *dir << " proxy, new auth is " << newauth << endl; + assert(newauth >= 0); // we should know the new authority! + assert(dir->is_frozen()); // i should be frozen right now! + assert(dir->state_test(CDIR_STATE_PROXY)); + + // forward + mds->messenger->send_message(m, + MSG_ADDR_MDS(newauth), MDS_PORT_CACHE, MDS_PORT_CACHE); + return; } -void MDCache::dir_sync_wait(CDir *dir) -{ -} -void handle_dir_sync_start(MDirSyncStart *m) -{ -} + @@ -2957,15 +2973,16 @@ void MDCache::export_dir(CDir *dir, new C_MDS_ExportGo(mds, dir, dest, pop)); // drop any sync or lock if sticky + /* if (g_conf.mdcache_sticky_sync_normal || g_conf.mdcache_sticky_sync_softasync) export_dir_dropsync(dir); - + */ // NOTE: we don't need to worry about hard locks; those aren't sticky (yet?). } -void MDCache::export_dir_dropsync(CDir *dir) +/*void MDCache::export_dir_dropsync(CDir *dir) { dout(7) << "export_dir_dropsync in " << *dir << endl; @@ -2985,7 +3002,7 @@ void MDCache::export_dir_dropsync(CDir *dir) export_dir_dropsync(in->dir); } } - +*/ void MDCache::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) @@ -3262,9 +3279,14 @@ void MDCache::export_dir_walk(MExportDir *req, in->version++; // so log entries are ignored, etc. - // dentry + // -- dentry dir_rope.append( it->first.c_str(), it->first.length()+1 ); + // -- inode + // relax locks + if (!in->is_cached_by_anyone()) + in->replicate_relax_locks(); + // add inode CInodeExport istate( in ); dir_rope.append( istate._rope() ); @@ -4154,7 +4176,7 @@ void MDCache::handle_export_dir_notify(MExportDirNotify *m) // HASH on auth -void MDCache::drop_sync_in_dir(CDir *dir) +/*void MDCache::drop_sync_in_dir(CDir *dir) { for (CDir_map_t::iterator it = dir->begin(); it != dir->end(); it++) { CInode *in = it->second->inode; @@ -4165,7 +4187,7 @@ void MDCache::drop_sync_in_dir(CDir *dir) } } } - +*/ class C_MDS_HashFreeze : public Context { public: @@ -4223,9 +4245,11 @@ void MDCache::hash_dir(CDir *dir) hash_dir_complete(dir); // drop any sync or lock if sticky + /* if (g_conf.mdcache_sticky_sync_normal || g_conf.mdcache_sticky_sync_softasync) drop_sync_in_dir(dir); + */ } void MDCache::hash_dir_complete(CDir *dir) @@ -4499,7 +4523,7 @@ void MDCache::unhash_dir_finish(CDir *dir) dir->mark_complete(); // inode state - dir->inode->inode.isdir = INODE_DIR_NORMAL; + dir->inode->inode.hash_seed = 0; dir->inode->mark_dirty(); // unfreeze! @@ -4792,7 +4816,7 @@ vector MDCache::hack_add_file(string& fn, CInode *in) { if (!root) { root = in; add_inode( in ); - //dout(7) << " added root " << root << endl; + dout(7) << " added root " << root << endl; } else { root->inode.ino = in->inode.ino; // bleh } @@ -4816,7 +4840,9 @@ vector MDCache::hack_add_file(string& fn, CInode *in) { if (diri->dir == NULL) { dout(4) << " making " << *diri << " into a dir" << endl; - diri->inode.isdir = true; + diri->inode.mode &= ~INODE_MODE_FILE; + diri->inode.mode |= INODE_MODE_DIR; + assert(diri->is_dir()); diri->get_or_open_dir(mds); } diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index ea4073b713d0e..985287e3fb83c 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -10,10 +10,12 @@ #include "include/types.h" #include "include/filepath.h" + #include "CInode.h" #include "CDentry.h" #include "CDir.h" -//#include "InoProxySet.h" +#include "Lock.h" + class MDS; class Message; @@ -283,50 +285,82 @@ class MDCache { void handle_cache_expire(MCacheExpire *m); - - // -- lock and sync : inodes -- - // soft sync locks - bool read_soft_start(CInode *in, Message *m); - int read_soft_finish(CInode *in); - bool write_soft_start(CInode *in, Message *m); - int write_soft_finish(CInode *in); - - void inode_sync_start(CInode *in); - void inode_sync_release(CInode *in); - void inode_sync_wait(CInode *in); - - void handle_inode_sync_start(MInodeSyncStart *m); - void handle_inode_sync_ack(MInodeSyncAck *m); - void handle_inode_sync_release(MInodeSyncRelease *m); - void handle_inode_sync_recall(MInodeSyncRecall *m); - - void inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback=false); - - // hard locks - bool read_hard_try(CInode *in, Message *m); - bool write_hard_start(CInode *in, Message *m); - void write_hard_finish(CInode *in); - - void inode_lock_start(CInode *in); - void inode_lock_release(CInode *in); - void inode_lock_wait(CInode *in); - - void handle_inode_lock_start(MInodeLockStart *m); - void handle_inode_lock_ack(MInodeLockAck *m); - void handle_inode_lock_release(MInodeLockRelease *m); - + // -- locks -- + // high level interface + bool inode_hard_read_start(CInode *in, Message *m); + void inode_hard_read_finish(CInode *in); + bool inode_hard_write_start(CInode *in, Message *m); + void inode_hard_write_finish(CInode *in); + bool inode_soft_read_start(CInode *in, Message *m); + void inode_soft_read_finish(CInode *in); + bool inode_soft_write_start(CInode *in, Message *m); + void inode_soft_write_finish(CInode *in); + + // low level triggers + void inode_hard_sync(CInode *in); + void inode_hard_lock(CInode *in); + bool inode_soft_sync(CInode *in); + void inode_soft_lock(CInode *in); + void inode_soft_async(CInode *in); + + void inode_hard_eval(CInode *in); + void inode_soft_eval(CInode *in); + + // messengers void handle_lock(MLock *m); + void handle_lock_inode_hard(MLock *m); + void handle_lock_inode_soft(MLock *m); + + void handle_lock_dir(MLock *m); + void handle_lock_dn(MLock *m); + + + + // -- lock and sync : inodes -- OLD CRAP XXX + /* + // soft sync locks + bool read_soft_start(CInode *in, Message *m); + int read_soft_finish(CInode *in); + bool write_soft_start(CInode *in, Message *m); + int write_soft_finish(CInode *in); + + void inode_sync_start(CInode *in); + void inode_sync_release(CInode *in); + void inode_sync_wait(CInode *in); + + void handle_inode_sync_start(MInodeSyncStart *m); + void handle_inode_sync_ack(MInodeSyncAck *m); + void handle_inode_sync_release(MInodeSyncRelease *m); + void handle_inode_sync_recall(MInodeSyncRecall *m); + + void inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback=false); + */ + /* + // hard locks + bool read_hard_try(CInode *in, Message *m); + bool write_hard_start(CInode *in, Message *m); + void write_hard_finish(CInode *in); + + void inode_lock_start(CInode *in); + void inode_lock_release(CInode *in); + void inode_lock_wait(CInode *in); + + void handle_inode_lock_start(MInodeLockStart *m); + void handle_inode_lock_ack(MInodeLockAck *m); + void handle_inode_lock_release(MInodeLockRelease *m); + */ // -- sync : dirs -- - void dir_sync_start(CDir *dir); - void dir_sync_release(CDir *dir); - void dir_sync_wait(CDir *dir); - - void handle_dir_sync_start(MDirSyncStart *m); - void handle_dir_sync_ack(MDirSyncAck *m); - void handle_dir_sync_release(MDirSyncRelease *m); - + /* + void dir_sync_start(CDir *dir); + void dir_sync_release(CDir *dir); + void dir_sync_wait(CDir *dir); + + void handle_dir_sync_start(MDirSyncStart *m); + void handle_dir_sync_ack(MDirSyncAck *m); + void handle_dir_sync_release(MDirSyncRelease *m); + */ // == crap fns == CInode* hack_get_file(string& fn); diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index 764ff7778aa6c..9b3eca4bdfd7c 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -462,16 +462,16 @@ int MDS::handle_client_request(MClientRequest *req) MClientReply *MDS::handle_client_stat(MClientRequest *req, CInode *cur) { - if (!mdcache->read_soft_start(cur, req)) + if (!mdcache->inode_soft_read_start(cur, req)) return 0; // sync - dout(10) << "reply to " << *req << " stat " << cur->inode.touched << " pop " << cur->get_popularity() << endl; + dout(10) << "reply to " << *req << " stat " << cur->inode.mtime << " pop " << cur->get_popularity() << endl; MClientReply *reply = new MClientReply(req); reply->set_trace_dist( cur, whoami ); // FIXME: put inode info in reply... - mdcache->read_soft_finish(cur); + mdcache->inode_soft_read_finish(cur); logger->inc("ostat"); stat_read.hit(); @@ -502,14 +502,13 @@ MClientReply *MDS::handle_client_touch(MClientRequest *req, CInode *cur) { // write - if (!mdcache->write_soft_start(cur, req)) + if (!mdcache->inode_soft_write_start(cur, req)) return 0; // fw or (wait for) sync cur->auth_pin(); // do update cur->inode.mtime++; // whatever - cur->inode.touched++; cur->mark_dirty(); // tell replicas @@ -522,7 +521,7 @@ MClientReply *MDS::handle_client_touch(MClientRequest *req, reply->set_result(0); // log it - dout(10) << "log for " << *req << " touch " << cur->inode.touched << endl; + dout(10) << "log for " << *req << " touch " << cur->inode.mtime << endl; mdlog->submit_entry(new EInodeUpdate(cur), new C_MDS_TouchFinish(this, req, cur, reply)); return 0; @@ -550,7 +549,7 @@ void MDS::handle_client_touch_2(MClientRequest *req, // unpin cur->auth_unpin(); - mdcache->write_soft_finish(cur); + mdcache->inode_soft_write_finish(cur); } @@ -577,14 +576,13 @@ MClientReply *MDS::handle_client_chmod(MClientRequest *req, CInode *cur) { // write - if (!mdcache->write_hard_start(cur, req)) + if (!mdcache->inode_hard_write_start(cur, req)) return 0; // fw or (wait for) lock cur->auth_pin(); // do update cur->inode.mtime++; // whatever - cur->inode.touched++; // blah cur->mark_dirty(); // start reply @@ -592,7 +590,7 @@ MClientReply *MDS::handle_client_chmod(MClientRequest *req, reply->set_trace_dist( cur, whoami ); reply->set_result(0); - mdcache->write_hard_finish(cur); + mdcache->inode_hard_write_finish(cur); // log it dout(10) << "log for " << *req << " chmod" << endl; @@ -630,8 +628,10 @@ void MDS::handle_client_chmod_2(MClientRequest *req, MClientReply *MDS::handle_client_readdir(MClientRequest *req, CInode *cur) { - if (!mdcache->read_hard_try(cur,req)) + // check perm + if (!mdcache->inode_hard_read_start(cur,req)) return NULL; + mdcache->inode_hard_read_finish(cur); // it's a directory, right? if (!cur->is_dir()) { @@ -757,7 +757,7 @@ MClientReply *MDS::handle_client_openwr(MClientRequest *req, CInode *cur) { if (!cur->is_auth()) { - if (!cur->is_softasync()) { + if (cur->softlock.get_mode() == LOCK_MODE_SYNC) { int auth = cur->authority(); assert(auth != whoami); dout(10) << "open (write) [replica] " << *cur << " on replica, fw to auth " << auth << endl; @@ -767,7 +767,7 @@ MClientReply *MDS::handle_client_openwr(MClientRequest *req, MDS_PORT_SERVER); return 0; } - + dout(10) << "open (write) [replica shared write] " << *cur << endl; assert(0); } @@ -904,17 +904,7 @@ void MDS::handle_client_unlink(MClientRequest *req, dout(7) << "handle_client_unlink on " << *in << endl; - // can't be presync/lock - if (in->is_presync()) { - in->add_waiter(CINODE_WAIT_SYNC, - new C_MDS_RetryMessage(this, req)); - return; - } - if (in->is_prelock()) { - in->add_waiter(CINODE_WAIT_LOCK, - new C_MDS_RetryMessage(this, req)); - return; - } + // presync, lock? mdcache->inode_unlink(in, new C_MDS_UnlinkInode(this,in,req)); @@ -969,9 +959,8 @@ void MDS::handle_client_mkdir(MClientRequest *req) // create! CInode *newi = mdcache->create_inode(); + newi->inode.mode |= INODE_MODE_DIR; mdcache->link_inode(dir, name, newi); - - newi->inode.isdir = 1; newi->mark_dirty(); @@ -1094,15 +1083,15 @@ if (!locked && flag=renaming) basic protocol with replicas: -> Lock (possibly x2?) -< LockAck (possibly x2?) -> Rename - src ino - dst dir - either dst ino (is unlinked) - or dst name -< RenameAck - (implicitly unlocks, unlinks, etc.) + > Lock (possibly x2?) + < LockAck (possibly x2?) + > Rename + src ino + dst dir + either dst ino (is unlinked) + or dst name + < RenameAck + (implicitly unlocks, unlinks, etc.) */ @@ -1133,6 +1122,7 @@ void MDS::handle_client_rename_file(MClientRequest *req, CDir *destdir, string& name) { + /* bool must_wait_for_lock = false; // does destination exist? (is this an overwrite?) @@ -1160,7 +1150,7 @@ void MDS::handle_client_rename_file(MClientRequest *req, return; } - if (mdcache->write_hard_start(oldin, req) == false) { + if (mdcache->inode_hard_write_start(oldin, req) == false) { // wait dout(7) << "dest/overwrite " << *oldin << " locking, waiting" << endl; must_wait_for_lock = true; @@ -1186,7 +1176,7 @@ void MDS::handle_client_rename_file(MClientRequest *req, return; } - if (mdcache->write_hard_start(from, req) == false) { + if (mdcache->inode_hard_write_start(from, req) == false) { // wait dout(7) << "from " << *from << " locking, waiting" << endl; must_wait_for_lock = true; @@ -1207,6 +1197,8 @@ void MDS::handle_client_rename_file(MClientRequest *req, // ok go! mdcache->file_rename(from, destdir, name, oldin, new C_MDS_RenameFinish(this, req)); + + */ } diff --git a/ceph/mds/MDS.h b/ceph/mds/MDS.h index 3f0c429e48f4f..1bc784a5fcf16 100644 --- a/ceph/mds/MDS.h +++ b/ceph/mds/MDS.h @@ -27,24 +27,6 @@ using namespace std; #define MDS_PORT_BALANCER 20 -// md ops -#define MDS_OP_STAT 100 // -#define MDS_OP_READDIR 101 // - -#define MDS_OP_OPENRD 111 // -#define MDS_OP_OPENWR 112 // -#define MDS_OP_OPENWRC 113 // -#define MDS_OP_CLOSE 119 - -#define MDS_OP_TOUCH 200 // utime, actually -#define MDS_OP_CHMOD 201 // chmod - -#define MDS_OP_RENAME 211 -#define MDS_OP_UNLINK 212 -#define MDS_OP_LINK 213 - -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 #define MDS_TRAVERSE_FORWARD 1 #define MDS_TRAVERSE_DISCOVER 2 diff --git a/ceph/mds/MDStore.cc b/ceph/mds/MDStore.cc index 453d332c35f9f..c8c2d103d6643 100644 --- a/ceph/mds/MDStore.cc +++ b/ceph/mds/MDStore.cc @@ -531,7 +531,7 @@ void MDStore::do_fetch_dir_2( int result, if (mds->mdcache->have_inode(inode->ino)) { CInode *in = mds->mdcache->get_inode(inode->ino); - dout(10) << "readdir got (but i already had) " << *in << " isdir " << in->inode.isdir << " touched " << in->inode.touched<< endl; + dout(10) << "readdir got (but i already had) " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; continue; } @@ -543,7 +543,7 @@ void MDStore::do_fetch_dir_2( int result, mds->mdcache->add_inode( in ); mds->mdcache->link_inode( dir, dname, in ); - dout(10) << "readdir got " << *in << " isdir " << in->inode.isdir << " touched " << in->inode.touched<< endl; + dout(10) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; } } diff --git a/ceph/mds/oldcachestuff.cc b/ceph/mds/oldcachestuff.cc new file mode 100644 index 0000000000000..761e72ad28237 --- /dev/null +++ b/ceph/mds/oldcachestuff.cc @@ -0,0 +1,930 @@ + +/* + + +OLD LOCK CRAP: + (old): + sync - soft metadata.. no reads/writes can proceed. (eg no stat) + lock - hard(+soft) metadata.. path traversals stop etc. (??) + + + replication consistency modes: + hard+soft - hard and soft are defined on all replicas. + all reads proceed (in absense of sync lock) + writes require sync lock, fw to auth + -> normal behavior. + + hard - hard only, soft is undefined + reads require a sync + writes proceed if field updates are monotonic (e.g. size, m/c/atime) + -> 'softasync' + + types of access by cache users: + + hard soft + R - read_hard_try path traversal + R <= R read_soft_start stat + R <= W write_soft_start touch + W => W write_hard_start chmod + + note on those implications: + read_soft_start() calls read_hard_try() + write_soft_start() calls read_hard_try() + a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a + lock is held) + + + relationship with frozen directories: + + read_hard_try - can proceed, because any hard changes require a lock, which + requires an active authority, which implies things are unfrozen. + write_hard_start - waits (has to; only auth can initiate) + read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth) + write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth)) + + if sticky is on, an export_dir will drop any sync or lock so that the freeze will + proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing(). + + + +NAMESPACE: + + none right now. + + +*/ + + +/* soft sync locks: mtime, size, etc. + */ + +bool MDCache::read_soft_start(CInode *in, Message *m) +{ + // if (!read_hard_try(in, m)) + // return false; + + // if frozen: i can't proceed (for now, see above) + if (in->is_frozen()) { + dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + + dout(5) << "read_soft_start " << *in << endl; + + // what soft sync mode? + + if (in->is_softasync()) { + // softasync: hard consistency only + + if (in->is_auth()) { + // i am auth: i need sync + if (in->is_syncbyme()) goto yes; + if (in->is_lockbyme()) goto yes; // lock => sync + if (!in->is_cached_by_anyone() && + !in->is_open_write()) goto yes; // i'm alone + } else { + // i am replica: fw to auth + int auth = in->authority(); + dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + } else { + // normal: soft+hard consistency + + if (in->is_syncbyauth()) { + // wait for sync + } else { + // i'm consistent + goto yes; + } + } + + // we need sync + if (in->is_syncbyauth() && !in->is_softasync()) { + dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl; + } else if (in->is_softasync() && in->is_auth()) { + dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl; + } else + assert(2+2==5); + + if (!in->can_auth_pin()) { + dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, + new C_MDS_RetryMessage(mds,m)); + return false; + } + + if (in->is_auth()) { + // wait for sync + in->add_waiter(CINODE_WAIT_SYNC, + new C_MDS_RetryMessage(mds, m)); + + if (!in->is_presync()) + inode_sync_start(in); + } else { + // wait for unsync + in->add_waiter(CINODE_WAIT_UNSYNC, + new C_MDS_RetryMessage(mds, m)); + + assert(in->is_syncbyauth()); + + if (!in->is_waitonunsync()) + inode_sync_wait(in); + } + + return false; + + yes: + mds->balancer->hit_inode(in, MDS_POP_SOFTRD); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +int MDCache::read_soft_finish(CInode *in) +{ + dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl; + return 0; // do nothing, actually.. +} + + +bool MDCache::write_soft_start(CInode *in, Message *m) +{ + // if (!read_hard_try(in, m)) + //return false; + + // if frozen: i can't proceed (for now, see above) + if (in->is_frozen()) { + dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + dout(5) << "write_soft_start " << *in << endl; + // what soft sync mode? + + if (in->is_softasync()) { + // softasync: hard consistency only + + if (in->is_syncbyauth()) { + // wait for sync release + } else { + // i'm inconsistent; write away! + goto yes; + } + + } else { + // normal: soft+hard consistency + + if (in->is_auth()) { + // i am auth: i need sync + if (in->is_syncbyme()) goto yes; + if (in->is_lockbyme()) goto yes; // lock => sync + if (!in->is_cached_by_anyone() && + !in->is_open_write()) goto yes; // i'm alone + } else { + // i am replica: fw to auth + int auth = in->authority(); + dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + } + + // we need sync + if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) { + dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl; + } else if (!in->is_softasync() && in->is_auth()) { + dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl; + } else + assert(2+2==5); + + if (!in->can_auth_pin()) { + dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, + new C_MDS_RetryMessage(mds,m)); + return false; + } + + if (in->is_auth()) { + // wait for sync + in->add_waiter(CINODE_WAIT_SYNC, + new C_MDS_RetryMessage(mds, m)); + + if (!in->is_presync()) + inode_sync_start(in); + } else { + // wait for unsync + in->add_waiter(CINODE_WAIT_UNSYNC, + new C_MDS_RetryMessage(mds, m)); + + assert(in->is_syncbyauth()); + assert(in->is_softasync()); + + if (!in->is_waitonunsync()) + inode_sync_wait(in); + } + + return false; + + yes: + mds->balancer->hit_inode(in, MDS_POP_SOFTWR); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +int MDCache::write_soft_finish(CInode *in) +{ + dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl; + return 0; // do nothing, actually.. +} + + + + + + + + +/* hard locks: owner, mode + */ + +/* +bool MDCache::read_hard_try(CInode *in, + Message *m) +{ + //dout(5) << "read_hard_try " << *in << endl; + + if (in->is_auth()) { + // auth + goto yes; // fine + } else { + // replica + if (in->is_lockbyauth()) { + // locked by auth; wait! + dout(7) << "read_hard_try waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m)); + if (!in->is_waitonunlock()) + inode_lock_wait(in); + return false; + } else { + // not locked. + goto yes; + } + } + + yes: + mds->balancer->hit_inode(in, MDS_POP_HARDRD); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +bool MDCache::write_hard_start(CInode *in, + Message *m) +{ + // if frozen: i can't proceed; only auth can initiate lock + if (in->is_frozen()) { + dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where + // the freeze is waiting for our lock to be released) + + + if (in->is_auth()) { + // auth + if (in->is_lockbyme()) goto success; + if (!in->is_cached_by_anyone()) goto success; + + // need lock + if (!in->can_auth_pin()) { + dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m)); + return false; + } + + in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m)); + + if (!in->is_prelock()) + inode_lock_start(in); + + return false; + } else { + // replica + // fw to auth + int auth = in->authority(); + dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + + success: + in->lock_active_count++; + dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl; + assert(in->lock_active_count > 0); + + mds->balancer->hit_inode(in, MDS_POP_HARDWR); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + +void MDCache::write_hard_finish(CInode *in) +{ + in->lock_active_count--; + dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl; + assert(in->lock_active_count >= 0); + + // release lock? + if (in->lock_active_count == 0 && + in->is_lockbyme() && + !g_conf.mdcache_sticky_lock) { + dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl; + inode_lock_release(in); + } +} + + +void MDCache::inode_lock_start(CInode *in) +{ + dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl; + + assert(in->is_auth()); + assert(!in->is_prelock()); + assert(!in->is_lockbyme()); + assert(!in->is_lockbyauth()); + + in->lock_waiting_for_ack = in->cached_by; + in->dist_state |= CINODE_DIST_PRELOCK; + in->get(CINODE_PIN_PRELOCK); + in->auth_pin(); + + // send messages + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::inode_lock_release(CInode *in) +{ + dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl; + + assert(in->is_lockbyme()); + assert(in->is_auth()); + + in->dist_state &= ~CINODE_DIST_LOCKBYME; + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockRelease(in), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + in->auth_unpin(); +} + +void MDCache::inode_lock_wait(CInode *in) +{ + dout(5) << "lock_wait on " << *in << endl; + assert(!in->is_auth()); + assert(in->is_lockbyauth()); + + in->dist_state |= CINODE_DIST_WAITONUNLOCK; + in->get(CINODE_PIN_WAITONUNLOCK); +} + + +void MDCache::handle_inode_lock_start(MInodeLockStart *m) +{ + // authority is requesting a lock + CInode *in = get_inode(m->get_ino()); + if (!in) { + // don't have it anymore! + dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl; + mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done + return; + } + + // we shouldn't be authoritative... + assert(!in->is_auth()); + + dout(7) << "handle_lock_start " << *in << ", sending ack" << endl; + + // lock it + in->dist_state |= CINODE_DIST_LOCKBYAUTH; + + // sanity check: make sure we know who _is_ authoritative! + assert(m->get_asker() == in->authority()); + + // send ack + mds->messenger->send_message(new MInodeLockAck(in->ino()), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + + delete m; // done +} + + +void MDCache::handle_inode_lock_ack(MInodeLockAck *m) +{ + CInode *in = get_inode(m->get_ino()); + int from = m->get_source(); + dout(7) << "handle_lock_ack from " << from << " on " << *in << endl; + + assert(in); + assert(in->is_auth()); + assert(in->dist_state & CINODE_DIST_PRELOCK); + + // remove it from waiting list + in->lock_waiting_for_ack.erase(from); + + if (!m->did_have()) { + // erase from cached_by too! + in->cached_by_remove(from); + } + + if (in->lock_waiting_for_ack.size()) { + + // more coming + dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl; + + } else { + + // yay! + dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl; + + in->dist_state &= ~CINODE_DIST_PRELOCK; + in->dist_state |= CINODE_DIST_LOCKBYME; + in->put(CINODE_PIN_PRELOCK); + + // do waiters! + in->finish_waiting(CINODE_WAIT_LOCK); + } + + delete m; // done +} + + +void MDCache::handle_inode_lock_release(MInodeLockRelease *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl; + delete m; // done + return; + } + + if (!in->is_lockbyauth()) { + dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl; + assert(0); // i should have it, locked, or not have it at all! + delete m; // done + return; + } + + dout(7) << "handle_lock_release " << *in << endl; + assert(!in->is_auth()); + + // release state + in->dist_state &= ~CINODE_DIST_LOCKBYAUTH; + + // waiters? + if (in->is_waitonunlock()) { + in->put(CINODE_PIN_WAITONUNLOCK); + in->dist_state &= ~CINODE_DIST_WAITONUNLOCK; + + // finish + in->finish_waiting(CINODE_WAIT_UNLOCK); + } + + // done + delete m; +} +*/ + + + + + + + + + +// sync interface + +void MDCache::inode_sync_wait(CInode *in) +{ + assert(!in->is_auth()); + + int auth = in->authority(); + dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl; + + assert(in->is_syncbyauth()); + assert(!in->is_waitonunsync()); + + in->dist_state |= CINODE_DIST_WAITONUNSYNC; + in->get(CINODE_PIN_WAITONUNSYNC); + + if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) || + (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) { + // actually recall; if !sticky, auth will immediately release. + dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl; + mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino), + MSG_ADDR_MDS(auth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::inode_sync_start(CInode *in) +{ + // wait for all replicas + dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl; + + assert(in->is_auth()); + assert(!in->is_presync()); + assert(!in->is_sync()); + + in->sync_waiting_for_ack.clear(); + in->dist_state |= CINODE_DIST_PRESYNC; + in->get(CINODE_PIN_PRESYNC); + in->auth_pin(); + + in->sync_replicawantback = false; + + // send messages + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // sync clients + int last = -1; + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times) + in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + +} + +void MDCache::inode_sync_release(CInode *in) +{ + dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl; + + assert(in->is_syncbyme()); + assert(in->is_auth()); + + in->dist_state &= ~CINODE_DIST_SYNCBYME; + + // release replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // release writers + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + + in->auth_unpin(); +} + + + + +// messages +void MDCache::handle_inode_sync_start(MInodeSyncStart *m) +{ + // assume asker == authority for now. + + // authority is requesting a lock + CInode *in = get_inode(m->get_ino()); + if (!in) { + // don't have it anymore! + dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl; + mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done + return; + } + + dout(10) << "handle_sync_start " << *in << endl; + + // we shouldn't be authoritative... + assert(!in->is_auth()); + + // sanity check: make sure we know who _is_ authoritative! + assert(m->get_asker() == in->authority()); + + // lock it + in->dist_state |= CINODE_DIST_SYNCBYAUTH; + + // open for write by clients? + if (in->is_open_write()) { + dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl; + + // sync clients + in->sync_waiting_for_ack.clear(); + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + + in->pending_sync_request = m; + } else { + // no writers, ack. + dout(7) << "handle_sync_start " << *in << ", sending ack" << endl; + + inode_sync_ack(in, m); + } +} + +void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback) +{ + dout(7) << "sending inode_sync_ack " << *in << endl; + + // send ack + mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + + delete m; +} + +void MDCache::handle_inode_sync_ack(MInodeSyncAck *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl; + + if (in->is_auth()) { + assert(in->is_presync()); + } else { + assert(in->is_syncbyauth()); + assert(in->pending_sync_request); + } + + // remove it from waiting list + in->sync_waiting_for_ack.erase(m->get_source()); + + if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) { + // erase from cached_by too! + in->cached_by_remove(m->get_source()); + } + + if (m->replica_wantsback()) + in->sync_replicawantback = true; + + if (in->sync_waiting_for_ack.size()) { + + // more coming + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl; + + } else { + + // yay! + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl; + + if (!in->is_auth()) { + // replica, sync ack back to auth + assert(in->pending_sync_request); + inode_sync_ack(in, in->pending_sync_request, true); + in->pending_sync_request = 0; + delete m; + return; + } + + in->dist_state &= ~CINODE_DIST_PRESYNC; + in->dist_state |= CINODE_DIST_SYNCBYME; + in->put(CINODE_PIN_PRESYNC); + + // do waiters! + in->finish_waiting(CINODE_WAIT_SYNC); + + + // release sync right away? + if (in->is_syncbyme()) { + if (in->is_freezing()) { + dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl; + inode_sync_release(in); + } + else if (in->sync_replicawantback) { + dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl; + inode_sync_release(in); + } + else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) || + (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) { + dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl; + inode_sync_release(in); + } + else { + dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl; + } + } else { + dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl; + } + } + + delete m; // done +} + + +void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl; + delete m; // done + return; + } + + if (!in->is_syncbyauth()) { + dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl; + assert(0); // this shouldn't happen. + delete m; // done + return; + } + + dout(7) << "handle_sync_release " << *in << endl; + assert(!in->is_auth()); + + // release state + in->dist_state &= ~CINODE_DIST_SYNCBYAUTH; + + // waiters? + if (in->is_waitonunsync()) { + in->put(CINODE_PIN_WAITONUNSYNC); + in->dist_state &= ~CINODE_DIST_WAITONUNSYNC; + + // finish + in->finish_waiting(CINODE_WAIT_UNSYNC); + } + + // client readers? + if (in->is_open_write()) { + dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl; + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + } + + + // done + delete m; +} + + +void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl; + assert(0); // shouldn't happen + delete m; // done + return; + } + if(!in->is_auth()) { + do_ino_proxy(in, m); + return; + } + + if (in->is_syncbyme()) { + dout(7) << "handle_sync_recall " << *in << ", releasing" << endl; + inode_sync_release(in); + } + else if (in->is_presync()) { + dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl; + in->sync_replicawantback = true; + } + else { + dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl; + } + + // done + delete m; +} + + + + + + + + + + +// DIR SYNC + +/* + + dir sync + + - this are used when a directory is HASHED only. namely, + - to stat the dir inode we need an accurate directory size (????) + - for a readdir + +*/ + +void MDCache::dir_sync_start(CDir *dir) +{ + // wait for all replicas + dout(5) << "sync_start on " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(!dir->is_presync()); + assert(!dir->is_sync()); + + dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set(); + dir->state_set(CDIR_STATE_PRESYNC); + dir->auth_pin(); + + //dir->sync_replicawantback = false; + + // send messages + for (set::iterator it = dir->sync_waiting_for_ack.begin(); + it != dir->sync_waiting_for_ack.end(); + it++) { + mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::dir_sync_release(CDir *dir) +{ + + +} + +void MDCache::dir_sync_wait(CDir *dir) +{ + +} + + +void handle_dir_sync_start(MDirSyncStart *m) +{ +} + + + + diff --git a/ceph/messages/MClientReply.h b/ceph/messages/MClientReply.h index 191f7ebbad2ef..716ba24e10014 100644 --- a/ceph/messages/MClientReply.h +++ b/ceph/messages/MClientReply.h @@ -150,7 +150,7 @@ class MClientReply : public Message { while (in) { c_inode_info *info = new c_inode_info; info->inode = in->inode; - info->is_sync = in->is_sync() || in->is_presync(); + //info->is_sync = in->is_sync() || in->is_presync(); in->get_dist_spec(info->dist, whoami); trace.insert(trace.begin(), info); diff --git a/ceph/messages/MLock.h b/ceph/messages/MLock.h index 92c5cf054b87e..4e9f3b2c117c1 100644 --- a/ceph/messages/MLock.h +++ b/ceph/messages/MLock.h @@ -3,23 +3,36 @@ #include "msg/Message.h" -#define LOCK_OTYPE_INO 1 -#define LOCK_OTYPE_DIRINO 2 -#define LOCK_OTYPE_DN 3 +#define LOCK_OTYPE_IHARD 1 +#define LOCK_OTYPE_ISOFT 2 +#define LOCK_OTYPE_DIR 3 +#define LOCK_OTYPE_DN 4 -// basic messages -#define LOCK_AC_LOCK 1 -#define LOCK_AC_LOCKACK 2 -#define LOCK_AC_SYNC 3 -#define LOCK_AC_SYNCACK 4 -#define LOCK_AC_REQSYNC 5 -#define LOCK_AC_DELETE 6 -#define LOCK_AC_DELETEACK 7 +// for replicas +#define LOCK_AC_SYNC 0 +#define LOCK_AC_ASYNC 1 +#define LOCK_AC_LOCK 2 // nakable +#define LOCK_AC_GSYNC 3 // " +#define LOCK_AC_GLOCK 4 // " +#define LOCK_AC_GASYNC 5 // " -// async messages -#define LOCK_AC_ASYNC 8 -#define LOCK_AC_ASYNCACK 9 -#define LOCK_AC_REQASYNC 10 +#define LOCK_AC_FOR_REPLICA(a) ((a) <= 5) +#define LOCK_AC_FOR_AUTH(a) ((a) >= 6) + +#define LOCK_AC_NAKOFFSET 4 // be careful with numbering! + +// for auth +#define LOCK_AC_LOCKNAK 6 +#define LOCK_AC_GSYNCNAK 7 +#define LOCK_AC_GLOCKNAK 8 +#define LOCK_AC_GASYNCNAK 9 +#define LOCK_AC_LOCKACK 10 +#define LOCK_AC_GSYNCACK 11 +#define LOCK_AC_GLOCKACK 12 +#define LOCK_AC_GASYNCACK 13 + + +#define lock_ac_name(x) class MLock : public Message { @@ -47,12 +60,12 @@ class MLock : public Message { } virtual char *get_type_name() { return "ILock"; } - void set_ino(inodeno_t ino) { - otype = LOCK_OTYPE_INO; + void set_ino(inodeno_t ino, char ot) { + otype = ot; this->ino = ino; } void set_dirino(inodeno_t dirino) { - otype = LOCK_OTYPE_DIRINO; + otype = LOCK_OTYPE_DIR; this->ino = ino; } void set_dn(inodeno_t dirino, string& dn) { @@ -75,7 +88,7 @@ class MLock : public Message { s.copy(off,sizeof(otype), (char*)&otype); off += sizeof(otype); - s.copy(off,sizeof(inodeno_t), (char*)&ino); + s.copy(off,sizeof(ino), (char*)&ino); off += sizeof(ino); dn = s.c_str() + off; diff --git a/ceph/msg/FakeMessenger.cc b/ceph/msg/FakeMessenger.cc index 38beaf46b7ae3..2fa04925031a5 100644 --- a/ceph/msg/FakeMessenger.cc +++ b/ceph/msg/FakeMessenger.cc @@ -17,7 +17,6 @@ using namespace std; #include -//#define SERIALIZE #include "include/config.h" diff --git a/ceph/msg/Messenger.cc b/ceph/msg/Messenger.cc index cb40ee4c2d2f6..ab1b5c9a67536 100644 --- a/ceph/msg/Messenger.cc +++ b/ceph/msg/Messenger.cc @@ -42,18 +42,11 @@ using namespace std; #include "messages/MDirExpire.h" #include "messages/MCacheExpire.h" -#include "messages/MInodeSyncStart.h" -#include "messages/MInodeSyncAck.h" -#include "messages/MInodeSyncRelease.h" -#include "messages/MInodeSyncRecall.h" - -#include "messages/MInodeLockStart.h" -#include "messages/MInodeLockAck.h" -#include "messages/MInodeLockRelease.h" - #include "messages/MInodeUnlink.h" #include "messages/MInodeUnlinkAck.h" +#include "messages/MLock.h" + Message * decode_message(crope& ser) { @@ -164,28 +157,10 @@ decode_message(crope& ser) m = new MDirExpire(); break; - case MSG_MDS_INODESYNCSTART: - m = new MInodeSyncStart(); - break; - case MSG_MDS_INODESYNCACK: - m = new MInodeSyncAck(); - break; - case MSG_MDS_INODESYNCRELEASE: - m = new MInodeSyncRelease(); - break; - case MSG_MDS_INODESYNCRECALL: - m = new MInodeSyncRecall(); + case MSG_MDS_LOCK: + m = new MLock(); break; - case MSG_MDS_INODELOCKSTART: - m = new MInodeLockStart(); - break; - case MSG_MDS_INODELOCKACK: - m = new MInodeLockAck(); - break; - case MSG_MDS_INODELOCKRELEASE: - m = new MInodeLockRelease(); - break; case MSG_MDS_INODEUNLINK: m = new MInodeUnlink(); -- 2.39.5