From 27cf8d889e5e3f0ecc7f5012239ddc3b4fa31407 Mon Sep 17 00:00:00 2001 From: sage Date: Mon, 9 Aug 2004 22:40:51 +0000 Subject: [PATCH] locks (untested) git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@79 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/config.cc | 1 + ceph/config.h | 1 + ceph/mds/CDir.h | 2 +- ceph/mds/CInode.cc | 2 +- ceph/mds/CInode.h | 14 ++- ceph/mds/MDCache.cc | 288 +++++++++++++++++++++++++++++++++++++++----- ceph/mds/MDCache.h | 6 +- ceph/mds/MDS.cc | 142 ++++++++++++++++------ 8 files changed, 373 insertions(+), 83 deletions(-) diff --git a/ceph/config.cc b/ceph/config.cc index 9e37da2eb85cb..2a2afeca69a80 100644 --- a/ceph/config.cc +++ b/ceph/config.cc @@ -34,6 +34,7 @@ md_config_t g_conf = { mdcache_mid: .8, mdcache_sticky_sync_normal: true, mdcache_sticky_sync_softasync: false, + mdcache_sticky_lock: false, // sticky probably a bad idea mdbal_replicate_threshold: 500, mdbal_unreplicate_threshold: 200, diff --git a/ceph/config.h b/ceph/config.h index 6861de41cc894..ab61acbbddfd2 100644 --- a/ceph/config.h +++ b/ceph/config.h @@ -24,6 +24,7 @@ struct md_config_t { float mdcache_mid; bool mdcache_sticky_sync_normal; bool mdcache_sticky_sync_softasync; + bool mdcache_sticky_lock; float mdbal_replicate_threshold; float mdbal_unreplicate_threshold; diff --git a/ceph/mds/CDir.h b/ceph/mds/CDir.h index d331af39d44eb..15b47e260667a 100644 --- a/ceph/mds/CDir.h +++ b/ceph/mds/CDir.h @@ -182,7 +182,7 @@ class CDir { int is_hard_pinned() { return hard_pinned; } int adjust_nested_hard_pinned(int a); - bool can_hard_pin() { return !is_frozen(); } // and is_freezing?? + bool can_hard_pin() { return !(is_frozen() || is_freezing()); } void add_hard_pin_waiter(Context *c); void hard_pin(); void hard_unpin(); diff --git a/ceph/mds/CInode.cc b/ceph/mds/CInode.cc index 03f75fa8f5fe5..25d8dbecb35e3 100644 --- a/ceph/mds/CInode.cc +++ b/ceph/mds/CInode.cc @@ -31,7 +31,7 @@ CInode::CInode() : LRUObject() { nested_hard_pinned = 0; // state = 0; dist_state = 0; - soft_sync_count = 0; + lock_active_count = 0; version = 0; diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index 7450c424e6738..3a5587ef868cb 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -53,6 +53,9 @@ using namespace std; #define CINODE_PIN_PRESYNC 70002 // waiter #define CINODE_PIN_WAITONUNSYNC 70003 // waiter +#define CINODE_PIN_PRELOCK 70004 +#define CINODE_PIN_WAITONUNLOCK 70005 + // directory authority types // >= is the auth mds #define CDIR_AUTH_PARENT -1 // default @@ -69,8 +72,8 @@ using namespace std; #define CINODE_DIST_PRELOCK 64 // file mode, owner, etc. #define CINODE_DIST_LOCKBYME 128 // i am auth -#define CINODE_DIST_LOCKBYTHEM 256 // i am not auth - +#define CINODE_DIST_LOCKBYAUTH 256 // i am not auth +#define CINODE_DIST_WAITONUNLOCK 512 class Context; class CDentry; @@ -118,8 +121,10 @@ class CInode : LRUObject { unsigned dist_state; set sync_waiting_for_ack; list waiting_for_sync; + set lock_waiting_for_ack; list waiting_for_lock; - int soft_sync_count; + int lock_active_count; // count for in progress or waiting locks + // open file state // sets of client ids! @@ -233,8 +238,9 @@ class CInode : LRUObject { void take_sync_waiting(list& ls); bool is_lockbyme() { return dist_state & CINODE_DIST_LOCKBYME; } - bool is_lockbythem() { return dist_state & CINODE_DIST_LOCKBYTHEM; } + bool is_lockbyauth() { return dist_state & CINODE_DIST_LOCKBYAUTH; } bool is_prelock() { return dist_state & CINODE_DIST_PRELOCK; } + bool is_waitonunlock() { return dist_state & CINODE_DIST_WAITONUNLOCK; } void add_lock_waiter(Context *c); void take_lock_waiting(list& ls); diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index 231564200beab..7946950480e27 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -45,32 +45,6 @@ using namespace std; #undef dout #define dout(l) if (l<=g_conf.debug) cout << "mds" << mds->get_nodeid() << ".cache " -/* - -INODES: - - two types of inode metadata: - hard - uid/gid, mode - soft - m/c/atime, size - - correspondingly, two types of locks: - lock - freezes hard metadata.. path traversals stop etc. (??) - sync - freezes soft metadata.. no reads/writes can proceed. (eg no stat) - - replication consistency modes: - auth only - n/a - - hard+soft - hard and soft are defined on all replicas. - all reads proceed (in absense of sync lock) - writes require sync lock; possibly fw to auth - -> normal behavior. - - hard - hard only, soft is undefined - reads require a sync - writes proceed if field updates are monotonic (e.g. size, m/c/atime) - -> 'softasync' - -*/ MDCache::MDCache(MDS *m) @@ -562,12 +536,19 @@ int MDCache::path_traverse(string& path, // frozen? if (cur->dir->is_freeze_root()) { // doh! - dout(7) << "mds" << whoami << " dir " << *cur << " is frozen, waiting" << endl; + dout(7) << " dir " << *cur << " is frozen, waiting" << endl; cur->dir->add_freeze_waiter(new C_MDS_RetryMessage(mds, req)); return 1; } + // must read hard data to traverse + if (!read_hard_try(cur, req)) + return 1; + + // check permissions? + + // dentry: CDentry *dn = cur->dir->lookup(dname); if (dn && dn->inode) { // have it, keep going. @@ -1047,11 +1028,52 @@ void MDCache::handle_dir_update(MDirUpdate *m) // locks ---------------------------------------------------------------- +/* + +INODES: + + two types of inode metadata: + hard - uid/gid, mode + soft - m/c/atime, size + + correspondingly, two types of locks: + lock - freezes hard metadata.. path traversals stop etc. (??) + sync - freezes soft metadata.. no reads/writes can proceed. (eg no stat) + + replication consistency modes: + hard+soft - hard and soft are defined on all replicas. + all reads proceed (in absense of sync lock) + writes require sync lock; possibly fw to auth + -> normal behavior. + + hard - hard only, soft is undefined + reads require a sync + writes proceed if field updates are monotonic (e.g. size, m/c/atime) + -> 'softasync' + + types of access by cache users: + + hard soft + R - read_hard_try path traversal + R <= R read_soft_start stat + R <= W write_soft_start touch + W => W write_hard_start chmod + + note on those implications: + read_soft_start() calls read_hard_try() + write_soft_start() calls read_hard_try() + a hard lock implies/subsumes a soft sync +*/ + + /* soft sync locks: mtime, size, etc. */ bool MDCache::read_soft_start(CInode *in, Message *m) { + if (!read_hard_try(in, m)) + return false; + dout(5) << "read_soft_start " << *in << endl; // what soft sync mode? @@ -1062,6 +1084,7 @@ bool MDCache::read_soft_start(CInode *in, Message *m) if (in->is_auth()) { // i am auth: i need sync if (in->is_syncbyme()) return true; + if (in->is_lockbyme()) return true; // lock => sync if (!in->is_cached_by_anyone()) return true; // i'm alone } else { // i am replica: fw to auth @@ -1123,6 +1146,9 @@ int MDCache::read_soft_finish(CInode *in) bool MDCache::write_soft_start(CInode *in, Message *m) { + if (!read_hard_try(in, m)) + return false; + dout(5) << "write_soft_start " << *in << endl; // what soft sync mode? @@ -1142,6 +1168,7 @@ bool MDCache::write_soft_start(CInode *in, Message *m) if (in->is_auth()) { // i am auth: i need sync if (in->is_syncbyme()) return true; + if (in->is_lockbyme()) return true; // lock => sync if (!in->is_cached_by_anyone()) return true; // i'm alone } else { // i am replica: fw to auth @@ -1207,7 +1234,6 @@ void MDCache::sync_wait(CInode *in) in->dist_state |= CINODE_DIST_WAITONUNSYNC; in->get(CINODE_PIN_WAITONUNSYNC); - in->hard_pin(); if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) || (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) { @@ -1368,6 +1394,7 @@ void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) if (!in->is_syncbythem()) { dout(7) << "handle_sync_release " << m->get_ino() << ", not flagged as sync, dropping" << endl; + assert(0); delete m; // done return; } @@ -1383,7 +1410,6 @@ void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) if (in->is_waitonunsync()) { in->put(CINODE_PIN_WAITONUNSYNC); in->dist_state &= ~CINODE_DIST_WAITONUNSYNC; - in->hard_unpin(); // finish list finished; @@ -1433,13 +1459,37 @@ void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m) /* hard locks: owner, mode */ +bool MDCache::read_hard_try(CInode *in, + Message *m) +{ + //dout(5) << "read_hard_try " << *in << endl; + + if (in->is_auth()) { + // auth + return true; // fine + } else { + // replica + if (in->is_lockbyauth()) { + // locked by auth; wait! + dout(7) << "read_hard_try waiting on " << in << endl; + in->add_sync_waiter(new C_MDS_RetryMessage(mds, m)); + inode_lock_wait(in); + return false; + } else { + // not locked. + return true; + } + } +} + + bool MDCache::write_hard_start(CInode *in, Message *m) { if (in->is_auth()) { // auth - if (in->is_lockbyme()) return true; - if (!in->is_cached_by_anyone()) return true; // i'm alone + if (in->is_lockbyme()) goto success; + if (!in->is_cached_by_anyone()) goto success; // need lock if (!in->can_hard_pin()) { @@ -1449,7 +1499,8 @@ bool MDCache::write_hard_start(CInode *in, } in->add_lock_waiter(new C_MDS_RetryMessage(mds, m)); - + in->lock_active_count++; + if (!in->is_prelock()) inode_lock_start(in); @@ -1465,40 +1516,211 @@ bool MDCache::write_hard_start(CInode *in, MDS_PORT_CACHE); return false; } + + success: + in->lock_active_count++; + assert(in->lock_active_count > 0); } void MDCache::write_hard_finish(CInode *in) { dout(5) << "write_hard_finish " << *in << endl; + + assert(in->lock_active_count > 0); + in->lock_active_count--; + + // release lock? + if (!g_conf.mdcache_sticky_lock) { + dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl; + inode_lock_release(in); + } } void MDCache::inode_lock_start(CInode *in) { - + dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl; + + assert(in->is_auth()); + assert(!in->is_prelock()); + assert(!in->is_lockbyme()); + assert(!in->is_lockbyauth()); + + in->lock_waiting_for_ack = in->cached_by; + in->dist_state |= CINODE_DIST_PRELOCK; + in->get(CINODE_PIN_PRELOCK); + in->hard_pin(); + + // send messages + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } } void MDCache::inode_lock_release(CInode *in) { + dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl; + assert(in->is_lockbyme()); + assert(in->is_auth()); + + in->hard_unpin(); + in->dist_state &= ~CINODE_DIST_LOCKBYME; + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockRelease(in), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } } +void MDCache::inode_lock_wait(CInode *in) +{ + dout(5) << "lock_wait on " << *in; + assert(!in->is_auth()); + assert(in->is_lockbyauth()); + + in->dist_state |= CINODE_DIST_WAITONUNLOCK; + in->get(CINODE_PIN_WAITONUNLOCK); +} + + void MDCache::handle_inode_lock_start(MInodeLockStart *m) { + // authority is requesting a lock + CInode *in = get_inode(m->get_ino()); + if (!in) { + // don't have it anymore! + dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl; + mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done + return; + } + + // we shouldn't be authoritative... + assert(!in->is_auth()); + + dout(7) << "handle_lock_start " << *in << ", sending ack" << endl; + + // lock it + //in->get(CINODE_PIN_LOCKBYAUTH); + in->dist_state |= CINODE_DIST_LOCKBYAUTH; + + // send ack + mds->messenger->send_message(new MInodeLockAck(in->ino()), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done } void MDCache::handle_inode_lock_ack(MInodeLockAck *m) { + CInode *in = get_inode(m->get_ino()); + int from = m->get_source(); + dout(7) << "handle_lock_ack from " << from << " on " << *in << endl; + + assert(in); + assert(in->is_auth()); + assert(in->dist_state & CINODE_DIST_PRELOCK); + + // remove it from waiting list + in->lock_waiting_for_ack.erase(from); + + if (!m->did_have()) { + // erase from cached_by too! + in->cached_by_remove(from); + } + + if (in->lock_waiting_for_ack.size()) { + + // more coming + dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl; + + } else { + + // yay! + dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl; + + in->dist_state &= ~CINODE_DIST_PRELOCK; + in->dist_state |= CINODE_DIST_LOCKBYME; + //in->get(CINODE_PIN_LOCKBYME); + in->put(CINODE_PIN_PRELOCK); + // do waiters! + list finished; + in->take_lock_waiting(finished); + + for (list::iterator it = finished.begin(); + it != finished.end(); + it++) { + in->lock_active_count--; // effectively dequeued + Context *c = *it; + if (c) { + c->finish(0); + delete c; + } + } + } + + delete m; // done } void MDCache::handle_inode_lock_release(MInodeLockRelease *m) { + CInode *in = get_inode(m->get_ino()); + if (!in) { + dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl; + delete m; // done + return; + } + + if (!in->is_syncbythem()) { + dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, dropping" << endl; + assert(0); + delete m; // done + return; + } + + dout(7) << "handle_lock_release " << *in << endl; + assert(!in->is_auth()); + + // release state + //in->put(CINODE_PIN_SYNCBYTHEM); + in->dist_state &= ~CINODE_DIST_LOCKBYAUTH; + + // waiters? + if (in->is_waitonunlock()) { + in->put(CINODE_PIN_WAITONUNLOCK); + in->dist_state &= ~CINODE_DIST_WAITONUNLOCK; + //in->hard_unpin(); + + // finish + list finished; + in->take_lock_waiting(finished); + for (list::iterator it = finished.begin(); + it != finished.end(); + it++) { + Context *c = *it; + c->finish(0); + delete c; + } + } + + // done + delete m; } diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index dfbbe0b6352e0..51bc9342cad36 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -168,15 +168,13 @@ class MDCache { void handle_inode_sync_recall(MInodeSyncRecall *m); // hard locks - bool read_hard_start(CInode *in, Message *m); - int waitfor_lock(CInode *in, Message *m); - int read_hard_finish(CInode *in); - + bool read_hard_try(CInode *in, Message *m); bool write_hard_start(CInode *in, Message *m); void write_hard_finish(CInode *in); void inode_lock_start(CInode *in); void inode_lock_release(CInode *in); + void inode_lock_wait(CInode *in); void handle_inode_lock_start(MInodeLockStart *m); void handle_inode_lock_ack(MInodeLockAck *m); diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index f74ebc1fd52f4..0ed4228cb8c22 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -364,11 +364,11 @@ int MDS::handle_client_request(MClientRequest *req) break; case MDS_OP_TOUCH: - case MDS_OP_CHMOD: reply = handle_client_touch(req, cur); break; - //reply = handle_client_chmod(req, cur); + case MDS_OP_CHMOD: + reply = handle_client_chmod(req, cur); break; case MDS_OP_OPENRD: @@ -440,49 +440,108 @@ public: MClientReply *MDS::handle_client_touch(MClientRequest *req, CInode *cur) { - int auth = cur->authority(mdcluster); - - if (auth == whoami) { - - if (!cur->can_hard_pin()) { - // wait - cur->add_hard_pin_waiter(new C_MDS_RetryMessage(this, req)); - return 0; - } + if (!cur->can_hard_pin()) { + // wait + cur->add_hard_pin_waiter(new C_MDS_RetryMessage(this, req)); + return 0; + } - // write - if (!mdcache->write_soft_start(cur, req)) - return 0; // sync + // write + if (!mdcache->write_soft_start(cur, req)) + return 0; // fw or (wait for) sync - cur->hard_pin(); + cur->hard_pin(); + + // do update + cur->inode.mtime++; // whatever + cur->inode.touched++; + cur->mark_dirty(); + + // tell replicas + // actually, no! it's synced by me, or async. they'll get told upon release. + //mdcache->send_inode_updates(cur); + + // log it + dout(10) << "log for " << *req << " touch " << cur->inode.touched << endl; + mdlog->submit_entry(new EInodeUpdate(cur), + new C_MDS_TouchFinish(this, req, cur)); + return 0; +} - // do update - cur->inode.mtime++; // whatever - cur->inode.touched++; - cur->mark_dirty(); + +void MDS::handle_client_touch_2(MClientRequest *req, + CInode *cur) +{ + // reply + dout(10) << "reply to " << *req << " touch" << endl; + MClientReply *reply = new MClientReply(req); + reply->set_trace_dist( cur, whoami ); + reply->set_result(0); + + messenger->send_message(reply, + MSG_ADDR_CLIENT(req->get_client()), 0, + MDS_PORT_SERVER); - // tell replicas - // actually, no! it's synced by me, or async. they'll get told upon release. - //mdcache->send_inode_updates(cur); + logger->inc("otouch"); + stat_write.hit(); + stat_req.hit(); + stat_ops++; - // log it - dout(10) << "log for " << *req << " touch " << cur->inode.touched << endl; - mdlog->submit_entry(new EInodeUpdate(cur), - new C_MDS_TouchFinish(this, req, cur)); - return 0; - } else { + // done + delete req; - // forward - dout(10) << "forwarding touch to authority " << auth << endl; - messenger->send_message(req, - MSG_ADDR_MDS(auth), MDS_PORT_SERVER, - MDS_PORT_SERVER); + // unpin + cur->hard_unpin(); + mdcache->write_soft_finish(cur); +} + + + +class C_MDS_ChmodFinish : public Context { +public: + CInode *in; + MClientRequest *req; + MDS *mds; + C_MDS_ChmodFinish(MDS *mds, MClientRequest *req, CInode *cur) { + this->mds = mds; + this->in = cur; + this->req = req; + } + virtual void finish(int result) { + mds->handle_client_chmod_2(req, in); + } +}; + + +MClientReply *MDS::handle_client_chmod(MClientRequest *req, + CInode *cur) +{ + if (!cur->can_hard_pin()) { + // wait + cur->add_hard_pin_waiter(new C_MDS_RetryMessage(this, req)); return 0; } + + // write + if (!mdcache->write_hard_start(cur, req)) + return 0; // fw or (wait for) lock + + cur->hard_pin(); + + // do update + cur->inode.mtime++; // whatever + cur->inode.touched++; // blah + cur->mark_dirty(); + + // log it + dout(10) << "log for " << *req << " chmod" << endl; + mdlog->submit_entry(new EInodeUpdate(cur), + new C_MDS_TouchFinish(this, req, cur)); + return 0; } - -void MDS::handle_client_touch_2(MClientRequest *req, + +void MDS::handle_client_chmod_2(MClientRequest *req, CInode *cur) { // reply @@ -494,25 +553,28 @@ void MDS::handle_client_touch_2(MClientRequest *req, messenger->send_message(reply, MSG_ADDR_CLIENT(req->get_client()), 0, MDS_PORT_SERVER); - + logger->inc("otouch"); stat_write.hit(); stat_req.hit(); stat_ops++; - cur->hard_unpin(); - // done delete req; - + // unpin - mdcache->write_soft_finish(cur); + cur->hard_unpin(); + mdcache->write_hard_finish(cur); } + MClientReply *MDS::handle_client_readdir(MClientRequest *req, CInode *cur) { + if (!mdcache->read_hard_try(cur,req)) + return NULL; + // it's a directory, right? if (!cur->is_dir()) { // not a dir -- 2.39.5