mds/MDS.o\
mds/Server.o\
mds/MDCache.o\
+ mds/Locker.o\
mds/Migrator.o\
mds/Renamer.o\
mds/MDBalancer.o\
int mask;
// special stuff
+ version_t version; // auth only
unsigned char hash_seed; // only defined for dir; 0 if not hashed.
bool anchored; // auth only
version_t file_data_version; // auth only
multiset<Message*> pinset;
friend class Migrator;
+ friend class Locker;
friend class Renamer;
friend class Server;
friend class MDCache;
// friends
friend class Server;
+ friend class Locker;
friend class Migrator;
friend class MDCache;
friend class CDir;
committing_version = version;
- waitfor_save[version].push_back(onfinish);
+ if (onfinish)
+ waitfor_save[version].push_back(onfinish);
// write (async)
mds->filer->write(inode,
}
friend class MDCache;
+ friend class Locker;
friend class Migrator;
};
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#include "MDS.h"
+#include "MDCache.h"
+#include "Locker.h"
+#include "Server.h"
+#include "CInode.h"
+#include "CDir.h"
+#include "CDentry.h"
+#include "Migrator.h"
+
+#include "MDBalancer.h"
+#include "MDLog.h"
+#include "MDSMap.h"
+
+#include "include/filepath.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+#include "events/EUnlink.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MGenericMessage.h"
+#include "messages/MDiscover.h"
+#include "messages/MDiscoverReply.h"
+
+#include "messages/MDirUpdate.h"
+
+#include "messages/MInodeFileCaps.h"
+
+#include "messages/MInodeLink.h"
+#include "messages/MInodeLinkAck.h"
+#include "messages/MInodeUnlink.h"
+#include "messages/MInodeUnlinkAck.h"
+
+#include "messages/MLock.h"
+#include "messages/MDentryUnlink.h"
+
+#include "messages/MClientRequest.h"
+#include "messages/MClientFileCaps.h"
+
+#include <errno.h>
+#include <assert.h>
+
+#include "config.h"
+#undef dout
+#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker "
+
+
+
+void Locker::dispatch(Message *m)
+{
+ switch (m->get_type()) {
+
+ // locking
+ case MSG_MDS_LOCK:
+ handle_lock((MLock*)m);
+ break;
+
+ // cache fun
+ case MSG_MDS_INODEFILECAPS:
+ handle_inode_file_caps((MInodeFileCaps*)m);
+ break;
+
+ case MSG_CLIENT_FILECAPS:
+ handle_client_file_caps((MClientFileCaps*)m);
+ break;
+
+
+
+ default:
+ assert(0);
+ }
+}
+
+
+
+
+// file i/o -----------------------------------------
+
+__uint64_t Locker::issue_file_data_version(CInode *in)
+{
+ dout(7) << "issue_file_data_version on " << *in << endl;
+ return in->inode.file_data_version;
+}
+
+
+Capability* Locker::issue_new_caps(CInode *in,
+ int mode,
+ MClientRequest *req)
+{
+ dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl;
+
+ // my needs
+ int my_client = req->get_client();
+ int my_want = 0;
+ if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD;
+ if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR;
+
+ // register a capability
+ Capability *cap = in->get_client_cap(my_client);
+ if (!cap) {
+ // new cap
+ Capability c(my_want);
+ in->add_client_cap(my_client, c);
+ cap = in->get_client_cap(my_client);
+
+ // note client addr
+ mds->clientmap.add_open(my_client, req->get_client_inst());
+
+ } else {
+ // make sure it has sufficient caps
+ if (cap->wanted() & ~my_want) {
+ // augment wanted caps for this client
+ cap->set_wanted( cap->wanted() | my_want );
+ }
+ }
+
+ // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply)
+ cap->set_suppress(true);
+ int before = cap->pending();
+
+ if (in->is_auth()) {
+ // [auth] twiddle mode?
+ inode_file_eval(in);
+ } else {
+ // [replica] tell auth about any new caps wanted
+ request_inode_file_caps(in);
+ }
+
+ // issue caps (pot. incl new one)
+ issue_caps(in); // note: _eval above may have done this already...
+
+ // re-issue whatever we can
+ cap->issue(cap->pending());
+
+ // ok, stop suppressing.
+ cap->set_suppress(false);
+
+ int now = cap->pending();
+ if (before != now &&
+ (before & CAP_FILE_WR) == 0 &&
+ (now & CAP_FILE_WR)) {
+ // FIXME FIXME FIXME
+ }
+
+ // twiddle file_data_version?
+ if ((before & CAP_FILE_WRBUFFER) == 0 &&
+ (now & CAP_FILE_WRBUFFER)) {
+ in->inode.file_data_version++;
+ dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl;
+ }
+
+ return cap;
+}
+
+
+
+bool Locker::issue_caps(CInode *in)
+{
+ // allowed caps are determined by the lock mode.
+ int allowed = in->filelock.caps_allowed(in->is_auth());
+ dout(7) << "issue_caps filelock allows=" << cap_string(allowed)
+ << " on " << *in << endl;
+
+ // count conflicts with
+ int nissued = 0;
+
+ // client caps
+ for (map<int, Capability>::iterator it = in->client_caps.begin();
+ it != in->client_caps.end();
+ it++) {
+ if (it->second.issued() != (it->second.wanted() & allowed)) {
+ // issue
+ nissued++;
+
+ int before = it->second.pending();
+ long seq = it->second.issue(it->second.wanted() & allowed);
+ int after = it->second.pending();
+
+ // twiddle file_data_version?
+ if (!(before & CAP_FILE_WRBUFFER) &&
+ (after & CAP_FILE_WRBUFFER)) {
+ dout(7) << " incrementing file_data_version for " << *in << endl;
+ in->inode.file_data_version++;
+ }
+
+ if (seq > 0 &&
+ !it->second.is_suppress()) {
+ dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl;
+ mds->messenger->send_message(new MClientFileCaps(in->inode,
+ it->second.get_last_seq(),
+ it->second.pending(),
+ it->second.wanted()),
+ MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
+ 0, MDS_PORT_LOCKER);
+ }
+ }
+ }
+
+ return (nissued == 0); // true if no re-issued, no callbacks
+}
+
+
+
+void Locker::request_inode_file_caps(CInode *in)
+{
+ int wanted = in->get_caps_wanted();
+ if (wanted != in->replica_caps_wanted) {
+
+ if (wanted == 0) {
+ if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) {
+ // ok, release them finally!
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " no keeping anymore "
+ << " on " << *in
+ << endl;
+ }
+ else if (in->replica_caps_wanted_keep_until.sec() == 0) {
+ in->replica_caps_wanted_keep_until = g_clock.recent_now();
+ in->replica_caps_wanted_keep_until.sec_ref() += 2;
+
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " keeping until " << in->replica_caps_wanted_keep_until
+ << " on " << *in
+ << endl;
+ return;
+ } else {
+ // wait longer
+ return;
+ }
+ } else {
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ }
+ assert(!in->is_auth());
+
+ int auth = in->authority();
+ dout(7) << "request_inode_file_caps " << cap_string(wanted)
+ << " was " << cap_string(in->replica_caps_wanted)
+ << " on " << *in << " to mds" << auth << endl;
+ assert(!in->is_auth());
+
+ in->replica_caps_wanted = wanted;
+ mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
+ in->replica_caps_wanted),
+ auth, MDS_PORT_LOCKER);
+ } else {
+ in->replica_caps_wanted_keep_until.sec_ref() = 0;
+ }
+}
+
+void Locker::handle_inode_file_caps(MInodeFileCaps *m)
+{
+ CInode *in = mdcache->get_inode(m->get_ino());
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+
+ dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
+
+ if (in->is_proxy()) {
+ dout(7) << "proxy, fw" << endl;
+ mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER);
+ return;
+ }
+
+ if (m->get_caps())
+ in->mds_caps_wanted[m->get_from()] = m->get_caps();
+ else
+ in->mds_caps_wanted.erase(m->get_from());
+
+ inode_file_eval(in);
+ delete m;
+}
+
+
+/*
+ * note: we only get these from the client if
+ * - we are calling back previously issued caps (fewer than the client previously had)
+ * - or if the client releases (any of) its caps on its own
+ */
+void Locker::handle_client_file_caps(MClientFileCaps *m)
+{
+ int client = MSG_ADDR_NUM(m->get_source());
+ CInode *in = mdcache->get_inode(m->get_ino());
+ Capability *cap = 0;
+ if (in)
+ cap = in->get_client_cap(client);
+
+ if (!in || !cap) {
+ if (!in) {
+ dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl;
+ } else {
+ dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl;
+ }
+ delete m;
+ return;
+ }
+
+ assert(cap);
+
+ // filter wanted based on what we could ever give out (given auth/replica status)
+ int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth());
+
+ dout(7) << "handle_client_file_caps seq " << m->get_seq()
+ << " confirms caps " << cap_string(m->get_caps())
+ << " wants " << cap_string(wanted)
+ << " from client" << client
+ << " on " << *in
+ << endl;
+
+ // update wanted
+ if (cap->wanted() != wanted)
+ cap->set_wanted(wanted);
+
+ // confirm caps
+ int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
+ int has = cap->confirmed();
+ if (cap->is_null()) {
+ dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl;
+ in->remove_client_cap(client);
+ if (!in->is_auth())
+ request_inode_file_caps(in);
+
+ // dec client addr counter
+ mds->clientmap.dec_open(client);
+
+ // tell client.
+ MClientFileCaps *r = new MClientFileCaps(in->inode,
+ 0, 0, 0,
+ MClientFileCaps::FILECAP_RELEASE);
+ mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER);
+ }
+
+ // merge in atime?
+ if (m->get_inode().atime > in->inode.atime) {
+ dout(7) << " taking atime " << m->get_inode().atime << " > "
+ << in->inode.atime << " for " << *in << endl;
+ in->inode.atime = m->get_inode().atime;
+ }
+
+ if ((has|had) & CAP_FILE_WR) {
+ bool dirty = false;
+
+ // mtime
+ if (m->get_inode().mtime > in->inode.mtime) {
+ dout(7) << " taking mtime " << m->get_inode().mtime << " > "
+ << in->inode.mtime << " for " << *in << endl;
+ in->inode.mtime = m->get_inode().mtime;
+ dirty = true;
+ }
+ // size
+ if (m->get_inode().size > in->inode.size) {
+ dout(7) << " taking size " << m->get_inode().size << " > "
+ << in->inode.size << " for " << *in << endl;
+ in->inode.size = m->get_inode().size;
+ dirty = true;
+ }
+
+ if (dirty)
+ mds->mdlog->submit_entry(new EInodeUpdate(in));
+ }
+
+ // reevaluate, waiters
+ inode_file_eval(in);
+ in->finish_waiting(CINODE_WAIT_CAPS, 0);
+
+ delete m;
+}
+
+
+
+
+
+
+
+
+
+
+// locks ----------------------------------------------------------------
+
+/*
+
+
+INODES:
+
+= two types of inode metadata:
+ hard - uid/gid, mode
+ file - mtime, size
+ ? atime - atime (*) <-- we want a lazy update strategy?
+
+= correspondingly, two types of inode locks:
+ hardlock - hard metadata
+ filelock - file metadata
+
+ -> These locks are completely orthogonal!
+
+= metadata ops and how they affect inode metadata:
+ sma=size mtime atime
+ HARD FILE OP
+ files:
+ R RRR stat
+ RW chmod/chown
+ R W touch ?ctime
+ R openr
+ W read atime
+ R openw
+ Wc openwc ?ctime
+ WW write size mtime
+ close
+
+ dirs:
+ R W readdir atime
+ RRR ( + implied stats on files)
+ Rc WW mkdir (ctime on new dir, size+mtime on parent dir)
+ R WW link/unlink/rename/rmdir (size+mtime on dir)
+
+
+
+= relationship to client (writers):
+
+ - ops in question are
+ - stat ... need reasonable value for mtime (+ atime?)
+ - maybe we want a "quicksync" type operation instead of full lock
+ - truncate ... need to stop writers for the atomic truncate operation
+ - need a full lock
+
+
+
+
+= modes
+ - SYNC
+ Rauth Rreplica Wauth Wreplica
+ sync
+
+
+
+
+
+ALSO:
+
+ dirlock - no dir changes (prior to unhashing)
+ denlock - dentry lock (prior to unlink, rename)
+
+
+*/
+
+
+void Locker::handle_lock(MLock *m)
+{
+ switch (m->get_otype()) {
+ case LOCK_OTYPE_IHARD:
+ handle_lock_inode_hard(m);
+ break;
+
+ case LOCK_OTYPE_IFILE:
+ handle_lock_inode_file(m);
+ break;
+
+ case LOCK_OTYPE_DIR:
+ handle_lock_dir(m);
+ break;
+
+ case LOCK_OTYPE_DN:
+ handle_lock_dn(m);
+ break;
+
+ default:
+ dout(7) << "handle_lock got otype " << m->get_otype() << endl;
+ assert(0);
+ break;
+ }
+}
+
+
+
+// ===============================
+// hard inode metadata
+
+bool Locker::inode_hard_read_try(CInode *in, Context *con)
+{
+ dout(7) << "inode_hard_read_try on " << *in << endl;
+
+ // can read? grab ref.
+ if (in->hardlock.can_read(in->is_auth()))
+ return true;
+
+ assert(!in->is_auth());
+
+ // wait!
+ dout(7) << "inode_hard_read_try waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDR, con);
+ return false;
+}
+
+bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_hard_read_start on " << *in << endl;
+
+ // can read? grab ref.
+ if (in->hardlock.can_read(in->is_auth())) {
+ in->hardlock.get_read();
+ return true;
+ }
+
+ // can't read, and replicated.
+ assert(!in->is_auth());
+
+ // wait!
+ dout(7) << "inode_hard_read_start waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+}
+
+
+void Locker::inode_hard_read_finish(CInode *in)
+{
+ // drop ref
+ assert(in->hardlock.can_read(in->is_auth()));
+ in->hardlock.put_read();
+
+ dout(7) << "inode_hard_read_finish on " << *in << endl;
+
+ //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD);
+}
+
+
+bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_hard_write_start on " << *in << endl;
+
+ // if not replicated, i can twiddle lock at will
+ if (in->is_auth() &&
+ !in->is_cached_by_anyone() &&
+ in->hardlock.get_state() != LOCK_LOCK)
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // can write? grab ref.
+ if (in->hardlock.can_write(in->is_auth())) {
+ assert(in->is_auth());
+ if (!in->can_auth_pin()) {
+ dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+
+ in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_*
+ in->hardlock.get_write();
+ return true;
+ }
+
+ // can't write, replicated.
+ if (in->is_auth()) {
+ // auth
+ if (in->hardlock.can_write_soon(in->is_auth())) {
+ // just wait
+ } else {
+ // initiate lock
+ inode_hard_lock(in);
+ }
+
+ dout(7) << "inode_hard_write_start waiting on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in));
+
+ return false;
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+ }
+}
+
+
+void Locker::inode_hard_write_finish(CInode *in)
+{
+ // drop ref
+ assert(in->hardlock.can_write(in->is_auth()));
+ in->hardlock.put_write();
+ in->auth_unpin();
+ dout(7) << "inode_hard_write_finish on " << *in << endl;
+
+ // drop lock?
+ if (in->hardlock.get_nwrite() == 0) {
+
+ // auto-sync if alone.
+ if (in->is_auth() &&
+ !in->is_cached_by_anyone() &&
+ in->hardlock.get_state() != LOCK_SYNC)
+ in->hardlock.set_state(LOCK_SYNC);
+
+ inode_hard_eval(in);
+ }
+}
+
+
+void Locker::inode_hard_eval(CInode *in)
+{
+ // finished gather?
+ if (in->is_auth() &&
+ !in->hardlock.is_stable() &&
+ in->hardlock.gather_set.empty()) {
+ dout(7) << "inode_hard_eval finished gather on " << *in << endl;
+ switch (in->hardlock.get_state()) {
+ case LOCK_GLOCKR:
+ in->hardlock.set_state(LOCK_LOCK);
+
+ // waiters
+ in->hardlock.get_write();
+ in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE);
+ in->hardlock.put_write();
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ if (!in->hardlock.is_stable()) return;
+
+ if (in->is_auth()) {
+
+ // sync?
+ if (in->is_cached_by_anyone() &&
+ in->hardlock.get_nwrite() == 0 &&
+ in->hardlock.get_state() != LOCK_SYNC) {
+ dout(7) << "inode_hard_eval stable, syncing " << *in << endl;
+ inode_hard_sync(in);
+ }
+
+ } else {
+ // replica
+ }
+}
+
+
+// mid
+
+void Locker::inode_hard_sync(CInode *in)
+{
+ dout(7) << "inode_hard_sync on " << *in << endl;
+ assert(in->is_auth());
+
+ // check state
+ if (in->hardlock.get_state() == LOCK_SYNC)
+ return; // already sync
+ if (in->hardlock.get_state() == LOCK_GLOCKR)
+ assert(0); // um... hmm!
+ assert(in->hardlock.get_state() == LOCK_LOCK);
+
+ // hard data
+ bufferlist harddata;
+ in->encode_hard_state(harddata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ m->set_data(harddata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // change lock
+ in->hardlock.set_state(LOCK_SYNC);
+
+ // waiters?
+ in->finish_waiting(CINODE_WAIT_HARDSTABLE);
+}
+
+void Locker::inode_hard_lock(CInode *in)
+{
+ dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl;
+ assert(in->is_auth());
+
+ // check state
+ if (in->hardlock.get_state() == LOCK_LOCK ||
+ in->hardlock.get_state() == LOCK_GLOCKR)
+ return; // already lock or locking
+ assert(in->hardlock.get_state() == LOCK_SYNC);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // change lock
+ in->hardlock.set_state(LOCK_GLOCKR);
+ in->hardlock.init_gather(in->get_cached_by());
+}
+
+
+
+
+
+// messenger
+
+void Locker::handle_lock_inode_hard(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_IHARD);
+
+ mds->logger->inc("lih");
+
+ int from = m->get_asker();
+ CInode *in = mdcache->get_inode(m->get_ino());
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+ dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl;
+
+ if (in->is_proxy()) {
+ // fw
+ int newauth = in->authority();
+ assert(newauth >= 0);
+ if (from == newauth) {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+ delete m;
+ } else {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+ mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+ } else {
+ // replica
+ if (!in) {
+ dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl;
+ /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness
+ to keep gather_set a proper/correct subset of cached_by. better to use the existing
+ cacheexpire mechanism instead!
+ */
+ delete m;
+ return;
+ }
+
+ assert(!in->is_auth());
+ }
+
+ dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl;
+
+ CLock *lock = &in->hardlock;
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ assert(lock->get_state() == LOCK_LOCK);
+
+ { // assim data
+ int off = 0;
+ in->decode_hard_state(m->get_data(), off);
+ }
+
+ // update lock
+ lock->set_state(LOCK_SYNC);
+
+ // no need to reply
+
+ // waiters
+ in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE);
+ break;
+
+ case LOCK_AC_LOCK:
+ assert(lock->get_state() == LOCK_SYNC);
+ //|| lock->get_state() == LOCK_GLOCKR);
+
+ // wait for readers to finish?
+ if (lock->get_nread() > 0) {
+ dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl;
+ lock->set_state(LOCK_GLOCKR);
+ in->add_waiter(CINODE_WAIT_HARDNORD,
+ new C_MDS_RetryMessage(mds,m));
+ assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!)
+ return;
+ } else {
+
+ // update lock and reply
+ lock->set_state(LOCK_LOCK);
+
+ {
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IHARD);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ }
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ assert(lock->state == LOCK_GLOCKR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl;
+ inode_hard_eval(in);
+ }
+ }
+ delete m;
+}
+
+
+
+
+// =====================
+// soft inode metadata
+
+
+bool Locker::inode_file_read_start(CInode *in, MClientRequest *m)
+{
+ dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl;
+
+ // can read? grab ref.
+ if (in->filelock.can_read(in->is_auth())) {
+ in->filelock.get_read();
+ return true;
+ }
+
+ // can't read, and replicated.
+ if (in->filelock.can_read_soon(in->is_auth())) {
+ // wait
+ dout(7) << "inode_file_read_start can_read_soon " << *in << endl;
+ } else {
+ if (in->is_auth()) {
+ // auth
+
+ // FIXME or qsync?
+
+ if (in->filelock.is_stable()) {
+ inode_file_lock(in); // lock, bc easiest to back off
+
+ if (in->filelock.can_read(in->is_auth())) {
+ in->filelock.get_read();
+
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_write();
+ return true;
+ }
+ } else {
+ dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+ } else {
+ // replica
+ if (in->filelock.is_stable()) {
+
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+
+ } else {
+ // wait until stable
+ dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+ }
+ }
+
+ // wait
+ dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl;
+ in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in));
+
+ return false;
+}
+
+
+void Locker::inode_file_read_finish(CInode *in)
+{
+ // drop ref
+ assert(in->filelock.can_read(in->is_auth()));
+ in->filelock.put_read();
+
+ dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl;
+
+ if (in->filelock.get_nread() == 0) {
+ in->finish_waiting(CINODE_WAIT_FILENORD);
+ inode_file_eval(in);
+ }
+}
+
+
+bool Locker::inode_file_write_start(CInode *in, MClientRequest *m)
+{
+ // can write? grab ref.
+ if (in->filelock.can_write(in->is_auth())) {
+ in->filelock.get_write();
+ return true;
+ }
+
+ // can't write, replicated.
+ if (in->is_auth()) {
+ // auth
+ if (in->filelock.can_write_soon(in->is_auth())) {
+ // just wait
+ } else {
+ if (!in->filelock.is_stable()) {
+ dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ }
+
+ // initiate lock
+ inode_file_lock(in);
+
+ if (in->filelock.can_write(in->is_auth())) {
+ in->filelock.get_write();
+
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ return true;
+ }
+ }
+
+ dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in));
+ return false;
+ } else {
+ // replica
+ // fw to auth
+ int auth = in->authority();
+ dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
+ assert(auth != mds->get_nodeid());
+ mdcache->request_forward(m, auth);
+ return false;
+ }
+}
+
+
+void Locker::inode_file_write_finish(CInode *in)
+{
+ // drop ref
+ assert(in->filelock.can_write(in->is_auth()));
+ in->filelock.put_write();
+ dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
+
+ // drop lock?
+ if (in->filelock.get_nwrite() == 0) {
+ in->finish_waiting(CINODE_WAIT_FILENOWR);
+ inode_file_eval(in);
+ }
+}
+
+
+/*
+ * ...
+ *
+ * also called after client caps are acked to us
+ * - checks if we're in unstable sfot state and can now move on to next state
+ * - checks if soft state should change (eg bc last writer closed)
+ */
+
+void Locker::inode_file_eval(CInode *in)
+{
+ int issued = in->get_caps_issued();
+
+ // [auth] finished gather?
+ if (in->is_auth() &&
+ !in->filelock.is_stable() &&
+ in->filelock.gather_set.size() == 0) {
+ dout(7) << "inode_file_eval finished mds gather on " << *in << endl;
+
+ switch (in->filelock.get_state()) {
+ // to lock
+ case LOCK_GLOCKR:
+ case LOCK_GLOCKM:
+ case LOCK_GLOCKL:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LOCK);
+
+ // waiters
+ in->filelock.get_read();
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ in->filelock.put_write();
+ }
+ break;
+
+ // to mixed
+ case LOCK_GMIXEDR:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ case LOCK_GMIXEDL:
+ if ((issued & ~(CAP_FILE_WR)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+
+ if (in->is_cached_by_anyone()) {
+ // data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ // to loner
+ case LOCK_GLONERR:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LONER);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ case LOCK_GLONERM:
+ if ((issued & ~CAP_FILE_WR) == 0) {
+ in->filelock.set_state(LOCK_LONER);
+ in->finish_waiting(CINODE_WAIT_FILESTABLE);
+ }
+ break;
+
+ // to sync
+ case LOCK_GSYNCL:
+ case LOCK_GSYNCM:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_SYNC);
+
+ { // bcast data to replicas
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ reply->set_data(softdata);
+ mds->send_message_mds(reply, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // waiters
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ issue_caps(in);
+ }
+
+ // [replica] finished caps gather?
+ if (!in->is_auth() &&
+ !in->filelock.is_stable()) {
+ switch (in->filelock.get_state()) {
+ case LOCK_GMIXEDR:
+ if ((issued & ~(CAP_FILE_RD)) == 0) {
+ in->filelock.set_state(LOCK_MIXED);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ }
+ break;
+
+ case LOCK_GLOCKR:
+ if (issued == 0) {
+ in->filelock.set_state(LOCK_LOCK);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER);
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+
+ // !stable -> do nothing.
+ if (!in->filelock.is_stable()) return;
+
+
+ // stable.
+ assert(in->filelock.is_stable());
+
+ if (in->is_auth()) {
+ // [auth]
+ int wanted = in->get_caps_wanted();
+ bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty();
+ dout(7) << "inode_file_eval wanted=" << cap_string(wanted)
+ << " filelock=" << in->filelock
+ << " loner=" << loner
+ << endl;
+
+ // * -> loner?
+ if (in->filelock.get_nread() == 0 &&
+ in->filelock.get_nwrite() == 0 &&
+ (wanted & CAP_FILE_WR) &&
+ loner &&
+ in->filelock.get_state() != LOCK_LONER) {
+ dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_loner(in);
+ }
+
+ // * -> mixed?
+ else if (in->filelock.get_nread() == 0 &&
+ in->filelock.get_nwrite() == 0 &&
+ (wanted & CAP_FILE_RD) &&
+ (wanted & CAP_FILE_WR) &&
+ !(loner && in->filelock.get_state() == LOCK_LONER) &&
+ in->filelock.get_state() != LOCK_MIXED) {
+ dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_mixed(in);
+ }
+
+ // * -> sync?
+ else if (in->filelock.get_nwrite() == 0 &&
+ !(wanted & CAP_FILE_WR) &&
+ ((wanted & CAP_FILE_RD) ||
+ in->is_cached_by_anyone() ||
+ (!loner && in->filelock.get_state() == LOCK_LONER)) &&
+ in->filelock.get_state() != LOCK_SYNC) {
+ dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl;
+ inode_file_sync(in);
+ }
+
+ // * -> lock? (if not replicated or open)
+ else if (!in->is_cached_by_anyone() &&
+ wanted == 0 &&
+ in->filelock.get_state() != LOCK_LOCK) {
+ inode_file_lock(in);
+ }
+
+ } else {
+ // replica
+ // recall? check wiaters? XXX
+ }
+}
+
+
+// mid
+
+bool Locker::inode_file_sync(CInode *in)
+{
+ dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_SYNC ||
+ in->filelock.get_state() == LOCK_GSYNCL ||
+ in->filelock.get_state() == LOCK_GSYNCM)
+ return true;
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ assert((in->get_caps_wanted() & CAP_FILE_WR) == 0);
+
+ if (in->filelock.get_state() == LOCK_LOCK) {
+ if (in->is_cached_by_anyone()) {
+ // soft data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+
+ // reissue caps
+ issue_caps(in);
+ return true;
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ // writers?
+ if (issued & CAP_FILE_WR) {
+ // gather client write caps
+ in->filelock.set_state(LOCK_GSYNCM);
+ issue_caps(in);
+ } else {
+ // no writers, go straight to sync
+
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+ }
+ return false;
+ }
+
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ // writers?
+ if (issued & CAP_FILE_WR) {
+ // gather client write caps
+ in->filelock.set_state(LOCK_GSYNCL);
+ issue_caps(in);
+ } else {
+ // no writers, go straight to sync
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_SYNC);
+ }
+ return false;
+ }
+ else
+ assert(0); // wtf.
+
+ return false;
+}
+
+
+void Locker::inode_file_lock(CInode *in)
+{
+ dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_LOCK ||
+ in->filelock.get_state() == LOCK_GLOCKR ||
+ in->filelock.get_state() == LOCK_GLOCKM ||
+ in->filelock.get_state() == LOCK_GLOCKL)
+ return; // lock or locking
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKR);
+
+ // call back caps
+ if (issued)
+ issue_caps(in);
+ } else {
+ if (issued) {
+ // call back caps
+ in->filelock.set_state(LOCK_GLOCKR);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKM);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ //assert(issued); // ??? -sage 2/19/06
+ if (issued) {
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKM);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+
+ }
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ if (issued & CAP_FILE_WR) {
+ // change lock
+ in->filelock.set_state(LOCK_GLOCKL);
+
+ // call back caps
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_LOCK);
+ }
+ }
+ else
+ assert(0); // wtf.
+}
+
+
+void Locker::inode_file_mixed(CInode *in)
+{
+ dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_GMIXEDR ||
+ in->filelock.get_state() == LOCK_GMIXEDL)
+ return; // mixed or mixing
+
+ assert(in->filelock.is_stable());
+
+ int issued = in->get_caps_issued();
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ in->filelock.set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ } else {
+ if (issued) {
+ in->filelock.set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_MIXED);
+ }
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_LOCK) {
+ if (in->is_cached_by_anyone()) {
+ // data
+ bufferlist softdata;
+ in->encode_file_state(softdata);
+
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ m->set_data(softdata);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+
+ // change lock
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ }
+
+ else if (in->filelock.get_state() == LOCK_LONER) {
+ if (issued & CAP_FILE_WRBUFFER) {
+ // gather up WRBUFFER caps
+ in->filelock.set_state(LOCK_GMIXEDL);
+ issue_caps(in);
+ }
+ else if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ } else {
+ in->filelock.set_state(LOCK_MIXED);
+ issue_caps(in);
+ }
+ }
+
+ else
+ assert(0); // wtf.
+}
+
+
+void Locker::inode_file_loner(CInode *in)
+{
+ dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl;
+
+ assert(in->is_auth());
+
+ // check state
+ if (in->filelock.get_state() == LOCK_LONER ||
+ in->filelock.get_state() == LOCK_GLONERR ||
+ in->filelock.get_state() == LOCK_GLONERM)
+ return;
+
+ assert(in->filelock.is_stable());
+ assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty());
+
+ if (in->filelock.get_state() == LOCK_SYNC) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLONERR);
+ } else {
+ // only one guy with file open, who gets it all, so
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+ }
+
+ else if (in->filelock.get_state() == LOCK_LOCK) {
+ // change lock. ignore replicas; they don't know about LONER.
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+
+ else if (in->filelock.get_state() == LOCK_MIXED) {
+ if (in->is_cached_by_anyone()) {
+ // bcast to replicas
+ for (set<int>::iterator it = in->cached_by_begin();
+ it != in->cached_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ in->filelock.init_gather(in->get_cached_by());
+
+ // change lock
+ in->filelock.set_state(LOCK_GLONERM);
+ } else {
+ in->filelock.set_state(LOCK_LONER);
+ issue_caps(in);
+ }
+ }
+
+ else
+ assert(0);
+}
+
+// messenger
+
+void Locker::handle_lock_inode_file(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_IFILE);
+
+ mds->logger->inc("lif");
+
+ CInode *in = mdcache->get_inode(m->get_ino());
+ int from = m->get_asker();
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+ assert(in);
+ assert(in->is_auth() || in->is_proxy());
+ dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl;
+
+ if (in->is_proxy()) {
+ // fw
+ int newauth = in->authority();
+ assert(newauth >= 0);
+ if (from == newauth) {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
+ delete m;
+ } else {
+ dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
+ mds->send_message_mds(m, newauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+ } else {
+ // replica
+ if (!in) {
+ // drop it. don't nak.
+ dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl;
+ delete m;
+ return;
+ }
+
+ assert(!in->is_auth());
+ }
+
+ dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl;
+
+ CLock *lock = &in->filelock;
+ int issued = in->get_caps_issued();
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_SYNC:
+ assert(lock->get_state() == LOCK_LOCK ||
+ lock->get_state() == LOCK_MIXED);
+
+ { // assim data
+ int off = 0;
+ in->decode_file_state(m->get_data(), off);
+ }
+
+ // update lock
+ lock->set_state(LOCK_SYNC);
+
+ // no need to reply.
+
+ // waiters
+ in->filelock.get_read();
+ in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_read();
+ inode_file_eval(in);
+ break;
+
+ case LOCK_AC_LOCK:
+ assert(lock->get_state() == LOCK_SYNC ||
+ lock->get_state() == LOCK_MIXED);
+
+ // call back caps?
+ if (issued & CAP_FILE_RD) {
+ dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl;
+ issue_caps(in);
+ }
+ if (lock->get_nread() > 0) {
+ dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl;
+ in->add_waiter(CINODE_WAIT_FILENORD,
+ new C_MDS_RetryMessage(mds,m));
+ lock->set_state(LOCK_GLOCKR);
+ assert(0);// i am broken.. why retry message when state captures all the info i need?
+ return;
+ }
+ if (issued & CAP_FILE_RD) {
+ lock->set_state(LOCK_GLOCKR);
+ break;
+ }
+
+ // nothing to wait for, lock and ack.
+ {
+ lock->set_state(LOCK_LOCK);
+
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ break;
+
+ case LOCK_AC_MIXED:
+ assert(lock->get_state() == LOCK_SYNC ||
+ lock->get_state() == LOCK_LOCK);
+
+ if (lock->get_state() == LOCK_SYNC) {
+ // MIXED
+ if (issued & CAP_FILE_RD) {
+ // call back client caps
+ lock->set_state(LOCK_GMIXEDR);
+ issue_caps(in);
+ break;
+ } else {
+ // no clients, go straight to mixed
+ lock->set_state(LOCK_MIXED);
+
+ // ack
+ MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
+ reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+ } else {
+ // LOCK
+ lock->set_state(LOCK_MIXED);
+
+ // no ack needed.
+ }
+
+ issue_caps(in);
+
+ // waiters
+ in->filelock.get_write();
+ in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE);
+ in->filelock.put_write();
+ inode_file_eval(in);
+ break;
+
+
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ assert(lock->state == LOCK_GLOCKR ||
+ lock->state == LOCK_GLOCKM ||
+ lock->state == LOCK_GLONERM ||
+ lock->state == LOCK_GLONERR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+ case LOCK_AC_SYNCACK:
+ assert(lock->state == LOCK_GSYNCM);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ /* not used currently
+ {
+ // merge data (keep largest size, mtime, etc.)
+ int off = 0;
+ in->decode_merge_file_state(m->get_data(), off);
+ }
+ */
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+ case LOCK_AC_MIXEDACK:
+ assert(lock->state == LOCK_GMIXEDR);
+ assert(lock->gather_set.count(from));
+ lock->gather_set.erase(from);
+
+ if (lock->gather_set.size()) {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
+ } else {
+ dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
+ inode_file_eval(in);
+ }
+ break;
+
+
+ default:
+ assert(0);
+ }
+
+ delete m;
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+void Locker::handle_lock_dir(MLock *m)
+{
+
+}
+
+
+
+// DENTRY
+
+bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref)
+{
+ dout(7) << "dentry_xlock_start on " << *dn << endl;
+
+ // locked?
+ if (dn->lockstate == DN_LOCK_XLOCK) {
+ if (dn->xlockedby == m) return true; // locked by me!
+
+ // not by me, wait
+ dout(7) << "dentry " << *dn << " xlock by someone else" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // prelock?
+ if (dn->lockstate == DN_LOCK_PREXLOCK) {
+ if (dn->xlockedby == m) {
+ dout(7) << "dentry " << *dn << " prexlock by me" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ } else {
+ dout(7) << "dentry " << *dn << " prexlock by someone else" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ }
+ return false;
+ }
+
+
+ // lockable!
+ assert(dn->lockstate == DN_LOCK_SYNC ||
+ dn->lockstate == DN_LOCK_UNPINNING);
+
+ // dir auth pinnable?
+ if (!dn->dir->can_auth_pin()) {
+ dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl;
+ dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // is dentry path pinned?
+ if (dn->is_pinned()) {
+ dout(7) << "dentry " << *dn << " pinned, waiting" << endl;
+ dn->lockstate = DN_LOCK_UNPINNING;
+ dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+ dn->name,
+ new C_MDS_RetryRequest(mds,m,ref));
+ return false;
+ }
+
+ // pin path up to dentry! (if success, point of no return)
+ CDentry *pdn = dn->dir->inode->get_parent_dn();
+ if (pdn) {
+ if (mdcache->active_requests[m].traces.count(pdn)) {
+ dout(7) << "already path pinned parent dentry " << *pdn << endl;
+ } else {
+ dout(7) << "pinning parent dentry " << *pdn << endl;
+ vector<CDentry*> trace;
+ mdcache->make_trace(trace, pdn->inode);
+ assert(trace.size());
+
+ if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false;
+
+ mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace;
+ }
+ }
+
+ // pin dir!
+ dn->dir->auth_pin();
+
+ // mine!
+ dn->xlockedby = m;
+
+ if (dn->dir->is_open_by_anyone()) {
+ dn->lockstate = DN_LOCK_PREXLOCK;
+
+ // xlock with whom?
+ set<int> who = dn->dir->get_open_by();
+ dn->gather_set = who;
+
+ // make path
+ string path;
+ dn->make_path(path);
+ dout(10) << "path is " << path << " for " << *dn << endl;
+
+ for (set<int>::iterator it = who.begin();
+ it != who.end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ m->set_path(path);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+
+ // wait
+ dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
+ new C_MDS_RetryRequest(mds, m, ref));
+ return false;
+ } else {
+ dn->lockstate = DN_LOCK_XLOCK;
+ mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+ return true;
+ }
+}
+
+void Locker::dentry_xlock_finish(CDentry *dn, bool quiet)
+{
+ dout(7) << "dentry_xlock_finish on " << *dn << endl;
+
+ assert(dn->xlockedby);
+ if (dn->xlockedby == DN_XLOCK_FOREIGN) {
+ dout(7) << "this was a foreign xlock" << endl;
+ } else {
+ // remove from request record
+ assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1);
+ mdcache->active_requests[dn->xlockedby].xlocks.erase(dn);
+ }
+
+ dn->xlockedby = 0;
+ dn->lockstate = DN_LOCK_SYNC;
+
+ // unpin parent dir?
+ // -> no? because we might have xlocked 2 things in this dir.
+ // instead, we let request_finish clean up the mess.
+
+ // tell replicas?
+ if (!quiet) {
+ // tell even if dn is null.
+ if (dn->dir->is_open_by_anyone()) {
+ for (set<int>::iterator it = dn->dir->open_by_begin();
+ it != dn->dir->open_by_end();
+ it++) {
+ MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
+ m->set_dn(dn->dir->ino(), dn->name);
+ mds->send_message_mds(m, *it, MDS_PORT_LOCKER);
+ }
+ }
+ }
+
+ // unpin dir
+ dn->dir->auth_unpin();
+}
+
+/*
+ * onfinish->finish() will be called with
+ * 0 on successful xlock,
+ * -1 on failure
+ */
+
+class C_MDC_XlockRequest : public Context {
+ Locker *mdc;
+ CDir *dir;
+ string dname;
+ Message *req;
+ Context *finisher;
+public:
+ C_MDC_XlockRequest(Locker *mdc,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher) {
+ this->mdc = mdc;
+ this->dir = dir;
+ this->dname = dname;
+ this->req = req;
+ this->finisher = finisher;
+ }
+
+ void finish(int r) {
+ mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher);
+ }
+};
+
+void Locker::dentry_xlock_request_finish(int r,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher)
+{
+ dout(10) << "dentry_xlock_request_finish r = " << r << endl;
+ if (r == 1) { // 1 for xlock request success
+ CDentry *dn = dir->lookup(dname);
+ if (dn && dn->xlockedby == 0) {
+ // success
+ dn->xlockedby = req; // our request was the winner
+ dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl;
+
+ // remember!
+ mdcache->active_requests[req].foreign_xlocks.insert(dn);
+ }
+ }
+
+ // retry request (or whatever)
+ finisher->finish(0);
+ delete finisher;
+}
+
+void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create,
+ Message *req, Context *onfinish)
+{
+ dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl;
+ // send request
+ int dauth = dir->dentry_authority(dname);
+ MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid());
+ m->set_dn(dir->ino(), dname);
+ mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+
+ // add waiter
+ dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname,
+ new C_MDC_XlockRequest(this,
+ dir, dname, req,
+ onfinish));
+}
+
+
+
+
+void Locker::handle_lock_dn(MLock *m)
+{
+ assert(m->get_otype() == LOCK_OTYPE_DN);
+
+ CInode *diri = mdcache->get_inode(m->get_ino()); // may be null
+ CDir *dir = 0;
+ if (diri) dir = diri->dir; // may be null
+ string dname = m->get_dn();
+ int from = m->get_asker();
+ CDentry *dn = 0;
+
+ if (LOCK_AC_FOR_AUTH(m->get_action())) {
+ // auth
+
+ // normally we have it always
+ if (diri && dir) {
+ int dauth = dir->dentry_authority(dname);
+ assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy,
+ m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak
+ m->get_action() == LOCK_AC_REQXLOCKNAK);
+
+ if (dir->is_proxy()) {
+
+ assert(dauth >= 0);
+
+ if (dauth == m->get_asker() &&
+ (m->get_action() == LOCK_AC_REQXLOCK ||
+ m->get_action() == LOCK_AC_REQXLOCKC)) {
+ dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl;
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+ else
+ delete m;
+ return;
+ }
+
+ dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl;
+
+ // forward
+ if (mdcache->active_requests.count(m)) {
+ // xlock requests are requests, use request_* functions!
+ assert(m->get_action() == LOCK_AC_REQXLOCK ||
+ m->get_action() == LOCK_AC_REQXLOCKC);
+ // forward as a request
+ mdcache->request_forward(m, dauth, MDS_PORT_LOCKER);
+ } else {
+ // not an xlock req, or it is and we just didn't register the request yet
+ // forward normally
+ mds->send_message_mds(m, dauth, MDS_PORT_LOCKER);
+ }
+ return;
+ }
+
+ dn = dir->lookup(dname);
+ }
+
+ // except with.. an xlock request?
+ if (!dn) {
+ assert(dir); // we should still have the dir, though! the requester has the dir open.
+ switch (m->get_action()) {
+
+ case LOCK_AC_LOCK:
+ dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl;
+ dn = dir->add_dentry(dname);
+ break;
+
+ case LOCK_AC_REQXLOCK:
+ // send nak
+ if (dir->state_test(CDIR_STATE_DELETED)) {
+ dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl;
+ } else {
+ dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl;
+ }
+ {
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(m->get_path());
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+ }
+
+ // finish request (if we got that far)
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+
+ delete m;
+ return;
+
+ case LOCK_AC_REQXLOCKC:
+ dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl;
+ break;
+
+ default:
+ assert(0);
+ }
+ }
+ } else {
+ // replica
+ if (dir) dn = dir->lookup(dname);
+ if (!dn) {
+ dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl;
+
+ if (m->get_action() == LOCK_AC_REQXLOCKACK ||
+ m->get_action() == LOCK_AC_REQXLOCKNAK) {
+ dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl;
+ //assert(0); // how can this happen? tell me now!
+
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+ int r = mdcache->path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds,m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ return;
+ }
+
+ if (m->get_action() == LOCK_AC_LOCK) {
+ if (0) { // not anymore
+ dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl;
+
+ vector<CDentry*> trace;
+ filepath path = m->get_path();
+ int r = mdcache->path_traverse(path, trace, true,
+ m, new C_MDS_RetryMessage(mds,m),
+ MDS_TRAVERSE_DISCOVER);
+ assert(r>0);
+ }
+ if (1) {
+ // NAK
+ MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid());
+ reply->set_dn(m->get_ino(), dname);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+ }
+ } else {
+ dout(7) << "safely ignoring." << endl;
+ delete m;
+ }
+ return;
+ }
+
+ assert(dn);
+ }
+
+ if (dn) {
+ dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl;
+ } else {
+ dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl;
+ }
+
+ switch (m->get_action()) {
+ // -- replica --
+ case LOCK_AC_LOCK:
+ assert(dn->lockstate == DN_LOCK_SYNC ||
+ dn->lockstate == DN_LOCK_UNPINNING ||
+ dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover!
+
+ if (dn->is_pinned()) {
+ dn->lockstate = DN_LOCK_UNPINNING;
+
+ // wait
+ dout(7) << "dn pinned, waiting " << *dn << endl;
+ dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
+ dn->name,
+ new C_MDS_RetryMessage(mds, m));
+ return;
+ } else {
+ dn->lockstate = DN_LOCK_XLOCK;
+ dn->xlockedby = 0;
+
+ // ack now
+ MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
+ reply->set_dn(diri->ino(), dname);
+ mds->send_message_mds(reply, from, MDS_PORT_LOCKER);
+ }
+
+ // wake up waiters
+ dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ?
+ break;
+
+ case LOCK_AC_SYNC:
+ assert(dn->lockstate == DN_LOCK_XLOCK);
+ dn->lockstate = DN_LOCK_SYNC;
+ dn->xlockedby = 0;
+
+ // null? hose it.
+ if (dn->is_null()) {
+ dout(7) << "hosing null (and now sync) dentry " << *dn << endl;
+ dir->remove_dentry(dn);
+ }
+
+ // wake up waiters
+ dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out
+ break;
+
+ case LOCK_AC_REQXLOCKACK:
+ case LOCK_AC_REQXLOCKNAK:
+ {
+ dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl;
+ list<Context*> finished;
+ dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY!
+ finish_contexts(finished,
+ (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1);
+ }
+ break;
+
+
+ // -- auth --
+ case LOCK_AC_LOCKACK:
+ case LOCK_AC_LOCKNAK:
+ assert(dn->gather_set.count(from) == 1);
+ dn->gather_set.erase(from);
+ if (dn->gather_set.size() == 0) {
+ dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl;
+ dn->lockstate = DN_LOCK_XLOCK;
+ mdcache->active_requests[dn->xlockedby].xlocks.insert(dn);
+ dir->finish_waiting(CDIR_WAIT_DNLOCK, dname);
+ }
+ break;
+
+
+ case LOCK_AC_REQXLOCKC:
+ // make sure it's a _file_, if it exists.
+ if (dn && dn->inode && dn->inode->is_dir()) {
+ dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl;
+
+ // nak
+ string path;
+ dn->make_path(path);
+
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(path);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+
+ // done
+ if (mdcache->active_requests.count(m))
+ mdcache->request_finish(m);
+ else
+ delete m;
+ return;
+ }
+
+ case LOCK_AC_REQXLOCK:
+ if (dn) {
+ dout(7) << "handle_lock_dn reqxlock on " << *dn << endl;
+ } else {
+ dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl;
+ }
+
+
+ // start request?
+ if (!mdcache->active_requests.count(m)) {
+ vector<CDentry*> trace;
+ if (!mdcache->request_start(m, dir->inode, trace))
+ return; // waiting for pin
+ }
+
+ // try to xlock!
+ if (!dn) {
+ assert(m->get_action() == LOCK_AC_REQXLOCKC);
+ dn = dir->add_dentry(dname);
+ }
+
+ if (dn->xlockedby != m) {
+ if (!dentry_xlock_start(dn, m, dir->inode)) {
+ // hose null dn if we're waiting on something
+ if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+ return; // waiting for xlock
+ }
+ } else {
+ // successfully xlocked! on behalf of requestor.
+ string path;
+ dn->make_path(path);
+
+ dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl;
+
+ // ACK xlock request
+ MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid());
+ reply->set_dn(dir->ino(), dname);
+ reply->set_path(path);
+ mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER);
+
+ // note: keep request around in memory (to hold the xlock/pins on behalf of requester)
+ return;
+ }
+ break;
+
+ case LOCK_AC_UNXLOCK:
+ dout(7) << "handle_lock_dn unxlock on " << *dn << endl;
+ {
+ string dname = dn->name;
+ Message *m = dn->xlockedby;
+
+ // finish request
+ mdcache->request_finish(m); // this will drop the locks (and unpin paths!)
+ return;
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+
+ delete m;
+}
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_LOCKER_H
+#define __MDS_LOCKER_H
+
+#include "include/types.h"
+
+#include <map>
+#include <list>
+#include <set>
+using std::map;
+using std::list;
+using std::set;
+
+class MDS;
+class CDir;
+class CInode;
+class CDentry;
+
+class Message;
+
+class MDiscover;
+class MDiscoverReply;
+class MCacheExpire;
+class MDirUpdate;
+class MDentryUnlink;
+class MLock;
+
+class MClientRequest;
+
+
+class Anchor;
+class Capability;
+
+
+class Locker {
+private:
+ MDS *mds;
+ MDCache *mdcache;
+
+ public:
+ Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {}
+
+ void dispatch(Message *m);
+
+ // -- locks --
+ // high level interface
+ public:
+ bool inode_hard_read_try(CInode *in, Context *con);
+ bool inode_hard_read_start(CInode *in, MClientRequest *m);
+ void inode_hard_read_finish(CInode *in);
+ bool inode_hard_write_start(CInode *in, MClientRequest *m);
+ void inode_hard_write_finish(CInode *in);
+ bool inode_file_read_start(CInode *in, MClientRequest *m);
+ void inode_file_read_finish(CInode *in);
+ bool inode_file_write_start(CInode *in, MClientRequest *m);
+ void inode_file_write_finish(CInode *in);
+
+ void inode_hard_eval(CInode *in);
+ void inode_file_eval(CInode *in);
+
+ protected:
+ void inode_hard_mode(CInode *in, int mode);
+ void inode_file_mode(CInode *in, int mode);
+
+ // low level triggers
+ void inode_hard_sync(CInode *in);
+ void inode_hard_lock(CInode *in);
+ bool inode_file_sync(CInode *in);
+ void inode_file_lock(CInode *in);
+ void inode_file_mixed(CInode *in);
+ void inode_file_loner(CInode *in);
+
+ // messengers
+ void handle_lock(MLock *m);
+ void handle_lock_inode_hard(MLock *m);
+ void handle_lock_inode_file(MLock *m);
+
+ // -- file i/o --
+ public:
+ version_t issue_file_data_version(CInode *in);
+ Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req);
+ bool issue_caps(CInode *in);
+
+ protected:
+ void handle_client_file_caps(class MClientFileCaps *m);
+
+ void request_inode_file_caps(CInode *in);
+ void handle_inode_file_caps(class MInodeFileCaps *m);
+
+
+ // dirs
+ void handle_lock_dir(MLock *m);
+
+ // dentry locks
+ public:
+ bool dentry_xlock_start(CDentry *dn,
+ Message *m, CInode *ref);
+ void dentry_xlock_finish(CDentry *dn, bool quiet=false);
+ void handle_lock_dn(MLock *m);
+ void dentry_xlock_request(CDir *dir, string& dname, bool create,
+ Message *req, Context *onfinish);
+ void dentry_xlock_request_finish(int r,
+ CDir *dir, string& dname,
+ Message *req,
+ Context *finisher);
+
+
+};
+
+
+#endif
#include "MDStore.h"
#include "MDS.h"
#include "Server.h"
+#include "Locker.h"
#include "MDLog.h"
#include "MDBalancer.h"
#include "AnchorClient.h"
break;
- // locking
- case MSG_MDS_LOCK:
- handle_lock((MLock*)m);
- break;
-
- // cache fun
- case MSG_MDS_INODEFILECAPS:
- handle_inode_file_caps((MInodeFileCaps*)m);
- break;
-
- case MSG_CLIENT_FILECAPS:
- handle_client_file_caps((MClientFileCaps*)m);
- break;
case MSG_MDS_DENTRYUNLINK:
handle_dentry_unlink((MDentryUnlink*)m);
*/
// must read directory hard data (permissions, x bit) to traverse
- if (!noperm && !inode_hard_read_try(cur, ondelay)) {
+ if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) {
if (onfinish) delete onfinish;
return 1;
}
dout(7) << "request_cleanup leftover xlock " << *dn << endl;
- dentry_xlock_finish(dn);
+ mds->locker->dentry_xlock_finish(dn);
// queue finishers
dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue);
if (in->hardlock.is_gathering(from)) {
in->hardlock.gather_set.erase(from);
if (in->hardlock.gather_set.size() == 0)
- inode_hard_eval(in);
+ mds->locker->inode_hard_eval(in);
}
if (in->filelock.is_gathering(from)) {
in->filelock.gather_set.erase(from);
if (in->filelock.gather_set.size() == 0)
- inode_file_eval(in);
+ mds->locker->inode_file_eval(in);
}
// alone now?
if (!in->is_cached_by_anyone()) {
- inode_hard_eval(in);
- inode_file_eval(in);
+ mds->locker->inode_hard_eval(in);
+ mds->locker->inode_file_eval(in);
}
}
string dname = dn->name;
// unpin dir / unxlock
- dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking
+ mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking
// did i empty out an imported dir?
if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0)
-// file i/o -----------------------------------------
-
-__uint64_t MDCache::issue_file_data_version(CInode *in)
-{
- dout(7) << "issue_file_data_version on " << *in << endl;
- return in->inode.file_data_version;
-}
-
-
-Capability* MDCache::issue_new_caps(CInode *in,
- int mode,
- MClientRequest *req)
-{
- dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl;
-
- // my needs
- int my_client = req->get_client();
- int my_want = 0;
- if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD;
- if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR;
-
- // register a capability
- Capability *cap = in->get_client_cap(my_client);
- if (!cap) {
- // new cap
- Capability c(my_want);
- in->add_client_cap(my_client, c);
- cap = in->get_client_cap(my_client);
-
- // note client addr
- mds->clientmap.add_open(my_client, req->get_client_inst());
-
- } else {
- // make sure it has sufficient caps
- if (cap->wanted() & ~my_want) {
- // augment wanted caps for this client
- cap->set_wanted( cap->wanted() | my_want );
- }
- }
-
- // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply)
- cap->set_suppress(true);
- int before = cap->pending();
-
- if (in->is_auth()) {
- // [auth] twiddle mode?
- inode_file_eval(in);
- } else {
- // [replica] tell auth about any new caps wanted
- request_inode_file_caps(in);
- }
-
- // issue caps (pot. incl new one)
- issue_caps(in); // note: _eval above may have done this already...
-
- // re-issue whatever we can
- cap->issue(cap->pending());
-
- // ok, stop suppressing.
- cap->set_suppress(false);
-
- int now = cap->pending();
- if (before != now &&
- (before & CAP_FILE_WR) == 0 &&
- (now & CAP_FILE_WR)) {
- // FIXME FIXME FIXME
- }
-
- // twiddle file_data_version?
- if ((before & CAP_FILE_WRBUFFER) == 0 &&
- (now & CAP_FILE_WRBUFFER)) {
- in->inode.file_data_version++;
- dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl;
- }
-
- return cap;
-}
-
-
-
-bool MDCache::issue_caps(CInode *in)
-{
- // allowed caps are determined by the lock mode.
- int allowed = in->filelock.caps_allowed(in->is_auth());
- dout(7) << "issue_caps filelock allows=" << cap_string(allowed)
- << " on " << *in << endl;
-
- // count conflicts with
- int nissued = 0;
-
- // client caps
- for (map<int, Capability>::iterator it = in->client_caps.begin();
- it != in->client_caps.end();
- it++) {
- if (it->second.issued() != (it->second.wanted() & allowed)) {
- // issue
- nissued++;
-
- int before = it->second.pending();
- long seq = it->second.issue(it->second.wanted() & allowed);
- int after = it->second.pending();
-
- // twiddle file_data_version?
- if (!(before & CAP_FILE_WRBUFFER) &&
- (after & CAP_FILE_WRBUFFER)) {
- dout(7) << " incrementing file_data_version for " << *in << endl;
- in->inode.file_data_version++;
- }
-
- if (seq > 0 &&
- !it->second.is_suppress()) {
- dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl;
- mds->messenger->send_message(new MClientFileCaps(in->inode,
- it->second.get_last_seq(),
- it->second.pending(),
- it->second.wanted()),
- MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first),
- 0, MDS_PORT_CACHE);
- }
- }
- }
-
- return (nissued == 0); // true if no re-issued, no callbacks
-}
-
-
-
-void MDCache::request_inode_file_caps(CInode *in)
-{
- int wanted = in->get_caps_wanted();
- if (wanted != in->replica_caps_wanted) {
-
- if (wanted == 0) {
- if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) {
- // ok, release them finally!
- in->replica_caps_wanted_keep_until.sec_ref() = 0;
- dout(7) << "request_inode_file_caps " << cap_string(wanted)
- << " was " << cap_string(in->replica_caps_wanted)
- << " no keeping anymore "
- << " on " << *in
- << endl;
- }
- else if (in->replica_caps_wanted_keep_until.sec() == 0) {
- in->replica_caps_wanted_keep_until = g_clock.recent_now();
- in->replica_caps_wanted_keep_until.sec_ref() += 2;
-
- dout(7) << "request_inode_file_caps " << cap_string(wanted)
- << " was " << cap_string(in->replica_caps_wanted)
- << " keeping until " << in->replica_caps_wanted_keep_until
- << " on " << *in
- << endl;
- return;
- } else {
- // wait longer
- return;
- }
- } else {
- in->replica_caps_wanted_keep_until.sec_ref() = 0;
- }
- assert(!in->is_auth());
-
- int auth = in->authority();
- dout(7) << "request_inode_file_caps " << cap_string(wanted)
- << " was " << cap_string(in->replica_caps_wanted)
- << " on " << *in << " to mds" << auth << endl;
- assert(!in->is_auth());
-
- in->replica_caps_wanted = wanted;
- mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(),
- in->replica_caps_wanted),
- auth, MDS_PORT_CACHE);
- } else {
- in->replica_caps_wanted_keep_until.sec_ref() = 0;
- }
-}
-
-void MDCache::handle_inode_file_caps(MInodeFileCaps *m)
-{
- CInode *in = get_inode(m->get_ino());
- assert(in);
- assert(in->is_auth() || in->is_proxy());
-
- dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl;
-
- if (in->is_proxy()) {
- dout(7) << "proxy, fw" << endl;
- mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE);
- return;
- }
-
- if (m->get_caps())
- in->mds_caps_wanted[m->get_from()] = m->get_caps();
- else
- in->mds_caps_wanted.erase(m->get_from());
-
- inode_file_eval(in);
- delete m;
-}
-
-
-/*
- * note: we only get these from the client if
- * - we are calling back previously issued caps (fewer than the client previously had)
- * - or if the client releases (any of) its caps on its own
- */
-void MDCache::handle_client_file_caps(MClientFileCaps *m)
-{
- int client = MSG_ADDR_NUM(m->get_source());
- CInode *in = get_inode(m->get_ino());
- Capability *cap = 0;
- if (in)
- cap = in->get_client_cap(client);
-
- if (!in || !cap) {
- if (!in) {
- dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl;
- } else {
- dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl;
- }
- delete m;
- return;
- }
-
- assert(cap);
-
- // filter wanted based on what we could ever give out (given auth/replica status)
- int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth());
-
- dout(7) << "handle_client_file_caps seq " << m->get_seq()
- << " confirms caps " << cap_string(m->get_caps())
- << " wants " << cap_string(wanted)
- << " from client" << client
- << " on " << *in
- << endl;
-
- // update wanted
- if (cap->wanted() != wanted)
- cap->set_wanted(wanted);
-
- // confirm caps
- int had = cap->confirm_receipt(m->get_seq(), m->get_caps());
- int has = cap->confirmed();
- if (cap->is_null()) {
- dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl;
- in->remove_client_cap(client);
- if (!in->is_auth())
- request_inode_file_caps(in);
-
- // dec client addr counter
- mds->clientmap.dec_open(client);
-
- // tell client.
- MClientFileCaps *r = new MClientFileCaps(in->inode,
- 0, 0, 0,
- MClientFileCaps::FILECAP_RELEASE);
- mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_CACHE);
- }
-
- // merge in atime?
- if (m->get_inode().atime > in->inode.atime) {
- dout(7) << " taking atime " << m->get_inode().atime << " > "
- << in->inode.atime << " for " << *in << endl;
- in->inode.atime = m->get_inode().atime;
- }
-
- if ((has|had) & CAP_FILE_WR) {
- bool dirty = false;
-
- // mtime
- if (m->get_inode().mtime > in->inode.mtime) {
- dout(7) << " taking mtime " << m->get_inode().mtime << " > "
- << in->inode.mtime << " for " << *in << endl;
- in->inode.mtime = m->get_inode().mtime;
- dirty = true;
- }
- // size
- if (m->get_inode().size > in->inode.size) {
- dout(7) << " taking size " << m->get_inode().size << " > "
- << in->inode.size << " for " << *in << endl;
- in->inode.size = m->get_inode().size;
- dirty = true;
- }
-
- if (dirty)
- mds->mdlog->submit_entry(new EInodeUpdate(in));
- }
-
- // reevaluate, waiters
- inode_file_eval(in);
- in->finish_waiting(CINODE_WAIT_CAPS, 0);
-
- delete m;
-}
-
-
-
-
-
-
-
-
-
-
-// locks ----------------------------------------------------------------
-
-/*
-
-
-INODES:
-
-= two types of inode metadata:
- hard - uid/gid, mode
- file - mtime, size
- ? atime - atime (*) <-- we want a lazy update strategy?
-
-= correspondingly, two types of inode locks:
- hardlock - hard metadata
- filelock - file metadata
-
- -> These locks are completely orthogonal!
-
-= metadata ops and how they affect inode metadata:
- sma=size mtime atime
- HARD FILE OP
- files:
- R RRR stat
- RW chmod/chown
- R W touch ?ctime
- R openr
- W read atime
- R openw
- Wc openwc ?ctime
- WW write size mtime
- close
-
- dirs:
- R W readdir atime
- RRR ( + implied stats on files)
- Rc WW mkdir (ctime on new dir, size+mtime on parent dir)
- R WW link/unlink/rename/rmdir (size+mtime on dir)
-
-
-
-= relationship to client (writers):
-
- - ops in question are
- - stat ... need reasonable value for mtime (+ atime?)
- - maybe we want a "quicksync" type operation instead of full lock
- - truncate ... need to stop writers for the atomic truncate operation
- - need a full lock
-
-
-
-
-= modes
- - SYNC
- Rauth Rreplica Wauth Wreplica
- sync
-
-
-
-
-
-ALSO:
-
- dirlock - no dir changes (prior to unhashing)
- denlock - dentry lock (prior to unlink, rename)
-
-
-*/
-
-
-void MDCache::handle_lock(MLock *m)
-{
- switch (m->get_otype()) {
- case LOCK_OTYPE_IHARD:
- handle_lock_inode_hard(m);
- break;
-
- case LOCK_OTYPE_IFILE:
- handle_lock_inode_file(m);
- break;
-
- case LOCK_OTYPE_DIR:
- handle_lock_dir(m);
- break;
-
- case LOCK_OTYPE_DN:
- handle_lock_dn(m);
- break;
-
- default:
- dout(7) << "handle_lock got otype " << m->get_otype() << endl;
- assert(0);
- break;
- }
-}
-
-
-
-// ===============================
-// hard inode metadata
-
-bool MDCache::inode_hard_read_try(CInode *in, Context *con)
-{
- dout(7) << "inode_hard_read_try on " << *in << endl;
-
- // can read? grab ref.
- if (in->hardlock.can_read(in->is_auth()))
- return true;
-
- assert(!in->is_auth());
-
- // wait!
- dout(7) << "inode_hard_read_try waiting on " << *in << endl;
- in->add_waiter(CINODE_WAIT_HARDR, con);
- return false;
-}
-
-bool MDCache::inode_hard_read_start(CInode *in, MClientRequest *m)
-{
- dout(7) << "inode_hard_read_start on " << *in << endl;
-
- // can read? grab ref.
- if (in->hardlock.can_read(in->is_auth())) {
- in->hardlock.get_read();
- return true;
- }
-
- // can't read, and replicated.
- assert(!in->is_auth());
-
- // wait!
- dout(7) << "inode_hard_read_start waiting on " << *in << endl;
- in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in));
- return false;
-}
-
-
-void MDCache::inode_hard_read_finish(CInode *in)
-{
- // drop ref
- assert(in->hardlock.can_read(in->is_auth()));
- in->hardlock.put_read();
-
- dout(7) << "inode_hard_read_finish on " << *in << endl;
-
- //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD);
-}
-
-
-bool MDCache::inode_hard_write_start(CInode *in, MClientRequest *m)
-{
- dout(7) << "inode_hard_write_start on " << *in << endl;
-
- // if not replicated, i can twiddle lock at will
- if (in->is_auth() &&
- !in->is_cached_by_anyone() &&
- in->hardlock.get_state() != LOCK_LOCK)
- in->hardlock.set_state(LOCK_LOCK);
-
- // can write? grab ref.
- if (in->hardlock.can_write(in->is_auth())) {
- assert(in->is_auth());
- if (!in->can_auth_pin()) {
- dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl;
- in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in));
- return false;
- }
-
- in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_*
- in->hardlock.get_write();
- return true;
- }
-
- // can't write, replicated.
- if (in->is_auth()) {
- // auth
- if (in->hardlock.can_write_soon(in->is_auth())) {
- // just wait
- } else {
- // initiate lock
- inode_hard_lock(in);
- }
-
- dout(7) << "inode_hard_write_start waiting on " << *in << endl;
- in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in));
-
- return false;
- } else {
- // replica
- // fw to auth
- int auth = in->authority();
- dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- request_forward(m, auth);
- return false;
- }
-}
-
-
-void MDCache::inode_hard_write_finish(CInode *in)
-{
- // drop ref
- assert(in->hardlock.can_write(in->is_auth()));
- in->hardlock.put_write();
- in->auth_unpin();
- dout(7) << "inode_hard_write_finish on " << *in << endl;
-
- // drop lock?
- if (in->hardlock.get_nwrite() == 0) {
-
- // auto-sync if alone.
- if (in->is_auth() &&
- !in->is_cached_by_anyone() &&
- in->hardlock.get_state() != LOCK_SYNC)
- in->hardlock.set_state(LOCK_SYNC);
-
- inode_hard_eval(in);
- }
-}
-
-
-void MDCache::inode_hard_eval(CInode *in)
-{
- // finished gather?
- if (in->is_auth() &&
- !in->hardlock.is_stable() &&
- in->hardlock.gather_set.empty()) {
- dout(7) << "inode_hard_eval finished gather on " << *in << endl;
- switch (in->hardlock.get_state()) {
- case LOCK_GLOCKR:
- in->hardlock.set_state(LOCK_LOCK);
-
- // waiters
- in->hardlock.get_write();
- in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE);
- in->hardlock.put_write();
- break;
-
- default:
- assert(0);
- }
- }
- if (!in->hardlock.is_stable()) return;
-
- if (in->is_auth()) {
-
- // sync?
- if (in->is_cached_by_anyone() &&
- in->hardlock.get_nwrite() == 0 &&
- in->hardlock.get_state() != LOCK_SYNC) {
- dout(7) << "inode_hard_eval stable, syncing " << *in << endl;
- inode_hard_sync(in);
- }
-
- } else {
- // replica
- }
-}
-
-
-// mid
-
-void MDCache::inode_hard_sync(CInode *in)
-{
- dout(7) << "inode_hard_sync on " << *in << endl;
- assert(in->is_auth());
-
- // check state
- if (in->hardlock.get_state() == LOCK_SYNC)
- return; // already sync
- if (in->hardlock.get_state() == LOCK_GLOCKR)
- assert(0); // um... hmm!
- assert(in->hardlock.get_state() == LOCK_LOCK);
-
- // hard data
- bufferlist harddata;
- in->encode_hard_state(harddata);
-
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
- m->set_data(harddata);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
-
- // change lock
- in->hardlock.set_state(LOCK_SYNC);
-
- // waiters?
- in->finish_waiting(CINODE_WAIT_HARDSTABLE);
-}
-
-void MDCache::inode_hard_lock(CInode *in)
-{
- dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl;
- assert(in->is_auth());
-
- // check state
- if (in->hardlock.get_state() == LOCK_LOCK ||
- in->hardlock.get_state() == LOCK_GLOCKR)
- return; // already lock or locking
- assert(in->hardlock.get_state() == LOCK_SYNC);
-
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IHARD);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
-
- // change lock
- in->hardlock.set_state(LOCK_GLOCKR);
- in->hardlock.init_gather(in->get_cached_by());
-}
-
-
-
-
-
-// messenger
-
-void MDCache::handle_lock_inode_hard(MLock *m)
-{
- assert(m->get_otype() == LOCK_OTYPE_IHARD);
-
- mds->logger->inc("lih");
-
- int from = m->get_asker();
- CInode *in = get_inode(m->get_ino());
-
- if (LOCK_AC_FOR_AUTH(m->get_action())) {
- // auth
- assert(in);
- assert(in->is_auth() || in->is_proxy());
- dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl;
-
- if (in->is_proxy()) {
- // fw
- int newauth = in->authority();
- assert(newauth >= 0);
- if (from == newauth) {
- dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
- delete m;
- } else {
- dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
- mds->send_message_mds(m, newauth, MDS_PORT_CACHE);
- }
- return;
- }
- } else {
- // replica
- if (!in) {
- dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl;
- /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness
- to keep gather_set a proper/correct subset of cached_by. better to use the existing
- cacheexpire mechanism instead!
- */
- delete m;
- return;
- }
-
- assert(!in->is_auth());
- }
-
- dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl;
-
- CLock *lock = &in->hardlock;
-
- switch (m->get_action()) {
- // -- replica --
- case LOCK_AC_SYNC:
- assert(lock->get_state() == LOCK_LOCK);
-
- { // assim data
- int off = 0;
- in->decode_hard_state(m->get_data(), off);
- }
-
- // update lock
- lock->set_state(LOCK_SYNC);
-
- // no need to reply
-
- // waiters
- in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE);
- break;
-
- case LOCK_AC_LOCK:
- assert(lock->get_state() == LOCK_SYNC);
- //|| lock->get_state() == LOCK_GLOCKR);
-
- // wait for readers to finish?
- if (lock->get_nread() > 0) {
- dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl;
- lock->set_state(LOCK_GLOCKR);
- in->add_waiter(CINODE_WAIT_HARDNORD,
- new C_MDS_RetryMessage(mds,m));
- assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!)
- return;
- } else {
-
- // update lock and reply
- lock->set_state(LOCK_LOCK);
-
- {
- MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IHARD);
- mds->send_message_mds(reply, from, MDS_PORT_CACHE);
- }
- }
- break;
-
-
- // -- auth --
- case LOCK_AC_LOCKACK:
- assert(lock->state == LOCK_GLOCKR);
- assert(lock->gather_set.count(from));
- lock->gather_set.erase(from);
-
- if (lock->gather_set.size()) {
- dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
- } else {
- dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl;
- inode_hard_eval(in);
- }
- }
- delete m;
-}
-
-
-
-
-// =====================
-// soft inode metadata
-
-
-bool MDCache::inode_file_read_start(CInode *in, MClientRequest *m)
-{
- dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl;
-
- // can read? grab ref.
- if (in->filelock.can_read(in->is_auth())) {
- in->filelock.get_read();
- return true;
- }
-
- // can't read, and replicated.
- if (in->filelock.can_read_soon(in->is_auth())) {
- // wait
- dout(7) << "inode_file_read_start can_read_soon " << *in << endl;
- } else {
- if (in->is_auth()) {
- // auth
-
- // FIXME or qsync?
-
- if (in->filelock.is_stable()) {
- inode_file_lock(in); // lock, bc easiest to back off
-
- if (in->filelock.can_read(in->is_auth())) {
- in->filelock.get_read();
-
- in->filelock.get_write();
- in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
- in->filelock.put_write();
- return true;
- }
- } else {
- dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
- in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
- return false;
- }
- } else {
- // replica
- if (in->filelock.is_stable()) {
-
- // fw to auth
- int auth = in->authority();
- dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- request_forward(m, auth);
- return false;
-
- } else {
- // wait until stable
- dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl;
- in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
- return false;
- }
- }
- }
-
- // wait
- dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl;
- in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in));
-
- return false;
-}
-
-
-void MDCache::inode_file_read_finish(CInode *in)
-{
- // drop ref
- assert(in->filelock.can_read(in->is_auth()));
- in->filelock.put_read();
-
- dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl;
-
- if (in->filelock.get_nread() == 0) {
- in->finish_waiting(CINODE_WAIT_FILENORD);
- inode_file_eval(in);
- }
-}
-
-
-bool MDCache::inode_file_write_start(CInode *in, MClientRequest *m)
-{
- // can write? grab ref.
- if (in->filelock.can_write(in->is_auth())) {
- in->filelock.get_write();
- return true;
- }
-
- // can't write, replicated.
- if (in->is_auth()) {
- // auth
- if (in->filelock.can_write_soon(in->is_auth())) {
- // just wait
- } else {
- if (!in->filelock.is_stable()) {
- dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl;
- in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in));
- return false;
- }
-
- // initiate lock
- inode_file_lock(in);
-
- if (in->filelock.can_write(in->is_auth())) {
- in->filelock.get_write();
-
- in->filelock.get_read();
- in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
- in->filelock.put_read();
- return true;
- }
- }
-
- dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl;
- in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in));
- return false;
- } else {
- // replica
- // fw to auth
- int auth = in->authority();
- dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl;
- assert(auth != mds->get_nodeid());
- request_forward(m, auth);
- return false;
- }
-}
-
-
-void MDCache::inode_file_write_finish(CInode *in)
-{
- // drop ref
- assert(in->filelock.can_write(in->is_auth()));
- in->filelock.put_write();
- dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl;
-
- // drop lock?
- if (in->filelock.get_nwrite() == 0) {
- in->finish_waiting(CINODE_WAIT_FILENOWR);
- inode_file_eval(in);
- }
-}
-
-
-/*
- * ...
- *
- * also called after client caps are acked to us
- * - checks if we're in unstable sfot state and can now move on to next state
- * - checks if soft state should change (eg bc last writer closed)
- */
-
-void MDCache::inode_file_eval(CInode *in)
-{
- int issued = in->get_caps_issued();
-
- // [auth] finished gather?
- if (in->is_auth() &&
- !in->filelock.is_stable() &&
- in->filelock.gather_set.size() == 0) {
- dout(7) << "inode_file_eval finished mds gather on " << *in << endl;
-
- switch (in->filelock.get_state()) {
- // to lock
- case LOCK_GLOCKR:
- case LOCK_GLOCKM:
- case LOCK_GLOCKL:
- if (issued == 0) {
- in->filelock.set_state(LOCK_LOCK);
-
- // waiters
- in->filelock.get_read();
- in->filelock.get_write();
- in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE);
- in->filelock.put_read();
- in->filelock.put_write();
- }
- break;
-
- // to mixed
- case LOCK_GMIXEDR:
- if ((issued & ~(CAP_FILE_RD)) == 0) {
- in->filelock.set_state(LOCK_MIXED);
- in->finish_waiting(CINODE_WAIT_FILESTABLE);
- }
- break;
-
- case LOCK_GMIXEDL:
- if ((issued & ~(CAP_FILE_WR)) == 0) {
- in->filelock.set_state(LOCK_MIXED);
-
- if (in->is_cached_by_anyone()) {
- // data
- bufferlist softdata;
- in->encode_file_state(softdata);
-
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
-
- in->finish_waiting(CINODE_WAIT_FILESTABLE);
- }
- break;
-
- // to loner
- case LOCK_GLONERR:
- if (issued == 0) {
- in->filelock.set_state(LOCK_LONER);
- in->finish_waiting(CINODE_WAIT_FILESTABLE);
- }
- break;
-
- case LOCK_GLONERM:
- if ((issued & ~CAP_FILE_WR) == 0) {
- in->filelock.set_state(LOCK_LONER);
- in->finish_waiting(CINODE_WAIT_FILESTABLE);
- }
- break;
-
- // to sync
- case LOCK_GSYNCL:
- case LOCK_GSYNCM:
- if ((issued & ~(CAP_FILE_RD)) == 0) {
- in->filelock.set_state(LOCK_SYNC);
-
- { // bcast data to replicas
- bufferlist softdata;
- in->encode_file_state(softdata);
-
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- reply->set_data(softdata);
- mds->send_message_mds(reply, *it, MDS_PORT_CACHE);
- }
- }
-
- // waiters
- in->filelock.get_read();
- in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
- in->filelock.put_read();
- }
- break;
-
- default:
- assert(0);
- }
-
- issue_caps(in);
- }
-
- // [replica] finished caps gather?
- if (!in->is_auth() &&
- !in->filelock.is_stable()) {
- switch (in->filelock.get_state()) {
- case LOCK_GMIXEDR:
- if ((issued & ~(CAP_FILE_RD)) == 0) {
- in->filelock.set_state(LOCK_MIXED);
-
- // ack
- MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, in->authority(), MDS_PORT_CACHE);
- }
- break;
-
- case LOCK_GLOCKR:
- if (issued == 0) {
- in->filelock.set_state(LOCK_LOCK);
-
- // ack
- MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, in->authority(), MDS_PORT_CACHE);
- }
- break;
-
- default:
- assert(0);
- }
- }
-
- // !stable -> do nothing.
- if (!in->filelock.is_stable()) return;
-
-
- // stable.
- assert(in->filelock.is_stable());
-
- if (in->is_auth()) {
- // [auth]
- int wanted = in->get_caps_wanted();
- bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty();
- dout(7) << "inode_file_eval wanted=" << cap_string(wanted)
- << " filelock=" << in->filelock
- << " loner=" << loner
- << endl;
-
- // * -> loner?
- if (in->filelock.get_nread() == 0 &&
- in->filelock.get_nwrite() == 0 &&
- (wanted & CAP_FILE_WR) &&
- loner &&
- in->filelock.get_state() != LOCK_LONER) {
- dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl;
- inode_file_loner(in);
- }
-
- // * -> mixed?
- else if (in->filelock.get_nread() == 0 &&
- in->filelock.get_nwrite() == 0 &&
- (wanted & CAP_FILE_RD) &&
- (wanted & CAP_FILE_WR) &&
- !(loner && in->filelock.get_state() == LOCK_LONER) &&
- in->filelock.get_state() != LOCK_MIXED) {
- dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl;
- inode_file_mixed(in);
- }
-
- // * -> sync?
- else if (in->filelock.get_nwrite() == 0 &&
- !(wanted & CAP_FILE_WR) &&
- ((wanted & CAP_FILE_RD) ||
- in->is_cached_by_anyone() ||
- (!loner && in->filelock.get_state() == LOCK_LONER)) &&
- in->filelock.get_state() != LOCK_SYNC) {
- dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl;
- inode_file_sync(in);
- }
-
- // * -> lock? (if not replicated or open)
- else if (!in->is_cached_by_anyone() &&
- wanted == 0 &&
- in->filelock.get_state() != LOCK_LOCK) {
- inode_file_lock(in);
- }
-
- } else {
- // replica
- // recall? check wiaters? XXX
- }
-}
-
-
-// mid
-
-bool MDCache::inode_file_sync(CInode *in)
-{
- dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl;
-
- assert(in->is_auth());
-
- // check state
- if (in->filelock.get_state() == LOCK_SYNC ||
- in->filelock.get_state() == LOCK_GSYNCL ||
- in->filelock.get_state() == LOCK_GSYNCM)
- return true;
-
- assert(in->filelock.is_stable());
-
- int issued = in->get_caps_issued();
-
- assert((in->get_caps_wanted() & CAP_FILE_WR) == 0);
-
- if (in->filelock.get_state() == LOCK_LOCK) {
- if (in->is_cached_by_anyone()) {
- // soft data
- bufferlist softdata;
- in->encode_file_state(softdata);
-
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
-
- // change lock
- in->filelock.set_state(LOCK_SYNC);
-
- // reissue caps
- issue_caps(in);
- return true;
- }
-
- else if (in->filelock.get_state() == LOCK_MIXED) {
- // writers?
- if (issued & CAP_FILE_WR) {
- // gather client write caps
- in->filelock.set_state(LOCK_GSYNCM);
- issue_caps(in);
- } else {
- // no writers, go straight to sync
-
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
-
- // change lock
- in->filelock.set_state(LOCK_SYNC);
- }
- return false;
- }
-
- else if (in->filelock.get_state() == LOCK_LONER) {
- // writers?
- if (issued & CAP_FILE_WR) {
- // gather client write caps
- in->filelock.set_state(LOCK_GSYNCL);
- issue_caps(in);
- } else {
- // no writers, go straight to sync
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
-
- // change lock
- in->filelock.set_state(LOCK_SYNC);
- }
- return false;
- }
- else
- assert(0); // wtf.
-
- return false;
-}
-
-
-void MDCache::inode_file_lock(CInode *in)
-{
- dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl;
-
- assert(in->is_auth());
-
- // check state
- if (in->filelock.get_state() == LOCK_LOCK ||
- in->filelock.get_state() == LOCK_GLOCKR ||
- in->filelock.get_state() == LOCK_GLOCKM ||
- in->filelock.get_state() == LOCK_GLOCKL)
- return; // lock or locking
-
- assert(in->filelock.is_stable());
-
- int issued = in->get_caps_issued();
-
- if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.init_gather(in->get_cached_by());
-
- // change lock
- in->filelock.set_state(LOCK_GLOCKR);
-
- // call back caps
- if (issued)
- issue_caps(in);
- } else {
- if (issued) {
- // call back caps
- in->filelock.set_state(LOCK_GLOCKR);
- issue_caps(in);
- } else {
- in->filelock.set_state(LOCK_LOCK);
- }
- }
- }
-
- else if (in->filelock.get_state() == LOCK_MIXED) {
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.init_gather(in->get_cached_by());
-
- // change lock
- in->filelock.set_state(LOCK_GLOCKM);
-
- // call back caps
- issue_caps(in);
- } else {
- //assert(issued); // ??? -sage 2/19/06
- if (issued) {
- // change lock
- in->filelock.set_state(LOCK_GLOCKM);
-
- // call back caps
- issue_caps(in);
- } else {
- in->filelock.set_state(LOCK_LOCK);
- }
- }
-
- }
- else if (in->filelock.get_state() == LOCK_LONER) {
- if (issued & CAP_FILE_WR) {
- // change lock
- in->filelock.set_state(LOCK_GLOCKL);
-
- // call back caps
- issue_caps(in);
- } else {
- in->filelock.set_state(LOCK_LOCK);
- }
- }
- else
- assert(0); // wtf.
-}
-
-
-void MDCache::inode_file_mixed(CInode *in)
-{
- dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl;
-
- assert(in->is_auth());
-
- // check state
- if (in->filelock.get_state() == LOCK_GMIXEDR ||
- in->filelock.get_state() == LOCK_GMIXEDL)
- return; // mixed or mixing
-
- assert(in->filelock.is_stable());
-
- int issued = in->get_caps_issued();
-
- if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.init_gather(in->get_cached_by());
-
- in->filelock.set_state(LOCK_GMIXEDR);
- issue_caps(in);
- } else {
- if (issued) {
- in->filelock.set_state(LOCK_GMIXEDR);
- issue_caps(in);
- } else {
- in->filelock.set_state(LOCK_MIXED);
- }
- }
- }
-
- else if (in->filelock.get_state() == LOCK_LOCK) {
- if (in->is_cached_by_anyone()) {
- // data
- bufferlist softdata;
- in->encode_file_state(softdata);
-
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- m->set_data(softdata);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
-
- // change lock
- in->filelock.set_state(LOCK_MIXED);
- issue_caps(in);
- }
-
- else if (in->filelock.get_state() == LOCK_LONER) {
- if (issued & CAP_FILE_WRBUFFER) {
- // gather up WRBUFFER caps
- in->filelock.set_state(LOCK_GMIXEDL);
- issue_caps(in);
- }
- else if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.set_state(LOCK_MIXED);
- issue_caps(in);
- } else {
- in->filelock.set_state(LOCK_MIXED);
- issue_caps(in);
- }
- }
-
- else
- assert(0); // wtf.
-}
-
-
-void MDCache::inode_file_loner(CInode *in)
-{
- dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl;
-
- assert(in->is_auth());
-
- // check state
- if (in->filelock.get_state() == LOCK_LONER ||
- in->filelock.get_state() == LOCK_GLONERR ||
- in->filelock.get_state() == LOCK_GLONERM)
- return;
-
- assert(in->filelock.is_stable());
- assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty());
-
- if (in->filelock.get_state() == LOCK_SYNC) {
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.init_gather(in->get_cached_by());
-
- // change lock
- in->filelock.set_state(LOCK_GLONERR);
- } else {
- // only one guy with file open, who gets it all, so
- in->filelock.set_state(LOCK_LONER);
- issue_caps(in);
- }
- }
-
- else if (in->filelock.get_state() == LOCK_LOCK) {
- // change lock. ignore replicas; they don't know about LONER.
- in->filelock.set_state(LOCK_LONER);
- issue_caps(in);
- }
-
- else if (in->filelock.get_state() == LOCK_MIXED) {
- if (in->is_cached_by_anyone()) {
- // bcast to replicas
- for (set<int>::iterator it = in->cached_by_begin();
- it != in->cached_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- in->filelock.init_gather(in->get_cached_by());
-
- // change lock
- in->filelock.set_state(LOCK_GLONERM);
- } else {
- in->filelock.set_state(LOCK_LONER);
- issue_caps(in);
- }
- }
-
- else
- assert(0);
-}
-
-// messenger
-
-void MDCache::handle_lock_inode_file(MLock *m)
-{
- assert(m->get_otype() == LOCK_OTYPE_IFILE);
-
- mds->logger->inc("lif");
-
- CInode *in = get_inode(m->get_ino());
- int from = m->get_asker();
-
- if (LOCK_AC_FOR_AUTH(m->get_action())) {
- // auth
- assert(in);
- assert(in->is_auth() || in->is_proxy());
- dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl;
-
- if (in->is_proxy()) {
- // fw
- int newauth = in->authority();
- assert(newauth >= 0);
- if (from == newauth) {
- dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl;
- delete m;
- } else {
- dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl;
- mds->send_message_mds(m, newauth, MDS_PORT_CACHE);
- }
- return;
- }
- } else {
- // replica
- if (!in) {
- // drop it. don't nak.
- dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl;
- delete m;
- return;
- }
-
- assert(!in->is_auth());
- }
-
- dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl;
-
- CLock *lock = &in->filelock;
- int issued = in->get_caps_issued();
-
- switch (m->get_action()) {
- // -- replica --
- case LOCK_AC_SYNC:
- assert(lock->get_state() == LOCK_LOCK ||
- lock->get_state() == LOCK_MIXED);
-
- { // assim data
- int off = 0;
- in->decode_file_state(m->get_data(), off);
- }
-
- // update lock
- lock->set_state(LOCK_SYNC);
-
- // no need to reply.
-
- // waiters
- in->filelock.get_read();
- in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE);
- in->filelock.put_read();
- inode_file_eval(in);
- break;
-
- case LOCK_AC_LOCK:
- assert(lock->get_state() == LOCK_SYNC ||
- lock->get_state() == LOCK_MIXED);
-
- // call back caps?
- if (issued & CAP_FILE_RD) {
- dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl;
- issue_caps(in);
- }
- if (lock->get_nread() > 0) {
- dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl;
- in->add_waiter(CINODE_WAIT_FILENORD,
- new C_MDS_RetryMessage(mds,m));
- lock->set_state(LOCK_GLOCKR);
- assert(0);// i am broken.. why retry message when state captures all the info i need?
- return;
- }
- if (issued & CAP_FILE_RD) {
- lock->set_state(LOCK_GLOCKR);
- break;
- }
-
- // nothing to wait for, lock and ack.
- {
- lock->set_state(LOCK_LOCK);
-
- MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, from, MDS_PORT_CACHE);
- }
- break;
-
- case LOCK_AC_MIXED:
- assert(lock->get_state() == LOCK_SYNC ||
- lock->get_state() == LOCK_LOCK);
-
- if (lock->get_state() == LOCK_SYNC) {
- // MIXED
- if (issued & CAP_FILE_RD) {
- // call back client caps
- lock->set_state(LOCK_GMIXEDR);
- issue_caps(in);
- break;
- } else {
- // no clients, go straight to mixed
- lock->set_state(LOCK_MIXED);
-
- // ack
- MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid());
- reply->set_ino(in->ino(), LOCK_OTYPE_IFILE);
- mds->send_message_mds(reply, from, MDS_PORT_CACHE);
- }
- } else {
- // LOCK
- lock->set_state(LOCK_MIXED);
-
- // no ack needed.
- }
-
- issue_caps(in);
-
- // waiters
- in->filelock.get_write();
- in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE);
- in->filelock.put_write();
- inode_file_eval(in);
- break;
-
-
-
-
- // -- auth --
- case LOCK_AC_LOCKACK:
- assert(lock->state == LOCK_GLOCKR ||
- lock->state == LOCK_GLOCKM ||
- lock->state == LOCK_GLONERM ||
- lock->state == LOCK_GLONERR);
- assert(lock->gather_set.count(from));
- lock->gather_set.erase(from);
-
- if (lock->gather_set.size()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
- } else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
- inode_file_eval(in);
- }
- break;
-
- case LOCK_AC_SYNCACK:
- assert(lock->state == LOCK_GSYNCM);
- assert(lock->gather_set.count(from));
- lock->gather_set.erase(from);
-
- /* not used currently
- {
- // merge data (keep largest size, mtime, etc.)
- int off = 0;
- in->decode_merge_file_state(m->get_data(), off);
- }
- */
-
- if (lock->gather_set.size()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
- } else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
- inode_file_eval(in);
- }
- break;
-
- case LOCK_AC_MIXEDACK:
- assert(lock->state == LOCK_GMIXEDR);
- assert(lock->gather_set.count(from));
- lock->gather_set.erase(from);
-
- if (lock->gather_set.size()) {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl;
- } else {
- dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl;
- inode_file_eval(in);
- }
- break;
-
-
- default:
- assert(0);
- }
-
- delete m;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-void MDCache::handle_lock_dir(MLock *m)
-{
-
-}
-
-
-
-// DENTRY
-
-bool MDCache::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref)
-{
- dout(7) << "dentry_xlock_start on " << *dn << endl;
-
- // locked?
- if (dn->lockstate == DN_LOCK_XLOCK) {
- if (dn->xlockedby == m) return true; // locked by me!
-
- // not by me, wait
- dout(7) << "dentry " << *dn << " xlock by someone else" << endl;
- dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
- new C_MDS_RetryRequest(mds,m,ref));
- return false;
- }
-
- // prelock?
- if (dn->lockstate == DN_LOCK_PREXLOCK) {
- if (dn->xlockedby == m) {
- dout(7) << "dentry " << *dn << " prexlock by me" << endl;
- dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
- new C_MDS_RetryRequest(mds,m,ref));
- } else {
- dout(7) << "dentry " << *dn << " prexlock by someone else" << endl;
- dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name,
- new C_MDS_RetryRequest(mds,m,ref));
- }
- return false;
- }
-
-
- // lockable!
- assert(dn->lockstate == DN_LOCK_SYNC ||
- dn->lockstate == DN_LOCK_UNPINNING);
-
- // dir auth pinnable?
- if (!dn->dir->can_auth_pin()) {
- dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl;
- dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE,
- new C_MDS_RetryRequest(mds,m,ref));
- return false;
- }
-
- // is dentry path pinned?
- if (dn->is_pinned()) {
- dout(7) << "dentry " << *dn << " pinned, waiting" << endl;
- dn->lockstate = DN_LOCK_UNPINNING;
- dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
- dn->name,
- new C_MDS_RetryRequest(mds,m,ref));
- return false;
- }
-
- // pin path up to dentry! (if success, point of no return)
- CDentry *pdn = dn->dir->inode->get_parent_dn();
- if (pdn) {
- if (active_requests[m].traces.count(pdn)) {
- dout(7) << "already path pinned parent dentry " << *pdn << endl;
- } else {
- dout(7) << "pinning parent dentry " << *pdn << endl;
- vector<CDentry*> trace;
- make_trace(trace, pdn->inode);
- assert(trace.size());
-
- if (!path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false;
-
- active_requests[m].traces[trace[trace.size()-1]] = trace;
- }
- }
-
- // pin dir!
- dn->dir->auth_pin();
-
- // mine!
- dn->xlockedby = m;
-
- if (dn->dir->is_open_by_anyone()) {
- dn->lockstate = DN_LOCK_PREXLOCK;
-
- // xlock with whom?
- set<int> who = dn->dir->get_open_by();
- dn->gather_set = who;
-
- // make path
- string path;
- dn->make_path(path);
- dout(10) << "path is " << path << " for " << *dn << endl;
-
- for (set<int>::iterator it = who.begin();
- it != who.end();
- it++) {
- MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid());
- m->set_dn(dn->dir->ino(), dn->name);
- m->set_path(path);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
-
- // wait
- dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl;
- dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name,
- new C_MDS_RetryRequest(mds, m, ref));
- return false;
- } else {
- dn->lockstate = DN_LOCK_XLOCK;
- active_requests[dn->xlockedby].xlocks.insert(dn);
- return true;
- }
-}
-
-void MDCache::dentry_xlock_finish(CDentry *dn, bool quiet)
-{
- dout(7) << "dentry_xlock_finish on " << *dn << endl;
-
- assert(dn->xlockedby);
- if (dn->xlockedby == DN_XLOCK_FOREIGN) {
- dout(7) << "this was a foreign xlock" << endl;
- } else {
- // remove from request record
- assert(active_requests[dn->xlockedby].xlocks.count(dn) == 1);
- active_requests[dn->xlockedby].xlocks.erase(dn);
- }
-
- dn->xlockedby = 0;
- dn->lockstate = DN_LOCK_SYNC;
-
- // unpin parent dir?
- // -> no? because we might have xlocked 2 things in this dir.
- // instead, we let request_finish clean up the mess.
-
- // tell replicas?
- if (!quiet) {
- // tell even if dn is null.
- if (dn->dir->is_open_by_anyone()) {
- for (set<int>::iterator it = dn->dir->open_by_begin();
- it != dn->dir->open_by_end();
- it++) {
- MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid());
- m->set_dn(dn->dir->ino(), dn->name);
- mds->send_message_mds(m, *it, MDS_PORT_CACHE);
- }
- }
- }
-
- // unpin dir
- dn->dir->auth_unpin();
-}
-
-/*
- * onfinish->finish() will be called with
- * 0 on successful xlock,
- * -1 on failure
- */
-
-class C_MDC_XlockRequest : public Context {
- MDCache *mdc;
- CDir *dir;
- string dname;
- Message *req;
- Context *finisher;
-public:
- C_MDC_XlockRequest(MDCache *mdc,
- CDir *dir, string& dname,
- Message *req,
- Context *finisher) {
- this->mdc = mdc;
- this->dir = dir;
- this->dname = dname;
- this->req = req;
- this->finisher = finisher;
- }
-
- void finish(int r) {
- mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher);
- }
-};
-
-void MDCache::dentry_xlock_request_finish(int r,
- CDir *dir, string& dname,
- Message *req,
- Context *finisher)
-{
- dout(10) << "dentry_xlock_request_finish r = " << r << endl;
- if (r == 1) { // 1 for xlock request success
- CDentry *dn = dir->lookup(dname);
- if (dn && dn->xlockedby == 0) {
- // success
- dn->xlockedby = req; // our request was the winner
- dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl;
-
- // remember!
- active_requests[req].foreign_xlocks.insert(dn);
- }
- }
-
- // retry request (or whatever)
- finisher->finish(0);
- delete finisher;
-}
-
-void MDCache::dentry_xlock_request(CDir *dir, string& dname, bool create,
- Message *req, Context *onfinish)
-{
- dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl;
- // send request
- int dauth = dir->dentry_authority(dname);
- MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid());
- m->set_dn(dir->ino(), dname);
- mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
-
- // add waiter
- dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname,
- new C_MDC_XlockRequest(this,
- dir, dname, req,
- onfinish));
-}
-
-
-
-
-void MDCache::handle_lock_dn(MLock *m)
-{
- assert(m->get_otype() == LOCK_OTYPE_DN);
-
- CInode *diri = get_inode(m->get_ino()); // may be null
- CDir *dir = 0;
- if (diri) dir = diri->dir; // may be null
- string dname = m->get_dn();
- int from = m->get_asker();
- CDentry *dn = 0;
-
- if (LOCK_AC_FOR_AUTH(m->get_action())) {
- // auth
-
- // normally we have it always
- if (diri && dir) {
- int dauth = dir->dentry_authority(dname);
- assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy,
- m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak
- m->get_action() == LOCK_AC_REQXLOCKNAK);
-
- if (dir->is_proxy()) {
-
- assert(dauth >= 0);
-
- if (dauth == m->get_asker() &&
- (m->get_action() == LOCK_AC_REQXLOCK ||
- m->get_action() == LOCK_AC_REQXLOCKC)) {
- dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl;
- if (active_requests.count(m))
- request_finish(m);
- else
- delete m;
- return;
- }
-
- dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl;
-
- // forward
- if (active_requests.count(m)) {
- // xlock requests are requests, use request_* functions!
- assert(m->get_action() == LOCK_AC_REQXLOCK ||
- m->get_action() == LOCK_AC_REQXLOCKC);
- // forward as a request
- request_forward(m, dauth, MDS_PORT_CACHE);
- } else {
- // not an xlock req, or it is and we just didn't register the request yet
- // forward normally
- mds->send_message_mds(m, dauth, MDS_PORT_CACHE);
- }
- return;
- }
-
- dn = dir->lookup(dname);
- }
-
- // except with.. an xlock request?
- if (!dn) {
- assert(dir); // we should still have the dir, though! the requester has the dir open.
- switch (m->get_action()) {
-
- case LOCK_AC_LOCK:
- dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl;
- dn = dir->add_dentry(dname);
- break;
-
- case LOCK_AC_REQXLOCK:
- // send nak
- if (dir->state_test(CDIR_STATE_DELETED)) {
- dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl;
- } else {
- dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl;
- }
- {
- MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
- reply->set_dn(dir->ino(), dname);
- reply->set_path(m->get_path());
- mds->send_message_mds(reply, m->get_asker(), MDS_PORT_CACHE);
- }
-
- // finish request (if we got that far)
- if (active_requests.count(m)) request_finish(m);
-
- delete m;
- return;
-
- case LOCK_AC_REQXLOCKC:
- dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl;
- break;
-
- default:
- assert(0);
- }
- }
- } else {
- // replica
- if (dir) dn = dir->lookup(dname);
- if (!dn) {
- dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl;
-
- if (m->get_action() == LOCK_AC_REQXLOCKACK ||
- m->get_action() == LOCK_AC_REQXLOCKNAK) {
- dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl;
- //assert(0); // how can this happen? tell me now!
-
- vector<CDentry*> trace;
- filepath path = m->get_path();
- int r = path_traverse(path, trace, true,
- m, new C_MDS_RetryMessage(mds,m),
- MDS_TRAVERSE_DISCOVER);
- assert(r>0);
- return;
- }
-
- if (m->get_action() == LOCK_AC_LOCK) {
- if (0) { // not anymore
- dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl;
-
- vector<CDentry*> trace;
- filepath path = m->get_path();
- int r = path_traverse(path, trace, true,
- m, new C_MDS_RetryMessage(mds,m),
- MDS_TRAVERSE_DISCOVER);
- assert(r>0);
- }
- if (1) {
- // NAK
- MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid());
- reply->set_dn(m->get_ino(), dname);
- mds->send_message_mds(reply, m->get_asker(), MDS_PORT_CACHE);
- }
- } else {
- dout(7) << "safely ignoring." << endl;
- delete m;
- }
- return;
- }
-
- assert(dn);
- }
-
- if (dn) {
- dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl;
- } else {
- dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl;
- }
-
- switch (m->get_action()) {
- // -- replica --
- case LOCK_AC_LOCK:
- assert(dn->lockstate == DN_LOCK_SYNC ||
- dn->lockstate == DN_LOCK_UNPINNING ||
- dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover!
-
- if (dn->is_pinned()) {
- dn->lockstate = DN_LOCK_UNPINNING;
-
- // wait
- dout(7) << "dn pinned, waiting " << *dn << endl;
- dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED,
- dn->name,
- new C_MDS_RetryMessage(mds, m));
- return;
- } else {
- dn->lockstate = DN_LOCK_XLOCK;
- dn->xlockedby = 0;
-
- // ack now
- MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid());
- reply->set_dn(diri->ino(), dname);
- mds->send_message_mds(reply, from, MDS_PORT_CACHE);
- }
-
- // wake up waiters
- dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ?
- break;
-
- case LOCK_AC_SYNC:
- assert(dn->lockstate == DN_LOCK_XLOCK);
- dn->lockstate = DN_LOCK_SYNC;
- dn->xlockedby = 0;
-
- // null? hose it.
- if (dn->is_null()) {
- dout(7) << "hosing null (and now sync) dentry " << *dn << endl;
- dir->remove_dentry(dn);
- }
-
- // wake up waiters
- dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out
- break;
-
- case LOCK_AC_REQXLOCKACK:
- case LOCK_AC_REQXLOCKNAK:
- {
- dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl;
- list<Context*> finished;
- dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY!
- finish_contexts(finished,
- (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1);
- }
- break;
-
-
- // -- auth --
- case LOCK_AC_LOCKACK:
- case LOCK_AC_LOCKNAK:
- assert(dn->gather_set.count(from) == 1);
- dn->gather_set.erase(from);
- if (dn->gather_set.size() == 0) {
- dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl;
- dn->lockstate = DN_LOCK_XLOCK;
- active_requests[dn->xlockedby].xlocks.insert(dn);
- dir->finish_waiting(CDIR_WAIT_DNLOCK, dname);
- }
- break;
-
-
- case LOCK_AC_REQXLOCKC:
- // make sure it's a _file_, if it exists.
- if (dn && dn->inode && dn->inode->is_dir()) {
- dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl;
-
- // nak
- string path;
- dn->make_path(path);
-
- MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid());
- reply->set_dn(dir->ino(), dname);
- reply->set_path(path);
- mds->send_message_mds(reply, m->get_asker(), MDS_PORT_CACHE);
-
- // done
- if (active_requests.count(m))
- request_finish(m);
- else
- delete m;
- return;
- }
-
- case LOCK_AC_REQXLOCK:
- if (dn) {
- dout(7) << "handle_lock_dn reqxlock on " << *dn << endl;
- } else {
- dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl;
- }
-
-
- // start request?
- if (!active_requests.count(m)) {
- vector<CDentry*> trace;
- if (!request_start(m, dir->inode, trace))
- return; // waiting for pin
- }
-
- // try to xlock!
- if (!dn) {
- assert(m->get_action() == LOCK_AC_REQXLOCKC);
- dn = dir->add_dentry(dname);
- }
-
- if (dn->xlockedby != m) {
- if (!dentry_xlock_start(dn, m, dir->inode)) {
- // hose null dn if we're waiting on something
- if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
- return; // waiting for xlock
- }
- } else {
- // successfully xlocked! on behalf of requestor.
- string path;
- dn->make_path(path);
-
- dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl;
-
- // ACK xlock request
- MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid());
- reply->set_dn(dir->ino(), dname);
- reply->set_path(path);
- mds->send_message_mds(reply, m->get_asker(), MDS_PORT_CACHE);
-
- // note: keep request around in memory (to hold the xlock/pins on behalf of requester)
- return;
- }
- break;
-
- case LOCK_AC_UNXLOCK:
- dout(7) << "handle_lock_dn unxlock on " << *dn << endl;
- {
- string dname = dn->name;
- Message *m = dn->xlockedby;
-
- // finish request
- request_finish(m); // this will drop the locks (and unpin paths!)
- return;
- }
- break;
-
- default:
- assert(0);
- }
-
- delete m;
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
/*
bool did_shutdown_exports;
friend class C_MDC_ShutdownCommit;
+ friend class Locker;
friend class Migrator;
friend class Renamer;
friend class MDBalancer;
void handle_cache_expire(MCacheExpire *m);
- // -- locks --
- // high level interface
- public:
- bool inode_hard_read_try(CInode *in, Context *con);
- bool inode_hard_read_start(CInode *in, MClientRequest *m);
- void inode_hard_read_finish(CInode *in);
- bool inode_hard_write_start(CInode *in, MClientRequest *m);
- void inode_hard_write_finish(CInode *in);
- bool inode_file_read_start(CInode *in, MClientRequest *m);
- void inode_file_read_finish(CInode *in);
- bool inode_file_write_start(CInode *in, MClientRequest *m);
- void inode_file_write_finish(CInode *in);
-
- void inode_hard_eval(CInode *in);
- void inode_file_eval(CInode *in);
-
- protected:
- void inode_hard_mode(CInode *in, int mode);
- void inode_file_mode(CInode *in, int mode);
-
- // low level triggers
- void inode_hard_sync(CInode *in);
- void inode_hard_lock(CInode *in);
- bool inode_file_sync(CInode *in);
- void inode_file_lock(CInode *in);
- void inode_file_mixed(CInode *in);
- void inode_file_loner(CInode *in);
-
- // messengers
- void handle_lock(MLock *m);
- void handle_lock_inode_hard(MLock *m);
- void handle_lock_inode_file(MLock *m);
-
- // -- file i/o --
- public:
- version_t issue_file_data_version(CInode *in);
- Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req);
- bool issue_caps(CInode *in);
-
- protected:
- void handle_client_file_caps(class MClientFileCaps *m);
-
- void request_inode_file_caps(CInode *in);
- void handle_inode_file_caps(class MInodeFileCaps *m);
-
- // dirs
- void handle_lock_dir(MLock *m);
-
- // dentry locks
- public:
- bool dentry_xlock_start(CDentry *dn,
- Message *m, CInode *ref);
- void dentry_xlock_finish(CDentry *dn, bool quiet=false);
- void handle_lock_dn(MLock *m);
- void dentry_xlock_request(CDir *dir, string& dname, bool create,
- Message *req, Context *onfinish);
- void dentry_xlock_request_finish(int r,
- CDir *dir, string& dname,
- Message *req,
- Context *finisher);
-
-
-
// == crap fns ==
public:
void dump() {
// we just read an event.
if (le->can_expire(mds) == true) {
// obsolete
- dout(7) << "trim obsolete " << *le << endl;
+ dout(7) << "trim obsolete: " << *le << endl;
delete le;
logger->inc("obs");
} else {
assert ((int)trimming.size() < g_conf.mds_log_max_trimming);
// trim!
- dout(7) << "trim trimming " << *le << endl;
+ dout(7) << "trim trimming: " << *le << endl;
trimming[le->_end_off] = le;
le->retire(mds, new C_MDL_Trimmed(this, le));
logger->inc("retire");
#include "MDS.h"
#include "Server.h"
+#include "Locker.h"
#include "MDCache.h"
#include "MDStore.h"
#include "MDLog.h"
server = new Server(this);
+ locker = new Locker(this, mdcache);
req_rate = 0;
case MDS_PORT_CACHE:
mdcache->dispatch(m);
break;
+ case MDS_PORT_LOCKER:
+ locker->dispatch(m);
+ break;
case MDS_PORT_MIGRATOR:
mdcache->migrator->dispatch(m);
#define MDS_PORT_MAIN 0
#define MDS_PORT_SERVER 1
#define MDS_PORT_CACHE 2
-#define MDS_PORT_STORE 3
-#define MDS_PORT_BALANCER 4
-#define MDS_PORT_MIGRATOR 5
-#define MDS_PORT_RENAMER 6
+#define MDS_PORT_LOCKER 3
+#define MDS_PORT_STORE 4
+#define MDS_PORT_BALANCER 5
+#define MDS_PORT_MIGRATOR 6
+#define MDS_PORT_RENAMER 7
#define MDS_PORT_ANCHORCLIENT 10
#define MDS_PORT_ANCHORMGR 11
class Filer;
class Server;
+class Locker;
class AnchorTable;
class AnchorClient;
class MDCache;
// sub systems
Server *server;
MDCache *mdcache;
+ Locker *locker;
MDStore *mdstore;
MDLog *mdlog;
MDBalancer *balancer;
#include "CDir.h"
#include "CDentry.h"
#include "Migrator.h"
+#include "Locker.h"
#include "MDBalancer.h"
#include "MDLog.h"
in->hardlock.gather_set.erase(mds->get_nodeid());
in->hardlock.gather_set.erase(oldauth);
if (in->hardlock.gather_set.empty())
- cache->inode_hard_eval(in);
+ mds->locker->inode_hard_eval(in);
}
// caps
in->filelock.gather_set.erase(mds->get_nodeid());
in->filelock.gather_set.erase(oldauth);
if (in->filelock.gather_set.empty()) // necessary but not suffient...
- cache->inode_file_eval(in);
+ mds->locker->inode_file_eval(in);
}
// other
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "MDS.h"
+#include "Server.h"
+#include "Locker.h"
+#include "MDCache.h"
+#include "MDLog.h"
+#include "Migrator.h"
+#include "MDBalancer.h"
+#include "Renamer.h"
+
+#include "msg/Messenger.h"
+
+#include "messages/MClientMount.h"
+#include "messages/MClientMountAck.h"
+#include "messages/MClientRequest.h"
+#include "messages/MClientReply.h"
+#include "messages/MHashReaddir.h"
+#include "messages/MHashReaddirReply.h"
+
+#include "messages/MLock.h"
+
+#include "messages/MInodeLink.h"
+
+#include "events/EInodeUpdate.h"
+#include "events/EDirUpdate.h"
+
+#include "include/filepath.h"
+#include "common/Timer.h"
+#include "common/Logger.h"
+#include "common/LogType.h"
+
+#include <errno.h>
+#include <fcntl.h>
+
+#include <list>
+#include <iostream>
+using namespace std;
+
+
+
+void Server::dispatch(Message *m)
+{
+ // active?
+ if (!mds->is_active()) {
+ dout(3) << "not active yet, waiting" << endl;
+ mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ switch (m->get_type()) {
+ case MSG_CLIENT_MOUNT:
+ handle_client_mount((MClientMount*)m);
+ return;
+ case MSG_CLIENT_UNMOUNT:
+ handle_client_unmount(m);
+ return;
+ }
+
+
+ switch (m->get_type()) {
+ case MSG_CLIENT_REQUEST:
+ handle_client_request((MClientRequest*)m);
+ return;
+
+ case MSG_MDS_HASHREADDIR:
+ handle_hash_readdir((MHashReaddir*)m);
+ return;
+ case MSG_MDS_HASHREADDIRREPLY:
+ handle_hash_readdir_reply((MHashReaddirReply*)m);
+ return;
+
+ }
+
+ dout(1) << " main unknown message " << m->get_type() << endl;
+ assert(0);
+}
+
+
+
+
+
+void Server::handle_client_mount(MClientMount *m)
+{
+ int n = MSG_ADDR_NUM(m->get_source());
+ dout(3) << "mount by client" << n << endl;
+ mds->clientmap.add_mount(n, m->get_source_inst());
+
+ assert(whoami == 0); // mds0 mounts/unmounts
+
+ // ack
+ messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap),
+ m->get_source(), m->get_source_inst());
+ delete m;
+}
+
+void Server::handle_client_unmount(Message *m)
+{
+ int n = MSG_ADDR_NUM(m->get_source());
+ dout(3) << "unmount by client" << n << endl;
+
+ assert(whoami == 0); // mds0 mounts/unmounts
+
+ mds->clientmap.rem_mount(n);
+
+ if (mds->clientmap.get_mount_set().empty()) {
+ dout(3) << "all clients done, initiating shutdown" << endl;
+ mds->shutdown_start();
+ }
+
+ // ack by sending back to client
+ entity_inst_t srcinst = m->get_source_inst(); // make a copy!
+ messenger->send_message(m, m->get_source(), srcinst);
+}
+
+
+
+/*******
+ * some generic stuff for finishing off requests
+ */
+
+/** C_MDS_CommitRequest
+ */
+
+class C_MDS_CommitRequest : public Context {
+ Server *server;
+ MClientRequest *req;
+ MClientReply *reply;
+ CInode *tracei; // inode to include a trace for
+ LogEvent *event;
+
+public:
+ C_MDS_CommitRequest(Server *server,
+ MClientRequest *req, MClientReply *reply, CInode *tracei,
+ LogEvent *event=0) {
+ this->server = server;
+ this->req = req;
+ this->tracei = tracei;
+ this->reply = reply;
+ this->event = event;
+ }
+ void finish(int r) {
+ if (r != 0) {
+ // failure. set failure code and reply.
+ reply->set_result(r);
+ }
+ if (event) {
+ server->commit_request(req, reply, tracei, event);
+ } else {
+ // reply.
+ server->reply_request(req, reply, tracei);
+ }
+ }
+};
+
+
+/*
+ * send generic response (just and error code)
+ */
+void Server::reply_request(MClientRequest *req, int r, CInode *tracei)
+{
+ reply_request(req, new MClientReply(req, r), tracei);
+}
+
+
+/*
+ * send given reply
+ * include a trace to tracei
+ */
+void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) {
+ dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl;
+
+ // include trace
+ if (tracei) {
+ reply->set_trace_dist( tracei, whoami );
+ }
+
+ // send reply
+ messenger->send_message(reply,
+ MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+ // discard request
+ mdcache->request_finish(req);
+
+ // stupid stats crap (FIXME)
+ stat_ops++;
+}
+
+
+/*
+ * commit event(s) to the metadata journal, then reply.
+ * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply)
+ */
+void Server::commit_request(MClientRequest *req,
+ MClientReply *reply,
+ CInode *tracei,
+ LogEvent *event,
+ LogEvent *event2)
+{
+ // log
+ if (event) mdlog->submit_entry(event);
+ if (event2) mdlog->submit_entry(event2);
+
+ if (g_conf.mds_log_before_reply && g_conf.mds_log && event) {
+ // SAFE mode!
+
+ // pin inode so it doesn't go away!
+ if (tracei) mdcache->request_pin_inode(req, tracei);
+
+ // wait for log sync
+ mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei));
+ return;
+ }
+ else {
+ // just reply
+ reply_request(req, reply, tracei);
+ }
+}
+
+
+
+/***
+ * process a client request
+ */
+
+void Server::handle_client_request(MClientRequest *req)
+{
+ dout(4) << "req " << *req << endl;
+
+ // note original client addr
+ if (req->get_source().is_client())
+ req->set_client_inst( req->get_source_inst() );
+
+ if (!mds->is_active()) {
+ dout(5) << " not active, discarding client request." << endl;
+ delete req;
+ return;
+ }
+
+ if (!mdcache->get_root()) {
+ dout(5) << "need to open root" << endl;
+ mdcache->open_root(new C_MDS_RetryMessage(mds, req));
+ return;
+ }
+
+ // okay, i want
+ CInode *ref = 0;
+ vector<CDentry*> trace; // might be blank, for fh guys
+
+ bool follow_trailing_symlink = false;
+
+ // operations on fh's or other non-files
+ switch (req->get_op()) {
+ /*
+ case MDS_OP_FSTAT:
+ reply = handle_client_fstat(req, cur);
+ break; ****** fiX ME ***
+ */
+
+ case MDS_OP_TRUNCATE:
+ if (!req->get_ino()) break; // can be called w/ either fh OR path
+
+ case MDS_OP_RELEASE:
+ case MDS_OP_FSYNC:
+ ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed?
+
+ if (!ref) {
+ int next = whoami + 1;
+ if (next >= mds->mdsmap->get_num_mds()) next = 0;
+ dout(10) << "got request on ino we don't have, passing buck to " << next << endl;
+ mds->send_message_mds(req, next, MDS_PORT_SERVER);
+ return;
+ }
+ }
+
+ if (!ref) {
+ // we need to traverse a path
+ filepath refpath = req->get_filepath();
+
+ // ops on non-existing files --> directory paths
+ switch (req->get_op()) {
+ case MDS_OP_OPEN:
+ if (!(req->get_iarg() & O_CREAT)) break;
+
+ case MDS_OP_MKNOD:
+ case MDS_OP_MKDIR:
+ case MDS_OP_SYMLINK:
+ case MDS_OP_LINK:
+ case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!!
+ case MDS_OP_RMDIR:
+ case MDS_OP_RENAME:
+ // remove last bit of path
+ refpath = refpath.prefixpath(refpath.depth()-1);
+ break;
+ }
+ dout(10) << "refpath = " << refpath << endl;
+
+ Context *ondelay = new C_MDS_RetryMessage(mds, req);
+
+ if (req->get_op() == MDS_OP_LSTAT) {
+ follow_trailing_symlink = false;
+ }
+
+ // do trace
+ int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink,
+ req, ondelay,
+ MDS_TRAVERSE_FORWARD,
+ 0,
+ true); // is MClientRequest
+
+ if (r > 0) return; // delayed
+ if (r == -ENOENT ||
+ r == -ENOTDIR ||
+ r == -EISDIR) {
+ // error!
+ dout(10) << " path traverse error " << r << ", replying" << endl;
+
+ // send error
+ messenger->send_message(new MClientReply(req, r),
+ MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst());
+
+ // <HACK>
+ if (refpath.last_bit() == ".hash" &&
+ refpath.depth() > 1) {
+ dout(1) << "got explicit hash command " << refpath << endl;
+ CDir *dir = trace[trace.size()-1]->get_inode()->dir;
+ if (!dir->is_hashed() &&
+ !dir->is_hashing() &&
+ dir->is_auth())
+ mdcache->migrator->hash_dir(dir);
+ }
+ // </HACK>
+
+
+ delete req;
+ return;
+ }
+
+ if (trace.size())
+ ref = trace[trace.size()-1]->inode;
+ else
+ ref = mdcache->get_root();
+ }
+
+ dout(10) << "ref is " << *ref << endl;
+
+ // rename doesn't pin src path (initially)
+ if (req->get_op() == MDS_OP_RENAME) trace.clear();
+
+ // register
+ if (!mdcache->request_start(req, ref, trace))
+ return;
+
+ // process
+ dispatch_request(req, ref);
+}
+
+
+
+void Server::dispatch_request(Message *m, CInode *ref)
+{
+ MClientRequest *req = 0;
+
+ // MLock or MClientRequest?
+ /* this is a little weird.
+ client requests and mlocks both initial dentry xlocks, path pins, etc.,
+ and thus both make use of the context C_MDS_RetryRequest.
+ */
+ switch (m->get_type()) {
+ case MSG_CLIENT_REQUEST:
+ req = (MClientRequest*)m;
+ break; // continue below!
+
+ case MSG_MDS_LOCK:
+ mds->locker->handle_lock_dn((MLock*)m);
+ return; // done
+
+ default:
+ assert(0); // shouldn't get here
+ }
+
+ // MClientRequest.
+
+ switch(req->get_op()) {
+
+ // files
+ case MDS_OP_OPEN:
+ if (req->get_iarg() & O_CREAT)
+ handle_client_openc(req, ref);
+ else
+ handle_client_open(req, ref);
+ break;
+ case MDS_OP_TRUNCATE:
+ handle_client_truncate(req, ref);
+ break;
+ /*
+ case MDS_OP_FSYNC:
+ handle_client_fsync(req, ref);
+ break;
+ */
+ /*
+ case MDS_OP_RELEASE:
+ handle_client_release(req, ref);
+ break;
+ */
+
+ // inodes
+ case MDS_OP_STAT:
+ case MDS_OP_LSTAT:
+ handle_client_stat(req, ref);
+ break;
+ case MDS_OP_UTIME:
+ handle_client_utime(req, ref);
+ break;
+ case MDS_OP_CHMOD:
+ handle_client_chmod(req, ref);
+ break;
+ case MDS_OP_CHOWN:
+ handle_client_chown(req, ref);
+ break;
+
+ // namespace
+ case MDS_OP_READDIR:
+ handle_client_readdir(req, ref);
+ break;
+ case MDS_OP_MKNOD:
+ handle_client_mknod(req, ref);
+ break;
+ case MDS_OP_LINK:
+ handle_client_link(req, ref);
+ break;
+ case MDS_OP_UNLINK:
+ handle_client_unlink(req, ref);
+ break;
+ case MDS_OP_RENAME:
+ handle_client_rename(req, ref);
+ break;
+ case MDS_OP_RMDIR:
+ handle_client_unlink(req, ref);
+ break;
+ case MDS_OP_MKDIR:
+ handle_client_mkdir(req, ref);
+ break;
+ case MDS_OP_SYMLINK:
+ handle_client_symlink(req, ref);
+ break;
+
+
+
+ default:
+ dout(1) << " unknown client op " << req->get_op() << endl;
+ assert(0);
+ }
+
+ return;
+}
+
+
+
+
+// STAT
+
+void Server::handle_client_stat(MClientRequest *req,
+ CInode *ref)
+{
+ // do I need file info?
+ int mask = req->get_iarg();
+ if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) {
+ // yes. do a full stat.
+ if (!mds->locker->inode_file_read_start(ref, req))
+ return; // syncing
+ mds->locker->inode_file_read_finish(ref);
+ } else {
+ // nope! easy peasy.
+ }
+
+ mds->balancer->hit_inode(ref, META_POP_IRD);
+
+ // reply
+ dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl;
+ MClientReply *reply = new MClientReply(req);
+
+ reply_request(req, reply, ref);
+}
+
+
+
+// INODE UPDATES
+
+// utime
+
+void Server::handle_client_utime(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_file_write_start(cur, req))
+ return; // fw or (wait for) sync
+
+ // do update
+ cur->inode.mtime = req->get_targ();
+ cur->inode.atime = req->get_targ2();
+ if (cur->is_auth())
+ cur->mark_dirty();
+
+ mds->locker->inode_file_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // init reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_result(0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+// HARD
+
+// chmod
+
+void Server::handle_client_chmod(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_hard_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+
+ // check permissions
+
+ // do update
+ int mode = req->get_iarg();
+ cur->inode.mode &= ~04777;
+ cur->inode.mode |= (mode & 04777);
+ cur->mark_dirty();
+
+ mds->locker->inode_hard_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+// chown
+
+void Server::handle_client_chown(MClientRequest *req,
+ CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_hard_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+ // check permissions
+
+ // do update
+ int uid = req->get_iarg();
+ int gid = req->get_iarg2();
+ cur->inode.uid = uid;
+ cur->inode.gid = gid;
+ cur->mark_dirty();
+
+ mds->locker->inode_hard_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+bool Server::try_open_dir(CInode *in, MClientRequest *req)
+{
+ if (!in->dir && in->is_frozen_dir()) {
+ // doh!
+ dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl;
+ assert(in->get_parent_dir());
+ in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, in));
+ return false;
+ }
+
+ in->get_or_open_dir(mds);
+ return true;
+}
+
+
+// DIRECTORY and NAMESPACE OPS
+
+// READDIR
+
+int Server::encode_dir_contents(CDir *dir,
+ list<InodeStat*>& inls,
+ list<string>& dnls)
+{
+ int numfiles = 0;
+
+ for (CDir_map_t::iterator it = dir->begin();
+ it != dir->end();
+ it++) {
+ CDentry *dn = it->second;
+
+ // hashed?
+ if (dir->is_hashed() &&
+ whoami != mds->hash_dentry( dir->ino(), it->first ))
+ continue;
+
+ // is dentry readable?
+ if (dn->is_xlocked()) {
+ // ***** FIXME *****
+ // ?
+ dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl;
+ }
+
+ CInode *in = dn->inode;
+ if (!in) continue; // null dentry?
+
+ dout(12) << "including inode " << *in << endl;
+
+ // add this item
+ // note: InodeStat makes note of whether inode data is readable.
+ dnls.push_back( it->first );
+ inls.push_back( new InodeStat(in, whoami) );
+ numfiles++;
+ }
+ return numfiles;
+}
+
+
+/*
+ * note: this is pretty sloppy, but should work just fine i think...
+ */
+void Server::handle_hash_readdir(MHashReaddir *m)
+{
+ CInode *cur = mdcache->get_inode(m->get_ino());
+ assert(cur);
+
+ if (!cur->dir ||
+ !cur->dir->is_hashed()) {
+ assert(0);
+ dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl;
+ delete m;
+ return;
+ }
+ CDir *dir = cur->dir;
+ assert(dir);
+ assert(dir->is_hashed());
+
+ // complete?
+ if (!dir->is_complete()) {
+ dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m));
+ return;
+ }
+
+ // get content
+ list<InodeStat*> inls;
+ list<string> dnls;
+ int num = encode_dir_contents(dir, inls, dnls);
+
+ // sent it back!
+ messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num),
+ m->get_source(), m->get_source_inst(), MDS_PORT_CACHE);
+}
+
+
+void Server::handle_hash_readdir_reply(MHashReaddirReply *m)
+{
+ CInode *cur = mdcache->get_inode(m->get_ino());
+ assert(cur);
+
+ if (!cur->dir ||
+ !cur->dir->is_hashed()) {
+ assert(0);
+ dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl;
+ delete m;
+ return;
+ }
+ CDir *dir = cur->dir;
+ assert(dir);
+ assert(dir->is_hashed());
+
+ // move items to hashed_readdir gather
+ int from = MSG_ADDR_NUM(m->get_source());
+ assert(dir->hashed_readdir.count(from) == 0);
+ dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(),
+ m->get_in());
+ dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(),
+ m->get_dn());
+ delete m;
+
+ // gather finished?
+ if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) {
+ dout(7) << "still waiting for more hashed readdir bits" << endl;
+ return;
+ }
+
+ dout(7) << "got last bit! finishing waiters" << endl;
+
+ // do these finishers. they'll copy the results.
+ list<Context*> finished;
+ dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished);
+ finish_contexts(finished);
+
+ // now discard these results
+ for (map<int, pair< list<InodeStat*>, list<string> > >::iterator it = dir->hashed_readdir.begin();
+ it != dir->hashed_readdir.end();
+ it++) {
+ for (list<InodeStat*>::iterator ci = it->second.first.begin();
+ ci != it->second.first.end();
+ ci++)
+ delete *ci;
+ }
+ dir->hashed_readdir.clear();
+
+ // unpin dir (we're done!)
+ dir->auth_unpin();
+
+ // trigger any waiters for next hashed readdir cycle
+ dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue);
+}
+
+
+class C_MDS_HashReaddir : public Context {
+ Server *server;
+ MClientRequest *req;
+ CDir *dir;
+public:
+ C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) {
+ this->server = server;
+ this->req = req;
+ this->dir = dir;
+ }
+ void finish(int r) {
+ server->finish_hash_readdir(req, dir);
+ }
+};
+
+void Server::finish_hash_readdir(MClientRequest *req, CDir *dir)
+{
+ dout(7) << "finish_hash_readdir on " << *dir << endl;
+
+ assert(dir->is_hashed());
+ assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds());
+
+ // reply!
+ MClientReply *reply = new MClientReply(req);
+ reply->set_result(0);
+
+ for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+ reply->copy_dir_items(dir->hashed_readdir[i].first,
+ dir->hashed_readdir[i].second);
+ }
+
+ // ok!
+ reply_request(req, reply, dir->inode);
+}
+
+
+void Server::handle_client_readdir(MClientRequest *req,
+ CInode *cur)
+{
+ // it's a directory, right?
+ if (!cur->is_dir()) {
+ // not a dir
+ dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+
+ // auth?
+ if (!cur->dir_is_auth()) {
+ int dirauth = cur->authority();
+ if (cur->dir)
+ dirauth = cur->dir->authority();
+ assert(dirauth >= 0);
+ assert(dirauth != whoami);
+
+ // forward to authority
+ dout(10) << " forwarding readdir to authority " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(cur, req))
+ return;
+ assert(cur->dir->is_auth());
+
+ // unhashing? wait!
+ if (cur->dir->is_hashed() &&
+ cur->dir->is_unhashing()) {
+ dout(10) << "unhashing, waiting" << endl;
+ cur->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ // check perm
+ if (!mds->locker->inode_hard_read_start(cur,req))
+ return;
+ mds->locker->inode_hard_read_finish(cur);
+
+ CDir *dir = cur->dir;
+ assert(dir);
+
+ if (!dir->is_complete()) {
+ // fetch
+ dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ if (dir->is_hashed()) {
+ // HASHED
+ dout(7) << "hashed dir" << endl;
+ if (!dir->can_auth_pin()) {
+ dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ if (!dir->hashed_readdir.empty()) {
+ dout(7) << "another readdir gather in progres, waiting" << endl;
+ dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur));
+ return;
+ }
+
+ // start new readdir gather
+ dout(7) << "staring new hashed readdir gather" << endl;
+
+ // pin auth for process!
+ dir->auth_pin();
+
+ // get local bits
+ encode_dir_contents(cur->dir,
+ dir->hashed_readdir[whoami].first,
+ dir->hashed_readdir[whoami].second);
+
+ // request other bits
+ for (int i=0; i<mds->mdsmap->get_num_mds(); i++) {
+ if (i == whoami) continue;
+ mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER);
+ }
+
+ // wait
+ dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR,
+ new C_MDS_HashReaddir(this, req, dir));
+ } else {
+ // NON-HASHED
+ // build dir contents
+ list<InodeStat*> inls;
+ list<string> dnls;
+ int numfiles = encode_dir_contents(cur->dir, inls, dnls);
+
+ // . too
+ dnls.push_back(".");
+ inls.push_back(new InodeStat(cur, whoami));
+ ++numfiles;
+
+ // yay, reply
+ MClientReply *reply = new MClientReply(req);
+ reply->take_dir_items(inls, dnls, numfiles);
+
+ dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl;
+ reply->set_result(0);
+
+ //balancer->hit_dir(cur->dir);
+
+ // reply
+ reply_request(req, reply, cur);
+ }
+}
+
+
+// MKNOD
+
+void Server::handle_client_mknod(MClientRequest *req, CInode *ref)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, ref);
+ if (!newi) return;
+
+ // it's a file!
+ newi->inode.mode = req->get_iarg();
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_FILE;
+
+ mds->balancer->hit_inode(newi, META_POP_IWR);
+
+ // commit
+ commit_request(req, new MClientReply(req, 0), ref,
+ new EInodeUpdate(newi)); // FIXME this is the wrong message
+}
+
+// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical.
+
+CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist)
+{
+ dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl;
+
+ // get containing directory (without last bit)
+ filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1);
+ string name = req->get_filepath().last_bit();
+
+ // did we get to parent?
+ dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl;
+
+ // make sure parent is a dir?
+ if (!diri->is_dir()) {
+ dout(7) << "not a dir" << endl;
+ reply_request(req, -ENOTDIR);
+ return 0;
+ }
+
+ // am i not open, not auth?
+ if (!diri->dir && !diri->is_auth()) {
+ int dirauth = diri->authority();
+ dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return 0;
+ }
+
+ if (!try_open_dir(diri, req)) return 0;
+ CDir *dir = diri->dir;
+
+ // make sure it's my dentry
+ int dnauth = dir->dentry_authority(name);
+ if (dnauth != whoami) {
+ // fw
+
+ dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return 0;
+ }
+ // ok, done passing buck.
+
+
+ // frozen?
+ if (dir->is_frozen()) {
+ dout(7) << "dir is frozen " << *dir << endl;
+ dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ // make sure name doesn't already exist
+ CDentry *dn = dir->lookup(name);
+ if (dn) {
+ if (!dn->can_read(req)) {
+ dout(10) << "waiting on (existing!) dentry " << *dn << endl;
+ dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ if (!dn->is_null()) {
+ // name already exists
+ if (okexist) {
+ dout(10) << "dentry " << name << " exists in " << *dir << endl;
+ return dn->inode;
+ } else {
+ dout(10) << "dentry " << name << " exists in " << *dir << endl;
+ reply_request(req, -EEXIST);
+ return 0;
+ }
+ }
+ }
+
+ // make sure dir is complete
+ if (!dir->is_complete()) {
+ dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl;
+ mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri));
+ return 0;
+ }
+
+ // create!
+ CInode *newi = mdcache->create_inode();
+ newi->inode.uid = req->get_caller_uid();
+ newi->inode.gid = req->get_caller_gid();
+ newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime(); // now
+
+ // link
+ if (!dn)
+ dn = dir->add_dentry(name, newi);
+ else
+ dir->link_inode(dn, newi);
+
+ // bump modify pop
+ mds->balancer->hit_dir(dir, META_POP_DWR);
+
+ // mark dirty
+ dn->mark_dirty();
+ newi->mark_dirty();
+
+ // journal it
+ mdlog->submit_entry(new EDirUpdate(dir)); // FIXME WRONG EVENT
+
+ // ok!
+ return newi;
+}
+
+
+// LINK
+
+class C_MDS_LinkTraverse : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+public:
+ vector<CDentry*> trace;
+ C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ }
+ void finish(int r) {
+ server->handle_client_link_2(r, req, ref, trace);
+ }
+};
+
+void Server::handle_client_link(MClientRequest *req, CInode *ref)
+{
+ // figure out name
+ string dname = req->get_filepath().last_bit();
+ dout(7) << "dname is " << dname << endl;
+
+ // make sure parent is a dir?
+ if (!ref->is_dir()) {
+ dout(7) << "not a dir " << *ref << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!ref->dir && !ref->is_auth()) {
+ int dirauth = ref->authority();
+ dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(ref, req)) return;
+ CDir *dir = ref->dir;
+ dout(7) << "handle_client_link dir is " << *dir << endl;
+
+ // make sure it's my dentry
+ int dauth = dir->dentry_authority(dname);
+ if (dauth != whoami) {
+ // fw
+ dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl;
+ mdcache->request_forward(req, dauth);
+ return;
+ }
+ // ok, done passing buck.
+
+
+ // exists?
+ CDentry *dn = dir->lookup(dname);
+ if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+ dout(7) << "handle_client_link dn exists " << *dn << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ // keep src dir in memory
+ mdcache->request_pin_dir(req, dir);
+
+ // discover link target
+ filepath target = req->get_sarg();
+
+ dout(7) << "handle_client_link discovering target " << target << endl;
+
+ C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref);
+ Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+
+ mdcache->path_traverse(target, onfinish->trace, false,
+ req, ondelay,
+ MDS_TRAVERSE_DISCOVER, //XLOCK,
+ onfinish);
+}
+
+
+class C_MDS_RemoteLink : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+ CDentry *dn;
+ CInode *targeti;
+public:
+ C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ this->dn = dn;
+ this->targeti = targeti;
+ }
+ void finish(int r) {
+ if (r > 0) { // success
+ // yay
+ server->handle_client_link_finish(req, ref, dn, targeti);
+ }
+ else if (r == 0) {
+ // huh? retry!
+ assert(0);
+ server->dispatch_request(req, ref);
+ } else {
+ // link failed
+ server->reply_request(req, r);
+ }
+ }
+};
+
+void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace)
+{
+ // target dne?
+ if (r < 0) {
+ dout(7) << "target " << req->get_sarg() << " dne" << endl;
+ reply_request(req, r);
+ return;
+ }
+ assert(r == 0);
+
+ CInode *targeti = mdcache->get_root();
+ if (trace.size()) targeti = trace[trace.size()-1]->inode;
+ assert(targeti);
+
+ // dir?
+ dout(7) << "target is " << *targeti << endl;
+ if (targeti->is_dir()) {
+ dout(7) << "target is a dir, failing" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // keep target inode in memory
+ mdcache->request_pin_inode(req, targeti);
+
+ dout(7) << "dir is " << *ref << endl;
+
+ // xlock the dentry
+ CDir *dir = ref->dir;
+ assert(dir);
+
+ string dname = req->get_filepath().last_bit();
+ int dauth = dir->dentry_authority(dname);
+ if (whoami != dauth) {
+ // ugh, exported out from under us
+ dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl;
+ mdcache->request_forward(req, dauth);
+ return;
+ }
+
+ CDentry *dn = dir->lookup(dname);
+ if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) {
+ dout(7) << "handle_client_link dn exists " << *dn << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ if (!dn) dn = dir->add_dentry(dname);
+
+ if (!dn->is_xlockedbyme(req)) {
+ if (!mds->locker->dentry_xlock_start(dn, req, ref)) {
+ if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn);
+ return;
+ }
+ }
+
+
+ // ok xlocked!
+ if (targeti->is_auth()) {
+ // mine
+ if (targeti->is_anchored()) {
+ dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl;
+ } else {
+ assert(targeti->inode.nlink == 1);
+ dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl;
+
+ mdcache->anchor_inode(targeti,
+ new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+
+ // ok, inc link!
+ targeti->inode.nlink++;
+ dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl;
+ targeti->mark_dirty();
+
+ } else {
+ // remote: send nlink++ request, wait
+ dout(7) << "target is remote, sending InodeLink" << endl;
+ mds->send_message_mds(new MInodeLink(targeti->ino(), whoami), targeti->authority(), MDS_PORT_CACHE);
+
+ // wait
+ targeti->add_waiter(CINODE_WAIT_LINK,
+ new C_MDS_RemoteLink(this, req, ref, dn, targeti));
+ return;
+ }
+
+ handle_client_link_finish(req, ref, dn, targeti);
+}
+
+void Server::handle_client_link_finish(MClientRequest *req, CInode *ref,
+ CDentry *dn, CInode *targeti)
+{
+ // create remote link
+ dn->dir->link_inode(dn, targeti->ino());
+ dn->link_remote( targeti ); // since we have it
+ dn->mark_dirty();
+
+ mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+ // done!
+ commit_request(req, new MClientReply(req, 0), ref,
+ 0); // FIXME i should log something
+}
+
+
+// UNLINK
+
+void Server::handle_client_unlink(MClientRequest *req,
+ CInode *diri)
+{
+ // rmdir or unlink
+ bool rmdir = false;
+ if (req->get_op() == MDS_OP_RMDIR) rmdir = true;
+
+ // find it
+ if (req->get_filepath().depth() == 0) {
+ dout(7) << "can't rmdir root" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ string name = req->get_filepath().last_bit();
+
+ // make sure parent is a dir?
+ if (!diri->is_dir()) {
+ dout(7) << "not a dir" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!diri->dir && !diri->is_auth()) {
+ int dirauth = diri->authority();
+ dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(diri, req)) return;
+ CDir *dir = diri->dir;
+ int dnauth = dir->dentry_authority(name);
+
+ // does it exist?
+ CDentry *dn = dir->lookup(name);
+ if (!dn) {
+ if (dnauth == whoami) {
+ dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl;
+ reply_request(req, -ENOENT);
+ } else {
+ // send to authority!
+ dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl;
+ mdcache->request_forward(req, dnauth);
+ }
+ return;
+ }
+
+ // have it. locked?
+ if (!dn->can_read(req)) {
+ dout(10) << " waiting on " << *dn << endl;
+ dir->add_waiter(CDIR_WAIT_DNREAD,
+ name,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+
+ // null?
+ if (dn->is_null()) {
+ dout(10) << "unlink on null dn " << *dn << endl;
+ reply_request(req, -ENOENT);
+ return;
+ }
+
+ // ok!
+ CInode *in = dn->inode;
+ assert(in);
+ if (rmdir) {
+ dout(7) << "handle_client_rmdir on dir " << *in << endl;
+ } else {
+ dout(7) << "handle_client_unlink on non-dir " << *in << endl;
+ }
+
+ // dir stuff
+ if (in->is_dir()) {
+ if (rmdir) {
+ // rmdir
+
+ // open dir?
+ if (in->is_auth() && !in->dir) {
+ if (!try_open_dir(in, req)) return;
+ }
+
+ // not dir auth? (or not open, which implies the same!)
+ if (!in->dir) {
+ dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return;
+ }
+ if (!in->dir->is_auth()) {
+ int dirauth = in->dir->authority();
+ dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ assert(in->dir);
+ assert(in->dir->is_auth());
+
+ // dir size check on dir auth (but not necessarily dentry auth)?
+
+ // should be empty
+ if (in->dir->get_size() == 0 && !in->dir->is_complete()) {
+ dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl;
+ mds->mdstore->fetch_dir(in->dir,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+ if (in->dir->get_size() > 0) {
+ dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl;
+ reply_request(req, -ENOTEMPTY);
+ return;
+ }
+
+ dout(7) << "handle_client_rmdir dir is empty!" << endl;
+
+ // export sanity check
+ if (!in->is_auth()) {
+ // i should be exporting this now/soon, since the dir is empty.
+ dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl;
+ if (!in->dir->is_freezing() && in->dir->is_frozen()) {
+ assert(in->dir->is_import());
+ mdcache->migrator->export_empty_import(in->dir);
+ } else {
+ dout(7) << "apparently already exporting" << endl;
+ }
+ in->dir->add_waiter(CDIR_WAIT_UNFREEZE,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+
+ } else {
+ // unlink
+ dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+ } else {
+ if (rmdir) {
+ // unlink
+ dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl;
+ reply_request(req, -ENOTDIR);
+ return;
+ }
+ }
+
+ // am i dentry auth?
+ if (dnauth != whoami) {
+ // not auth; forward!
+ dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl;
+ mdcache->request_forward(req, dnauth);
+ return;
+ }
+
+ dout(7) << "handle_client_unlink/rmdir on " << *in << endl;
+
+ // xlock dentry
+ if (!mds->locker->dentry_xlock_start(dn, req, diri))
+ return;
+
+ // is this a remote link?
+ if (dn->is_remote() && !dn->inode) {
+ CInode *in = mdcache->get_inode(dn->get_remote_ino());
+ if (in) {
+ dn->link_remote(in);
+ } else {
+ // open inode
+ dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl;
+ mdcache->open_remote_ino(dn->get_remote_ino(), req,
+ new C_MDS_RetryRequest(mds, req, diri));
+ return;
+ }
+ }
+
+
+ mds->balancer->hit_dir(dn->dir, META_POP_DWR);
+
+ // it's locked, unlink!
+ MClientReply *reply = new MClientReply(req,0);
+ mdcache->dentry_unlink(dn,
+ new C_MDS_CommitRequest(this, req, reply, diri,
+ new EInodeUpdate(diri))); // FIXME WRONG EVENT
+ return;
+}
+
+
+
+
+
+
+// RENAME
+
+class C_MDS_RenameTraverseDst : public Context {
+ Server *server;
+ MClientRequest *req;
+ CInode *ref;
+ CInode *srcdiri;
+ CDir *srcdir;
+ CDentry *srcdn;
+ filepath destpath;
+public:
+ vector<CDentry*> trace;
+
+ C_MDS_RenameTraverseDst(Server *server,
+ MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath) {
+ this->server = server;
+ this->req = req;
+ this->ref = ref;
+ this->srcdiri = srcdiri;
+ this->srcdir = srcdir;
+ this->srcdn = srcdn;
+ this->destpath = destpath;
+ }
+ void finish(int r) {
+ server->handle_client_rename_2(req, ref,
+ srcdiri, srcdir, srcdn, destpath,
+ trace, r);
+ }
+};
+
+
+/*
+
+ weirdness iwith rename:
+ - ref inode is what was originally srcdiri, but that may change by the tiem
+ the rename actually happens. for all practical purpose, ref is useless except
+ for C_MDS_RetryRequest
+
+ */
+void Server::handle_client_rename(MClientRequest *req,
+ CInode *ref)
+{
+ dout(7) << "handle_client_rename on " << *req << endl;
+
+ // sanity checks
+ if (req->get_filepath().depth() == 0) {
+ dout(7) << "can't rename root" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ // mv a/b a/b/c -- meaningless
+ if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 &&
+ req->get_sarg().c_str()[ req->get_path().length() ] == '/') {
+ dout(7) << "can't rename to underneath myself" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // mv blah blah -- also meaningless
+ if (req->get_sarg() == req->get_path()) {
+ dout(7) << "can't rename something to itself (or into itself)" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // traverse to source
+ /*
+ this is abnoraml, just for rename. since we don't pin source path
+ (because we don't want to screw up the lock ordering) the ref inode
+ (normally/initially srcdiri) may move, and this may fail.
+ -> so, re-traverse path. and make sure we request_finish in the case of a forward!
+ */
+ filepath refpath = req->get_filepath();
+ string srcname = refpath.last_bit();
+ refpath = refpath.prefixpath(refpath.depth()-1);
+
+ dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl;
+ vector<CDentry*> trace;
+ int r = mdcache->path_traverse(refpath, trace, true,
+ req, new C_MDS_RetryRequest(mds, req, ref),
+ MDS_TRAVERSE_FORWARD);
+ if (r == 2) {
+ dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl;
+ dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl;
+ mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that)
+ return;
+ }
+ if (r > 0) return;
+ if (r < 0) { // dne or something. got renamed out from under us, probably!
+ dout(7) << "traverse r=" << r << endl;
+ reply_request(req, r);
+ return;
+ }
+
+ CInode *srcdiri;
+ if (trace.size())
+ srcdiri = trace[trace.size()-1]->inode;
+ else
+ srcdiri = mdcache->get_root();
+
+ dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl;
+
+ dout(7) << "handle_client_rename srcname is " << srcname << endl;
+
+ // make sure parent is a dir?
+ if (!srcdiri->is_dir()) {
+ dout(7) << "srcdiri not a dir " << *srcdiri << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // am i not open, not auth?
+ if (!srcdiri->dir && !srcdiri->is_auth()) {
+ int dirauth = srcdiri->authority();
+ dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl;
+ mdcache->request_forward(req, dirauth);
+ return;
+ }
+
+ if (!try_open_dir(srcdiri, req)) return;
+ CDir *srcdir = srcdiri->dir;
+ dout(7) << "handle_client_rename srcdir is " << *srcdir << endl;
+
+ // make sure it's my dentry
+ int srcauth = srcdir->dentry_authority(srcname);
+ if (srcauth != whoami) {
+ // fw
+ dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl;
+ mdcache->request_forward(req, srcauth);
+ return;
+ }
+ // ok, done passing buck.
+
+ // src dentry
+ CDentry *srcdn = srcdir->lookup(srcname);
+
+ // xlocked?
+ if (srcdn && !srcdn->can_read(req)) {
+ dout(10) << " waiting on " << *srcdn << endl;
+ srcdir->add_waiter(CDIR_WAIT_DNREAD,
+ srcname,
+ new C_MDS_RetryRequest(mds, req, srcdiri));
+ return;
+ }
+
+ if ((srcdn && !srcdn->inode) ||
+ (!srcdn && srcdir->is_complete())) {
+ dout(10) << "handle_client_rename src dne " << endl;
+ reply_request(req, -EEXIST);
+ return;
+ }
+
+ if (!srcdn && !srcdir->is_complete()) {
+ dout(10) << "readding incomplete dir" << endl;
+ mds->mdstore->fetch_dir(srcdir,
+ new C_MDS_RetryRequest(mds, req, srcdiri));
+ return;
+ }
+ assert(srcdn && srcdn->inode);
+
+
+ dout(10) << "handle_client_rename srcdn is " << *srcdn << endl;
+ dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl;
+
+ // pin src in cache (so it won't expire)
+ mdcache->request_pin_inode(req, srcdn->inode);
+
+ // find the destination, normalize
+ // discover, etc. on the way... just get it on the local node.
+ filepath destpath = req->get_sarg();
+
+ C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath);
+ Context *ondelay = new C_MDS_RetryRequest(mds, req, ref);
+
+ /*
+ * use DISCOVERXLOCK mode:
+ * the dest may not exist, and may be xlocked from a remote host
+ * we want to succeed if we find the xlocked dentry
+ * ??
+ */
+ mdcache->path_traverse(destpath, onfinish->trace, false,
+ req, ondelay,
+ MDS_TRAVERSE_DISCOVER, //XLOCK,
+ onfinish);
+}
+
+void Server::handle_client_rename_2(MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath,
+ vector<CDentry*>& trace,
+ int r)
+{
+ dout(7) << "handle_client_rename_2 on " << *req << endl;
+ dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl;
+
+ CInode *srci = srcdn->inode;
+ assert(srci);
+ CDir* destdir = 0;
+ string destname;
+
+ // what is the dest? (dir or file or complete filename)
+ // note: trace includes root, destpath doesn't (include leading /)
+ if (trace.size() && trace[trace.size()-1]->inode == 0) {
+ dout(10) << "dropping null dentry from tail of trace" << endl;
+ trace.pop_back(); // drop it!
+ }
+
+ CInode *d;
+ if (trace.size())
+ d = trace[trace.size()-1]->inode;
+ else
+ d = mdcache->get_root();
+ assert(d);
+ dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl;
+
+ // make sure i can open the dir?
+ if (d->is_dir() && !d->dir_is_auth() && !d->dir) {
+ // discover it
+ mdcache->open_remote_dir(d,
+ new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+
+ if (trace.size() == destpath.depth()) {
+ if (d->is_dir()) {
+ // mv /some/thing /to/some/dir
+ if (!try_open_dir(d, req)) return;
+ destdir = d->dir; // /to/some/dir
+ destname = req->get_filepath().last_bit(); // thing
+ destpath.add_dentry(destname);
+ } else {
+ // mv /some/thing /to/some/existing_filename
+ destdir = trace[trace.size()-1]->dir; // /to/some
+ destname = destpath.last_bit(); // existing_filename
+ }
+ }
+ else if (trace.size() == destpath.depth()-1) {
+ if (d->is_dir()) {
+ // mv /some/thing /to/some/place_that_maybe_dne (we might be replica)
+ if (!try_open_dir(d, req)) return;
+ destdir = d->dir; // /to/some
+ destname = destpath.last_bit(); // place_that_MAYBE_dne
+ } else {
+ dout(7) << "dest dne" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+ }
+ else {
+ assert(trace.size() < destpath.depth()-1);
+ // check traverse return value
+ if (r > 0) {
+ return; // discover, readdir, etc.
+ }
+
+ // ??
+ assert(r < 0 || trace.size() == 0); // musta been an error
+
+ // error out
+ dout(7) << " rename dest " << destpath << " dne" << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ string srcpath = req->get_path();
+ dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl;
+ dout(10) << "handle_client_rename_2 destpath " << destpath << endl;
+
+ // src == dest?
+ if (srcdn->get_dir() == destdir && srcdn->name == destname) {
+ dout(7) << "rename src=dest, same file " << endl;
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ // does destination exist? (is this an overwrite?)
+ CDentry *destdn = destdir->lookup(destname);
+ CInode *oldin = 0;
+ if (destdn) {
+ oldin = destdn->get_inode();
+
+ if (oldin) {
+ // make sure it's also a file!
+ // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir.
+ if (oldin->is_dir()) {
+ // fail!
+ dout(7) << "dest exists and is dir" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+
+ if (srcdn->inode->is_dir() &&
+ !oldin->is_dir()) {
+ dout(7) << "cannot overwrite non-directory with directory" << endl;
+ reply_request(req, -EISDIR);
+ return;
+ }
+ }
+
+ dout(7) << "dest exists " << *destdn << endl;
+ if (destdn->get_inode()) {
+ dout(7) << "destino is " << *destdn->get_inode() << endl;
+ } else {
+ dout(7) << "dest dn is a NULL stub" << endl;
+ }
+ } else {
+ dout(7) << "dest dn dne (yet)" << endl;
+ }
+
+
+ // local or remote?
+ int srcauth = srcdir->dentry_authority(srcdn->name);
+ int destauth = destdir->dentry_authority(destname);
+ dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl;
+
+ //
+ if (srcauth != whoami ||
+ destauth != whoami) {
+ dout(7) << "rename has remote dest " << destauth << endl;
+ dout(7) << "FOREIGN RENAME" << endl;
+
+ // punt?
+ if (false && srcdn->inode->is_dir()) {
+ reply_request(req, -EINVAL);
+ return;
+ }
+
+ } else {
+ dout(7) << "rename is local" << endl;
+ }
+
+ handle_client_rename_local(req, ref,
+ srcpath, srcdiri, srcdn,
+ destpath.get_path(), destdir, destdn, destname);
+ return;
+}
+
+
+
+
+void Server::handle_client_rename_local(MClientRequest *req,
+ CInode *ref,
+ string& srcpath,
+ CInode *srcdiri,
+ CDentry *srcdn,
+ string& destpath,
+ CDir *destdir,
+ CDentry *destdn,
+ string& destname)
+{
+ //bool everybody = false;
+ //if (true || srcdn->inode->is_dir()) {
+ /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap!
+ i could limit this to cases where something beneath me is exported.
+ could possibly limit the list. (maybe.)
+ Underlying constraint is that, regardless of the order i do the xlocks, and whatever
+ imports/exports might happen in the process, the destdir _must_ exist on any node
+ importing something beneath me when rename finishes, or else mayhem ensues when
+ their import is dangling in the cache.
+ */
+ /*
+ having made a proper mess of this on the first pass, here is my plan:
+
+ - xlocks of src, dest are done in lex order
+ - xlock is optional.. if you have the dentry, lock it, if not, don't.
+ - if you discover an xlocked dentry, you get the xlock.
+
+ possible trouble:
+ - you have an import beneath the source, and don't have the dest dir.
+ - when the actual rename happens, you discover the dest
+ - actually, do this on any open dir, so we don't detach whole swaths
+ of our cache.
+
+ notes:
+ - xlocks are initiated from authority, as are discover_replies, so replicas are
+ guaranteed to either not have dentry, or to have it xlocked.
+ -
+ - foreign xlocks are eventually unraveled by the initiator on success or failure.
+
+ todo to make this work:
+ - hose bool everybody param crap
+ /- make handle_lock_dn not discover, clean up cases
+ /- put dest path in MRenameNotify
+ /- make rename_notify discover if its a dir
+ / - this will catch nested imports too, obviously
+ /- notify goes to merged list on local rename
+ /- notify goes to everybody on a foreign rename
+ /- handle_notify needs to gracefully ignore spurious notifies
+ */
+ //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl;
+ //everybody = true;
+ //}
+
+ bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == whoami;
+ bool destlocal = destdir->dentry_authority(destname) == whoami;
+
+ dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl;
+ if (destdn) {
+ dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl;
+ } else {
+ dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl;
+ }
+
+ /* lock source and dest dentries, in lexicographic order.
+ */
+ bool dosrc = srcpath < destpath;
+ for (int i=0; i<2; i++) {
+ if (dosrc) {
+
+ // src
+ if (srclocal) {
+ if (!srcdn->is_xlockedbyme(req) &&
+ !mds->locker->dentry_xlock_start(srcdn, req, ref))
+ return;
+ } else {
+ if (!srcdn || srcdn->xlockedby != req) {
+ mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+ }
+ dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl;
+
+ } else {
+
+ if (destlocal) {
+ // dest
+ if (!destdn) destdn = destdir->add_dentry(destname);
+ if (!destdn->is_xlockedbyme(req) &&
+ !mds->locker->dentry_xlock_start(destdn, req, ref)) {
+ if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn);
+ return;
+ }
+ } else {
+ if (!destdn || destdn->xlockedby != req) {
+ /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case
+ * my traverse and determination of dest vs dest/srcfilename was out of date.
+ */
+ mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref));
+ return;
+ }
+ }
+ dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl;
+
+ }
+
+ dosrc = !dosrc;
+ }
+
+
+ // final check: verify if dest exists that src is a file
+
+ // FIXME: is this necessary?
+
+ if (destdn->inode) {
+ if (destdn->inode->is_dir()) {
+ dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl;
+ assert(0);
+ reply_request(req, -EINVAL);
+ return;
+ }
+ if (srcdn->inode->is_dir()) {
+ dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl;
+ assert(0);
+ reply_request(req, -EINVAL);
+ return;
+ }
+ } else {
+ // if destdn->inode is null, then we know it's a non-existent dest,
+ // why? because if it's local, it dne. and if it's remote, we xlocked with
+ // REQXLOCKC, which will only allow you to lock a file.
+ // so we know dest is a file, or non-existent
+ if (!destlocal) {
+ if (srcdn->inode->is_dir()) {
+ // help: maybe the dest exists and is a file? ..... FIXME
+ } else {
+ // we're fine, src is file, dest is file|dne
+ }
+ }
+ }
+
+ mds->balancer->hit_dir(srcdn->dir, META_POP_DWR);
+ mds->balancer->hit_dir(destdn->dir, META_POP_DWR);
+
+ // we're golden.
+ // everything is xlocked by us, we rule, etc.
+ MClientReply *reply = new MClientReply(req, 0);
+ mdcache->renamer->file_rename( srcdn, destdn,
+ new C_MDS_CommitRequest(this, req, reply, srcdn->inode,
+ new EInodeUpdate(srcdn->inode)) ); // FIXME WRONG EVENT
+}
+
+
+
+
+
+
+
+// MKDIR
+
+void Server::handle_client_mkdir(MClientRequest *req, CInode *diri)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, diri);
+ if (!newi) return;
+
+ // make my new inode a dir.
+ newi->inode.mode = req->get_iarg();
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_DIR;
+
+ // use dir layout
+ newi->inode.layout = g_OSD_MDDirLayout;
+
+ // init dir to be empty
+ assert(!newi->is_frozen_dir()); // bc mknod worked
+ CDir *newdir = newi->get_or_open_dir(mds);
+ newdir->mark_complete();
+ newdir->mark_dirty();
+
+ mds->balancer->hit_dir(newdir, META_POP_DWR);
+
+ if (
+ diri->dir->is_auth() &&
+ diri->dir->is_rep() &&
+ newdir->is_auth() &&
+ !newdir->is_hashing()) {
+ int dest = rand() % mds->mdsmap->get_num_mds();
+ if (dest != whoami) {
+ dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl;
+ mdcache->migrator->export_dir(newdir, dest);
+ }
+ }
+
+ // commit to log
+ commit_request(req, new MClientReply(req, 0), diri,
+ new EInodeUpdate(newi),//);
+ new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery!
+ return;
+}
+
+
+
+
+
+// SYMLINK
+
+void Server::handle_client_symlink(MClientRequest *req, CInode *diri)
+{
+ // make dentry and inode, link.
+ CInode *newi = mknod(req, diri);
+ if (!newi) return;
+
+ // make my new inode a symlink
+ newi->inode.mode &= ~INODE_TYPE_MASK;
+ newi->inode.mode |= INODE_MODE_SYMLINK;
+
+ // set target
+ newi->symlink = req->get_sarg();
+
+ mds->balancer->hit_dir(diri->dir, META_POP_DWR);
+
+ // commit
+ commit_request(req, new MClientReply(req, 0), diri,
+ new EInodeUpdate(newi)); // FIXME should be differnet log entry
+}
+
+
+
+
+
+
+
+// ===================================
+// TRUNCATE, FSYNC
+
+/*
+ * FIXME: this truncate implemention is WRONG WRONG WRONG
+ */
+
+void Server::handle_client_truncate(MClientRequest *req, CInode *cur)
+{
+ // write
+ if (!mds->locker->inode_file_write_start(cur, req))
+ return; // fw or (wait for) lock
+
+ // check permissions
+
+ // do update
+ cur->inode.size = req->get_sizearg();
+ cur->mark_dirty();
+
+ mds->locker->inode_file_write_finish(cur);
+
+ mds->balancer->hit_inode(cur, META_POP_IWR);
+
+ // start reply
+ MClientReply *reply = new MClientReply(req, 0);
+
+ // commit
+ commit_request(req, reply, cur,
+ new EInodeUpdate(cur));
+}
+
+
+
+// ===========================
+// open, openc, close
+
+void Server::handle_client_open(MClientRequest *req,
+ CInode *cur)
+{
+ int flags = req->get_iarg();
+ int mode = req->get_iarg2();
+
+ dout(7) << "open " << flags << " on " << *cur << endl;
+ dout(10) << "open flags = " << flags << " mode = " << mode << endl;
+
+ // is it a file?
+ if (!(cur->inode.mode & INODE_MODE_FILE)) {
+ dout(7) << "not a regular file" << endl;
+ reply_request(req, -EINVAL); // FIXME what error do we want?
+ return;
+ }
+
+ // auth for write access
+ if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY &&
+ !cur->is_auth()) {
+ int auth = cur->authority();
+ assert(auth != whoami);
+ dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl;
+
+ mdcache->request_forward(req, auth);
+ return;
+ }
+
+
+ // hmm, check permissions or something.
+
+
+ // can we issue the caps they want?
+ version_t fdv = mds->locker->issue_file_data_version(cur);
+ Capability *cap = mds->locker->issue_new_caps(cur, mode, req);
+ if (!cap) return; // can't issue (yet), so wait!
+
+ dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl;
+
+ mds->balancer->hit_inode(cur, META_POP_IRD);
+
+ // reply
+ MClientReply *reply = new MClientReply(req, 0);
+ reply->set_file_caps(cap->pending());
+ reply->set_file_caps_seq(cap->get_last_seq());
+ reply->set_file_data_version(fdv);
+ reply_request(req, reply, cur);
+}
+
+
+
+void Server::handle_client_openc(MClientRequest *req, CInode *ref)
+{
+ dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl;
+
+ CInode *in = mknod(req, ref, true);
+ if (!in) return;
+
+ in->inode.mode = 0644; // wtf FIXME
+ in->inode.mode |= INODE_MODE_FILE;
+
+ handle_client_open(req, in);
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __MDS_SERVER_H
+#define __MDS_SERVER_H
+
+#include "MDS.h"
+
+class LogEvent;
+
+class Server {
+ MDS *mds;
+ MDCache *mdcache;
+ MDLog *mdlog;
+ Messenger *messenger;
+ int whoami;
+
+ __uint64_t stat_ops;
+
+public:
+ Server(MDS *m) :
+ mds(m),
+ mdcache(mds->mdcache), mdlog(mds->mdlog),
+ messenger(mds->messenger), whoami(mds->get_nodeid()),
+ stat_ops(0) {
+ }
+
+ void dispatch(Message *m);
+
+ // generic request helpers
+ void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0);
+ void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei);
+ void commit_request(MClientRequest *req,
+ MClientReply *reply,
+ CInode *tracei,
+ LogEvent *event,
+ LogEvent *event2 = 0);
+
+ bool try_open_dir(CInode *in, MClientRequest *req);
+
+
+ // clients
+ void handle_client_mount(class MClientMount *m);
+ void handle_client_unmount(Message *m);
+
+ void handle_client_request(MClientRequest *m);
+ void handle_client_request_2(MClientRequest *req,
+ vector<CDentry*>& trace,
+ int r);
+
+ // fs ops
+ void handle_client_fstat(MClientRequest *req);
+
+ // requests
+ void dispatch_request(Message *m, CInode *ref);
+
+ // inode request *req, CInode *ref;
+ void handle_client_stat(MClientRequest *req, CInode *ref);
+ void handle_client_utime(MClientRequest *req, CInode *ref);
+ void handle_client_inode_soft_update_2(MClientRequest *req,
+ MClientReply *reply,
+ CInode *ref);
+ void handle_client_chmod(MClientRequest *req, CInode *ref);
+ void handle_client_chown(MClientRequest *req, CInode *ref);
+ void handle_client_inode_hard_update_2(MClientRequest *req,
+ MClientReply *reply,
+ CInode *ref);
+
+ // readdir
+ void handle_client_readdir(MClientRequest *req, CInode *ref);
+ int encode_dir_contents(CDir *dir,
+ list<class InodeStat*>& inls,
+ list<string>& dnls);
+ void handle_hash_readdir(MHashReaddir *m);
+ void handle_hash_readdir_reply(MHashReaddirReply *m);
+ void finish_hash_readdir(MClientRequest *req, CDir *dir);
+
+ // namespace changes
+ void handle_client_mknod(MClientRequest *req, CInode *ref);
+ void handle_client_link(MClientRequest *req, CInode *ref);
+ void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector<CDentry*>& trace);
+ void handle_client_link_finish(MClientRequest *req, CInode *ref,
+ CDentry *dn, CInode *targeti);
+
+ void handle_client_unlink(MClientRequest *req, CInode *ref);
+ void handle_client_rename(MClientRequest *req, CInode *ref);
+ void handle_client_rename_2(MClientRequest *req,
+ CInode *ref,
+ CInode *srcdiri,
+ CDir *srcdir,
+ CDentry *srcdn,
+ filepath& destpath,
+ vector<CDentry*>& trace,
+ int r);
+ void handle_client_rename_local(MClientRequest *req, CInode *ref,
+ string& srcpath, CInode *srcdiri, CDentry *srcdn,
+ string& destpath, CDir *destdir, CDentry *destdn, string& name);
+
+ void handle_client_mkdir(MClientRequest *req, CInode *ref);
+ void handle_client_rmdir(MClientRequest *req, CInode *ref);
+ void handle_client_symlink(MClientRequest *req, CInode *ref);
+
+ // file
+ void handle_client_open(MClientRequest *req, CInode *ref);
+ void handle_client_openc(MClientRequest *req, CInode *ref);
+ void handle_client_release(MClientRequest *req, CInode *in);
+ void handle_client_truncate(MClientRequest *req, CInode *in);
+ void handle_client_fsync(MClientRequest *req, CInode *in);
+
+ CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc
+
+
+};
+
+class C_MDS_RetryRequest : public Context {
+ MDS *mds;
+ Message *req; // MClientRequest or MLock
+ CInode *ref;
+ public:
+ C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) {
+ assert(ref);
+ this->mds = mds;
+ this->req = req;
+ this->ref = ref;
+ }
+ virtual void finish(int r) {
+ mds->server->dispatch_request(req, ref);
+ }
+};
+
+
+
+#endif
// live journal
bool can_expire(MDS *mds) {
- if (mds->idalloc->get_committed_version() <= table_version)
+ if (mds->idalloc->get_committed_version() < table_version)
return false; // still dirty
else
return true; // already flushed
#include <assert.h>
#include "config.h"
#include "include/types.h"
+#include "ETraced.h"
#include "../LogEvent.h"
#include "../CDir.h"
#include "../MDCache.h"
-class EDirUpdate : public LogEvent {
+class EDirUpdate : public ETraced {
protected:
inodeno_t dirino;
version_t version;
public:
- EDirUpdate(CDir *dir) :
- LogEvent(EVENT_DIRUPDATE) {
+ EDirUpdate(CDir *dir) : ETraced(EVENT_DIRUPDATE, dir->inode) {
this->dirino = dir->ino();
version = dir->get_version();
}
- EDirUpdate() :
- LogEvent(EVENT_DIRUPDATE) {
+ EDirUpdate() : ETraced(EVENT_DIRUPDATE) {
}
+ void print(ostream& out) {
+ out << "up dir " << dirino << " ";
+ ETraced::print(out);
+ out << "/ v " << version;
+ }
+
virtual void encode_payload(bufferlist& bl) {
+ encode_trace(bl);
bl.append((char*)&version, sizeof(version));
bl.append((char*)&dirino, sizeof(dirino));
}
void decode_payload(bufferlist& bl, int& off) {
+ decode_trace(bl, off);
bl.copy(off, sizeof(version), (char*)&version);
off += sizeof(version);
bl.copy(off, sizeof(dirino), (char*)&dirino);
#include <assert.h>
#include "config.h"
#include "include/types.h"
-#include "../LogEvent.h"
-#include "../CInode.h"
-#include "../MDCache.h"
+
+#include "ETraced.h"
#include "../MDStore.h"
-class EInodeUpdate : public LogEvent {
+class EInodeUpdate : public ETraced {
protected:
inode_t inode;
- __uint32_t version;
public:
- EInodeUpdate(CInode *in) :
- LogEvent(EVENT_INODEUPDATE) {
- this->inode = in->inode;
- version = in->get_version();
- }
- EInodeUpdate() :
- LogEvent(EVENT_INODEUPDATE) {
+ EInodeUpdate(CInode *in) : ETraced(EVENT_INODEUPDATE, in) {
+ this->inode = in->get_inode();
}
+ EInodeUpdate() : ETraced(EVENT_INODEUPDATE) { }
+ void print(ostream& out) {
+ out << "up inode " << inode.ino << " ";
+ ETraced::print(out);
+ out << " v " << inode.version;
+ }
+
virtual void encode_payload(bufferlist& bl) {
- bl.append((char*)&version, sizeof(version));
+ encode_trace(bl);
bl.append((char*)&inode, sizeof(inode));
}
void decode_payload(bufferlist& bl, int& off) {
- bl.copy(off, sizeof(version), (char*)&version);
- off += sizeof(version);
+ decode_trace(bl, off);
bl.copy(off, sizeof(inode), (char*)&inode);
off += sizeof(inode);
}
dout(7) << "EInodeUpdate obsolete? on " << *in << endl;
if (!in->is_auth())
return true; // not my inode anymore!
- if (in->get_version() != version)
+ if (in->get_version() != inode.version)
return true; // i'm obsolete! (another log entry follows)
CDir *parent = in->get_parent_dir();
using namespace std;
#include "../LogEvent.h"
+#include "../CInode.h"
+#include "../CDir.h"
+#include "../CDentry.h"
+#include "../MDCache.h"
// generic log event
class ETraced : public LogEvent {
- struct bit {
- inodeno_t dirino;
- version_t dirv;
- string dn;
- inodeno_t ino;
- version_t inov;
-
- bit() {}
- bit(inodeno_t di, version_t dv, string& d, inodeno_t i, version_t iv) :
- dirino(di), dirv(dv), dn(d), ino(i), inov(iv) {}
- void _encode(bufferlist& bl) {
- bl.append((char*)&dirino, sizeof(dirino));
- bl.append((char*)&dirv, sizeof(dirv));
- ::_encode(dn,bl);
- bl.append((char*)&ino, sizeof(ino));
- bl.append((char*)&inov, sizeof(inov));
- }
- void _decode(bufferlist& bl, int& off) {
- bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino);
- bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv);
- ::_decode(dn, bl, off);
- bl.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino);
- bl.copy(off, sizeof(inov), (char*)&inov); off += sizeof(inov);
- }
+ // <dir, dn, inode> segment.
+ struct bit {
+ inodeno_t dirino;
+ version_t dirv;
+ string dn;
+ inodeno_t ino;
+ version_t inov;
+
+ bit(bufferlist& bl, int& off) { _decode(bl,off); }
+ bit(inodeno_t di, version_t dv, const string& d, inodeno_t i, version_t iv) :
+ dirino(di), dirv(dv), dn(d), ino(i), inov(iv) {}
+
+ void _encode(bufferlist& bl) {
+ bl.append((char*)&dirino, sizeof(dirino));
+ bl.append((char*)&dirv, sizeof(dirv));
+ ::_encode(dn, bl);
+ bl.append((char*)&ino, sizeof(ino));
+ bl.append((char*)&inov, sizeof(inov));
+ }
+ void _decode(bufferlist& bl, int& off) {
+ bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino);
+ bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv);
+ ::_decode(dn, bl, off);
+ bl.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino);
+ bl.copy(off, sizeof(inov), (char*)&inov); off += sizeof(inov);
+ }
};
protected:
list<bit> trace;
- public:
- ETraced(int t) : LogEvent(t) { }
+public:
+ ETraced(int t, CInode *in = 0) : LogEvent(t) {
+ if (in) {
+ CDir *dir;
+ CDentry *dn;
+ do {
+ dn = in->get_parent_dn();
+ if (!dn) break;
+ dir = dn->get_dir();
+ if (!dir) break;
+
+ trace.push_front(bit(dir->ino(), dir->get_version(),
+ dn->get_name(),
+ in->ino(), in->get_version()));
+
+ in = dir->get_inode();
+ } while (!dir->is_import());
+ }
+ }
void decode_trace(bufferlist& bl, int& off) {
- int n;
- bl.copy(off, sizeof(n), (char*)&n);
- off += n;
- for (int i=0; i<n; i++) {
- trace.push_back(bit());
- trace.back()._decode(bl,off);
- }
+ int n;
+ bl.copy(off, sizeof(n), (char*)&n);
+ off += sizeof(n);
+ for (int i=0; i<n; i++)
+ trace.push_back( bit(bl, off) );
}
void encode_trace(bufferlist& bl) {
- int n = trace.size();
- bl.append((char*)&n, sizeof(n));
- for (list<bit>::iterator i = trace.begin();
- i != trace.end();
- i++)
- i->_encode(bl);
+ int n = trace.size();
+ bl.append((char*)&n, sizeof(n));
+ for (list<bit>::iterator i = trace.begin();
+ i != trace.end();
+ i++)
+ i->_encode(bl);
}
-
+
void print(ostream& out) {
- out << '"' << event << '"';
+ for (list<bit>::iterator p = trace.begin();
+ p != trace.end();
+ p++) {
+ if (p != trace.begin()) out << "/";
+ out << p->dn;
+ }
}
};
// read
void set_read_pos(off_t p) {
assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now.
- assert(read_bl == 0); // etc.
+ assert(read_bl == 0); // ...
read_pos = requested_pos = received_pos = p;
read_buf.clear();
}