From 0d362c636e2fb7455d4882f19dff21df2a266cbd Mon Sep 17 00:00:00 2001 From: sage Date: Thu, 5 May 2005 02:29:53 +0000 Subject: [PATCH] remote rename works! buggy tho. git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@203 29311d96-e01e-0410-9327-a35deaab8ce9 --- ceph/Makefile | 4 +- ceph/TODO | 2 + ceph/client/Client.cc | 5 +- ceph/fakefuse.cc | 1 + ceph/mds/CInode.h | 4 +- ceph/mds/MDCache.cc | 483 +++++++++++++++++++++++++---------- ceph/mds/MDCache.h | 41 +-- ceph/mds/MDS.cc | 6 +- ceph/messages/MRename.h | 22 +- ceph/messages/MRenameAck.h | 19 +- ceph/messages/MRenameReq.h | 11 +- ceph/msg/CheesySerializer.cc | 19 +- ceph/msg/CheesySerializer.h | 2 +- ceph/msg/Message.h | 2 +- ceph/msg/Messenger.cc | 12 + ceph/osd/OSDMap.h | 20 +- 16 files changed, 462 insertions(+), 191 deletions(-) diff --git a/ceph/Makefile b/ceph/Makefile index a586549344a4a..d6b1e284e6188 100644 --- a/ceph/Makefile +++ b/ceph/Makefile @@ -13,12 +13,12 @@ CFLAGS = -g -I. -pg -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT LIBS = -lpthread -lfuse MDS_OBJS= \ - mds/MDBalancer.o\ mds/MDS.o\ + mds/MDCache.o\ + mds/MDBalancer.o\ mds/CDentry.o\ mds/CDir.o\ mds/CInode.o\ - mds/MDCache.o\ mds/MDStore.o\ mds/LogStream.o\ mds/IdAllocator.o\ diff --git a/ceph/TODO b/ceph/TODO index 3cef99804a508..f4e0cbba64d55 100644 --- a/ceph/TODO +++ b/ceph/TODO @@ -1,6 +1,8 @@ me, soon: - foreign renames + -> what about setting up an inode proxy??? + - unwind foreign xlocks on failure etc. /- symlinks - fix logging model for data safety diff --git a/ceph/client/Client.cc b/ceph/client/Client.cc index 4b6336d681412..8399c7d26d8f8 100644 --- a/ceph/client/Client.cc +++ b/ceph/client/Client.cc @@ -42,11 +42,12 @@ Client::~Client() void Client::init() { - + } void Client::shutdown() { - + dout(1) << "shutdown" << endl; + messenger->shutdown(); } // ------------------- diff --git a/ceph/fakefuse.cc b/ceph/fakefuse.cc index 5c37cb5e44aac..5f99134fe6f4e 100644 --- a/ceph/fakefuse.cc +++ b/ceph/fakefuse.cc @@ -64,6 +64,7 @@ int main(int argc, char **argv) { cout << "starting fuse on pid " << getpid() << endl; ceph_fuse_main(client[i], argc, argv); cout << "fuse finished on pid " << getpid() << endl; + client[i]->shutdown(); } diff --git a/ceph/mds/CInode.h b/ceph/mds/CInode.h index effe73d1e92e7..437227ff19b66 100644 --- a/ceph/mds/CInode.h +++ b/ceph/mds/CInode.h @@ -136,9 +136,7 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { // waiters: handle_export_dir_warning // triggers: handle_export_dir_notify -#define CINODE_WAIT_RENAME 32768 - // waiters: file_rename - // triggers: file_rename_finish +#define CINODE_WAIT_RENAME (1<<16) #define CINODE_WAIT_HARDR (1<<17) // 131072 #define CINODE_WAIT_HARDW (1<<18) // 262... diff --git a/ceph/mds/MDCache.cc b/ceph/mds/MDCache.cc index e64b995036377..26742cfe99d14 100644 --- a/ceph/mds/MDCache.cc +++ b/ceph/mds/MDCache.cc @@ -40,7 +40,11 @@ #include "messages/MLock.h" #include "messages/MDentryUnlink.h" + #include "messages/MRenameNotify.h" +#include "messages/MRename.h" +#include "messages/MRenameAck.h" +#include "messages/MRenameReq.h" #include "messages/MClientRequest.h" @@ -69,14 +73,12 @@ MDCache::MDCache(MDS *m) { mds = m; root = NULL; - lru = new LRU(); - lru->lru_set_max(g_conf.mdcache_size); - lru->lru_set_midpoint(g_conf.mdcache_mid); + lru.lru_set_max(g_conf.mdcache_size); + lru.lru_set_midpoint(g_conf.mdcache_mid); } MDCache::~MDCache() { - if (lru) { delete lru; lru = NULL; } } @@ -84,7 +86,7 @@ MDCache::~MDCache() bool MDCache::shutdown() { - if (lru->lru_get_size() > 0) { + if (lru.lru_get_size() > 0) { dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; show_cache(); show_imports(); @@ -121,11 +123,11 @@ void MDCache::destroy_inode(CInode *in) void MDCache::add_inode(CInode *in) { // add to lru, inode map - assert(inode_map.size() == lru->lru_get_size()); - lru->lru_insert_mid(in); + assert(inode_map.size() == lru.lru_get_size()); + lru.lru_insert_mid(in); assert(inode_map.count(in->ino()) == 0); // should be no dup inos! inode_map[ in->ino() ] = in; - assert(inode_map.size() == lru->lru_get_size()); + assert(inode_map.size() == lru.lru_get_size()); } void MDCache::remove_inode(CInode *o) @@ -141,7 +143,7 @@ void MDCache::remove_inode(CInode *o) dn->dir->unlink_inode(dn); // leave dentry } inode_map.erase(o->ino()); // remove from map - lru->lru_remove(o); // remove from lru + lru.lru_remove(o); // remove from lru } @@ -382,14 +384,14 @@ void MDCache::export_empty_import(CDir *dir) bool MDCache::trim(__int32_t max) { if (max < 0) { - max = lru->lru_get_max(); + max = lru.lru_get_max(); if (!max) return false; } map expiremap; - while (lru->lru_get_size() > max) { - CInode *in = (CInode*)lru->lru_expire(); + while (lru.lru_get_size() > max) { + CInode *in = (CInode*)lru.lru_expire(); if (!in) break; //return false; if (in->dir) { @@ -499,7 +501,7 @@ bool MDCache::shutdown_pass() dout(7) << "log is empty. flushing cache" << endl; trim(0); - dout(7) << "cache size now " << lru->lru_get_size() << endl; + dout(7) << "cache size now " << lru.lru_get_size() << endl; // send all imports back to 0. if (mds->get_nodeid() != 0) { @@ -517,7 +519,7 @@ bool MDCache::shutdown_pass() } // shut down root? - if (lru->lru_get_size() == 1) { + if (lru.lru_get_size() == 1) { if (root && root->dir && root->dir->is_import() && @@ -538,10 +540,10 @@ bool MDCache::shutdown_pass() } // sanity - assert(inode_map.size() == lru->lru_get_size()); + assert(inode_map.size() == lru.lru_get_size()); // done? - if (lru->lru_get_size() == 0) { + if (lru.lru_get_size() == 0) { if (mds->get_nodeid() != 0) { dout(7) << "done, sending shutdown_finish" << endl; mds->messenger->send_message(new MGenericMessage(MSG_MDS_SHUTDOWNFINISH), @@ -551,7 +553,7 @@ bool MDCache::shutdown_pass() } return true; } else { - dout(7) << "there's still stuff in the cache: " << lru->lru_get_size() << endl; + dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; show_cache(); dump(); } @@ -705,6 +707,15 @@ int MDCache::proc_message(Message *m) case MSG_MDS_RENAMENOTIFY: handle_rename_notify((MRenameNotify*)m); break; + case MSG_MDS_RENAME: + handle_rename((MRename*)m); + break; + case MSG_MDS_RENAMEREQ: + handle_rename_req((MRenameReq*)m); + break; + case MSG_MDS_RENAMEACK: + handle_rename_ack((MRenameAck*)m); + break; // import @@ -905,6 +916,13 @@ int MDCache::path_traverse(filepath& origpath, // dentry CDentry *dn = cur->dir->lookup(path[depth]); + + // xlocked by me? + if (dn && !dn->inode && dn->xlockedby == req) { // hack? + trace.push_back(dn); + break; // done! + } + if (dn && dn->inode) { // have it. locked? if (!noperm && dn->is_xlockedbyother(req)) { @@ -979,7 +997,7 @@ int MDCache::path_traverse(filepath& origpath, // directory isn't complete; reload dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; - lru->lru_touch(cur); // touch readdiree + lru.lru_touch(cur); // touch readdiree mds->mdstore->fetch_dir(cur->dir, ondelay); mds->logger->inc("cmiss"); @@ -1000,7 +1018,7 @@ int MDCache::path_traverse(filepath& origpath, } else { dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; - lru->lru_touch(cur); // touch discoveree + lru.lru_touch(cur); // touch discoveree mds->messenger->send_message(new MDiscover(mds->get_nodeid(), cur->ino(), @@ -1986,16 +2004,50 @@ void MDCache::file_rename(CDentry *srcdn, CDentry *destdn, Context *c, bool ever CDir *srcdir = srcdn->dir; string srcname = srcdn->name; + bool srcauth = srcdir->dentry_authority(srcdn->name) == mds->get_nodeid(); CDir *destdir = destdn->dir; string destname = destdn->name; + bool destauth = destdir->dentry_authority(destdn->name) == mds->get_nodeid(); CInode *in = srcdn->inode; Message *req = srcdn->xlockedby; + // foreign rename? + if (srcauth && !destauth) { + dout(7) << "file_rename src auth, not dest auth. sending MRename" << endl; + + file_rename_foreign_src(srcdn, destdn, + mds->get_nodeid()); // i'm the initiator + + // set waiter on the inode (is this the best place?) + in->add_waiter(CINODE_WAIT_RENAME, c); + return; + } + else if (!srcauth) { + if (destauth) { + dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; + } else { + dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; + } + + MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator + srcdir->ino(), srcname, destdir->ino(), destname); + int srcauth = srcdir->dentry_authority(srcdn->name); + mds->messenger->send_message(m, + MSG_ADDR_MDS(srcauth), MDS_PORT_CACHE, MDS_PORT_CACHE); + + // set waiter on the inode (is this the best place?) + in->add_waiter(CINODE_WAIT_RENAME, c); + return; + } + + assert(srcauth && destauth); + dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; + // update our cache rename_file(srcdn, destdn); - + // mark dentries dirty srcdn->mark_dirty(); destdn->mark_dirty(); @@ -2003,6 +2055,7 @@ void MDCache::file_rename(CDentry *srcdn, CDentry *destdn, Context *c, bool ever // update imports/exports? if (in->is_dir() && in->dir) fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change + // tell replicas (no need to wait for ack) to do the same, and un-xlock. // make list @@ -2051,7 +2104,141 @@ void MDCache::file_rename(CDentry *srcdn, CDentry *destdn, Context *c, bool ever destdir->take_waiting(CDIR_WAIT_DNREAD, destname, mds->finished_queue); } -void MDCache::handle_rename_notify(MRenameNotify*m) +void MDCache::file_rename_foreign_src(CDentry *srcdn, CDentry *destdn, int initiator) +{ + dout(7) << "file_rename_foreign_src " << *srcdn << " to " << *destdn << endl; + + CDir *srcdir = srcdn->dir; + CDir *destdir = destdn->dir; + string srcname = srcdn->name; + + // (we're basically exporting this inode) + CInode *in = srcdn->inode; + assert(in); + + // encode and export inode state + crope inode_state; + encode_export_inode(in, inode_state); + + MRename *m = new MRename(initiator, + srcdir->ino(), srcdn->name, destdir->ino(), destdn->name, + inode_state); + int destauth = destdir->dentry_authority(destdn->name); + mds->messenger->send_message(m, + MSG_ADDR_MDS(destauth), MDS_PORT_CACHE, MDS_PORT_CACHE); + + // update our cache + rename_file(srcdn, destdn); + srcdn->mark_dirty(); + + // update imports/exports? + if (in->is_dir() && in->dir) + fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change + + // drop xlock on src + dentry_xlock_finish(srcdn); + srcdir->take_waiting(CDIR_WAIT_ANY, srcname, mds->finished_queue); +} + +void MDCache::handle_rename_req(MRenameReq *m) +{ + CInode *srcdiri = get_inode(m->get_srcdirino()); + CDir *srcdir = srcdiri->dir; + CDentry *srcdn = srcdir->lookup(m->get_srcname()); + assert(srcdn); + + CInode *destdiri = get_inode(m->get_destdirino()); + CDir *destdir = destdiri->dir; + CDentry *destdn = destdir->lookup(m->get_destname()); + assert(destdn); + + file_rename_foreign_src(srcdn, destdn, m->get_initiator()); + delete m; +} + +void MDCache::handle_rename(MRename *m) +{ + CInode *srcdiri = get_inode(m->get_srcdirino()); + CDir *srcdir = srcdiri->dir; + CDentry *srcdn = srcdir->lookup(m->get_srcname()); + assert(srcdn); + + CInode *destdiri = get_inode(m->get_destdirino()); + CDir *destdir = destdiri->dir; + CDentry *destdn = destdir->lookup(m->get_destname()); + assert(destdn); + string destname = destdn->name; + + dout(7) << "handle_rename " << *srcdn << " to " << *destdn << endl; + + // decode + import inode (into _old_ location to start) + int off = 0; + decode_import_inode(srcdn, m->get_inode_state(), off, m->get_source()); + + CInode *in = srcdn->inode; + assert(in); + + // rename it + rename_file(srcdn, destdn); + destdn->mark_dirty(); + + // update imports/exports? + if (in->is_dir() && in->dir) + fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change + + // send notifies + // including src. but not me, or initiator. + set who; + for (int i=0; iget_cluster()->get_num_mds(); i++) + if (i != mds->get_nodeid() && + i != m->get_initiator()) who.insert(i); + + for (set::iterator it = who.begin(); + it != who.end(); + it++) { + mds->messenger->send_message(new MRenameNotify(srcdir->ino(), + srcdn->name, + destdir->ino(), + destdn->name), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, MDS_PORT_CACHE); + } + + // ok, tell the initiator + if (m->get_initiator() == mds->get_nodeid()) { + // it's me! + in->take_waiting(CINODE_WAIT_RENAME, mds->finished_queue); + } else { + MRenameAck *ack = new MRenameAck(srcdir->ino(), srcdn->name, destdir->ino(), destdn->name); + mds->messenger->send_message(ack, + MSG_ADDR_MDS(m->get_initiator()), MDS_PORT_CACHE, MDS_PORT_CACHE); + } + + + // drop xlock on dst + dentry_xlock_finish(destdn); + destdir->take_waiting(CDIR_WAIT_DNREAD, destname, mds->finished_queue); + + delete m; +} + + +void MDCache::handle_rename_ack(MRenameAck *m) +{ + CInode *destdiri = get_inode(m->get_destdirino()); + CDir *destdir = destdiri->dir; + CDentry *destdn = destdir->lookup(m->get_destname()); + assert(destdn); + CInode *in = destdn->inode; + + dout(7) << "handle_rename_ack on " << *in << endl; + + // all done! + in->take_waiting(CINODE_WAIT_RENAME, mds->finished_queue); + delete m; +} + + +void MDCache::handle_rename_notify(MRenameNotify *m) { dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; @@ -3593,20 +3780,23 @@ public: } void finish(int r) { + cout << "xlockrequest->finish r = " << r << endl; if (r == 0) { CDentry *dn = dir->lookup(dname); if (dn && dn->xlockedby == 0) { + cout << "xlock request success, now xlocked by " << req << endl; // success dn->xlockedby = req; // our request was the winner // remember! mdc->active_requests[req].foreign_xlocks.insert(dn); - return; } } // retry request (or whatever) + cout << "doing finish on " << finisher << endl; finisher->finish(0); + delete finisher; } }; @@ -3616,7 +3806,7 @@ void MDCache::dentry_xlock_request(CDir *dir, string& dname, bool create, dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; // send request int dauth = dir->dentry_authority(dname); - MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, dauth); + MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); m->set_dn(dir->ino(), dname); mds->messenger->send_message(m, MSG_ADDR_MDS(dauth), MDS_PORT_CACHE, @@ -3649,8 +3839,9 @@ void MDCache::handle_lock_dn(MLock *m) // normally we have it always if (diri && dir) { int dauth = dir->dentry_authority(dname); - assert(dauth == mds->get_nodeid() || - dir->is_proxy()); + assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, + m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak + m->get_action() == LOCK_AC_REQXLOCKNAK); if (dir->is_proxy()) { @@ -4381,6 +4572,50 @@ void MDCache::export_dir_go(CDir *dir, } +/** encode_export_inode + * update our local state for this inode to export. + * encode relevant state to be sent over the wire. + * used by: export_dir_walk, file_rename (if foreign) + */ +void MDCache::encode_export_inode(CInode *in, crope& state_rope) +{ + in->version++; // so local log entries are ignored, etc. + + // relax locks + if (!in->is_cached_by_anyone()) + in->replicate_relax_locks(); + + // replica_writers? + if (in->is_open_write()) + in->add_replica_writer(mds->get_nodeid()); // i am now a replica writer! + + // add inode + CInodeExport istate( in ); + state_rope.append( istate._rope() ); + + // we're export this inode; fix inode state + dout(7) << "encode_export_inode " << *in << endl; + + if (in->is_dirty()) in->mark_clean(); + + // clear/unpin cached_by (we're no longer the authority) + in->cached_by_clear(); + + // don't need to know this anymore + in->clear_replica_writers(); + + // twiddle lock states + in->softlock.twiddle_export(); + in->hardlock.twiddle_export(); + + // mark auth + assert(in->is_auth()); + in->set_auth(false); + in->replica_nonce = CINODE_EXPORT_NONCE; + + // *** other state too? +} + void MDCache::export_dir_walk(MExportDir *req, C_MDS_ExportFinish *fin, @@ -4449,74 +4684,36 @@ void MDCache::export_dir_walk(MExportDir *req, // -- inode dir_rope.append((char)'I'); // inode dentry - in->version++; // so log entries are ignored, etc. + encode_export_inode(in, dir_rope); // encode, and (update state for) export - // relax locks - if (!in->is_cached_by_anyone()) - in->replicate_relax_locks(); - - // replica_writers? - if (in->is_open_write()) - in->add_replica_writer(mds->get_nodeid()); // i am now a replica writer! - - // add inode - CInodeExport istate( in ); - dir_rope.append( istate._rope() ); - - if (in->is_dir()) { - - // recurse? - if (in->dir) { - if (in->dir->is_auth()) { - // nested subdir - assert(in->dir->dir_auth == CDIR_AUTH_PARENT); - subdirs.push_back(in->dir); // it's ours, recurse (later) - - } else { - // nested export - assert(in->dir->dir_auth >= 0); - dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->dir_auth << "; removing from exports" << endl; - assert(exports.count(in->dir) == 1); - exports.erase(in->dir); // discard nested export (nested_exports updated above) - - in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDIR_PIN_EXPORT); - - // simplify dir_auth? - if (in->dir->dir_auth == newauth) - in->dir->dir_auth = CDIR_AUTH_PARENT; - } + // directory? + if (in->is_dir() && in->dir) { + if (in->dir->is_auth()) { + // nested subdir + assert(in->dir->dir_auth == CDIR_AUTH_PARENT); + subdirs.push_back(in->dir); // it's ours, recurse (later) + + } else { + // nested export + assert(in->dir->dir_auth >= 0); + dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->dir_auth << "; removing from exports" << endl; + assert(exports.count(in->dir) == 1); + exports.erase(in->dir); // discard nested export (nested_exports updated above) + + in->dir->state_clear(CDIR_STATE_EXPORT); + in->dir->put(CDIR_PIN_EXPORT); + + // simplify dir_auth? + if (in->dir->dir_auth == newauth) + in->dir->dir_auth = CDIR_AUTH_PARENT; } } - // we're export this inode; fix inode state - dout(7) << "export_dir_walk exporting " << *in << endl; - - if (in->is_dirty()) in->mark_clean(); - - // clear/unpin cached_by (we're no longer the authority) - in->cached_by_clear(); - - // don't need to know this anymore - in->clear_replica_writers(); - - // twiddle lock states - in->softlock.twiddle_export(); - in->hardlock.twiddle_export(); - - // mark auth - assert(in->is_auth()); - in->set_auth(false); - in->replica_nonce = CINODE_EXPORT_NONCE; - // add to proxy export_proxy_inos[basedir].insert(in->ino()); in->state_set(CINODE_STATE_PROXY); in->get(CINODE_PIN_PROXY); - - // *** other state too? - // waiters list waiters; in->take_waiting(CINODE_WAIT_ANY, waiters); @@ -5103,6 +5300,66 @@ void MDCache::handle_export_dir_finish(MExportDirFinish *m) } +void MDCache::decode_import_inode(CDentry *dn, crope& r, int& off, int oldauth) +{ + + CInodeExport istate; + off = istate._unrope(r, off); + dout(15) << "got a cinodeexport " << endl; + + bool added = false; + CInode *in = get_inode(istate.get_ino()); + if (!in) { + in = new CInode; + added = true; + } else { + in->set_auth(true); + } + + // state + istate.update_inode(in); + + // link + if (added) { + add_inode(in); + dn->dir->link_inode(dn, in); + dout(10) << "added " << *in << endl; + } else { + dout(10) << " had " << *in << endl; + + if (in->is_replica_writer(mds->get_nodeid())) + in->remove_replica_writer(mds->get_nodeid()); + } + + // cached_by + assert(!in->is_cached_by(oldauth)); + in->cached_by_add( oldauth, CINODE_EXPORT_NONCE ); + if (in->is_cached_by(mds->get_nodeid())) + in->cached_by_remove(mds->get_nodeid()); + + /* don't do this + if (!in->hardlock.is_stable() && + in->hardlock.is_gathering(mds->get_nodeid())) { + in->hardlock.gather_set.erase(mds->get_nodeid()); + if (in->hardlock.gather_set.size() == 0) + inode_hard_eval(in); + } + if (!in->softlock.is_stable() && + in->softlock.is_gathering(mds->get_nodeid())) { + in->softlock.gather_set.erase(mds->get_nodeid()); + if (in->softlock.gather_set.size() == 0) + inode_soft_eval(in); + } + */ + + // other + if (in->is_dirty()) { + dout(10) << "logging dirty import " << *in << endl; + mds->mdlog->submit_entry(new EInodeUpdate(in), + NULL); // FIXME pay attention to completion? + } +} + void MDCache::import_dir_block(crope& r, int& off, @@ -5179,61 +5436,7 @@ void MDCache::import_dir_block(crope& r, } else if (icode == 'I') { // inode - CInodeExport istate; - off = istate._unrope(r, off); - dout(15) << "got a cinodeexport " << endl; - - bool added = false; - CInode *in = get_inode(istate.get_ino()); - if (!in) { - in = new CInode; - added = true; - } else { - in->set_auth(true); - } - - // state - istate.update_inode(in); - - // link - if (added) { - add_inode(in); - dn->dir->link_inode(dn, in); - dout(10) << "added " << *in << endl; - } else { - dout(10) << " had " << *in << endl; - - if (in->is_replica_writer(mds->get_nodeid())) - in->remove_replica_writer(mds->get_nodeid()); - } - - // cached_by - assert(!in->is_cached_by(oldauth)); - in->cached_by_add( oldauth, CINODE_EXPORT_NONCE ); - if (in->is_cached_by(mds->get_nodeid())) - in->cached_by_remove(mds->get_nodeid()); - - /* don't do this - if (!in->hardlock.is_stable() && - in->hardlock.is_gathering(mds->get_nodeid())) { - in->hardlock.gather_set.erase(mds->get_nodeid()); - if (in->hardlock.gather_set.size() == 0) - inode_hard_eval(in); - } - if (!in->softlock.is_stable() && - in->softlock.is_gathering(mds->get_nodeid())) { - in->softlock.gather_set.erase(mds->get_nodeid()); - if (in->softlock.gather_set.size() == 0) - inode_soft_eval(in); - } - */ - - // other - if (in->is_dirty()) { - dout(10) << "logging dirty import " << *in << endl; - mds->mdlog->submit_entry(new EInodeUpdate(in), - NULL); // FIXME pay attention to completion? - } + decode_import_inode(dn, r, off, oldauth); } // mark dentry dirty? (only _after_ we link the inode) @@ -6159,7 +6362,7 @@ CInode* MDCache::hack_get_file(string& fn) { } //dump(); - lru->lru_status(); + lru.lru_status(); return cur; } diff --git a/ceph/mds/MDCache.h b/ceph/mds/MDCache.h index b787fe1f2c6d6..203780f625cd5 100644 --- a/ceph/mds/MDCache.h +++ b/ceph/mds/MDCache.h @@ -38,6 +38,10 @@ class MInodeWriterClosed; class MLock; class MRenameNotify; +class MRename; +class MRenameReq; +class MRenameAck; + class C_MDS_ExportFinish; class MClientRequest; @@ -71,7 +75,7 @@ class MDCache { protected: // the cache CInode *root; // root inode - LRU *lru; // lru for expiring items + LRU lru; // lru for expiring items inode_map_t inode_map; // map of inodes by ino MDS *mds; @@ -111,32 +115,28 @@ class MDCache { MDCache(MDS *m); ~MDCache(); - - // accessors + // root inode CInode *get_root() { return root; } void set_root(CInode *r) { root = r; add_inode(root); } - LRU *get_lru() { return lru; } - - // fn + // cache size_t set_cache_size(size_t max) { - lru->lru_set_max(max); + lru.lru_set_max(max); } - size_t get_cache_size() { lru->lru_get_size(); } + size_t get_cache_size() { lru.lru_get_size(); } bool trim(__int32_t max = -1); // trim cache + // shutdown void shutdown_start(); bool shutdown_pass(); bool shutdown(); // clear cache (ie at shutodwn) // have_inode? bool have_inode( inodeno_t ino ) { - inode_map_t::iterator it = inode_map.find(ino); - if (it == inode_map.end()) return false; - return true; + return inode_map.count(ino) ? true:false; } // return inode* or null @@ -145,7 +145,7 @@ class MDCache { return inode_map[ ino ]; return NULL; } - + protected: CDir *get_containing_import(CDir *in); CDir *get_containing_export(CDir *in); @@ -204,7 +204,11 @@ class MDCache { void handle_dentry_unlink(MDentryUnlink *m); void file_rename(CDentry *srcdn, CDentry *destdn, Context *c, bool everyone); - void handle_rename_notify(MRenameNotify *m); + void file_rename_foreign_src(CDentry *srcdn, CDentry *destdn, int initiator); + void handle_rename_notify(MRenameNotify *m); // -> bystanders (and source, rarely) + void handle_rename(MRename *m); // src -> dest + void handle_rename_req(MRenameReq *m); // init -> src (rarely) + void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) @@ -244,13 +248,10 @@ class MDCache { void export_dir_finish(CDir *dir); void handle_export_dir_notify_ack(MExportDirNotifyAck *m); + void encode_export_inode(CInode *in, crope& r); + + // importer - /*CInode *import_dentry_inode(CDir *dir, - pchar& p, - int from, - CDir *import_root=0, - int *would_be_dir_auth = 0); // need for normal import - */ void handle_export_dir_discover(MExportDirDiscover *m); void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); void handle_export_dir_prep(MExportDirPrep *m); @@ -266,6 +267,8 @@ class MDCache { inodeno_t dir_ino, inodeno_t replica_ino); + void decode_import_inode(CDentry *dn, crope& r, int &off, int oldauth); + // bystander void handle_export_dir_warning(MExportDirWarning *m); void handle_export_dir_notify(MExportDirNotify *m); diff --git a/ceph/mds/MDS.cc b/ceph/mds/MDS.cc index df038e52ba102..0aa40c72959dc 100644 --- a/ceph/mds/MDS.cc +++ b/ceph/mds/MDS.cc @@ -316,7 +316,7 @@ void MDS::dispatch(Message *m) balancer->send_heartbeat(); } */ - if (whoami == 0 && false) { + if (whoami == 0) { static bool didit = false; // 7 to 1 @@ -1560,7 +1560,7 @@ void MDS::handle_client_rename_local(MClientRequest *req, { bool everybody = false; if (true || srcdn->inode->is_dir()) { - /* overkill warning: lock w/ everyone for simplicity. + /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! i could limit this to cases where something beneath me is exported. could possibly limit the list. (maybe.) Underlying constraint is that, regardless of the order i do the xlocks, and whatever @@ -1624,7 +1624,7 @@ void MDS::handle_client_rename_local(MClientRequest *req, dosrc = !dosrc; } - // we're golden (everything is xlocked by use, we rule, etc.) + // we're golden (everything is xlocked by us, we rule, etc.) mdcache->file_rename( srcdn, destdn, new C_MDS_RenameFinish(this, req, srcdn->inode), everybody ); diff --git a/ceph/messages/MRename.h b/ceph/messages/MRename.h index 665606d543a58..9ddcfcb8bfb2f 100644 --- a/ceph/messages/MRename.h +++ b/ceph/messages/MRename.h @@ -6,34 +6,39 @@ class MRename : public Message { string srcname; inodeno_t destdirino; string destname; + int initiator; crope inode_state; public: + int get_initiator() { return initiator; } inodeno_t get_srcdirino() { return srcdirino; } string& get_srcname() { return srcname; } inodeno_t get_destdirino() { return destdirino; } string& get_destname() { return destname; } + crope& get_inode_state() { return inode_state; } MRename() {} - MRename(inodeno_t srcdirino, - const string& srcname, - inodeno_t destdirino, - const string& destname, - crope& inode_state) : + MRename(int initiator, + inodeno_t srcdirino, + const string& srcname, + inodeno_t destdirino, + const string& destname, + crope& inode_state) : Message(MSG_MDS_RENAME) { + this->initiator = initiator; this->srcdirino = srcdirino; this->srcname = srcname; this->destdirino = destdirino; this->destname = destname; this->inode_state = inode_state; } - virtual char *get_type_name() { return "Rnot";} - - crope& get_inode_state() { return inode_state; } + virtual char *get_type_name() { return "Rn";} virtual void decode_payload(crope& s) { int off = 0; + s.copy(off, sizeof(initiator), (char*)&initiator); + off += sizeof(initiator); s.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); s.copy(off, sizeof(destdirino), (char*)&destdirino); @@ -45,6 +50,7 @@ class MRename : public Message { inode_state = s.substr(off, s.length()-off); } virtual void encode_payload(crope& s) { + s.append((char*)&initiator,sizeof(initiator)); s.append((char*)&srcdirino,sizeof(srcdirino)); s.append((char*)&destdirino,sizeof(destdirino)); s.append((char*)srcname.c_str()); diff --git a/ceph/messages/MRenameAck.h b/ceph/messages/MRenameAck.h index 95a924e48ca67..b363419e07c5c 100644 --- a/ceph/messages/MRenameAck.h +++ b/ceph/messages/MRenameAck.h @@ -1,7 +1,7 @@ #ifndef __MRENAMEACK_H #define __MRENAMEACK_H -class MRename : public Message { +class MRenameAck : public Message { inodeno_t srcdirino; string srcname; inodeno_t destdirino; @@ -10,27 +10,40 @@ class MRename : public Message { public: inodeno_t get_srcdirino() { return srcdirino; } string& get_srcname() { return srcname; } + inodeno_t get_destdirino() { return destdirino; } + string& get_destname() { return destname; } MRenameAck() {} MRenameAck(inodeno_t srcdirino, - const string& srcname) : + const string& srcname, + inodeno_t destdirino, + const string& destname) : Message(MSG_MDS_RENAMEACK) { this->srcdirino = srcdirino; this->srcname = srcname; + this->destdirino = destdirino; + this->destname = destname; } - virtual char *get_type_name() { return "RnotA";} + virtual char *get_type_name() { return "RnAck";} virtual void decode_payload(crope& s) { int off = 0; s.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); + s.copy(off, sizeof(destdirino), (char*)&destdirino); + off += sizeof(destdirino); srcname = s.c_str() + off; off += srcname.length() + 1; + destname = s.c_str() + off; + off += destname.length() + 1; } virtual void encode_payload(crope& s) { s.append((char*)&srcdirino,sizeof(srcdirino)); + s.append((char*)&destdirino,sizeof(destdirino)); s.append((char*)srcname.c_str()); s.append((char)0); + s.append((char*)destname.c_str()); + s.append((char)0); } }; diff --git a/ceph/messages/MRenameReq.h b/ceph/messages/MRenameReq.h index a8c8a318f767f..49750b68251d6 100644 --- a/ceph/messages/MRenameReq.h +++ b/ceph/messages/MRenameReq.h @@ -2,32 +2,38 @@ #define __MRENAMEREQ_H class MRenameReq : public Message { + int initiator; inodeno_t srcdirino; string srcname; inodeno_t destdirino; string destname; public: + int get_initiator() { return initiator; } inodeno_t get_srcdirino() { return srcdirino; } string& get_srcname() { return srcname; } inodeno_t get_destdirino() { return destdirino; } string& get_destname() { return destname; } MRenameReq() {} - MRenameReq(inodeno_t srcdirino, + MRenameReq(int initiator, + inodeno_t srcdirino, const string& srcname, inodeno_t destdirino, const string& destname) : Message(MSG_MDS_RENAMEREQ) { + this->initiator = initiator; this->srcdirino = srcdirino; this->srcname = srcname; this->destdirino = destdirino; this->destname = destname; } - virtual char *get_type_name() { return "Rnot";} + virtual char *get_type_name() { return "RnReq";} virtual void decode_payload(crope& s) { int off = 0; + s.copy(off, sizeof(initiator), (char*)&initiator); + off += sizeof(initiator); s.copy(off, sizeof(srcdirino), (char*)&srcdirino); off += sizeof(srcdirino); s.copy(off, sizeof(destdirino), (char*)&destdirino); @@ -38,6 +44,7 @@ class MRenameReq : public Message { off += destname.length() + 1; } virtual void encode_payload(crope& s) { + s.append((char*)&initiator,sizeof(initiator)); s.append((char*)&srcdirino,sizeof(srcdirino)); s.append((char*)&destdirino,sizeof(destdirino)); s.append((char*)srcname.c_str()); diff --git a/ceph/msg/CheesySerializer.cc b/ceph/msg/CheesySerializer.cc index 0e0e16256b24b..56bac33126d74 100644 --- a/ceph/msg/CheesySerializer.cc +++ b/ceph/msg/CheesySerializer.cc @@ -86,7 +86,7 @@ Message *CheesySerializer::sendrecv(Message *m, msg_addr_t dest, int port) // pick up reply Message *reply = call_reply[pcid]; - assert(reply); + //assert(reply); call_reply.erase(pcid); // remove from call map call_cond.erase(pcid); @@ -98,3 +98,20 @@ Message *CheesySerializer::sendrecv(Message *m, msg_addr_t dest, int port) return reply; } + +// ------------- + +int CheesySerializer::shutdown() +{ + dout(1) << "shutdown" << endl; + + // abort any pending sendrecv's + lock.Lock(); + for (map::iterator it = call_cond.begin(); + it != call_cond.end(); + it++) { + dout(1) << "shutdown waking up (hung) pcid " << it->first << endl; + it->second->Signal(); // wake up! + } + lock.Unlock(); +} diff --git a/ceph/msg/CheesySerializer.h b/ceph/msg/CheesySerializer.h index 934c9baf6e8d0..883710f06ca40 100644 --- a/ceph/msg/CheesySerializer.h +++ b/ceph/msg/CheesySerializer.h @@ -28,7 +28,7 @@ class CheesySerializer : public Messenger, this->messenger = msg; last_pcid = 1; } - int shutdown() {} + int shutdown(); // incoming messages void dispatch(Message *m); diff --git a/ceph/msg/Message.h b/ceph/msg/Message.h index 11fe081d154db..fef00dd11eff4 100644 --- a/ceph/msg/Message.h +++ b/ceph/msg/Message.h @@ -54,7 +54,7 @@ #define MSG_MDS_RENAMENOTIFY 300 // sent to bystanders #define MSG_MDS_RENAMEREQ 301 // sent from initiator to src auth (rare) #define MSG_MDS_RENAME 302 // sent from src to dest, includes inode -#define MSG_MDS_RENAMEACK 303 // sent from dest to src, to xlock_finish +#define MSG_MDS_RENAMEACK 303 // sent from dest to initiator, to xlock_finish #define MSG_MDS_LOCK 500 diff --git a/ceph/msg/Messenger.cc b/ceph/msg/Messenger.cc index 83ba467d18003..7bb174394aeab 100644 --- a/ceph/msg/Messenger.cc +++ b/ceph/msg/Messenger.cc @@ -37,6 +37,9 @@ using namespace std; #include "messages/MExportDirFinish.h" #include "messages/MRenameNotify.h" +#include "messages/MRename.h" +#include "messages/MRenameReq.h" +#include "messages/MRenameAck.h" #include "messages/MDentryUnlink.h" #include "messages/MHeartbeat.h" @@ -154,6 +157,15 @@ decode_message(crope& ser) case MSG_MDS_RENAMENOTIFY: m = new MRenameNotify(); break; + case MSG_MDS_RENAME: + m = new MRename(); + break; + case MSG_MDS_RENAMEREQ: + m = new MRenameReq(); + break; + case MSG_MDS_RENAMEACK: + m = new MRenameAck(); + break; case MSG_MDS_DENTRYUNLINK: m = new MDentryUnlink(); diff --git a/ceph/osd/OSDMap.h b/ceph/osd/OSDMap.h index 27e3de3d8e1f8..9959b9de6eab3 100644 --- a/ceph/osd/OSDMap.h +++ b/ceph/osd/OSDMap.h @@ -25,7 +25,7 @@ class OSDGroup { int size; // num disks in this group (aka num_disks_in_cluster[]) int weight; // weight (for data migration etc.) (aka weight_cluster[]) - vector osds; // the list of osd addrs + vector osds; // the list of actual osd's }; @@ -35,16 +35,24 @@ class OSDCluster { __uint64_t version; // what version of the osd cluster descriptor is this // RUSH disk groups - vector disk_groups; // RUSH disk groups - set failed_disks; // list of failed disks + vector osd_groups; // RUSH disk groups + set failed_osds; // list of failed disks public: OSDCluster() : version(0) { } // cluster state - bool is_failed(int osd) { return failed_disks.count(osd) ? true:false; } - + bool is_failed(int osd) { return failed_osds.count(osd) ? true:false; } + + int num_osds() { + int n = 0; + for (vector::iterator it = osd_groups.begin(); + it != osd_groups.end(); + it++) + n += it->size; + return n; + } // mapping facilities @@ -63,7 +71,7 @@ class OSDCluster { osds.clear(); for (int i=0; i