From 0bcbbf6c1fee08733b7eb89b35a020e5cffd38f6 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 12 Oct 2007 22:46:27 +0000 Subject: [PATCH] merged r1936 from branches/sage/mds back into trunk git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1937 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/Makefile | 19 +- trunk/ceph/TODO | 108 +- trunk/ceph/client/Client.cc | 38 +- trunk/ceph/client/Client.h | 13 +- trunk/ceph/client/SyntheticClient.cc | 8 +- trunk/ceph/cmds.cc | 2 +- trunk/ceph/common/Logger.cc | 6 +- trunk/ceph/common/Logger.h | 8 +- trunk/ceph/config.cc | 36 +- trunk/ceph/config.h | 9 +- trunk/ceph/ebofs/BlockDevice.cc | 14 +- trunk/ceph/ebofs/Ebofs.cc | 1 + trunk/ceph/include/Context.h | 10 +- trunk/ceph/include/buffer.h | 9 +- trunk/ceph/include/encodable.h | 312 +++- trunk/ceph/include/frag.h | 8 +- trunk/ceph/include/lru.h | 9 + trunk/ceph/include/oldbuffer.h | 358 ----- trunk/ceph/include/oldbufferlist.h | 682 --------- trunk/ceph/include/triple.h | 28 + trunk/ceph/include/types.h | 6 +- trunk/ceph/include/xlist.h | 21 +- trunk/ceph/mds/AnchorClient.cc | 52 +- trunk/ceph/mds/AnchorClient.h | 20 +- trunk/ceph/mds/CDentry.cc | 33 +- trunk/ceph/mds/CDentry.h | 50 +- trunk/ceph/mds/CDir.cc | 258 ++-- trunk/ceph/mds/CDir.h | 183 +-- trunk/ceph/mds/CInode.cc | 197 ++- trunk/ceph/mds/CInode.h | 164 +- trunk/ceph/mds/ClientMap.cc | 11 +- trunk/ceph/mds/ClientMap.h | 12 +- trunk/ceph/mds/IdAllocator.h | 1 + trunk/ceph/mds/Locker.cc | 162 +- trunk/ceph/mds/Locker.h | 21 +- trunk/ceph/mds/LogEvent.h | 24 +- trunk/ceph/mds/LogSegment.h | 69 + trunk/ceph/mds/MDBalancer.cc | 12 +- trunk/ceph/mds/MDCache.cc | 1526 +++++++++++-------- trunk/ceph/mds/MDCache.h | 215 ++- trunk/ceph/mds/MDLog.cc | 461 +++--- trunk/ceph/mds/MDLog.h | 129 +- trunk/ceph/mds/MDS.cc | 172 ++- trunk/ceph/mds/Migrator.cc | 360 +++-- trunk/ceph/mds/Migrator.h | 31 +- trunk/ceph/mds/ScatterLock.h | 14 +- trunk/ceph/mds/Server.cc | 753 +++++---- trunk/ceph/mds/Server.h | 6 +- trunk/ceph/mds/SimpleLock.h | 10 +- trunk/ceph/mds/events/EAnchor.h | 6 +- trunk/ceph/mds/events/EAnchorClient.h | 2 - trunk/ceph/mds/events/EMetaBlob.h | 23 +- trunk/ceph/mds/events/EOpen.h | 3 +- trunk/ceph/mds/events/EPurgeFinish.h | 18 +- trunk/ceph/mds/events/ESession.h | 4 +- trunk/ceph/mds/events/ESlaveUpdate.h | 19 +- trunk/ceph/mds/events/ESubtreeMap.h | 4 +- trunk/ceph/mds/events/EUpdate.h | 3 +- trunk/ceph/mds/journal.cc | 539 ++++--- trunk/ceph/mds/mdstypes.h | 55 +- trunk/ceph/messages/MClientMount.h | 2 +- trunk/ceph/messages/MClientReply.h | 290 ++-- trunk/ceph/messages/MClientRequestForward.h | 4 +- trunk/ceph/messages/MDiscover.h | 5 +- trunk/ceph/messages/MDiscoverReply.h | 87 +- trunk/ceph/messages/MExportDir.h | 13 +- trunk/ceph/messages/MExportDirDiscover.h | 11 +- trunk/ceph/messages/MLock.h | 32 +- trunk/ceph/messages/MMDSCacheRejoin.h | 42 +- trunk/ceph/messages/MMDSSlaveRequest.h | 38 +- trunk/ceph/mon/ClientMonitor.cc | 7 +- trunk/ceph/mon/MonMap.h | 20 +- trunk/ceph/mon/MonitorStore.cc | 8 +- trunk/ceph/mon/OSDMonitor.cc | 4 +- trunk/ceph/mon/PGMap.h | 2 + trunk/ceph/msg/SimpleMessenger.cc | 34 +- trunk/ceph/msg/SimpleMessenger.h | 33 +- trunk/ceph/msg/msg_types.h | 3 +- trunk/ceph/newsyn.cc | 3 +- trunk/ceph/osd/OSD.cc | 6 +- trunk/ceph/osd/osd_types.h | 2 +- trunk/ceph/osdc/Objecter.cc | 21 +- trunk/ceph/osdc/Objecter.h | 1 + trunk/ceph/valgrind.supp | 37 + 84 files changed, 4051 insertions(+), 3981 deletions(-) delete mode 100644 trunk/ceph/include/oldbuffer.h delete mode 100644 trunk/ceph/include/oldbufferlist.h create mode 100644 trunk/ceph/include/triple.h create mode 100644 trunk/ceph/mds/LogSegment.h diff --git a/trunk/ceph/Makefile b/trunk/ceph/Makefile index 2f664db478747..f1dc7b3d4d60a 100644 --- a/trunk/ceph/Makefile +++ b/trunk/ceph/Makefile @@ -13,7 +13,10 @@ # on issdm, it's /usr/local/mpich2/bin. # Hook for extra -I options, etc. -EXTRA_CFLAGS = -O3 -g #-I${HOME}/include -L${HOME}/lib +EXTRA_CFLAGS = #-I${HOME}/include -L${HOME}/lib +EXTRA_CFLAGS += -g +EXTRA_CFLAGS += -pg +EXTRA_CFLAGS += -O3 # base CFLAGS = -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} @@ -142,22 +145,22 @@ mkmonmap: mkmonmap.cc common.o extractosdmaps: extractosdmaps.cc common.o osd.o mon.o ebofs.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o +cmon: cmon.o mon.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o +cosd: cosd.o osd.o ebofs.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o +cmds: cmds.o mds.o osdc.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o +csyn: csyn.o client.o osdc.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cfuse: cfuse.cc client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o +cfuse: cfuse.o client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ @@ -188,10 +191,10 @@ ipc_testclient: ceph_ipc/ipc_testclient.cc ceph_ipc/ipc_client.o # fake* -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o +fakefuse: fakefuse.o mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o +fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ diff --git a/trunk/ceph/TODO b/trunk/ceph/TODO index 6d652f1b25de1..ed581ab0b350b 100644 --- a/trunk/ceph/TODO +++ b/trunk/ceph/TODO @@ -52,47 +52,50 @@ mdsmon - per-mds, shared standby queues -sage mds +mds bugs +- open file rejournaling vs capped log... + - open files vs shutdown in general! need to export any caps on replicated metadata +- export caps to auth on unlinked inodes +- stray purge on shutdown + +- rename slave in-memory rollback on failure -bugs to fix -- fix server unlink .. needs to use slave_requests to clean up any failures during the resolve stage - fix purge_stray bug - try_remove_unlinked_dn thing -- emetablob playback with bad linkage.. from sloppy unlink? hmm + - client session open from locker.. doesn't work properly with delays + -> journal the session open _with_ the import(start) - proper handling of cache expire messages during rejoin phase? - verify once-per-segment jouranl context is working... +mds - extend/clean up filepath to allow paths relative to an ino - fix path_traverse - fix reconnect/rejoin open file weirdness +- get rid of replicate objects for replicate_to .. encode to bufferlists directly + - stray reintegration -- stray purge on shutdown - - need to export stray crap to another mds.. - verify stray is empty on shutdown +- real chdir (directory "open") + - relative metadata ops + - consistency points/snapshots - dentry versions vs dirfrags... - - detect and deal with client failure - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... - inode.max_size - inode.allocated_size - -- real chdir (directory "open") - - relative metadata ops - + - osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. -- EMetablob should return 'expired' if they have higher versions (and are thus described by a newer journal entry) - -- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in teh current log epoch in CDir... +- could mark dir complete in EMetaBlob by counting how many dentries are dirtied in the current log epoch in CDir... - fix rmdir empty exported dirfrag race - export all frags <= 1 item? then we ensure freezing before empty, avoiding any last unlink + export vs rmdir race. @@ -103,10 +106,16 @@ bugs to fix - need to move state from replicas to auth. simplelock doesn't currently support that. - ScatterLock or something? hrm. -- FIXME how to journal root and stray inode content? +- FIXME how to journal/store root and stray inode content? - in particular, i care about dirfragtree.. get it on rejoin? - and dir sizes, if i add that... also on rejoin? +- efficient stat for single writers +- lstat vs stat? +- add FILE_CAP_EXTEND capability bit +- only share osdmap updates with clients holding capabilities +- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) + osdmon @@ -248,29 +257,6 @@ crush - allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) -mds -- distributed client management -- chdir (directory opens!) -- rewrite logstream - - clean up - - be smart about rados ack vs reread - - log locking? root log object - - trimming, rotation - -- efficient stat for single writers -- lstat vs stat -- add FILE_CAP_EXTEND capability bit -- only share osdmap updates with clients holding capabilities -- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) -- finish hard links! - - reclaim danglers from inode file on discover... - - fix rename wrt hard links -- interactive hash/unhash interface -- test hashed readdir -- make logstream.flush align itself to stripes - -- carefully define/document frozen wrt dir_auth vs hashing - client @@ -297,52 +283,6 @@ why qsync could be wrong (for very strict POSIX) : varying mds -> client message -> for correct result, need to _stop_ client writers while gathering metadata. -SAGE: - -- string table? - -- hard links - - fix MExportAck and others to use dir+dentry, not inode - (otherwise this all breaks with hard links.. altho it probably needs reworking already!) - -- do real permission checks? - - - - - - -ISSUES - - -- discover - - soft: authority selectively repicates, or sets a 'forward' flag in reply - - hard: authority always replicates (eg. discover for export) - - forward flag (see soft) - - error flag (if file not found, etc.) - - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply - - - -DOCUMENT -- cache, distributed cache structure and invariants -- export process -- hash/unhash process - - -TEST -- hashing - - test hash/unhash operation - - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed. - - test nauthitems (wrt hashing?) - - -IMPLEMENT - -- smarter balancing - - popularity calculation and management is inconsistent/wrong. - - does it work? - - dump active config in run output somewhere diff --git a/trunk/ceph/client/Client.cc b/trunk/ceph/client/Client.cc index b4bd9a5bffc05..4a7e6baacaf5a 100644 --- a/trunk/ceph/client/Client.cc +++ b/trunk/ceph/client/Client.cc @@ -409,25 +409,19 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) * * update MDS location cache for a single inode */ -void Client::update_inode_dist(Inode *in, InodeStat *st) +void Client::update_dir_dist(Inode *in, DirStat *dst) { // auth in->dir_auth = -1; - if (st->dirfrag_auth.size() == 1) { - in->dir_auth = st->dirfrag_auth.begin()->second; + if (dst->frag == frag_t()) { + in->dir_auth = dst->auth; } else { - for (map::iterator p = st->dirfrag_auth.begin(); - p != st->dirfrag_auth.end(); - ++p) { - dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << p->first << " to mds " << p->second << dendl; - in->fragmap[p->first] = p->second; - } + dout(20) << "got dirfrag map for " << in->inode.ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; + in->fragmap[dst->frag] = dst->auth; } // replicated - in->dir_replicated = false; - if (!st->dirfrag_rep.empty()) - in->dir_replicated = true; // FIXME + in->dir_replicated = dst->is_rep; // FIXME that's just one frag! // dist /* @@ -457,6 +451,7 @@ Inode* Client::insert_trace(MClientReply *reply) dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << dendl; list::const_iterator pdn = reply->get_trace_dn().begin(); + list::const_iterator pdir = reply->get_trace_dir().begin(); for (list::const_iterator pin = reply->get_trace_in().begin(); pin != reply->get_trace_in().end(); @@ -470,10 +465,12 @@ Inode* Client::insert_trace(MClientReply *reply) cur = root = new Inode((*pin)->inode, objectcacher); dout(10) << "insert_trace new root is " << root << dendl; inode_map[root->inode.ino] = root; + root->dir_auth = 0; } } else { // not root. Dir *dir = cur->open_dir(); + assert(pdn != reply->get_trace_dn().end()); cur = this->insert_inode(dir, *pin, *pdn); dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << dendl; ++pdn; @@ -483,14 +480,16 @@ Inode* Client::insert_trace(MClientReply *reply) lru.lru_touch(cur->dn); } - // update dist info - update_inode_dist(cur, *pin); - // set cache ttl if (g_conf.client_cache_stat_ttl) { cur->valid_until = now; cur->valid_until += g_conf.client_cache_stat_ttl; } + + // update dir dist info + if (pdir == reply->get_trace_dir().end()) break; + update_dir_dist(cur, *pdir); + ++pdir; } return cur; @@ -980,10 +979,15 @@ void Client::handle_mds_map(MMDSMap* m) mount_cond.Signal(); // mount might be waiting for this. } + if (m->get_epoch() < mdsmap->get_epoch()) { + dout(1) << "handle_mds_map epoch " << m->get_epoch() << " is older than our " + << mdsmap->get_epoch() << dendl; + delete m; + return; + } + dout(1) << "handle_mds_map epoch " << m->get_epoch() << dendl; - epoch_t was = mdsmap->get_epoch(); mdsmap->decode(m->get_encoded()); - assert(mdsmap->get_epoch() >= was); // send reconnect? if (frommds >= 0 && diff --git a/trunk/ceph/client/Client.h b/trunk/ceph/client/Client.h index 69da16b6e84d5..727098906c617 100644 --- a/trunk/ceph/client/Client.h +++ b/trunk/ceph/client/Client.h @@ -252,10 +252,13 @@ class Inode { if (!dirfragtree.empty()) { __gnu_cxx::hash H; frag_t fg = dirfragtree[H(dname)]; - if (fragmap.count(fg) && - fragmap[fg] >= 0) { - //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl; - return fragmap[fg]; + while (fg != frag_t()) { + if (fragmap.count(fg) && + fragmap[fg] >= 0) { + //cout << "picked frag ino " << inode.ino << " dname " << dname << " fg " << fg << " mds" << fragmap[fg] << std::endl; + return fragmap[fg]; + } + fg = frag_t(fg.value(), fg.bits()-1); // try more general... } } return authority(); @@ -682,7 +685,7 @@ protected: // metadata cache Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); - void update_inode_dist(Inode *in, InodeStat *st); + void update_dir_dist(Inode *in, DirStat *st); Inode* insert_trace(MClientReply *reply); // ---------------------- diff --git a/trunk/ceph/client/SyntheticClient.cc b/trunk/ceph/client/SyntheticClient.cc index 68c90e90a6e9c..931ea790625bb 100644 --- a/trunk/ceph/client/SyntheticClient.cc +++ b/trunk/ceph/client/SyntheticClient.cc @@ -274,6 +274,8 @@ int SyntheticClient::run() run_until = utime_t(0,0); dout(5) << "run" << dendl; + int seq = 0; + for (list::iterator it = modes.begin(); it != modes.end(); it++) { @@ -411,7 +413,7 @@ int SyntheticClient::run() break; case SYNCLIENT_MODE_MAKEDIRS: { - string sarg1 = get_sarg(0); + string sarg1 = get_sarg(seq++); int iarg1 = iargs.front(); iargs.pop_front(); int iarg2 = iargs.front(); iargs.pop_front(); int iarg3 = iargs.front(); iargs.pop_front(); @@ -1549,13 +1551,13 @@ int SyntheticClient::make_files(int num, int count, int priv, bool more) } } else { // shared - if (whoami == 0) { + if (true || whoami == 0) { for (int c=0; cmkdir(d, 0755); } } else { - sleep(5); + sleep(2); } } diff --git a/trunk/ceph/cmds.cc b/trunk/ceph/cmds.cc index 2fe19901ff767..6e475ad4b588d 100644 --- a/trunk/ceph/cmds.cc +++ b/trunk/ceph/cmds.cc @@ -101,7 +101,7 @@ int main(int argc, char **argv) mds->mds_lock.Unlock(); // done - delete mds; + //delete mds; return 0; } diff --git a/trunk/ceph/common/Logger.cc b/trunk/ceph/common/Logger.cc index c21cf31a46a40..2e7db26027a57 100644 --- a/trunk/ceph/common/Logger.cc +++ b/trunk/ceph/common/Logger.cc @@ -212,8 +212,10 @@ void Logger::_flush() } else { if (fvals[i] > 0 && vals[i] == 0) out << "\t" << fvals[i]; - else + else { + //cout << this << " p " << i << " and size is " << vals.size() << std::endl; out << "\t" << vals[i]; + } } } out << std::endl; @@ -265,6 +267,7 @@ long Logger::set(const char *key, long v) if (i < 0) i = type->add_set(key); maybe_resize(i+1); + //cout << this << " set " << i << " to " << v << std::endl; long r = vals[i] = v; logger_lock.Unlock(); return r; @@ -279,6 +282,7 @@ double Logger::fset(const char *key, double v) if (i < 0) i = type->add_set(key); maybe_resize(i+1); + //cout << this << " fset " << i << " to " << v << std::endl; double r = fvals[i] = v; logger_lock.Unlock(); return r; diff --git a/trunk/ceph/common/Logger.h b/trunk/ceph/common/Logger.h index 9c63671633bc0..70fc1fa978024 100644 --- a/trunk/ceph/common/Logger.h +++ b/trunk/ceph/common/Logger.h @@ -37,10 +37,10 @@ class Logger { vector< vector > vals_to_avg; void maybe_resize(unsigned s) { - if (s >= vals.size()) { - vals.resize(s); - fvals.resize(s); - vals_to_avg.resize(s); + while (s >= vals.size()) { + vals.push_back(0); + fvals.push_back(0.0); + vals_to_avg.push_back(vector()); } } diff --git a/trunk/ceph/config.cc b/trunk/ceph/config.cc index 6e4d82ea609b1..f9dea43f893a1 100644 --- a/trunk/ceph/config.cc +++ b/trunk/ceph/config.cc @@ -95,6 +95,7 @@ md_config_t g_conf = { debug_mds: 1, debug_mds_balancer: 1, debug_mds_log: 1, + debug_mds_log_expire: 1, debug_mds_migrator: 1, debug_buffer: 0, debug_timer: 0, @@ -193,7 +194,7 @@ md_config_t g_conf = { journaler_batch_max: 16384, // max bytes we'll delay flushing // --- mds --- - mds_cache_size: MDS_CACHE_SIZE, + mds_cache_size: 300000, //MDS_CACHE_SIZE, mds_cache_mid: .7, mds_decay_halflife: 5, @@ -202,11 +203,10 @@ md_config_t g_conf = { mds_beacon_grace: 15, //60*60.0, mds_log: true, - mds_log_max_len: MDS_CACHE_SIZE / 3, - mds_log_max_trimming: 10000, - mds_log_read_inc: 1<<20, + mds_log_max_events: -1, //MDS_CACHE_SIZE / 3, + mds_log_max_segments: 100, + mds_log_max_expiring: 20, mds_log_pad_entry: 128,//256,//64, - mds_log_flush_on_shutdown: true, mds_log_eopen_size: 100, // # open inodes per log entry mds_bal_sample_interval: 3.0, // every 5 seconds @@ -219,7 +219,7 @@ md_config_t g_conf = { mds_bal_merge_rd: 1000, mds_bal_merge_wr: 1000, mds_bal_interval: 10, // seconds - mds_bal_fragment_interval: 5, // seconds + mds_bal_fragment_interval: 2, // seconds mds_bal_idle_threshold: 0, //.1, mds_bal_max: -1, mds_bal_max_until: -1, @@ -233,7 +233,6 @@ md_config_t g_conf = { mds_bal_minchunk: .001, // never take anything smaller than this mds_trim_on_rejoin: true, - mds_commit_on_shutdown: true, mds_shutdown_check: 0, //30, mds_verify_export_dirauth: true, @@ -568,6 +567,11 @@ void parse_config_options(std::vector& args) g_conf.debug_mds_log = atoi(args[++i]); else g_debug_after_conf.debug_mds_log = atoi(args[++i]); + else if (strcmp(args[i], "--debug_mds_log_expire") == 0) + if (!g_conf.debug_after) + g_conf.debug_mds_log_expire = atoi(args[++i]); + else + g_debug_after_conf.debug_mds_log_expire = atoi(args[++i]); else if (strcmp(args[i], "--debug_mds_migrator") == 0) if (!g_conf.debug_after) g_conf.debug_mds_migrator = atoi(args[++i]); @@ -680,19 +684,15 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mds_log") == 0) g_conf.mds_log = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_max_len") == 0) - g_conf.mds_log_max_len = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_read_inc") == 0) - g_conf.mds_log_read_inc = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_max_trimming") == 0) - g_conf.mds_log_max_trimming = atoi(args[++i]); - - else if (strcmp(args[i], "--mds_commit_on_shutdown") == 0) - g_conf.mds_commit_on_shutdown = atoi(args[++i]); + else if (strcmp(args[i], "--mds_log_max_events") == 0) + g_conf.mds_log_max_events = atoi(args[++i]); + else if (strcmp(args[i], "--mds_log_max_segments") == 0) + g_conf.mds_log_max_segments = atoi(args[++i]); + else if (strcmp(args[i], "--mds_log_max_expiring") == 0) + g_conf.mds_log_max_expiring = atoi(args[++i]); + else if (strcmp(args[i], "--mds_shutdown_check") == 0) g_conf.mds_shutdown_check = atoi(args[++i]); - else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0) - g_conf.mds_log_flush_on_shutdown = atoi(args[++i]); else if (strcmp(args[i], "--mds_decay_halflife") == 0) g_conf.mds_decay_halflife = atoi(args[++i]); diff --git a/trunk/ceph/config.h b/trunk/ceph/config.h index 4faa5b605ed3e..3c56f6af20941 100644 --- a/trunk/ceph/config.h +++ b/trunk/ceph/config.h @@ -75,6 +75,7 @@ struct md_config_t { int debug_mds; int debug_mds_balancer; int debug_mds_log; + int debug_mds_log_expire; int debug_mds_migrator; int debug_buffer; int debug_timer; @@ -195,11 +196,10 @@ struct md_config_t { float mds_beacon_grace; bool mds_log; - int mds_log_max_len; - int mds_log_max_trimming; - int mds_log_read_inc; + int mds_log_max_events; + int mds_log_max_segments; + int mds_log_max_expiring; int mds_log_pad_entry; - bool mds_log_flush_on_shutdown; int mds_log_eopen_size; float mds_bal_sample_interval; @@ -226,7 +226,6 @@ struct md_config_t { float mds_bal_minchunk; bool mds_trim_on_rejoin; - bool mds_commit_on_shutdown; int mds_shutdown_check; bool mds_verify_export_dirauth; // debug flag diff --git a/trunk/ceph/ebofs/BlockDevice.cc b/trunk/ceph/ebofs/BlockDevice.cc index 355ae39daa5ac..94c108db2612c 100644 --- a/trunk/ceph/ebofs/BlockDevice.cc +++ b/trunk/ceph/ebofs/BlockDevice.cc @@ -155,24 +155,25 @@ int BlockDevice::ElevatorQueue::dequeue_io(list& biols, // allowed? (not already submitted to kernel?) if (block_lock.intersects(bio->start, bio->length)) { - // dout(20) << "dequeue_io " << bio->start << "~" << bio->length - // << " intersects block_lock " << block_lock << dendl; + dout(20) << "dequeue_io " << bio->start << "~" << bio->length + << " intersects block_lock " << block_lock << dendl; break; // stop, or go with what we've got so far } // add to biols int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? - if (num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! + if (num_bio && + num_iovs + nv >= IOV_MAX) break; // to many //g_conf.bdev_iov_max) break; // too many! num_iovs += nv; start = MIN(start, bio->start); length += bio->length; if (el_dir_forward) { - //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl; + dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << dendl; biols.push_back(bio); // add at back } else { - // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl; + dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << dendl; biols.push_front(bio); // add at front } num_bio++; @@ -689,7 +690,8 @@ int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) left -= iov[n].iov_len; n++; - if (left == 0) break; + if (left == 0 || + n == IOV_MAX) break; } int r = ::writev(fd, iov, n); diff --git a/trunk/ceph/ebofs/Ebofs.cc b/trunk/ceph/ebofs/Ebofs.cc index 97cba8285b463..b1f6ab7539467 100644 --- a/trunk/ceph/ebofs/Ebofs.cc +++ b/trunk/ceph/ebofs/Ebofs.cc @@ -410,6 +410,7 @@ void Ebofs::prepare_super(version_t epoch, bufferptr& bp) // put in a buffer bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + bp.zero(); memcpy(bp.c_str(), (const char*)&sb, sizeof(sb)); } diff --git a/trunk/ceph/include/Context.h b/trunk/ceph/include/Context.h index 231683bc2fe70..e5c74de6cb6e5 100644 --- a/trunk/ceph/include/Context.h +++ b/trunk/ceph/include/Context.h @@ -70,17 +70,17 @@ public: * C_Contexts - set of Contexts */ class C_Contexts : public Context { - std::list clist; - public: + std::list contexts; + void add(Context* c) { - clist.push_back(c); + contexts.push_back(c); } void take(std::list& ls) { - clist.splice(clist.end(), ls); + contexts.splice(contexts.end(), ls); } void finish(int r) { - finish_contexts(clist, r); + finish_contexts(contexts, r); } }; diff --git a/trunk/ceph/include/buffer.h b/trunk/ceph/include/buffer.h index 0e165cec82b51..5e48b6ce91bf6 100644 --- a/trunk/ceph/include/buffer.h +++ b/trunk/ceph/include/buffer.h @@ -441,6 +441,10 @@ public: unsigned get_off() { return off; } + bool end() { + return p == ls.end(); + } + void advance(unsigned o) { //cout << this << " advance " << o << " from " << off << " (p_off " << p_off << " in " << p->length() << ")" << std::endl; p_off += o; @@ -712,6 +716,7 @@ public: unsigned gap = append_buffer.unused_tail_length(); if (gap > 0) { if (gap > len) gap = len; + //cout << "append first char is " << data[0] << ", last char is " << data[len-1] << std::endl; append_buffer.append(data, gap); append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list len -= gap; @@ -946,7 +951,9 @@ inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { // ---------------------------------------------------------- -// new encoders +// encoders + +// DEPRECATED, please use _(en|de)code_(simple|complex) // raw template diff --git a/trunk/ceph/include/encodable.h b/trunk/ceph/include/encodable.h index 5d53c80adbda0..321361866ec9b 100644 --- a/trunk/ceph/include/encodable.h +++ b/trunk/ceph/include/encodable.h @@ -24,24 +24,276 @@ #include #include + +// ================================================================== +// simple + + +// raw +template +inline void _encode_raw(const T& t, bufferlist& bl) +{ + bl.append((char*)&t, sizeof(t)); +} +template +inline void _decode_raw(T& t, bufferlist::iterator &p) +{ + p.copy(sizeof(t), (char*)&t); +} + +#include +#include +#include +#include +#include +#include + +// list +template +inline void _encode_simple(const std::list& ls, bufferlist& bl) +{ + // should i pre- or post- count? + if (!ls.empty()) { + unsigned pos = bl.length(); + uint32_t n = 0; + _encode_raw(n, bl); + for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { + n++; + _encode_simple(*p, bl); + } + bl.copy_in(pos, sizeof(n), (char*)&n); + } else { + uint32_t n = ls.size(); // FIXME: this is slow on a list. + _encode_raw(n, bl); + for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) + _encode_simple(*p, bl); + } +} +template +inline void _decode_simple(std::list& ls, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + ls.clear(); + while (n--) { + T v; + _decode_simple(v, p); + ls.push_back(v); + } +} + +// deque +template +inline void _encode_simple(const std::deque& ls, bufferlist& bl) +{ + uint32_t n = ls.size(); + _encode_raw(n, bl); + for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) + _encode_simple(*p, bl); +} +template +inline void _decode_simple(std::deque& ls, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + ls.clear(); + while (n--) { + T v; + _decode_simple(v, p); + ls.push_back(v); + } +} + +// set +template +inline void _encode_simple(const std::set& s, bufferlist& bl) +{ + uint32_t n = s.size(); + _encode_raw(n, bl); + for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) + _encode_simple(*p, bl); +} +template +inline void _decode_simple(std::set& s, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + s.clear(); + while (n--) { + T v; + _decode_simple(v, p); + s.insert(v); + } +} + +// vector +template +inline void _encode_simple(const std::vector& v, bufferlist& bl) +{ + uint32_t n = v.size(); + _encode_raw(n, bl); + for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) + _encode_simple(*p, bl); +} +template +inline void _decode_simple(std::vector& v, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + v.resize(n); + for (uint32_t i=0; i +inline void _encode_simple(const std::map& m, bufferlist& bl) +{ + uint32_t n = m.size(); + _encode_raw(n, bl); + for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { + _encode_simple(p->first, bl); + _encode_simple(p->second, bl); + } +} +template +inline void _decode_simple(std::map& m, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + m.clear(); + while (n--) { + T k; + _decode_simple(k, p); + _decode_simple(m[k], p); + } +} + +// hash_map +template +inline void _encode_simple(const __gnu_cxx::hash_map& m, bufferlist& bl) +{ + uint32_t n = m.size(); + _encode_raw(n, bl); + for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { + _encode_simple(p->first, bl); + _encode_simple(p->second, bl); + } +} +template +inline void _decode_simple(__gnu_cxx::hash_map& m, bufferlist::iterator& p) +{ + uint32_t n; + _decode_raw(n, p); + m.clear(); + while (n--) { + T k; + _decode_simple(k, p); + _decode_simple(m[k], p); + } +} + +// string +inline void _encode_simple(const std::string& s, bufferlist& bl) +{ + uint32_t len = s.length(); + _encode_raw(len, bl); + bl.append(s.data(), len); +} +inline void _decode_simple(std::string& s, bufferlist::iterator& p) +{ + uint32_t len; + _decode_raw(len, p); + s.clear(); + p.copy(len, s); +} + +// const char* (encode only, string compatible) +inline void _encode_simple(const char *s, bufferlist& bl) +{ + uint32_t len = strlen(s); + _encode_raw(len, bl); + bl.append(s, len); +} + +// bufferptr (encapsulated) +inline void _encode_simple(const buffer::ptr& bp, bufferlist& bl) +{ + uint32_t len = bp.length(); + _encode_raw(len, bl); + bl.append(bp); +} +inline void _decode_simple(buffer::ptr& bp, bufferlist::iterator& p) +{ + uint32_t len; + _decode_raw(len, p); + + bufferlist s; + p.copy(len, s); + + if (s.buffers().size() == 1) + bp = s.buffers().front(); + else + bp = buffer::copy(s.c_str(), s.length()); +} + +// bufferlist (encapsulated) +inline void _encode_simple(const bufferlist& s, bufferlist& bl) +{ + uint32_t len = s.length(); + _encode_raw(len, bl); + bl.append(s); +} +inline void _encode_simple_destructively(bufferlist& s, bufferlist& bl) +{ + uint32_t len = s.length(); + _encode_raw(len, bl); + bl.claim_append(s); +} +inline void _decode_simple(bufferlist& s, bufferlist::iterator& p) +{ + uint32_t len; + _decode_raw(len, p); + s.clear(); + p.copy(len, s); +} + +// base +template +inline void _encode_simple(const T& t, bufferlist& bl) +{ + _encode_raw(t, bl); +} +template +inline void _decode_simple(T& t, bufferlist::iterator& p) +{ + _decode_raw(t, p); +} + + + + +// ================================================================== +// complex + // list template inline void _encode_complex(const std::list& ls, bufferlist& bl) { uint32_t n = ls.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) _encode_complex(*p, bl); } template -inline void _decode_complex(std::list& ls, bufferlist& bl, int& off) +inline void _decode_complex(std::list& ls, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); ls.clear(); while (n--) { T v; - _decode_complex(v, bl, off); + _decode_complex(v, p); ls.push_back(v); } } @@ -51,19 +303,19 @@ template inline void _encode_complex(const std::deque& ls, bufferlist& bl) { uint32_t n = ls.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename std::deque::const_iterator p = ls.begin(); p != ls.end(); ++p) _encode_complex(*p, bl); } template -inline void _decode_complex(std::deque& ls, bufferlist& bl, int& off) +inline void _decode_complex(std::deque& ls, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); ls.clear(); while (n--) { T v; - _decode_complex(v, bl, off); + _decode_complex(v, p); ls.push_back(v); } } @@ -73,19 +325,19 @@ template inline void _encode_complex(const std::set& s, bufferlist& bl) { uint32_t n = s.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename std::set::const_iterator p = s.begin(); p != s.end(); ++p) _encode_complex(*p, bl); } template -inline void _decode_complex(std::set& s, bufferlist& bl, int& off) +inline void _decode_complex(std::set& s, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); s.clear(); while (n--) { T v; - _decode_complex(v, bl, off); + _decode_complex(v, p); s.insert(v); } } @@ -95,18 +347,18 @@ template inline void _encode_complex(const std::vector& v, bufferlist& bl) { uint32_t n = v.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename std::vector::const_iterator p = v.begin(); p != v.end(); ++p) _encode_complex(*p, bl); } template -inline void _decode_complex(std::vector& v, bufferlist& bl, int& off) +inline void _decode_complex(std::vector& v, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); v.resize(n); for (uint32_t i=0; i inline void _encode_complex(const std::map& m, bufferlist& bl) { uint32_t n = m.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename std::map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); + _encode_simple(p->first, bl); _encode_complex(p->second, bl); } } template -inline void _decode_complex(std::map& m, bufferlist& bl, int& off) +inline void _decode_complex(std::map& m, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); m.clear(); while (n--) { T k; - _decode(k, bl, off); - _decode_complex(m[k], bl, off); + _decode_simple(k, p); + _decode_complex(m[k], p); } } @@ -138,22 +390,22 @@ template inline void _encode_complex(const __gnu_cxx::hash_map& m, bufferlist& bl) { uint32_t n = m.size(); - _encoderaw(n, bl); + _encode_raw(n, bl); for (typename __gnu_cxx::hash_map::const_iterator p = m.begin(); p != m.end(); ++p) { - _encode(p->first, bl); + _encode_simple(p->first, bl); _encode_complex(p->second, bl); } } template -inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist& bl, int& off) +inline void _decode_complex(__gnu_cxx::hash_map& m, bufferlist::iterator& p) { uint32_t n; - _decoderaw(n, bl, off); + _decode_raw(n, p); m.clear(); while (n--) { T k; - _decode(k, bl, off); - _decode_complex(m[k], bl, off); + _decode_simple(k, p); + _decode_complex(m[k], p); } } @@ -164,9 +416,9 @@ inline void _encode_complex(const T& t, bufferlist& bl) t._encode(bl); } template -inline void _decode_complex(T& t, bufferlist& bl, int& off) +inline void _decode_complex(T& t, bufferlist::iterator& p) { - t._decode(bl, off); + t._decode(p); } #endif diff --git a/trunk/ceph/include/frag.h b/trunk/ceph/include/frag.h index a6f5fdbde4b52..eac9d5bfa9e36 100644 --- a/trunk/ceph/include/frag.h +++ b/trunk/ceph/include/frag.h @@ -20,6 +20,7 @@ #include #include #include "buffer.h" +#include "encodable.h" /* * @@ -164,7 +165,7 @@ class fragtree_t { // pairs : // frag_t f is split by b bits. // if child frag_t does not appear, it is not split. - std::map _splits; + std::map _splits; public: // ------------- @@ -456,12 +457,15 @@ class fragtree_t { } // encoding - void _encode(bufferlist& bl) { + void _encode(bufferlist& bl) const { ::_encode(_splits, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(_splits, bl, off); } + void _decode(bufferlist::iterator& p) { + ::_decode_simple(_splits, p); + } void print(std::ostream& out) { out << "fragtree_t("; diff --git a/trunk/ceph/include/lru.h b/trunk/ceph/include/lru.h index 27edff8fdc00c..40dce1aa191ab 100644 --- a/trunk/ceph/include/lru.h +++ b/trunk/ceph/include/lru.h @@ -267,6 +267,15 @@ class LRU { return true; } + void lru_touch_entire_pintail() { + // promote entire pintail to the top lru + while (lru_pintail.get_length() > 0) { + LRUObject *o = lru_pintail.get_head(); + lru_pintail.remove(o); + lru_top.insert_tail(o); + } + } + // expire -- expire a single item LRUObject *lru_get_next_expire() { diff --git a/trunk/ceph/include/oldbuffer.h b/trunk/ceph/include/oldbuffer.h deleted file mode 100644 index 12ddf688934bc..0000000000000 --- a/trunk/ceph/include/oldbuffer.h +++ /dev/null @@ -1,358 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFER_H -#define __BUFFER_H - -#include -#include - -#include -using namespace std; - -// bit masks -#define BUFFER_MODE_NOCOPY 0 -#define BUFFER_MODE_COPY 1 // copy on create, my buffer - -#define BUFFER_MODE_NOFREE 0 -#define BUFFER_MODE_FREE 2 - -#define BUFFER_MODE_CUSTOMFREE 4 - -#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE) - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - -#include "common/Mutex.h" - -// HACK: in config.cc -/* - * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and - * buffer ever use buffer._ref, and only bufferptr should call ~buffer(). - * - * So, I only need to protect: - * - buffer()'s modification of buffer_total_alloc - * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc - * - * I don't protect - * - buffer._get() .. increment is atomic on any sane architecture - * - buffer._put() .. only called by ~bufferptr. - * - ~buffer .. only called by ~bufferptr *** I HOPE!! - */ -extern Mutex bufferlock; -extern long buffer_total_alloc; - - -typedef void (buffer_free_func_t)(void*,char*,unsigned); - - -/* - * buffer - the underlying buffer container. with a reference count. - * - * the buffer never shrinks. - * - * some invariants: - * _len never shrinks - * _len <= _alloc_len - */ -class buffer { - protected: - //wtf - //static Mutex bufferlock; - //static long buffer_total_alloc;// = 0; - - private: - // raw buffer alloc - char *_dataptr; - bool _myptr; - unsigned _len; - unsigned _alloc_len; - - // ref counts - unsigned _ref; - int _get() { - bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl; - return ++_ref; - } - int _put() { - bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl; - assert(_ref > 0); - return --_ref; - } - - // custom (de!)allocator - buffer_free_func_t *free_func; - void *free_func_arg; - - friend class bufferptr; - - public: - // constructors - buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - } - buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) { - bdbout(1) << "buffer.cons " << *this << endl; - _dataptr = new char[a]; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - } - ~buffer() { - bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl; - if (free_func) { - bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl; - free_func( free_func_arg, _dataptr, _alloc_len ); - } - else if (_dataptr && _myptr) { - bdbout(1) << "buffer.free " << (void*)_dataptr << endl; - delete[] _dataptr; - buffer_total_alloc -= _alloc_len; - } - } - - buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0, - buffer_free_func_t free_func=0, void* free_func_arg=0) : - _dataptr(0), - _myptr(false), - _len(l), - _ref(0), - free_func(0), free_func_arg(0) { - - if (alloc_len) - _alloc_len = alloc_len; - else - _alloc_len = l; - - _myptr = mode & BUFFER_MODE_FREE ? true:false; - bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl; - if (mode & BUFFER_MODE_COPY) { - _dataptr = new char[_alloc_len]; - bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; - bufferlock.Lock(); - buffer_total_alloc += _alloc_len; - bufferlock.Unlock(); - memcpy(_dataptr, p, l); - bdbout(1) << "buffer.copy " << *this << endl; - } else { - _dataptr = (char*)p; // ugly - bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl; - } - - if (mode & BUFFER_MODE_CUSTOMFREE && free_func) { - this->free_func = free_func; - this->free_func_arg = free_func_arg; - } - } - - // operators - buffer& operator=(buffer& other) { - assert(0); // not implemented, no reasonable assignment semantics. - return *this; - } - - char *c_str() { - return _dataptr; - } - - bool has_free_func() { return free_func != 0; } - - // accessor - unsigned alloc_length() { - return _alloc_len; - } - void set_length(unsigned l) { - assert(l <= _alloc_len); - _len = l; - } - unsigned length() { return _len; } - unsigned unused_tail_length() { return _alloc_len - _len; } - - friend ostream& operator<<(ostream& out, buffer& b); -}; - -inline ostream& operator<<(ostream& out, buffer& b) { - return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")"; -} - - -/* - * smart pointer class for buffer - * - * we reference count the actual buffer. - * we also let you refer to a subset of a buffer. - * we implement the high-level buffer accessor methods. - * - * some invariants: - * _off < _buffer->_len - * _off + _len <= _buffer->_len - */ -class bufferptr { - private: - buffer *_buffer; - unsigned _len, _off; - - public: - // empty cons - bufferptr() : - _buffer(0), - _len(0), - _off(0) { } - // main cons - the entire buffer - bufferptr(buffer *b) : - _buffer(b), - _len(b->_len), - _off(0) { - assert(_buffer->_ref == 0); - _buffer->_get(); // this is always the first one. - } - // subset cons - a subset of another bufferptr (subset) - bufferptr(const bufferptr& bp, unsigned len, unsigned off) { - bufferlock.Lock(); - _buffer = bp._buffer; - _len = len; - _off = bp._off + off; - _buffer->_get(); - assert(_off < _buffer->_len); // sanity checks - assert(_off + _len <= _buffer->_len); - bufferlock.Unlock(); - } - - // copy cons - bufferptr(const bufferptr &other) { - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - } - - // assignment operator - bufferptr& operator=(const bufferptr& other) { - //assert(0); - // discard old - discard_buffer(); - - // point to other - bufferlock.Lock(); - _buffer = other._buffer; - _len = other._len; - _off = other._off; - if (_buffer) _buffer->_get(); - bufferlock.Unlock(); - return *this; - } - - ~bufferptr() { - discard_buffer(); - } - - void discard_buffer() { - if (_buffer) { - bufferlock.Lock(); - if (_buffer->_put() == 0) - delete _buffer; - _buffer = 0; - bufferlock.Unlock(); - } - } - - - // dereference to get the actual buffer - buffer& operator*() { - return *_buffer; - } - - - bool at_buffer_head() const { - return _off == 0; - } - bool at_buffer_tail() const { - return _off + _len == _buffer->_len; - } - - // accessors for my subset - char *c_str() { - return _buffer->c_str() + _off; - } - unsigned length() const { - return _len; - } - unsigned offset() const { - return _off; - } - unsigned unused_tail_length() { - if (!at_buffer_tail()) return 0; - return _buffer->unused_tail_length(); - } - - - - // modifiers - void set_offset(unsigned off) { - assert(off <= _buffer->_alloc_len); - _off = off; - } - void set_length(unsigned len) { - assert(len >= 0 && _off + len <= _buffer->_alloc_len); - if (_buffer->_len < _off + len) - _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) - _len = len; // my len too - } - void zero() { - //bzero((void*)c_str(), _len); - memset((void*)c_str(), 0, _len); - } - - - // crope lookalikes - void append(const char *p, unsigned len) { - assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? - - // copy - memcpy(c_str() + _len, p, len); - _buffer->_len += len; - _len += len; - } - void copy_out(unsigned off, unsigned len, char *dest) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(dest, c_str() + off, len); - } - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0 && off <= _len); - assert(len >= 0 && off + len <= _len); - memcpy(c_str() + off, src, len); - } - - friend ostream& operator<<(ostream& out, bufferptr& bp); -}; - - -inline ostream& operator<<(ostream& out, bufferptr& bp) { - return out << "bufferptr(len=" << bp._len << " off=" << bp._off - << " cstr=" << (void*)bp.c_str() - << " buf=" << *bp._buffer - << ")"; -} - - - -#endif diff --git a/trunk/ceph/include/oldbufferlist.h b/trunk/ceph/include/oldbufferlist.h deleted file mode 100644 index d6447dd6f6d20..0000000000000 --- a/trunk/ceph/include/oldbufferlist.h +++ /dev/null @@ -1,682 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __BUFFERLIST_H -#define __BUFFERLIST_H - -#include "buffer.h" - -#include -#include -#include -#include -using namespace std; - -#include -using namespace __gnu_cxx; - - -// debug crap -#include "config.h" -#define bdbout(x) if (x <= g_conf.debug_buffer) cout - - - -class bufferlist { - private: - /* local state limited to _buffers, and _len. - * we maintain _len ourselves, so we must be careful when fiddling with buffers! - */ - list _buffers; - unsigned _len; - - public: - // cons/des - bufferlist() : _len(0) { - bdbout(1) << "bufferlist.cons " << this << endl; - } - bufferlist(const bufferlist& bl) : _len(0) { - //assert(0); // o(n) and stupid - bdbout(1) << "bufferlist.cons " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - } - ~bufferlist() { - bdbout(1) << "bufferlist.des " << this << endl; - } - - bufferlist& operator=(bufferlist& bl) { - //assert(0); // actually, this should be fine, just slow (O(n)) and stupid. - bdbout(1) << "bufferlist.= " << this << endl; - _buffers = bl._buffers; - _len = bl._len; - return *this; - } - - - // accessors - list& buffers() { - return _buffers; - } - //list::iterator begin() { return _buffers.begin(); } - //list::iterator end() { return _buffers.end(); } - - unsigned length() const { -#if 0 - { // DEBUG: verify _len - int len = 0; - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - len += (*it).length(); - } - assert(len == _len); - } -#endif - return _len; - } - - void _rope(crope& r) { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - r.append((*it).c_str(), (*it).length()); - } - - // modifiers - void clear() { - _buffers.clear(); - _len = 0; - } - void push_front(bufferptr& bp) { - _buffers.push_front(bp); - _len += bp.length(); - } - void push_front(buffer *b) { - bufferptr bp(b); - _buffers.push_front(bp); - _len += bp.length(); - } - void push_back(bufferptr& bp) { - _buffers.push_back(bp); - _len += bp.length(); - } - void push_back(buffer *b) { - bufferptr bp(b); - - _buffers.push_back(bp); - _len += bp.length(); - - } - void zero() { - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) - it->zero(); - } - - // sort-of-like-assignment-op - void claim(bufferlist& bl) { - // free my buffers - clear(); - claim_append(bl); - } - void claim_append(bufferlist& bl) { - // steal the other guy's buffers - _len += bl._len; - _buffers.splice( _buffers.end(), bl._buffers ); - bl._len = 0; - } - - - - - // crope lookalikes - void copy(unsigned off, unsigned len, char *dest) { - assert(off >= 0); - assert(off + len <= length()); - /*assert(off < length()); - if (off + len > length()) - len = length() - off; - */ - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_out(off, len, dest); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_out(off, howmuch, dest); - - dest += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - - void copy_in(unsigned off, unsigned len, const char *src) { - assert(off >= 0); - assert(off + len <= length()); - - // advance to off - list::iterator curbuf = _buffers.begin(); - - // skip off - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - break; - } - } - - // copy - while (len > 0) { - // is the rest ALL in this buffer? - if (off + len <= (*curbuf).length()) { - (*curbuf).copy_in(off, len, src); // yup, last bit! - break; - } - - // get as much as we can from this buffer. - unsigned howmuch = (*curbuf).length() - off; - (*curbuf).copy_in(off, howmuch, src); - - src += howmuch; - len -= howmuch; - off = 0; - curbuf++; - assert(curbuf != _buffers.end()); - } - } - void copy_in(unsigned off, unsigned len, bufferlist& bl) { - unsigned left = len; - for (list::iterator i = bl._buffers.begin(); - i != bl._buffers.end(); - i++) { - unsigned l = (*i).length(); - if (left < l) l = left; - copy_in(off, l, (*i).c_str()); - left -= l; - if (left == 0) break; - off += l; - } - } - - - void append(const char *data, unsigned len) { - if (len == 0) return; - - unsigned alen = 0; - - // copy into the tail buffer? - if (!_buffers.empty()) { - unsigned avail = _buffers.back().unused_tail_length(); - if (avail > 0) { - //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; - if (avail > len) - avail = len; - unsigned blen = _buffers.back().length(); - memcpy(_buffers.back().c_str() + blen, data, avail); - blen += avail; - _buffers.back().set_length(blen); - _len += avail; - data += avail; - len -= avail; - } - alen = _buffers.back().length(); - } - if (len == 0) return; - - // just add another buffer. - // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! - if (alen < 1024) alen = 1024; - push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen)); - } - void append(bufferptr& bp) { - push_back(bp); - } - void append(bufferptr& bp, unsigned len, unsigned off) { - bufferptr tempbp(bp, len, off); - push_back(tempbp); - } - void append(const bufferlist& bl) { - bufferlist temp = bl; // copy list - claim_append(temp); // and append - } - - - /* - * return a contiguous ptr to whole bufferlist contents. - */ - char *c_str() { - if (_buffers.size() == 1) { - return _buffers.front().c_str(); // good, we're already contiguous. - } - else if (_buffers.size() == 0) { - return 0; // no buffers - } - else { - // make one new contiguous buffer. - bufferptr newbuf = new buffer(length()); - unsigned off = 0; - - for (list::iterator it = _buffers.begin(); - it != _buffers.end(); - it++) { - //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least! - memcpy(newbuf.c_str() + off, - (*it).c_str(), (*it).length()); - off += (*it).length(); - } - assert(off == newbuf.length()); - - _buffers.clear(); - _buffers.push_back( newbuf ); - - // now it'll work. - return c_str(); - } - } - - - void substr_of(bufferlist& other, unsigned off, unsigned len) { - assert(off + len <= other.length()); - clear(); - - // skip off - list::iterator curbuf = other._buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "somewhere in " << *curbuf << endl; - break; - } - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "copying partial of " << *curbuf << endl; - _buffers.push_back( bufferptr( *curbuf, len, off ) ); - _len += len; - break; - } - - // through end - //cout << "copying end (all?) of " << *curbuf << endl; - unsigned howmuch = (*curbuf).length() - off; - _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); - _len += howmuch; - len -= howmuch; - off = 0; - curbuf++; - } - } - - // funky modifer - void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? - assert(off < length()); - assert(len > 0); - //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; - - // skip off - list::iterator curbuf = _buffers.begin(); - while (off > 0) { - assert(curbuf != _buffers.end()); - if (off >= (*curbuf).length()) { - // skip this buffer - //cout << "off = " << off << " skipping over " << *curbuf << endl; - off -= (*curbuf).length(); - curbuf++; - } else { - // somewhere in this buffer! - //cout << "off = " << off << " somewhere in " << *curbuf << endl; - break; - } - } - assert(off >= 0); - - if (off) { - // add a reference to the front bit - // insert it before curbuf (which we'll hose) - //cout << "keeping front " << off << " of " << *curbuf << endl; - _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) ); - _len += off; - } - - while (len > 0) { - // partial? - if (off + len < (*curbuf).length()) { - //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; - if (claim_by) - claim_by->append( *curbuf, len, off ); - (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big - (*curbuf).set_length( (*curbuf).length() - (len+off) ); - _len -= off+len; - //cout << " now " << *curbuf << endl; - break; - } - - // hose though the end - unsigned howmuch = (*curbuf).length() - off; - //cout << "discarding " << howmuch << " of " << *curbuf << endl; - if (claim_by) - claim_by->append( *curbuf, howmuch, off ); - _len -= (*curbuf).length(); - _buffers.erase( curbuf++ ); - len -= howmuch; - off = 0; - } - - // splice in *replace (implement me later?) - } - - friend ostream& operator<<(ostream& out, bufferlist& bl); - -}; - -inline ostream& operator<<(ostream& out, bufferlist& bl) { - out << "bufferlist(len=" << bl.length() << endl; - for (list::iterator it = bl._buffers.begin(); - it != bl._buffers.end(); - it++) - out << "\t" << *it << endl; - out << ")" << endl; - return out; -} - - - -// encoder/decode helpers - -// string -inline void _encode(const string& s, bufferlist& bl) -{ - bl.append(s.c_str(), s.length()+1); -} -inline void _decode(string& s, bufferlist& bl, int& off) -{ - s = bl.c_str() + off; - off += s.length() + 1; -} - -// bufferptr (encapsulated) -inline void _encode(bufferptr& bp, bufferlist& bl) -{ - size_t len = bp.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(bp); -} -inline void _decode(bufferptr& bp, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - bufferlist s; - s.substr_of(bl, off, len); - off += len; - - if (s.buffers().size() == 1) - bp = s.buffers().front(); - else - bp = new buffer(s.c_str(), s.length()); -} - -// bufferlist (encapsulated) -inline void _encode(const bufferlist& s, bufferlist& bl) -{ - size_t len = s.length(); - bl.append((char*)&len, sizeof(len)); - bl.append(s); -} -inline void _decode(bufferlist& s, bufferlist& bl, int& off) -{ - size_t len; - bl.copy(off, sizeof(len), (char*)&len); - off += sizeof(len); - s.substr_of(bl, off, len); - off += len; -} - - -// set -template -inline void _encode(set& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename set::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(set& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(vector& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename vector::iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(vector& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - s = vector(n); - for (int i=0; i -template -inline void _encode(const list& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename list::const_iterator it = s.begin(); - it != s.end(); - it++) { - T v = *it; - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(list& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -inline void _encode(map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (map::iterator it = s.begin(); - it != s.end(); - it++) { - _encode(it->first, bl); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - bl.append((char*)&k, sizeof(k)); - _encode(it->second, bl); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i -template -inline void _encode(const map& s, bufferlist& bl) -{ - int n = s.size(); - bl.append((char*)&n, sizeof(n)); - for (typename map::const_iterator it = s.begin(); - it != s.end(); - it++) { - T k = it->first; - U v = it->second; - bl.append((char*)&k, sizeof(k)); - bl.append((char*)&v, sizeof(v)); - n--; - } - assert(n==0); -} -template -inline void _decode(map& s, bufferlist& bl, int& off) -{ - s.clear(); - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __TRIPLE_H +#define __TRIPLE_H + +template +class triple { + public: + A first; + B second; + C third; + triple() {} + triple(A f, B s, C t) : first(f), second(s), third(t) {} +}; + +#endif diff --git a/trunk/ceph/include/types.h b/trunk/ceph/include/types.h index 088417acc20ac..92bcb94c6dc5f 100644 --- a/trunk/ceph/include/types.h +++ b/trunk/ceph/include/types.h @@ -242,13 +242,13 @@ struct inode_t { // base (immutable) inodeno_t ino; FileLayout layout; // ?immutable? - dev_t rdev; // if special file + uint32_t rdev; // if special file // affected by any inode change... utime_t ctime; // inode change time // perm (namespace permissions) - mode_t mode; + uint32_t mode; uid_t uid; gid_t gid; @@ -257,7 +257,7 @@ struct inode_t { bool anchored; // auth only? // file (data access) - off_t size, max_size, allocated_size; + int64_t size, max_size, allocated_size; utime_t mtime; // file data modify time. utime_t atime; // file data access time. diff --git a/trunk/ceph/include/xlist.h b/trunk/ceph/include/xlist.h index 35099311ef666..2ea2cbec6c815 100644 --- a/trunk/ceph/include/xlist.h +++ b/trunk/ceph/include/xlist.h @@ -24,12 +24,15 @@ public: xlist *_head; item(T i) : _item(i), _prev(0), _next(0), _head(0) {} + ~item() { + remove_myself(); + } xlist* get_xlist() { return _head; } void remove_myself() { - if (_head) { + if (_head) _head->remove(this); - } + assert(_head == 0); } }; @@ -39,6 +42,11 @@ private: public: xlist() : _front(0), _back(0), _size(0) {} + ~xlist() { + assert(_size == 0); + assert(_front == 0); + assert(_back == 0); + } int size() { return _size; } bool empty() { @@ -84,6 +92,15 @@ public: T front() { return (T)_front->_item; } T back() { return (T)_back->_item; } + void pop_front() { + assert(!empty()); + remove(_front); + } + void pop_back() { + assert(!empty()); + remove(_back); + } + class iterator { private: item *cur; diff --git a/trunk/ceph/mds/AnchorClient.cc b/trunk/ceph/mds/AnchorClient.cc index 3ae9db25ffd2e..b2fb1fb50d7bd 100644 --- a/trunk/ceph/mds/AnchorClient.cc +++ b/trunk/ceph/mds/AnchorClient.cc @@ -25,6 +25,7 @@ using std::cerr; #include "MDS.h" #include "MDLog.h" +#include "LogSegment.h" #include "events/EAnchorClient.h" #include "messages/MAnchor.h" @@ -79,8 +80,6 @@ void AnchorClient::handle_anchor_reply(class MAnchor *m) *pending_create_prepare[ino].patid = atid; pending_create_prepare.erase(ino); - pending_commit.insert(atid); - if (onfinish) { onfinish->finish(0); delete onfinish; @@ -115,8 +114,6 @@ void AnchorClient::handle_anchor_reply(class MAnchor *m) *pending_destroy_prepare[ino].patid = atid; pending_destroy_prepare.erase(ino); - pending_commit.insert(atid); - if (onfinish) { onfinish->finish(0); delete onfinish; @@ -151,8 +148,6 @@ void AnchorClient::handle_anchor_reply(class MAnchor *m) *pending_update_prepare[ino].patid = atid; pending_update_prepare.erase(ino); - pending_commit.insert(atid); - if (onfinish) { onfinish->finish(0); delete onfinish; @@ -187,17 +182,11 @@ void AnchorClient::handle_anchor_reply(class MAnchor *m) // remove from committing list assert(pending_commit.count(atid)); - pending_commit.erase(atid); - + assert(pending_commit[atid]->pending_commit_atids.count(atid)); + // log ACK. - mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid)); - - // kick any waiters - if (ack_waiters.count(atid)) { - dout(15) << "kicking waiters on atid " << atid << dendl; - mds->queue_waiters(ack_waiters[atid]); - ack_waiters.erase(atid); - } + mds->mdlog->submit_entry(new EAnchorClient(ANCHOR_OP_ACK, atid), + new C_LoggedAck(this, atid)); } break; @@ -209,6 +198,24 @@ void AnchorClient::handle_anchor_reply(class MAnchor *m) } +void AnchorClient::_logged_ack(version_t atid) +{ + dout(10) << "_logged_ack" << dendl; + + assert(pending_commit.count(atid)); + assert(pending_commit[atid]->pending_commit_atids.count(atid)); + + pending_commit[atid]->pending_commit_atids.erase(atid); + pending_commit.erase(atid); + + // kick any waiters (LogSegment trim) + if (ack_waiters.count(atid)) { + dout(15) << "kicking ack waiters on atid " << atid << dendl; + mds->queue_waiters(ack_waiters[atid]); + ack_waiters.erase(atid); + } +} + /* * public async interface @@ -291,12 +298,13 @@ void AnchorClient::prepare_update(inodeno_t ino, vector& trace, // COMMIT -void AnchorClient::commit(version_t atid) +void AnchorClient::commit(version_t atid, LogSegment *ls) { dout(10) << "commit " << atid << dendl; - assert(pending_commit.count(atid)); - pending_commit.insert(atid); + assert(pending_commit.count(atid) == 0); + pending_commit[atid] = ls; + ls->pending_commit_atids.insert(atid); // send message MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, atid); @@ -318,11 +326,11 @@ void AnchorClient::finish_recovery() void AnchorClient::resend_commits() { - for (set::iterator p = pending_commit.begin(); + for (map::iterator p = pending_commit.begin(); p != pending_commit.end(); ++p) { - dout(10) << "resending commit on " << *p << dendl; - MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, *p); + dout(10) << "resending commit on " << p->first << dendl; + MAnchor *req = new MAnchor(ANCHOR_OP_COMMIT, 0, p->first); mds->send_message_mds(req, mds->mdsmap->get_anchortable(), MDS_PORT_ANCHORTABLE, MDS_PORT_ANCHORCLIENT); diff --git a/trunk/ceph/mds/AnchorClient.h b/trunk/ceph/mds/AnchorClient.h index 6ec5603b0bc7e..fd790f39c399d 100644 --- a/trunk/ceph/mds/AnchorClient.h +++ b/trunk/ceph/mds/AnchorClient.h @@ -27,6 +27,7 @@ using __gnu_cxx::hash_map; class Context; class MDS; +class LogSegment; class AnchorClient : public Dispatcher { MDS *mds; @@ -49,11 +50,22 @@ class AnchorClient : public Dispatcher { hash_map pending_update_prepare; // pending commits - set pending_commit; + map pending_commit; map > ack_waiters; void handle_anchor_reply(class MAnchor *m); + class C_LoggedAck : public Context { + AnchorClient *ac; + version_t atid; + public: + C_LoggedAck(AnchorClient *a, version_t t) : ac(a), atid(t) {} + void finish(int r) { + ac->_logged_ack(atid); + } + }; + void _logged_ack(version_t atid); + public: AnchorClient(MDS *m) : mds(m) {} @@ -66,7 +78,7 @@ public: void prepare_destroy(inodeno_t ino, version_t *atid, Context *onfinish); void prepare_update(inodeno_t ino, vector& trace, version_t *atid, Context *onfinish); - void commit(version_t atid); + void commit(version_t atid, LogSegment *ls); // for recovery (by other nodes) void handle_mds_recovery(int mds); // called when someone else recovers @@ -75,8 +87,8 @@ public: void resend_prepares(hash_map& prepares, int op); // for recovery (by me) - void got_journaled_agree(version_t atid) { - pending_commit.insert(atid); + void got_journaled_agree(version_t atid, LogSegment *ls) { + pending_commit[atid] = ls; } void got_journaled_ack(version_t atid) { pending_commit.erase(atid); diff --git a/trunk/ceph/mds/CDentry.cc b/trunk/ceph/mds/CDentry.cc index 415fe986da273..2b6bb3470e8a8 100644 --- a/trunk/ceph/mds/CDentry.cc +++ b/trunk/ceph/mds/CDentry.cc @@ -21,6 +21,7 @@ #include "MDS.h" #include "MDCache.h" +#include "LogSegment.h" #include "messages/MLock.h" @@ -119,7 +120,7 @@ pair CDentry::authority() void CDentry::add_waiter(int tag, Context *c) { // wait on the directory? - if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) { + if (tag & (WAIT_UNFREEZE|WAIT_SINGLEAUTH)) { dir->add_waiter(tag, c); return; } @@ -135,27 +136,30 @@ version_t CDentry::pre_dirty(version_t min) } -void CDentry::_mark_dirty() +void CDentry::_mark_dirty(LogSegment *ls) { // state+pin if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); dir->inc_num_dirty(); get(PIN_DIRTY); + assert(ls); } + if (ls) + ls->dirty_dentries.push_back(&xlist_dirty); } -void CDentry::mark_dirty(version_t pv) +void CDentry::mark_dirty(version_t pv, LogSegment *ls) { dout(10) << " mark_dirty " << *this << dendl; // i now live in this new dir version assert(pv <= projected_version); version = pv; - _mark_dirty(); + _mark_dirty(ls); // mark dir too - dir->mark_dirty(pv); + dir->mark_dirty(pv, ls); } @@ -170,6 +174,8 @@ void CDentry::mark_clean() dir->dec_num_dirty(); put(PIN_DIRTY); + xlist_dirty.remove_myself(); + if (state_test(STATE_NEW)) state_clear(STATE_NEW); } @@ -262,6 +268,11 @@ void CDentry::auth_pin() if (auth_pins == 0) get(PIN_AUTHPIN); auth_pins++; + + dout(10) << "auth_pin on " << *this + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; + dir->adjust_nested_auth_pins(1); } @@ -270,12 +281,24 @@ void CDentry::auth_unpin() auth_pins--; if (auth_pins == 0) put(PIN_AUTHPIN); + + dout(10) << "auth_unpin on " << *this + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; + assert(auth_pins >= 0); + dir->adjust_nested_auth_pins(-1); } void CDentry::adjust_nested_auth_pins(int by) { nested_auth_pins += by; + + dout(15) << "adjust_nested_auth_pins by " << by + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; + assert(nested_auth_pins >= 0); + dir->adjust_nested_auth_pins(by); } diff --git a/trunk/ceph/mds/CDentry.h b/trunk/ceph/mds/CDentry.h index d120a1a07ec9f..416792beb8778 100644 --- a/trunk/ceph/mds/CDentry.h +++ b/trunk/ceph/mds/CDentry.h @@ -25,6 +25,7 @@ using namespace std; #include "include/types.h" #include "include/buffer.h" #include "include/lru.h" +#include "include/xlist.h" #include "mdstypes.h" #include "SimpleLock.h" @@ -38,6 +39,8 @@ class CDentryDiscover; class Anchor; class CDentry; +class LogSegment; + // define an ordering bool operator<(const CDentry& l, const CDentry& r); @@ -83,6 +86,8 @@ class CDentry : public MDSCacheObject, public LRUObject { version_t version; // dir version when last touched. version_t projected_version; // what it will be when i unlock/commit. + xlist::item xlist_dirty; + off_t dir_offset; int auth_pins, nested_auth_pins; @@ -109,6 +114,7 @@ public: remote_ino(0), remote_d_type(0), inode(0), dir(0), version(0), projected_version(0), + xlist_dirty(this), dir_offset(0), auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } @@ -117,6 +123,7 @@ public: remote_ino(0), remote_d_type(0), inode(in), dir(0), version(0), projected_version(0), + xlist_dirty(this), dir_offset(0), auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } @@ -125,6 +132,7 @@ public: remote_ino(ino), remote_d_type(dt), inode(in), dir(0), version(0), projected_version(0), + xlist_dirty(this), dir_offset(0), auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } @@ -189,8 +197,8 @@ public: pair authority(); version_t pre_dirty(version_t min=0); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); + void _mark_dirty(LogSegment *ls); + void mark_dirty(version_t projected_dirv, LogSegment *ls); void mark_clean(); void mark_new(); @@ -203,41 +211,41 @@ public: // -- exporting // note: this assumes the dentry already exists. // i.e., the name is already extracted... so we just need the other state. - void encode_export_state(bufferlist& bl) { - bl.append((char*)&state, sizeof(state)); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&projected_version, sizeof(projected_version)); + void encode_export(bufferlist& bl) { + ::_encode_simple(state, bl); + ::_encode_simple(version, bl); + ::_encode_simple(projected_version, bl); lock._encode(bl); - ::_encode(replica_map, bl); - + ::_encode_simple(replica_map, bl); + get(PIN_TEMPEXPORTING); + } + void finish_export() { // twiddle clear_replica_map(); replica_nonce = EXPORT_NONCE; state_clear(CDentry::STATE_AUTH); if (is_dirty()) mark_clean(); + put(PIN_TEMPEXPORTING); + } + void abort_export() { + put(PIN_TEMPEXPORTING); } - void decode_import_state(bufferlist& bl, int& off, int from, int to) { + void decode_import(bufferlist::iterator& blp, LogSegment *ls) { int nstate; - bl.copy(off, sizeof(nstate), (char*)&nstate); - off += sizeof(nstate); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(projected_version), (char*)&projected_version); - off += sizeof(projected_version); - lock._decode(bl, off); - ::_decode(replica_map, bl, off); + ::_decode_simple(nstate, blp); + ::_decode_simple(version, blp); + ::_decode_simple(projected_version, blp); + lock._decode(blp); + ::_decode_simple(replica_map, blp); // twiddle state = 0; state_set(CDentry::STATE_AUTH); if (nstate & STATE_DIRTY) - _mark_dirty(); + _mark_dirty(ls); if (!replica_map.empty()) get(PIN_REPLICATED); - add_replica(from, EXPORT_NONCE); - if (is_replica(to)) - remove_replica(to); } // -- locking -- diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc index a582fb7aae942..b4663b269c659 100644 --- a/trunk/ceph/mds/CDir.cc +++ b/trunk/ceph/mds/CDir.cc @@ -22,6 +22,7 @@ #include "MDS.h" #include "MDCache.h" #include "MDSMap.h" +#include "LogSegment.h" #include "include/Context.h" #include "common/Clock.h" @@ -118,7 +119,8 @@ ostream& CDir::print_db_line_prefix(ostream& out) // ------------------------------------------------------------------- // CDir -CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) +CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) : + xlist_dirty(this) { inode = in; frag = fg; @@ -495,7 +497,6 @@ void CDir::purge_stolen(list& waiters) if (is_auth()) clear_replica_map(); if (is_dirty()) mark_clean(); - if (state_test(STATE_EXPORT)) put(PIN_EXPORT); if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); @@ -508,7 +509,6 @@ void CDir::init_fragment_pins() { if (!replica_map.empty()) get(PIN_REPLICATED); if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_EXPORT)) get(PIN_EXPORT); if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); } @@ -646,6 +646,43 @@ void CDir::take_dentry_waiting(const string& dname, list& ls) put(PIN_DNWAITER); } +void CDir::add_ino_waiter(inodeno_t ino, Context *c) +{ + if (waiting_on_ino.empty()) + get(PIN_INOWAITER); + waiting_on_ino[ino].push_back(c); + dout(10) << "add_ino_waiter ino " << ino << " " << c << " on " << *this << dendl; +} + +void CDir::take_ino_waiting(inodeno_t ino, list& ls) +{ + if (waiting_on_ino.empty()) return; + if (waiting_on_ino.count(ino) == 0) return; + dout(10) << "take_ino_waiting ino " << ino + << " x " << waiting_on_ino[ino].size() + << " on " << *this << dendl; + ls.splice(ls.end(), waiting_on_ino[ino]); + waiting_on_ino.erase(ino); + if (waiting_on_ino.empty()) + put(PIN_INOWAITER); +} + +void CDir::take_sub_waiting(list& ls) +{ + dout(10) << "take_sub_waiting" << dendl; + for (hash_map >::iterator p = waiting_on_dentry.begin(); + p != waiting_on_dentry.end(); + ++p) + ls.splice(ls.end(), p->second); + waiting_on_dentry.clear(); + for (hash_map >::iterator p = waiting_on_ino.begin(); + p != waiting_on_ino.end(); + ++p) + ls.splice(ls.end(), p->second); + waiting_on_ino.clear(); +} + + void CDir::add_waiter(int tag, Context *c) { @@ -700,8 +737,10 @@ void CDir::finish_waiting(int mask, int result) list finished; take_waiting(mask, finished); - //finish_contexts(finished, result); - cache->mds->queue_waiters(finished); + if (result < 0) + finish_contexts(finished, result); + else + cache->mds->queue_waiters(finished); } @@ -717,22 +756,25 @@ version_t CDir::pre_dirty(version_t min) return projected_version; } -void CDir::_mark_dirty() +void CDir::_mark_dirty(LogSegment *ls) { if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); dout(10) << "mark_dirty (was clean) " << *this << " version " << version << dendl; get(PIN_DIRTY); + assert(ls); } else { dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << dendl; } + if (ls) + ls->dirty_dirfrags.push_back(&xlist_dirty); } -void CDir::mark_dirty(version_t pv) +void CDir::mark_dirty(version_t pv, LogSegment *ls) { assert(version < pv); version = pv; - _mark_dirty(); + _mark_dirty(ls); } void CDir::mark_clean() @@ -741,6 +783,8 @@ void CDir::mark_clean() if (state_test(STATE_DIRTY)) { state_clear(STATE_DIRTY); put(PIN_DIRTY); + + xlist_dirty.remove_myself(); } } @@ -787,7 +831,7 @@ void CDir::fetch(Context *c, bool ignore_authpinnability) if (!can_auth_pin() && !ignore_authpinnability) { dout(7) << "fetch waiting for authpinnable" << dendl; - add_waiter(WAIT_AUTHPINNABLE, c); + add_waiter(WAIT_UNFREEZE, c); return; } @@ -1231,6 +1275,63 @@ void CDir::_committed(version_t v) +// IMPORT/EXPORT + +void CDir::encode_export(bufferlist& bl) +{ + ::_encode_simple(version, bl); + ::_encode_simple(committed_version, bl); + ::_encode_simple(committed_version_equivalent, bl); + + ::_encode_simple(state, bl); + ::_encode_simple(dir_rep, bl); + + ::_encode_simple(pop_me, bl); + ::_encode_simple(pop_auth_subtree, bl); + + ::_encode_simple(dir_rep_by, bl); + ::_encode_simple(replica_map, bl); + + get(PIN_TEMPEXPORTING); +} + +void CDir::finish_export(utime_t now) +{ + pop_auth_subtree_nested -= pop_auth_subtree; + pop_me.zero(now); + pop_auth_subtree.zero(now); + put(PIN_TEMPEXPORTING); +} + +void CDir::decode_import(bufferlist::iterator& blp) +{ + ::_decode_simple(version, blp); + ::_decode_simple(committed_version, blp); + ::_decode_simple(committed_version_equivalent, blp); + committing_version = committed_version; + projected_version = version; + + unsigned s; + ::_decode_simple(s, blp); + state &= MASK_STATE_IMPORT_KEPT; + state |= (s & MASK_STATE_EXPORTED); + if (is_dirty()) get(PIN_DIRTY); + + ::_decode_simple(dir_rep, blp); + + ::_decode_simple(pop_me, blp); + ::_decode_simple(pop_auth_subtree, blp); + pop_auth_subtree_nested += pop_auth_subtree; + + ::_decode_simple(dir_rep_by, blp); + ::_decode_simple(replica_map, blp); + if (!replica_map.empty()) get(PIN_REPLICATED); + + replica_nonce = 0; // no longer defined +} + + + /******************************** * AUTHORITY @@ -1354,7 +1455,7 @@ void CDir::auth_pin() get(PIN_AUTHPIN); auth_pins++; - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; + dout(10) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; // nest pins? if (is_subtree_root()) return; // no. @@ -1369,14 +1470,10 @@ void CDir::auth_unpin() if (auth_pins == 0) put(PIN_AUTHPIN); - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; + dout(10) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; assert(auth_pins >= 0); - // pending freeze? - if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) && - auth_pins == 1 && - nested_auth_pins == 0) - finish_waiting(WAIT_FREEZEABLE); + maybe_finish_freeze(); // pending freeze? // nest? if (is_subtree_root()) return; // no. @@ -1389,15 +1486,11 @@ void CDir::adjust_nested_auth_pins(int inc) { nested_auth_pins += inc; - dout(10) << "adjust_nested_auth_pins " << inc << " on " << *this + dout(15) << "adjust_nested_auth_pins " << inc << " on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; assert(nested_auth_pins >= 0); - - // pending freeze? - if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) && - auth_pins == 1 && - nested_auth_pins == 0) - finish_waiting(WAIT_FREEZEABLE); + + maybe_finish_freeze(); // pending freeze? // adjust my inode? if (is_subtree_root()) @@ -1415,45 +1508,26 @@ void CDir::adjust_nested_auth_pins(int inc) // FREEZE TREE -class C_MDS_FreezeTree : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeTree(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_tree_finish(con); - } -}; - -void CDir::freeze_tree(Context *c) +bool CDir::freeze_tree() { assert(!is_frozen()); assert(!is_freezing()); auth_pin(); - - if (is_freezeable()) { + if (is_freezeable(true)) { _freeze_tree(); auth_unpin(); - if (c) { - c->finish(0); - delete c; - } + return true; } else { state_set(STATE_FREEZINGTREE); dout(10) << "freeze_tree waiting " << *this << dendl; - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - } + return false; + } } void CDir::_freeze_tree() { dout(10) << "_freeze_tree " << *this << dendl; - - // there shouldn't be any conflicting auth_pins (except the 'freezing' one) assert(is_freezeable(true)); // twiddle state @@ -1466,33 +1540,6 @@ void CDir::_freeze_tree() inode->auth_pin(); } -void CDir::freeze_tree_finish(Context *c) -{ - // still freezing? (we may have been canceled) - if (!is_freezing()) { - dout(10) << "freeze_tree_finish no longer freezing, done on " << *this << dendl; - c->finish(-1); - delete c; - return; - } - - // freezeable now? - if (!is_freezeable(true)) { - dout(10) << "freeze_tree_finish still waiting " << *this << dendl; - assert(state_test(STATE_FREEZINGTREE)); - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); - return; - } - - dout(10) << "freeze_tree_finish " << *this << dendl; - _freeze_tree(); - auth_unpin(); - if (c) { - c->finish(0); - delete c; - } -} - void CDir::unfreeze_tree() { dout(10) << "unfreeze_tree " << *this << dendl; @@ -1509,14 +1556,14 @@ void CDir::unfreeze_tree() // waiters? finish_waiting(WAIT_UNFREEZE); } else { + finish_waiting(WAIT_FROZEN, -1); + // freezing. stop it. assert(state_test(STATE_FREEZINGTREE)); state_clear(STATE_FREEZINGTREE); auth_unpin(); - // cancel freeze waiters finish_waiting(WAIT_UNFREEZE); - finish_waiting(WAIT_FREEZEABLE, -1); } } @@ -1564,43 +1611,26 @@ CDir *CDir::get_frozen_tree_root() // FREEZE DIR -class C_MDS_FreezeDir : public Context { - CDir *dir; - Context *con; -public: - C_MDS_FreezeDir(CDir *dir, Context *c) { - this->dir = dir; - this->con = c; - } - virtual void finish(int r) { - dir->freeze_dir_finish(con); - } -}; - -void CDir::freeze_dir(Context *c) +bool CDir::freeze_dir() { assert(!is_frozen()); assert(!is_freezing()); auth_pin(); - if (is_freezeable_dir()) { + if (is_freezeable_dir(true)) { _freeze_dir(); auth_unpin(); - if (c) { - c->finish(0); - delete c; - } + return true; } else { state_set(STATE_FREEZINGDIR); dout(10) << "freeze_dir + wait " << *this << dendl; - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); + return false; } } void CDir::_freeze_dir() -{ +{ dout(10) << "_freeze_dir " << *this << dendl; - assert(is_freezeable_dir(true)); state_clear(STATE_FREEZINGDIR); @@ -1611,33 +1641,6 @@ void CDir::_freeze_dir() inode->auth_pin(); // auth_pin for duration of freeze } -void CDir::freeze_dir_finish(Context *c) -{ - // still freezing? (we may have been canceled) - if (!is_freezing()) { - dout(10) << "freeze_dir_finish no longer freezing, done on " << *this << dendl; - c->finish(-1); - delete c; - return; - } - - // freezeable now? - if (!is_freezeable_dir(true)) { - dout(10) << "freeze_dir_finish still waiting " << *this << dendl; - state_set(STATE_FREEZINGDIR); - add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); - return; - } - - // freeze now - dout(10) << "freeze_dir_finish " << *this << dendl; - _freeze_dir(); - auth_unpin(); - if (c) { - c->finish(0); - delete c; - } -} void CDir::unfreeze_dir() { @@ -1651,17 +1654,16 @@ void CDir::unfreeze_dir() if (is_auth() && !is_subtree_root()) inode->auth_unpin(); - // waiters? finish_waiting(WAIT_UNFREEZE); } else { + finish_waiting(WAIT_FROZEN, -1); + // still freezing. stop. assert(state_test(STATE_FREEZINGDIR)); state_clear(STATE_FREEZINGDIR); auth_unpin(); - // cancel freeze waiters finish_waiting(WAIT_UNFREEZE); - finish_waiting(WAIT_FREEZEABLE, -1); } } diff --git a/trunk/ceph/mds/CDir.h b/trunk/ceph/mds/CDir.h index b8bf4e6e3920a..99bad3801e130 100644 --- a/trunk/ceph/mds/CDir.h +++ b/trunk/ceph/mds/CDir.h @@ -45,37 +45,34 @@ class Context; class CDirDiscover; - ostream& operator<<(ostream& out, class CDir& dir); -// CDir - - - class CDir : public MDSCacheObject { public: // -- pins -- static const int PIN_DNWAITER = 1; - static const int PIN_CHILD = 2; - static const int PIN_FROZEN = 3; - static const int PIN_EXPORT = 5; + static const int PIN_INOWAITER = 2; + static const int PIN_CHILD = 3; + static const int PIN_FROZEN = 4; + static const int PIN_SUBTREE = 5; static const int PIN_IMPORTING = 7; - static const int PIN_EXPORTING = 8; static const int PIN_IMPORTBOUND = 9; static const int PIN_EXPORTBOUND = 10; static const int PIN_STICKY = 11; + static const int PIN_SUBTREETEMP = 12; // used by MDCache::trim_non_auth() const char *pin_name(int p) { switch (p) { case PIN_DNWAITER: return "dnwaiter"; + case PIN_INOWAITER: return "inowaiter"; case PIN_CHILD: return "child"; case PIN_FROZEN: return "frozen"; - case PIN_EXPORT: return "export"; - case PIN_EXPORTING: return "exporting"; + case PIN_SUBTREE: return "subtree"; case PIN_IMPORTING: return "importing"; case PIN_IMPORTBOUND: return "importbound"; case PIN_EXPORTBOUND: return "exportbound"; case PIN_STICKY: return "sticky"; + case PIN_SUBTREETEMP: return "subtreetemp"; default: return generic_pin_name(p); } } @@ -88,8 +85,6 @@ class CDir : public MDSCacheObject { static const unsigned STATE_FREEZINGDIR = (1<< 5); static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching - static const unsigned STATE_DELETED = (1<< 8); - static const unsigned STATE_EXPORT = (1<< 9); static const unsigned STATE_IMPORTBOUND = (1<<10); static const unsigned STATE_EXPORTBOUND = (1<<11); static const unsigned STATE_EXPORTING = (1<<12); @@ -107,8 +102,8 @@ class CDir : public MDSCacheObject { static const unsigned MASK_STATE_EXPORTED = (STATE_COMPLETE|STATE_DIRTY); static const unsigned MASK_STATE_IMPORT_KEPT = - (STATE_EXPORT - |STATE_IMPORTING + ( + STATE_IMPORTING |STATE_IMPORTBOUND|STATE_EXPORTBOUND |STATE_FROZENTREE |STATE_STICKY); @@ -117,12 +112,10 @@ class CDir : public MDSCacheObject { |STATE_IMPORTBOUND|STATE_EXPORTBOUND |STATE_FROZENTREE |STATE_FROZENDIR - |STATE_EXPORT |STATE_STICKY); static const unsigned MASK_STATE_FRAGMENT_KEPT = (STATE_DIRTY | STATE_COMPLETE | - STATE_EXPORT | STATE_EXPORTBOUND | STATE_IMPORTBOUND); @@ -138,12 +131,12 @@ class CDir : public MDSCacheObject { // -- wait masks -- static const int WAIT_DENTRY = (1<<0); // wait for item to be in cache static const int WAIT_COMPLETE = (1<<1); // wait for complete dir contents - static const int WAIT_FREEZEABLE = (1<<2); // auth pins removed + static const int WAIT_FROZEN = (1<<2); // auth pins removed static const int WAIT_DNLOCK_OFFSET = 4; static const int WAIT_ANY = (0xffffffff); - static const int WAIT_ATFREEZEROOT = (WAIT_AUTHPINNABLE|WAIT_UNFREEZE); + static const int WAIT_ATFREEZEROOT = (WAIT_UNFREEZE); static const int WAIT_ATSUBTREEROOT = (WAIT_SINGLEAUTH); @@ -163,8 +156,8 @@ class CDir : public MDSCacheObject { //int hack_num_accessed; public: - typedef hash_map map_t; // there is a bug somewhere, valgrind me. - //typedef map map_t; + //typedef hash_map map_t; // there is a bug somewhere, valgrind me. + typedef map map_t; protected: // contents map_t items; // non-null AND null @@ -182,6 +175,8 @@ protected: version_t committed_version_equivalent; // in case of, e.g., temporary file version_t projected_version; + xlist::item xlist_dirty; + // lock nesting, freeze int auth_pins; int nested_auth_pins; @@ -313,9 +308,6 @@ private: // for giving to clients void get_dist_spec(set& ls, int auth) { - //if (( pop_auth_subtree.get(META_POP_IRD).get() > - //g_conf.mds_bal_replicate_threshold)) { - //if (!cached_by.empty() && inode.ino > 1) generic_dout(1) << "distributed spec for " << *this << endl; if (is_rep()) { for (map::iterator p = replicas_begin(); p != replicas_end(); @@ -364,8 +356,8 @@ private: void set_committed_version(version_t v) { committed_version = v; } version_t pre_dirty(version_t min=0); - void _mark_dirty(); - void mark_dirty(version_t pv); + void _mark_dirty(LogSegment *ls); + void mark_dirty(version_t pv, LogSegment *ls); void mark_clean(); void mark_complete() { state_set(STATE_COMPLETE); } @@ -387,6 +379,7 @@ private: // -- waiters -- protected: hash_map< string, list > waiting_on_dentry; + hash_map< inodeno_t, list > waiting_on_ino; public: bool is_waiting_for_dentry(const string& dn) { @@ -395,11 +388,28 @@ public: void add_dentry_waiter(const string& dentry, Context *c); void take_dentry_waiting(const string& dentry, list& ls); + bool is_waiting_for_ino(inodeno_t ino) { + return waiting_on_ino.count(ino); + } + void add_ino_waiter(inodeno_t ino, Context *c); + void take_ino_waiting(inodeno_t ino, list& ls); + + void take_sub_waiting(list& ls); // dentry or ino + void add_waiter(int mask, Context *c); void take_waiting(int mask, list& ls); // may include dentry waiters void finish_waiting(int mask, int result = 0); // ditto + // -- import/export -- + void encode_export(bufferlist& bl); + void finish_export(utime_t now); + void abort_export() { + put(PIN_TEMPEXPORTING); + } + void decode_import(bufferlist::iterator& blp); + + // -- auth pins -- bool can_auth_pin() { return is_auth() && !(is_frozen() || is_freezing()); } int is_auth_pinned() { return auth_pins; } @@ -411,16 +421,29 @@ public: void adjust_nested_auth_pins(int inc); // -- freezing -- - void freeze_tree(Context *c); - void freeze_tree_finish(Context *c); - void unfreeze_tree(); + bool freeze_tree(); void _freeze_tree(); + void unfreeze_tree(); - void freeze_dir(Context *c); - void freeze_dir_finish(Context *c); + bool freeze_dir(); void _freeze_dir(); void unfreeze_dir(); + void maybe_finish_freeze() { + if (auth_pins != 1 || nested_auth_pins != 0) + return; + if (state_test(STATE_FREEZINGTREE)) { + _freeze_tree(); + auth_unpin(); + finish_waiting(WAIT_FROZEN); + } + if (state_test(STATE_FREEZINGDIR)) { + _freeze_dir(); + auth_unpin(); + finish_waiting(WAIT_FROZEN); + } + } + bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } bool is_freezing_tree(); bool is_freezing_tree_root() { return state & STATE_FREEZINGTREE; } @@ -493,7 +516,6 @@ class CDirDiscover { dirfrag_t get_dirfrag() { return dirfrag; } - void _encode(bufferlist& bl) { bl.append((char*)&dirfrag, sizeof(dirfrag)); bl.append((char*)&nonce, sizeof(nonce)); @@ -514,102 +536,5 @@ class CDirDiscover { }; -// export - -class CDirExport { - struct { - dirfrag_t dirfrag; - uint32_t nden; // num dentries (including null ones) - version_t version; - version_t committed_version; - version_t committed_version_equivalent; - uint32_t state; - dirfrag_load_vec_t pop_me; - dirfrag_load_vec_t pop_auth_subtree; - int32_t dir_rep; - } st; - map replicas; - set rep_by; - - public: - CDirExport() {} - CDirExport(CDir *dir, utime_t now) { - memset(&st, 0, sizeof(st)); - - assert(dir->get_version() == dir->get_projected_version()); - - st.dirfrag = dir->dirfrag(); - st.nden = dir->items.size(); - st.version = dir->version; - st.committed_version = dir->committed_version; - st.committed_version_equivalent = dir->committed_version_equivalent; - st.state = dir->state; - st.dir_rep = dir->dir_rep; - - st.pop_me = dir->pop_me; - st.pop_auth_subtree = dir->pop_auth_subtree; - dir->pop_auth_subtree_nested -= dir->pop_auth_subtree; - dir->pop_me.zero(now); - dir->pop_auth_subtree.zero(now); - - rep_by = dir->dir_rep_by; - replicas = dir->replica_map; - } - - dirfrag_t get_dirfrag() { return st.dirfrag; } - uint32_t get_nden() { return st.nden; } - - void update_dir(CDir *dir) { - assert(dir->dirfrag() == st.dirfrag); - - // set committed_version at old version - dir->committing_version = - dir->committed_version = st.committed_version; - dir->committed_version_equivalent = st.committed_version_equivalent; - dir->projected_version = - dir->version = st.version; - - // twiddle state - dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. - (st.state & CDir::MASK_STATE_EXPORTED); - dir->dir_rep = st.dir_rep; - - dir->pop_me = st.pop_me; - dir->pop_auth_subtree = st.pop_auth_subtree; - dir->pop_auth_subtree_nested += dir->pop_auth_subtree; - - dir->replica_nonce = 0; // no longer defined - - if (!dir->replica_map.empty()) - generic_dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replica_map << dendl; - - dir->dir_rep_by = rep_by; - dir->replica_map = replicas; - generic_dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replica_map << dendl; - if (!replicas.empty()) - dir->get(CDir::PIN_REPLICATED); - if (dir->is_dirty()) { - dir->get(CDir::PIN_DIRTY); - } - } - - - void _encode(bufferlist& bl) { - bl.append((char*)&st, sizeof(st)); - ::_encode(replicas, bl); - ::_encode(rep_by, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - bl.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - ::_decode(replicas, bl, off); - ::_decode(rep_by, bl, off); - return off; - } - -}; - - #endif diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc index 87acc962409d3..3bdfc89e3f1fa 100644 --- a/trunk/ceph/mds/CInode.cc +++ b/trunk/ceph/mds/CInode.cc @@ -22,6 +22,8 @@ #include "MDCache.h" #include "AnchorTable.h" +#include "LogSegment.h" + #include "common/Clock.h" #include "messages/MLock.h" @@ -62,6 +64,10 @@ ostream& operator<<(ostream& out, CInode& in) out << " v" << in.get_version(); + if (in.state_test(CInode::STATE_AMBIGUOUSAUTH)) out << " AMBIGAUTH"; + if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; + if (in.is_frozen_inode()) out << " FROZEN"; + // locks out << " " << in.authlock; out << " " << in.linklock; @@ -109,12 +115,12 @@ inode_t *CInode::project_inode() return projected_inode.back(); } -void CInode::pop_and_dirty_projected_inode() +void CInode::pop_and_dirty_projected_inode(LogSegment *ls) { assert(!projected_inode.empty()); dout(15) << "pop_and_dirty_projected_inode " << projected_inode.front() << " v" << projected_inode.front()->version << dendl; - mark_dirty(projected_inode.front()->version); + mark_dirty(projected_inode.front()->version, ls); inode = *projected_inode.front(); delete projected_inode.front(); projected_inode.pop_front(); @@ -393,15 +399,20 @@ version_t CInode::pre_dirty() return parent->pre_dirty(); } -void CInode::_mark_dirty() +void CInode::_mark_dirty(LogSegment *ls) { if (!state_test(STATE_DIRTY)) { state_set(STATE_DIRTY); get(PIN_DIRTY); + assert(ls); } + + // move myself to this segment's dirty list + if (ls) + ls->dirty_inodes.push_back(&xlist_dirty); } -void CInode::mark_dirty(version_t pv) { +void CInode::mark_dirty(version_t pv, LogSegment *ls) { dout(10) << "mark_dirty " << *this << dendl; @@ -420,10 +431,10 @@ void CInode::mark_dirty(version_t pv) { // touch my private version assert(inode.version < pv); inode.version = pv; - _mark_dirty(); + _mark_dirty(ls); // mark dentry too - parent->mark_dirty(pv); + parent->mark_dirty(pv, ls); } @@ -433,6 +444,9 @@ void CInode::mark_clean() if (state_test(STATE_DIRTY)) { state_clear(STATE_DIRTY); put(PIN_DIRTY); + + // remove myself from ls dirty list + xlist_dirty.remove_myself(); } } @@ -551,7 +565,12 @@ void CInode::decode_lock_state(int type, bufferlist& bl) _decode(tm, bl, off); if (inode.mtime < tm) { inode.mtime = tm; - dirlock.set_updated(); + if (is_auth()) { + dout(10) << "decode_lock_state auth got mtime " << tm << " > my " << inode.mtime + << ", setting dirlock updated flag on " << *this + << dendl; + dirlock.set_updated(); + } } if (0) { map dfsz; @@ -565,6 +584,17 @@ void CInode::decode_lock_state(int type, bufferlist& bl) } } +void CInode::clear_dirty_scattered(int type) +{ + dout(10) << "clear_dirty_scattered " << type << " on " << *this << dendl; + switch (type) { + case LOCK_OTYPE_IDIR: + xlist_dirty_inode_mtime.remove_myself(); + break; + default: + assert(0); + } +} @@ -572,38 +602,78 @@ void CInode::decode_lock_state(int type, bufferlist& bl) bool CInode::is_frozen() { - if (parent && parent->dir->is_frozen()) - return true; + if (is_frozen_inode()) return true; + if (parent && parent->dir->is_frozen()) return true; return false; } bool CInode::is_frozen_dir() { - if (parent && parent->dir->is_frozen_dir()) - return true; + if (parent && parent->dir->is_frozen_dir()) return true; return false; } bool CInode::is_freezing() { - if (parent && parent->dir->is_freezing()) - return true; + if (is_freezing_inode()) return true; + if (parent && parent->dir->is_freezing()) return true; return false; } void CInode::add_waiter(int tag, Context *c) { + dout(10) << "add_waiter tag " << tag + << " !ambig " << !state_test(STATE_AMBIGUOUSAUTH) + << " !frozen " << !is_frozen_inode() + << " !freezing " << !is_freezing_inode() + << dendl; // wait on the directory? - if (tag & (WAIT_AUTHPINNABLE|WAIT_SINGLEAUTH)) { + // make sure its not the inode that is explicitly ambiguous|freezing|frozen + if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || + ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) { parent->dir->add_waiter(tag, c); return; } MDSCacheObject::add_waiter(tag, c); } +bool CInode::freeze_inode(int auth_pin_allowance) +{ + assert(auth_pin_allowance > 0); // otherwise we need to adjust parent's nested_auth_pins + assert(auth_pins >= auth_pin_allowance); + if (auth_pins > auth_pin_allowance) { + dout(10) << "freeze_inode - waiting for auth_pins to drop to " << auth_pin_allowance << dendl; + auth_pin_freeze_allowance = auth_pin_allowance; + get(PIN_FREEZING); + state_set(STATE_FREEZING); + return false; + } + + dout(10) << "freeze_inode - frozen" << dendl; + assert(auth_pins == auth_pin_allowance); + get(PIN_FROZEN); + state_set(STATE_FROZEN); + return true; +} + +void CInode::unfreeze_inode(list& finished) +{ + dout(10) << "unfreeze_inode" << dendl; + if (state_test(STATE_FREEZING)) { + state_clear(STATE_FREEZING); + put(PIN_FREEZING); + } else if (state_test(STATE_FROZEN)) { + state_clear(STATE_FROZEN); + put(PIN_FROZEN); + } else + assert(0); + take_waiting(WAIT_UNFREEZE, finished); +} + // auth_pins bool CInode::can_auth_pin() { + if (is_freezing_inode() || is_frozen_inode()) return false; if (parent) return parent->can_auth_pin(); return true; @@ -615,7 +685,9 @@ void CInode::auth_pin() get(PIN_AUTHPIN); auth_pins++; - dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; + dout(10) << "auth_pin on " << *this + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; if (parent) parent->adjust_nested_auth_pins( 1 ); @@ -627,18 +699,36 @@ void CInode::auth_unpin() if (auth_pins == 0) put(PIN_AUTHPIN); - dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << dendl; + dout(10) << "auth_unpin on " << *this + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; assert(auth_pins >= 0); - + if (parent) parent->adjust_nested_auth_pins( -1 ); + + if (is_freezing_inode() && + auth_pins == auth_pin_freeze_allowance) { + dout(10) << "auth_unpin freezing!" << dendl; + get(PIN_FROZEN); + put(PIN_FREEZING); + state_clear(STATE_FREEZING); + state_set(STATE_FROZEN); + finish_waiting(WAIT_FROZEN); + } } void CInode::adjust_nested_auth_pins(int a) { if (!parent) return; nested_auth_pins += a; + + dout(15) << "adjust_nested_auth_pins by " << a + << " now " << auth_pins << "+" << nested_auth_pins + << dendl; + assert(nested_auth_pins >= 0); + parent->adjust_nested_auth_pins(a); } @@ -673,3 +763,76 @@ CInodeDiscover* CInode::replicate_to( int rep ) + +// IMPORT/EXPORT + +void CInode::encode_export(bufferlist& bl) +{ + ::_encode_simple(inode, bl); + ::_encode_simple(symlink, bl); + dirfragtree._encode(bl); + + bool dirty = is_dirty(); + ::_encode_simple(dirty, bl); + + ::_encode_simple(pop, bl); + + ::_encode_simple(replica_map, bl); + + map cap_map; + export_client_caps(cap_map); + ::_encode_simple(cap_map, bl); + + authlock._encode(bl); + linklock._encode(bl); + dirfragtreelock._encode(bl); + filelock._encode(bl); + dirlock._encode(bl); + + get(PIN_TEMPEXPORTING); +} + +void CInode::finish_export(utime_t now) +{ + pop.zero(now); + + // just in case! + dirlock.clear_updated(); + + put(PIN_TEMPEXPORTING); +} + +void CInode::decode_import(bufferlist::iterator& p, + set& new_client_caps, + LogSegment *ls) +{ + utime_t old_mtime = inode.mtime; + ::_decode_simple(inode, p); + if (old_mtime > inode.mtime) { + assert(dirlock.is_updated()); + inode.mtime = old_mtime; // preserve our mtime, if it is larger + } + + ::_decode_simple(symlink, p); + dirfragtree._decode(p); + + bool dirty; + ::_decode_simple(dirty, p); + if (dirty) + _mark_dirty(ls); + + ::_decode_simple(pop, p); + + ::_decode_simple(replica_map, p); + if (!replica_map.empty()) get(PIN_REPLICATED); + + map cap_map; + ::_decode_simple(cap_map, p); + merge_client_caps(cap_map, new_client_caps); + + authlock._decode(p); + linklock._decode(p); + dirfragtreelock._decode(p); + filelock._decode(p); + dirlock._decode(p); +} diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h index 1af9cc8cdf662..8f453472a0477 100644 --- a/trunk/ceph/mds/CInode.h +++ b/trunk/ceph/mds/CInode.h @@ -46,7 +46,7 @@ class Message; class CInode; class CInodeDiscover; class MDCache; - +class LogSegment; ostream& operator<<(ostream& out, CInode& in); @@ -65,6 +65,9 @@ class CInode : public MDSCacheObject { static const int PIN_BATCHOPENJOURNAL = 9; static const int PIN_SCATTERED = 10; static const int PIN_STICKYDIRS = 11; + static const int PIN_PURGING = -12; + static const int PIN_FREEZING = 13; + static const int PIN_FROZEN = 14; const char *pin_name(int p) { switch (p) { @@ -78,6 +81,8 @@ class CInode : public MDSCacheObject { case PIN_BATCHOPENJOURNAL: return "batchopenjournal"; case PIN_SCATTERED: return "scattered"; case PIN_STICKYDIRS: return "stickydirs"; + case PIN_FREEZING: return "freezing"; + case PIN_FROZEN: return "frozen"; default: return generic_pin_name(p); } } @@ -88,6 +93,9 @@ class CInode : public MDSCacheObject { static const int STATE_UNANCHORING = (1<<4); static const int STATE_OPENINGDIR = (1<<5); static const int STATE_REJOINUNDEF = (1<<6); // inode contents undefined. + static const int STATE_FREEZING = (1<<7); + static const int STATE_FROZEN = (1<<8); + static const int STATE_AMBIGUOUSAUTH = (1<<9); // -- waiters -- //static const int WAIT_SLAVEAGREE = (1<<0); @@ -95,6 +103,7 @@ class CInode : public MDSCacheObject { static const int WAIT_ANCHORED = (1<<2); static const int WAIT_UNANCHORED = (1<<3); static const int WAIT_CAPS = (1<<4); + static const int WAIT_FROZEN = (1<<5); static const int WAIT_AUTHLOCK_OFFSET = 5; static const int WAIT_LINKLOCK_OFFSET = 5 + SimpleLock::WAIT_BITS; @@ -137,7 +146,7 @@ class CInode : public MDSCacheObject { } inode_t *project_inode(); - void pop_and_dirty_projected_inode(); + void pop_and_dirty_projected_inode(LogSegment *ls); // -- cache infrastructure -- private: @@ -184,10 +193,19 @@ protected: utime_t replica_caps_wanted_keep_until; - private: + // LogSegment xlists i (may) belong to + xlist::item xlist_dirty; +public: + xlist::item xlist_open_file; + xlist::item xlist_dirty_inode_mtime; + xlist::item xlist_purging_inode; + +private: // auth pin int auth_pins; int nested_auth_pins; +public: + int auth_pin_freeze_allowance; public: inode_load_vec_t pop; @@ -210,6 +228,8 @@ protected: stickydir_ref(0), parent(0), force_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), + xlist_dirty(this), xlist_open_file(this), + xlist_dirty_inode_mtime(this), xlist_purging_inode(this), auth_pins(0), nested_auth_pins(0), versionlock(this, LOCK_OTYPE_IVERSION, WAIT_VERSIONLOCK_OFFSET), authlock(this, LOCK_OTYPE_IAUTH, WAIT_AUTHLOCK_OFFSET), @@ -237,6 +257,13 @@ protected: bool is_root() { return inode.ino == MDS_INO_ROOT; } bool is_stray() { return MDS_INO_IS_STRAY(inode.ino); } + bool is_base() { return inode.ino < MDS_INO_BASE; } + + // note: this overloads MDSCacheObject + bool is_ambiguous_auth() { + return state_test(STATE_AMBIGUOUSAUTH) || + MDSCacheObject::is_ambiguous_auth(); + } inodeno_t ino() const { return inode.ino; } @@ -260,8 +287,8 @@ protected: version_t get_version() { return inode.version; } version_t pre_dirty(); - void _mark_dirty(); - void mark_dirty(version_t projected_dirv); + void _mark_dirty(LogSegment *ls); + void mark_dirty(version_t projected_dirv, LogSegment *ls); void mark_clean(); @@ -272,6 +299,17 @@ protected: void add_waiter(int tag, Context *c); + // -- import/export -- + void encode_export(bufferlist& bl); + void finish_export(utime_t now); + void abort_export() { + put(PIN_TEMPEXPORTING); + } + void decode_import(bufferlist::iterator& p, + set& new_client_caps, + LogSegment *ls); + + // -- locks -- public: LocalLock versionlock; @@ -281,6 +319,7 @@ public: FileLock filelock; ScatterLock dirlock; + SimpleLock* get_lock(int type) { switch (type) { case LOCK_OTYPE_IFILE: return &filelock; @@ -295,6 +334,7 @@ public: void encode_lock_state(int type, bufferlist& bl); void decode_lock_state(int type, bufferlist& bl); + void clear_dirty_scattered(int type); // -- caps -- (new) // client caps @@ -338,15 +378,17 @@ public: client_caps = cl; } */ - void take_client_caps(map& cl) { + void clear_client_caps() { if (!client_caps.empty()) put(PIN_CAPS); + client_caps.clear(); + } + void export_client_caps(map& cl) { for (map::iterator it = client_caps.begin(); it != client_caps.end(); it++) { cl[it->first] = it->second.make_export(); } - client_caps.clear(); } void merge_client_caps(map& cl, set& new_client_caps) { if (client_caps.empty() && !cl.empty()) @@ -425,10 +467,15 @@ public: // -- freeze -- + bool is_freezing_inode() { return state_test(STATE_FREEZING); } + bool is_frozen_inode() { return state_test(STATE_FROZEN); } bool is_frozen(); bool is_frozen_dir(); bool is_freezing(); + bool freeze_inode(int auth_pin_allowance=0); + void unfreeze_inode(list& finished); + // -- reference counting -- void bad_put(int by) { @@ -561,108 +608,5 @@ class CInodeDiscover { }; -// export - -class CInodeExport { - - struct st_ { - inode_t inode; - - inode_load_vec_t pop; - - bool is_dirty; // dirty inode? - - int num_caps; - } st; - - string symlink; - fragtree_t dirfragtree; - - map replicas; - map cap_map; - - bufferlist locks; - -public: - CInodeExport() {} - CInodeExport(CInode *in, utime_t now) { - st.inode = in->inode; - symlink = in->symlink; - dirfragtree = in->dirfragtree; - - st.is_dirty = in->is_dirty(); - replicas = in->replica_map; - - in->authlock._encode(locks); - in->linklock._encode(locks); - in->dirfragtreelock._encode(locks); - in->filelock._encode(locks); - in->dirlock._encode(locks); - - st.pop = in->pop; - in->pop.zero(now); - - // steal WRITER caps from inode - in->take_client_caps(cap_map); - //remaining_issued = in->get_caps_issued(); - } - - inodeno_t get_ino() { return st.inode.ino; } - - void update_inode(CInode *in, set& new_client_caps) { - // treat scatterlocked mtime special, since replica may have newer info - if (in->dirlock.get_state() == LOCK_SCATTER || - in->dirlock.get_state() == LOCK_GLOCKC || - in->dirlock.get_state() == LOCK_GTEMPSYNCC) - st.inode.mtime = MAX(in->inode.mtime, st.inode.mtime); - - in->inode = st.inode; - in->symlink = symlink; - in->dirfragtree = dirfragtree; - - in->pop = st.pop; - - if (st.is_dirty) - in->_mark_dirty(); - - in->replica_map = replicas; - if (!replicas.empty()) - in->get(CInode::PIN_REPLICATED); - - int off = 0; - in->authlock._decode(locks, off); - in->linklock._decode(locks, off); - in->dirfragtreelock._decode(locks, off); - in->filelock._decode(locks, off); - in->dirlock._decode(locks, off); - - // caps - in->merge_client_caps(cap_map, new_client_caps); - } - - void _encode(bufferlist& bl) { - st.num_caps = cap_map.size(); - - ::_encode(st, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); - ::_encode(replicas, bl); - ::_encode(locks, bl); - ::_encode(cap_map, bl); - } - - int _decode(bufferlist& bl, int off = 0) { - ::_decode(st, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - ::_decode(replicas, bl, off); - ::_decode(locks, bl, off); - ::_decode(cap_map, bl, off); - - return off; - } -}; - - #endif diff --git a/trunk/ceph/mds/ClientMap.cc b/trunk/ceph/mds/ClientMap.cc index 26f105d7ae1a6..5170f3fe9b3eb 100644 --- a/trunk/ceph/mds/ClientMap.cc +++ b/trunk/ceph/mds/ClientMap.cc @@ -96,10 +96,15 @@ public: void ClientMap::save(Context *onsave, version_t needv) { dout(10) << "save needv " << needv << ", v " << version << dendl; + + if (needv && committing >= needv) { + assert(committing > committed); + commit_waiters[committing].push_back(onsave); + return; + } + commit_waiters[version].push_back(onsave); - if (needv && committing >= needv) return; - bufferlist bl; init_inode(); @@ -108,7 +113,7 @@ void ClientMap::save(Context *onsave, version_t needv) mds->filer->write(inode, 0, bl.length(), bl, 0, - 0, new C_CM_Save(this, version)); + 0, new C_CM_Save(this, version)); } void ClientMap::_save_finish(version_t v) diff --git a/trunk/ceph/mds/ClientMap.h b/trunk/ceph/mds/ClientMap.h index 6fa68e207f5a4..c36e66d240a33 100644 --- a/trunk/ceph/mds/ClientMap.h +++ b/trunk/ceph/mds/ClientMap.h @@ -48,9 +48,6 @@ private: map > commit_waiters; public: - ClientMap(MDS *m) : mds(m), - version(0), projected(0), committing(0), committed(0) {} - version_t get_version() { return version; } version_t get_projected() { return projected; } version_t get_committing() { return committing; } @@ -119,10 +116,12 @@ private: // client id -> tid -> result code map > completed_requests; // completed client requests map > waiting_for_trim; + version_t requestmapv; public: void add_completed_request(metareqid_t ri) { completed_requests[ri.client].insert(ri.tid); + requestmapv++; } void trim_completed_requests(int client, tid_t mintid) { // zero means trim all! @@ -159,6 +158,12 @@ public: } + + ClientMap(MDS *m) : mds(m), + version(0), projected(0), committing(0), committed(0), + requestmapv(0) {} + + // -- encoding -- void encode(bufferlist& bl) { bl.append((char*)&version, sizeof(version)); @@ -174,6 +179,7 @@ public: projected = committing = committed = version; } + // -- loading, saving -- inode_t inode; list waiting_for_load; diff --git a/trunk/ceph/mds/IdAllocator.h b/trunk/ceph/mds/IdAllocator.h index e8a0f5436938f..51001f2236627 100644 --- a/trunk/ceph/mds/IdAllocator.h +++ b/trunk/ceph/mds/IdAllocator.h @@ -55,6 +55,7 @@ class IdAllocator { version_t get_version() { return version; } version_t get_committed_version() { return committed_version; } + version_t get_committing_version() { return committing_version; } // load/save from disk (hack) bool is_undef() { return state == STATE_UNDEF; } diff --git a/trunk/ceph/mds/Locker.cc b/trunk/ceph/mds/Locker.cc index 87d624ec30bf8..55f38cd799b5f 100644 --- a/trunk/ceph/mds/Locker.cc +++ b/trunk/ceph/mds/Locker.cc @@ -202,7 +202,7 @@ bool Locker::acquire_locks(MDRequest *mdr, if (!object->can_auth_pin()) { // wait dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl; - object->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); mds->locker->drop_locks(mdr); mdr->drop_local_auth_pins(); return false; @@ -241,8 +241,8 @@ bool Locker::acquire_locks(MDRequest *mdr, mds->send_message_mds(req, p->first, MDS_PORT_SERVER); // put in waiting list - assert(mdr->waiting_on_slave.count(p->first) == 0); - mdr->waiting_on_slave.insert(p->first); + assert(mdr->more()->waiting_on_slave.count(p->first) == 0); + mdr->more()->waiting_on_slave.insert(p->first); } return false; } @@ -566,6 +566,7 @@ class C_MDL_RequestInodeFileCaps : public Context { public: C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : locker(l), in(i) {} void finish(int r) { + in->put(CInode::PIN_PTRWAITER); if (!in->is_auth()) locker->request_inode_file_caps(in); } @@ -607,6 +608,7 @@ void Locker::request_inode_file_caps(CInode *in) // wait for single auth if (in->is_ambiguous_auth()) { + in->get(CInode::PIN_PTRWAITER); in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDL_RequestInodeFileCaps(this, in)); return; @@ -718,6 +720,8 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) if (cap->is_null()) { dout(7) << " cap for client" << client << " is now null, removing from " << *in << dendl; in->remove_client_cap(client); + if (!in->is_any_caps()) + in->xlist_open_file.remove_myself(); // unpin logsegment if (!in->is_auth()) request_inode_file_caps(in); @@ -979,7 +983,7 @@ void Locker::try_simple_eval(SimpleLock *lock) if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_SimpleEval(this, lock)); + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock)); return; } @@ -1105,7 +1109,7 @@ bool Locker::simple_rdlock_try(SimpleLock *lock, Context *con) // wait! dout(7) << "simple_rdlock_try waiting on " << *lock << " on " << *lock->get_parent() << dendl; - lock->add_waiter(SimpleLock::WAIT_RD, con); + if (con) lock->add_waiter(SimpleLock::WAIT_RD, con); return false; } @@ -1192,7 +1196,7 @@ bool Locker::simple_xlock_start(SimpleLock *lock, MDRequest *mdr) // send lock request int auth = lock->get_parent()->authority().first; - mdr->slaves.insert(auth); + mdr->more()->slaves.insert(auth); MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_XLOCK); r->set_lock_type(lock->get_type()); lock->get_parent()->set_object_info(r->get_object_info()); @@ -1413,6 +1417,7 @@ class C_Locker_ScatterEval : public Context { public: C_Locker_ScatterEval(Locker *l, ScatterLock *lk) : locker(l), lock(lk) {} void finish(int r) { + lock->get_parent()->put(CInode::PIN_PTRWAITER); locker->try_scatter_eval(lock); } }; @@ -1423,8 +1428,9 @@ void Locker::try_scatter_eval(ScatterLock *lock) // unstable and ambiguous auth? if (!lock->is_stable() && lock->get_parent()->is_ambiguous_auth()) { - dout(7) << "scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; + dout(7) << "try_scatter_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + lock->get_parent()->get(CInode::PIN_PTRWAITER); lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_ScatterEval(this, lock)); return; } @@ -1437,7 +1443,8 @@ void Locker::try_scatter_eval(ScatterLock *lock) if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_scatter_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - lock->get_parent()->add_waiter(MDSCacheObject::WAIT_AUTHPINNABLE, new C_Locker_ScatterEval(this, lock)); + lock->get_parent()->get(CInode::PIN_PTRWAITER); + lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_ScatterEval(this, lock)); return; } @@ -1464,7 +1471,6 @@ void Locker::scatter_eval_gather(ScatterLock *lock) auth, MDS_PORT_LOCKER); } lock->set_state(LOCK_LOCK); - //lock->get_parent()->put(CInode::PIN_SCATTERED); } } else { @@ -1492,7 +1498,6 @@ void Locker::scatter_eval_gather(ScatterLock *lock) dout(7) << "scatter_eval finished lock gather/un-wrlock on " << *lock << " on " << *lock->get_parent() << dendl; lock->set_state(LOCK_LOCK); - //lock->get_parent()->put(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_STABLE); lock->get_parent()->auth_unpin(); } @@ -1528,7 +1533,6 @@ void Locker::scatter_eval_gather(ScatterLock *lock) send_lock_message(lock, LOCK_AC_SCATTER, data); } lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); lock->get_parent()->auth_unpin(); } @@ -1559,10 +1563,11 @@ void Locker::scatter_eval_gather(ScatterLock *lock) void Locker::scatter_writebehind(ScatterLock *lock) { CInode *in = (CInode*)lock->get_parent(); - dout(10) << "scatter_writebehind on " << *lock << " on " << *in << dendl; + dout(10) << "scatter_writebehind " << in->inode.mtime << " on " << *lock << " on " << *in << dendl; // journal write-behind. inode_t *pi = in->project_inode(); + pi->mtime = in->inode.mtime; // make sure an intermediate version isn't goofing us up pi->version = in->pre_dirty(); EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind"); @@ -1570,14 +1575,14 @@ void Locker::scatter_writebehind(ScatterLock *lock) le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock)); + mds->mdlog->wait_for_sync(new C_Locker_ScatterWB(this, lock, mds->mdlog->get_current_segment())); } -void Locker::scatter_writebehind_finish(ScatterLock *lock) +void Locker::scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls) { CInode *in = (CInode*)lock->get_parent(); dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl; - in->pop_and_dirty_projected_inode(); + in->pop_and_dirty_projected_inode(ls); lock->clear_updated(); scatter_eval_gather(lock); } @@ -1589,23 +1594,57 @@ void Locker::scatter_eval(ScatterLock *lock) assert(lock->get_parent()->is_auth()); assert(lock->is_stable()); - if (((CInode*)lock->get_parent())->has_subtree_root_dirfrag()) { + CInode *in = (CInode*)lock->get_parent(); + if (in->has_subtree_root_dirfrag() && !in->is_base()) { // i _should_ be scattered. if (!lock->is_rdlocked() && - !lock->is_xlocked()) { + !lock->is_xlocked() && + lock->get_state() != LOCK_SCATTER) { dout(10) << "scatter_eval no rdlocks|xlocks, am subtree root inode, scattering" << dendl; scatter_scatter(lock); + autoscattered.push_back(&lock->xlistitem_autoscattered); } } else { // i _should_ be sync. + lock->xlistitem_autoscattered.remove_myself(); if (!lock->is_wrlocked() && - !lock->is_xlocked()) { + !lock->is_xlocked() && + lock->get_state() != LOCK_SYNC) { dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; scatter_sync(lock); } } } +void Locker::note_autoscattered(ScatterLock *lock) +{ + dout(10) << "note_autoscattered " << *lock << " on " << *lock->get_parent() << dendl; + autoscattered.push_back(&lock->xlistitem_autoscattered); +} + + +/* + * this is called by LogSegment::try_to_trim() when trying to + * flush dirty scattered data (e.g. inode->dirlock mtime) back + * to the auth node. + */ +void Locker::scatter_try_unscatter(ScatterLock *lock, Context *c) +{ + dout(10) << "scatter_try_unscatter " << *lock << " on " << *lock->get_parent() << dendl; + assert(!lock->get_parent()->is_auth()); + assert(!lock->get_parent()->is_ambiguous_auth()); + + // request unscatter? + int auth = lock->get_parent()->authority().first; + if (lock->get_state() == LOCK_SCATTER && + mds->mdsmap->get_state(auth) >= MDSMap::STATE_ACTIVE) + mds->send_message_mds(new MLock(lock, LOCK_AC_REQUNSCATTER, mds->get_nodeid()), + auth, MDS_PORT_LOCKER); + + // wait... + lock->add_waiter(SimpleLock::WAIT_STABLE, c); +} + void Locker::scatter_sync(ScatterLock *lock) { @@ -1636,7 +1675,6 @@ void Locker::scatter_sync(ScatterLock *lock) lock->init_gather(); } else { if (!lock->is_wrlocked()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it now, we're fine } } @@ -1667,6 +1705,8 @@ void Locker::scatter_scatter(ScatterLock *lock) assert(lock->get_parent()->is_auth()); assert(lock->is_stable()); + lock->set_last_scatter(g_clock.now()); + switch (lock->get_state()) { case LOCK_SYNC: if (!lock->is_rdlocked() && @@ -1708,7 +1748,6 @@ void Locker::scatter_scatter(ScatterLock *lock) send_lock_message(lock, LOCK_AC_SCATTER, data); } lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); } @@ -1739,7 +1778,6 @@ void Locker::scatter_lock(ScatterLock *lock) case LOCK_SCATTER: if (!lock->is_wrlocked() && !lock->get_parent()->is_replicated()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it. } @@ -1788,7 +1826,6 @@ void Locker::scatter_tempsync(ScatterLock *lock) case LOCK_SCATTER: if (!lock->is_wrlocked() && !lock->get_parent()->is_replicated()) { - //lock->get_parent()->put(CInode::PIN_SCATTERED); break; // do it. } @@ -1812,8 +1849,6 @@ void Locker::scatter_tempsync(ScatterLock *lock) - - void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) { int from = m->get_asker(); @@ -1832,7 +1867,6 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) // -- replica -- case LOCK_AC_SYNC: assert(lock->get_state() == LOCK_LOCK); - lock->set_state(LOCK_SYNC); lock->decode_locked_state(m->get_data()); lock->clear_updated(); @@ -1842,7 +1876,7 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) case LOCK_AC_LOCK: assert(lock->get_state() == LOCK_SCATTER || lock->get_state() == LOCK_SYNC); - + // wait for wrlocks to close? if (lock->is_wrlocked()) { assert(lock->get_state() == LOCK_SCATTER); @@ -1855,9 +1889,9 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) << " on " << *lock->get_parent() << dendl; lock->set_state(LOCK_GLOCKS); } else { - //if (lock->get_state() == LOCK_SCATTER) - //lock->get_parent()->put(CInode::PIN_SCATTERED); - + dout(7) << "handle_scatter_lock has no rd|wrlocks, sending lockack for " << *lock + << " on " << *lock->get_parent() << dendl; + // encode and reply bufferlist data; lock->encode_locked_state(data); @@ -1872,7 +1906,6 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) lock->decode_locked_state(m->get_data()); lock->clear_updated(); lock->set_state(LOCK_SCATTER); - //lock->get_parent()->get(CInode::PIN_SCATTERED); lock->finish_waiters(ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE); break; @@ -1900,15 +1933,33 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) case LOCK_AC_REQSCATTER: if (lock->is_stable()) { - dout(7) << "handle_scatter_lock got scatter request on " << *lock << " on " << *lock->get_parent() - << dendl; + /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing) + * because the replica should be holding an auth_pin if they're + * doing this (and thus, we are freezing, not frozen, and indefinite + * starvation isn't an issue). + */ + dout(7) << "handle_scatter_lock got scatter request on " << *lock + << " on " << *lock->get_parent() << dendl; scatter_scatter(lock); } else { - dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock << " on " << *lock->get_parent() - << dendl; + dout(7) << "handle_scatter_lock ignoring scatter request on " << *lock + << " on " << *lock->get_parent() << dendl; } break; + case LOCK_AC_REQUNSCATTER: + if (!lock->is_stable()) { + dout(7) << "handle_scatter_lock ignoring now-unnecessary unscatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + } else if (lock->get_parent()->can_auth_pin()) { + dout(7) << "handle_scatter_lock got unscatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + scatter_lock(lock); + } else { + dout(7) << "handle_scatter_lock DROPPING unscatter request on " << *lock + << " on " << *lock->get_parent() << dendl; + /* FIXME: if we can't auth_pin here, this request is effectively lost... */ + } } delete m; @@ -1916,6 +1967,44 @@ void Locker::handle_scatter_lock(ScatterLock *lock, MLock *m) +void Locker::scatter_unscatter_autoscattered() +{ + /* + * periodically unscatter autoscattered locks + */ + + dout(10) << "scatter_unscatter_autoscattered" << dendl; + + utime_t now = g_clock.now(); + int n = autoscattered.size(); + while (!autoscattered.empty()) { + ScatterLock *lock = autoscattered.front(); + + // stop? + if (lock->get_state() == LOCK_SCATTER && + now - lock->get_last_scatter() < 10.0) + break; + + autoscattered.pop_front(); + + if (lock->get_state() == LOCK_SCATTER && + lock->get_parent()->is_replicated()) { + if (((CInode*)lock->get_parent())->is_frozen() || + ((CInode*)lock->get_parent())->is_freezing()) { + // hrm.. requeue. + dout(10) << "last_scatter " << lock->get_last_scatter() + << ", now " << now << ", but frozen|freezing, requeueing" << dendl; + autoscattered.push_back(&lock->xlistitem_autoscattered); + } else { + dout(10) << "last_scatter " << lock->get_last_scatter() + << ", now " << now << ", locking" << dendl; + scatter_lock(lock); + } + } + if (--n == 0) break; + } +} + // ========================================================================== @@ -2138,6 +2227,7 @@ class C_Locker_FileEval : public Context { public: C_Locker_FileEval(Locker *l, FileLock *lk) : locker(l), lock(lk) {} void finish(int r) { + lock->get_parent()->put(CInode::PIN_PTRWAITER); locker->try_file_eval(lock); } }; @@ -2151,6 +2241,7 @@ void Locker::try_file_eval(FileLock *lock) in->is_ambiguous_auth()) { dout(7) << "try_file_eval not stable and ambiguous auth, waiting on " << *in << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) + in->get(CInode::PIN_PTRWAITER); in->add_waiter(CInode::WAIT_SINGLEAUTH, new C_Locker_FileEval(this, lock)); return; } @@ -2163,7 +2254,8 @@ void Locker::try_file_eval(FileLock *lock) if (!lock->get_parent()->can_auth_pin()) { dout(7) << "try_file_eval can't auth_pin, waiting on " << *in << dendl; //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH)) - in->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_Locker_FileEval(this, lock)); + in->get(CInode::PIN_PTRWAITER); + in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_FileEval(this, lock)); return; } diff --git a/trunk/ceph/mds/Locker.h b/trunk/ceph/mds/Locker.h index 1a90b1a033107..a69055f49449e 100644 --- a/trunk/ceph/mds/Locker.h +++ b/trunk/ceph/mds/Locker.h @@ -40,9 +40,9 @@ class MLock; class MClientRequest; - class Anchor; class Capability; +class LogSegment; class SimpleLock; class FileLock; @@ -62,6 +62,7 @@ private: void dispatch(Message *m); void handle_lock(MLock *m); + protected: void send_lock_message(SimpleLock *lock, int msg); void send_lock_message(SimpleLock *lock, int msg, const bufferlist &data); @@ -109,15 +110,24 @@ public: void dentry_anon_rdlock_trace_finish(vector& trace); // scatter +protected: + xlist autoscattered; + public: void try_scatter_eval(ScatterLock *lock); void scatter_eval(ScatterLock *lock); // public for MDCache::adjust_subtree_auth() void scatter_eval_gather(ScatterLock *lock); + void scatter_unscatter_autoscattered(); + void scatter_try_unscatter(ScatterLock *lock, Context *c); + void note_autoscattered(ScatterLock *lock); + + void scatter_lock(ScatterLock *lock); // called by LogSegment::try_to_expire + protected: void handle_scatter_lock(ScatterLock *lock, MLock *m); + void _scatter_replica_lock(ScatterLock *lock, int auth); void scatter_sync(ScatterLock *lock); - void scatter_lock(ScatterLock *lock); void scatter_scatter(ScatterLock *lock); void scatter_tempsync(ScatterLock *lock); bool scatter_rdlock_start(ScatterLock *lock, MDRequest *mdr); @@ -129,13 +139,14 @@ protected: class C_Locker_ScatterWB : public Context { Locker *locker; ScatterLock *lock; + LogSegment *ls; public: - C_Locker_ScatterWB(Locker *l, ScatterLock *sl) : locker(l), lock(sl) {} + C_Locker_ScatterWB(Locker *l, ScatterLock *sl, LogSegment *s) : locker(l), lock(sl), ls(s) {} void finish(int r) { - locker->scatter_writebehind_finish(lock); + locker->scatter_writebehind_finish(lock, ls); } }; - void scatter_writebehind_finish(ScatterLock *lock); + void scatter_writebehind_finish(ScatterLock *lock, LogSegment *ls); // local protected: diff --git a/trunk/ceph/mds/LogEvent.h b/trunk/ceph/mds/LogEvent.h index fb2ccf2664fb2..8f2f55f342bb3 100644 --- a/trunk/ceph/mds/LogEvent.h +++ b/trunk/ceph/mds/LogEvent.h @@ -43,6 +43,7 @@ using namespace std; #include "include/Context.h" class MDS; +class LogSegment; // generic log event class LogEvent { @@ -53,8 +54,10 @@ class LogEvent { friend class MDLog; public: + LogSegment *_segment; + LogEvent(int t) : - _type(t), _start_off(0), _end_off(0) { } + _type(t), _start_off(0), _end_off(0), _segment(0) { } virtual ~LogEvent() { } int get_type() { return _type; } @@ -72,29 +75,16 @@ class LogEvent { } /*** live journal ***/ - - /* obsolete() - is this entry committed to primary store, such that - * we can expire it from the journal? - */ - virtual bool has_expired(MDS *m) { - return true; - } - - /* expire() - prod MDS into committing the relevant state so that this - * entry can be expired from the jorunal. + /* update_segment() - adjust any state we need to in the LogSegment */ - virtual void expire(MDS *m, Context *c) { - assert(0); - c->finish(0); - delete c; - } + virtual void update_segment() { } - /*** recovery ***/ /* replay() - replay given event. this is idempotent. */ virtual void replay(MDS *m) { assert(0); } + }; inline ostream& operator<<(ostream& out, LogEvent& le) { diff --git a/trunk/ceph/mds/LogSegment.h b/trunk/ceph/mds/LogSegment.h new file mode 100644 index 0000000000000..e73f5f8b61b9c --- /dev/null +++ b/trunk/ceph/mds/LogSegment.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __LOGSEGMENT_H +#define __LOGSEGMENT_H + +#include "include/xlist.h" +#include "include/interval_set.h" +#include "include/Context.h" + +#include +using __gnu_cxx::hash_set; + +class CDir; +class CInode; +class CDentry; +class MDS; +class MDSlaveUpdate; + +class LogSegment { + public: + off_t offset; + int num_events; + + // dirty items + xlist dirty_dirfrags; + xlist dirty_inodes; + xlist dirty_dentries; + + xlist open_files; + xlist dirty_inode_mtimes; + + xlist slave_updates; + + //xlist purging_inodes; + map > purging_inodes; + + // committed anchor transactions + hash_set pending_commit_atids; + + // client request ids + map last_client_tids; + + // table version + version_t allocv; + version_t clientmapv; + version_t anchortablev; + + // try to expire + C_Gather *try_to_expire(MDS *mds); + + // cons + LogSegment(off_t off) : offset(off), num_events(0), + allocv(0), clientmapv(0), anchortablev(0) + { } +}; + +#endif diff --git a/trunk/ceph/mds/MDBalancer.cc b/trunk/ceph/mds/MDBalancer.cc index a3bfe5526245a..8e9d0e2dd46fa 100644 --- a/trunk/ceph/mds/MDBalancer.cc +++ b/trunk/ceph/mds/MDBalancer.cc @@ -175,7 +175,7 @@ void MDBalancer::send_heartbeat() if (mds->get_nodeid() == 0) beat_epoch++; - // load + // my load mds_load_t load = get_load(); mds_load[ mds->get_nodeid() ] = load; @@ -243,7 +243,7 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) //dout(0) << " load is " << load << " have " << mds_load.size() << dendl; - unsigned cluster_size = mds->get_mds_map()->get_num_mds(); + unsigned cluster_size = mds->get_mds_map()->get_num_in_mds(); if (mds_load.size() == cluster_size) { // let's go! //export_empties(); // no! @@ -317,7 +317,7 @@ void MDBalancer::do_fragmenting() if (!dir->is_auth()) continue; dout(0) << "do_fragmenting splitting " << *dir << dendl; - mds->mdcache->split_dir(dir, 3); + mds->mdcache->split_dir(dir, 4); } split_queue.clear(); } @@ -339,6 +339,8 @@ void MDBalancer::do_rebalance(int beat) dout(5) << " do_rebalance: cluster loads are" << dendl; + mds->mdcache->migrator->clear_export_queue(); + // rescale! turn my mds_load back into meta_load units double load_fac = 1.0; if (mds_load[whoami].mds_load() > 0) { @@ -771,8 +773,8 @@ void MDBalancer::hit_inode(utime_t now, CInode *in, int type, int who) // hit inode in->pop.get(type).hit(now); - if (in->get_parent_dir()) - hit_dir(now, in->get_parent_dir(), type, who); + if (in->get_parent_dn()) + hit_dir(now, in->get_parent_dn()->get_dir(), type, who); } /* // hit me diff --git a/trunk/ceph/mds/MDCache.cc b/trunk/ceph/mds/MDCache.cc index d58fe3db47373..32201986d9f40 100644 --- a/trunk/ceph/mds/MDCache.cc +++ b/trunk/ceph/mds/MDCache.cc @@ -100,7 +100,6 @@ MDCache::MDCache(MDS *m) lru.lru_set_midpoint(g_conf.mds_cache_mid); did_shutdown_log_cap = false; - shutdown_commits = 0; } MDCache::~MDCache() @@ -167,6 +166,14 @@ void MDCache::add_inode(CInode *in) // add to lru, inode map assert(inode_map.count(in->ino()) == 0); // should be no dup inos! inode_map[ in->ino() ] = in; + + if (in->ino() < MDS_INO_BASE) { + base_inodes.insert(in); + if (in->ino() == MDS_INO_ROOT) + set_root(in); + if (in->ino() == MDS_INO_STRAY(mds->get_nodeid())) + stray = in; + } } void MDCache::remove_inode(CInode *o) @@ -183,11 +190,16 @@ void MDCache::remove_inode(CInode *o) // remove from inode map inode_map.erase(o->ino()); + if (o->ino() < MDS_INO_BASE) { + assert(base_inodes.count(o)); + base_inodes.erase(o); + + if (o == root) root = 0; + if (o == stray) stray = 0; + } + // delete it delete o; - - if (o == root) root = 0; - if (o == stray) stray = 0; } @@ -209,7 +221,6 @@ CInode *MDCache::create_root_inode() root->force_auth = pair(0, CDIR_AUTH_UNKNOWN); - set_root( root ); add_inode( root ); return root; @@ -238,44 +249,30 @@ void MDCache::open_root(Context *c) } } else { // request inode from root mds - if (waiting_for_root.empty()) { - dout(7) << "discovering root" << dendl; - - filepath want; - MDiscover *req = new MDiscover(whoami, - MDS_INO_ROOT, - want, - false); // there _is_ no base dir for the root inode - mds->send_message_mds(req, 0, MDS_PORT_CACHE); - } else { - dout(7) << "waiting for root" << dendl; - } - - // wait - waiting_for_root.push_back(c); - + discover_base_ino(MDS_INO_ROOT, c, 0); } } CInode *MDCache::create_stray_inode(int whose) { if (whose < 0) whose = mds->get_nodeid(); - stray = new CInode(this, whose == mds->get_nodeid()); - memset(&stray->inode, 0, sizeof(inode_t)); - stray->inode.ino = MDS_INO_STRAY(whose); + + CInode *in = new CInode(this, whose == mds->get_nodeid()); + memset(&in->inode, 0, sizeof(inode_t)); + in->inode.ino = MDS_INO_STRAY(whose); // make it up (FIXME) - stray->inode.mode = 0755 | INODE_MODE_DIR; - stray->inode.size = 0; - stray->inode.ctime = - stray->inode.mtime = g_clock.now(); + in->inode.mode = 0755 | INODE_MODE_DIR; + in->inode.size = 0; + in->inode.ctime = + in->inode.mtime = g_clock.now(); - stray->inode.nlink = 1; - stray->inode.layout = g_OSD_MDDirLayout; + in->inode.nlink = 1; + in->inode.layout = g_OSD_MDDirLayout; - add_inode( stray ); + add_inode( in ); - return stray; + return in; } void MDCache::open_local_stray() @@ -291,16 +288,7 @@ void MDCache::open_foreign_stray(int who, Context *c) dout(10) << "open_foreign_stray mds" << who << " " << ino << dendl; assert(!have_inode(ino)); - // discover - filepath want; - MDiscover *req = new MDiscover(mds->get_nodeid(), - ino, - want, - false); // there _is_ no base dir for the stray inode - mds->send_message_mds(req, who, MDS_PORT_CACHE); - - // wait - waiting_for_stray[ino].push_back(c); + discover_base_ino(ino, c, who); } @@ -369,8 +357,10 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) CDir *root; if (dir->ino() < MDS_INO_BASE) { root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) + if (subtrees.count(root) == 0) { subtrees[root].clear(); + root->get(CDir::PIN_SUBTREE); + } } else { root = get_subtree_root(dir); // subtree root } @@ -386,6 +376,7 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) dout(10) << " new subtree at " << *dir << dendl; assert(subtrees.count(dir) == 0); subtrees[dir].clear(); // create empty subtree bounds list for me. + dir->get(CDir::PIN_SUBTREE); // set dir_auth dir->set_dir_auth(auth); @@ -423,49 +414,10 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) eval_subtree_root(dir); } - // adjust export pins - adjust_export_state(dir); - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - adjust_export_state(*p); - show_subtrees(); } -/* - * any "export" point must be pinned in cache to ensure a proper - * chain of delegation. we do this by pinning when a dir is nonauth - * but the inode is auth. - * - * import points don't need to be pinned the same way simply because the - * exporting mds is pinning the exprot (as above) thus the dir is - * always open on the importer. - */ -void MDCache::adjust_export_state(CDir *dir) -{ - // be auth bit agnostic, so that we work during recovery - // (before recalc_auth_bits) - if (dir->authority().first != mds->get_nodeid() && - dir->inode->authority().first == mds->get_nodeid()) { - // export. - if (!dir->state_test(CDir::STATE_EXPORT)) { - dout(10) << "adjust_export_state pinning new export " << *dir << dendl; - dir->state_set(CDir::STATE_EXPORT); - dir->get(CDir::PIN_EXPORT); - } - } - else { - // not export. - if (dir->state_test(CDir::STATE_EXPORT)) { - dout(10) << "adjust_export_state unpinning old export " << *dir << dendl; - dir->state_clear(CDir::STATE_EXPORT); - dir->put(CDir::PIN_EXPORT); - } - } -} - void MDCache::try_subtree_merge(CDir *dir) { dout(7) << "try_subtree_merge " << *dir << dendl; @@ -485,10 +437,11 @@ void MDCache::try_subtree_merge(CDir *dir) class C_MDC_SubtreeMergeWB : public Context { MDCache *mdcache; CInode *in; + LogSegment *ls; public: - C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i) : mdcache(mdc), in(i) {} + C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, LogSegment *s) : mdcache(mdc), in(i), ls(s) {} void finish(int r) { - mdcache->subtree_merge_writebehind_finish(in); + mdcache->subtree_merge_writebehind_finish(in, ls); } }; @@ -517,6 +470,7 @@ void MDCache::try_subtree_merge_at(CDir *dir) subtrees[parent].insert(*p); // we are no longer a subtree or bound + dir->put(CDir::PIN_SUBTREE); subtrees.erase(dir); subtrees[parent].erase(dir); @@ -552,17 +506,18 @@ void MDCache::try_subtree_merge_at(CDir *dir) le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in)); + mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in, + mds->mdlog->get_current_segment())); } } show_subtrees(15); } -void MDCache::subtree_merge_writebehind_finish(CInode *in) +void MDCache::subtree_merge_writebehind_finish(CInode *in, LogSegment *ls) { dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; - in->pop_and_dirty_projected_inode(); + in->pop_and_dirty_projected_inode(ls); in->auth_unpin(); } @@ -594,8 +549,10 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pairino() < MDS_INO_BASE) { root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) + if (subtrees.count(root) == 0) { subtrees[root].clear(); + root->get(CDir::PIN_SUBTREE); + } } else { root = get_subtree_root(dir); // subtree root } @@ -613,6 +570,7 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pairget(CDir::PIN_SUBTREE); // set dir_auth dir->set_dir_auth(auth); @@ -685,13 +643,6 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - adjust_export_state(*p); - // bound should now match. verify_subtree_bounds(dir, bounds); @@ -767,6 +718,7 @@ void MDCache::remove_subtree(CDir *dir) assert(subtrees.count(dir)); assert(subtrees[dir].empty()); subtrees.erase(dir); + dir->put(CDir::PIN_SUBTREE); if (dir->get_parent_dir()) { CDir *p = get_subtree_root(dir->get_parent_dir()); assert(subtrees[p].count(dir)); @@ -909,8 +861,10 @@ void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir) CDir *dir = *p; // un-force dir to subtree root - if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) + if (dir->dir_auth == pair(dir->dir_auth.first, dir->dir_auth.first)) { adjust_subtree_auth(dir, dir->dir_auth.first); + try_subtree_merge_at(dir); + } } show_subtrees(); @@ -981,24 +935,10 @@ int MDCache::num_subtrees_fullnonauth() // ==================================================================== // import map, recovery -/* - * take note of where we write import_maps in the log, as we need - * to take care not to expire them until an updated map is safely flushed. - */ -class C_MDS_WroteSubtreeMap : public Context { - MDCache *mdcache; - off_t end_off; -public: - C_MDS_WroteSubtreeMap(MDCache *mc, off_t eo) : mdcache(mc), end_off(eo) { } - void finish(int r) { - mdcache->_logged_subtree_map(end_off); - } -}; - -void MDCache::log_subtree_map(Context *onsync) +ESubtreeMap *MDCache::create_subtree_map() { - dout(10) << "log_subtree_map " << num_subtrees() << " subtrees, " + dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " << num_subtrees_fullauth() << " fullauth" << dendl; @@ -1030,22 +970,7 @@ void MDCache::log_subtree_map(Context *onsync) } //le->metablob.print(cout); - - Context *fin = new C_MDS_WroteSubtreeMap(this, mds->mdlog->get_write_pos()); - mds->mdlog->writing_subtree_map = true; - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(fin); - if (onsync) - mds->mdlog->wait_for_sync(onsync); -} - -void MDCache::_logged_subtree_map(off_t off) -{ - dout(10) << "_logged_subtree_map at " << off << dendl; - mds->mdlog->subtree_maps.insert(off); - mds->mdlog->writing_subtree_map = false; - - mds->mdlog->kick_subtree_map(); // just in case the last segment was empty. + return le; } @@ -1157,7 +1082,7 @@ void MDCache::send_resolve_now(int who) } // [resolving] if (uncommitted_slave_updates.count(who)) { - for (map::iterator p = uncommitted_slave_updates[who].begin(); + for (map::iterator p = uncommitted_slave_updates[who].begin(); p != uncommitted_slave_updates[who].end(); ++p) { dout(10) << " including uncommitted " << p->first << dendl; @@ -1179,41 +1104,28 @@ void MDCache::handle_mds_failure(int who) // make note of recovery set mds->mdsmap->get_recovery_mds_set(recovery_set); recovery_set.erase(mds->get_nodeid()); - dout(1) << "my recovery peers will be " << recovery_set << dendl; + dout(1) << "handle_mds_failure mds" << who << " : recovery peers are " << recovery_set << dendl; // adjust my recovery lists wants_resolve.erase(who); // MDS will ask again got_resolve.erase(who); // i'll get another. + + rejoin_sent.erase(who); // i need to send another rejoin_ack_gather.erase(who); // i'll need/get another. + + dout(10) << " wants_resolve " << wants_resolve << dendl; + dout(10) << " got_resolve " << got_resolve << dendl; + dout(10) << " rejoin_sent " << rejoin_sent << dendl; + dout(10) << " rejoin_gather " << rejoin_gather << dendl; + dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; + // tell the migrator too. migrator->handle_mds_failure_or_stop(who); - // kick any dir discovers that are waiting - hash_map >::iterator p = dir_discovers.begin(); - while (p != dir_discovers.end()) { - hash_map >::iterator n = p; - n++; - - // waiting on this mds? - if (p->second.count(who)) { - CInode *in = get_inode(p->first); - assert(in); - - // take waiters - list waiters; - in->take_waiting(CInode::WAIT_DIR, waiters); - mds->queue_waiters(waiters); - dout(10) << "kicking WAIT_DIR on " << *in << dendl; - - // remove from mds list - p->second.erase(who); - if (p->second.empty()) - dir_discovers.erase(p); - } - p = n; - } - + // kick any discovers that are waiting + kick_discovers(who); + // clean up any requests slave to/from this node list finish; for (hash_map::iterator p = active_requests.begin(); @@ -1234,18 +1146,18 @@ void MDCache::handle_mds_failure(int who) // failed node is slave? if (!p->second->committing) { - if (p->second->witnessed.count(who)) { + if (p->second->more()->witnessed.count(who)) { dout(10) << " master request " << *p->second << " no longer witnessed by slave mds" << who << dendl; // discard this peer's prepare (if any) - p->second->witnessed.erase(who); + p->second->more()->witnessed.erase(who); } - if (p->second->waiting_on_slave.count(who)) { + if (p->second->more()->waiting_on_slave.count(who)) { dout(10) << " master request " << *p->second << " waiting for slave mds" << who << " to recover" << dendl; // retry request when peer recovers - p->second->waiting_on_slave.erase(who); + p->second->more()->waiting_on_slave.erase(who); mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); } } @@ -1398,6 +1310,7 @@ void MDCache::handle_resolve(MMDSResolve *m) dout(7) << "ambiguous import succeeded on " << *dir << dendl; migrator->import_finish(dir); } + my_ambiguous_imports.erase(p); // no longer ambiguous. } p = next; } @@ -1454,7 +1367,6 @@ void MDCache::maybe_resolve_finish() else { dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; disambiguate_imports(); - if (mds->is_resolve()) { recalc_auth_bits(); trim_non_auth(); @@ -1476,7 +1388,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { // replay assert(uncommitted_slave_updates[from].count(*p)); - uncommitted_slave_updates[from][*p].replay(mds); + uncommitted_slave_updates[from][*p].commit.replay(mds); uncommitted_slave_updates[from].erase(*p); // log commit mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT)); @@ -1494,14 +1406,15 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { assert(uncommitted_slave_updates[from].count(*p)); + uncommitted_slave_updates[from][*p].rollback.replay(mds); uncommitted_slave_updates[from].erase(*p); mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); } else { MDRequest *mdr = request_get(*p); - if (mdr->slave_commit) { - mdr->slave_commit->finish(-1); - delete mdr->slave_commit; - mdr->slave_commit = 0; + if (mdr->more()->slave_commit) { + mdr->more()->slave_commit->finish(-1); + delete mdr->more()->slave_commit; + mdr->more()->slave_commit = 0; } if (mdr->slave_request) mdr->aborted = true; @@ -1569,14 +1482,18 @@ void MDCache::disambiguate_imports() } assert(my_ambiguous_imports.empty()); - // verify all my subtrees are unambiguous! - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (dir->is_ambiguous_dir_auth()) - dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; - assert(!dir->is_ambiguous_dir_auth()); + if (mds->is_resolve()) { + // verify all my subtrees are unambiguous! + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = p->first; + if (dir->is_ambiguous_dir_auth()) { + dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; + show_subtrees(); + } + assert(!dir->is_ambiguous_dir_auth()); + } } show_subtrees(); @@ -1711,6 +1628,10 @@ void MDCache::recalc_auth_bits() /* * rejoin phase! + * + * this initiates rejoin. it shoudl be called before we get any + * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). + * * we start out by sending rejoins to everyone in the recovery set. * * if we are rejoin, send for all regions in our cache. @@ -1735,6 +1656,7 @@ void MDCache::rejoin_send_rejoins() p != recovery_set.end(); ++p) { if (*p == mds->get_nodeid()) continue; // nothing to myself! + if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! if (mds->is_rejoin()) { rejoin_gather.insert(*p); rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); @@ -1762,9 +1684,44 @@ void MDCache::rejoin_send_rejoins() rejoin_walk(dir, rejoins[auth]); } + + // rejoin root inodes, too + for (map::iterator p = rejoins.begin(); + p != rejoins.end(); + ++p) { + if (mds->is_rejoin()) { + // weak + if (p->first == 0 && root) + p->second->add_weak_inode(root->ino()); + if (get_inode(MDS_INO_STRAY(p->first))) + p->second->add_weak_inode(MDS_INO_STRAY(p->first)); + } else { + // strong + if (p->first == 0 && root) { + p->second->add_weak_inode(root->ino()); + p->second->add_strong_inode(root->ino(), root->get_replica_nonce(), + root->get_caps_wanted(), + root->authlock.get_state(), + root->linklock.get_state(), + root->dirfragtreelock.get_state(), + root->filelock.get_state(), + root->dirlock.get_state()); + } + if (CInode *in = get_inode(MDS_INO_STRAY(p->first))) { + p->second->add_weak_inode(in->ino()); + p->second->add_strong_inode(in->ino(), in->get_replica_nonce(), + in->get_caps_wanted(), + in->authlock.get_state(), + in->linklock.get_state(), + in->dirfragtreelock.get_state(), + in->filelock.get_state(), + in->dirlock.get_state()); + } + } + } if (!mds->is_rejoin()) { - // strong. + // i am survivor. send strong rejoin. // note request authpins, xlocks for (hash_map::iterator p = active_requests.begin(); p != active_requests.end(); @@ -1809,17 +1766,19 @@ void MDCache::rejoin_send_rejoins() } // send the messages - assert(rejoin_ack_gather.empty()); for (map::iterator p = rejoins.begin(); p != rejoins.end(); ++p) { - mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); + assert(rejoin_sent.count(p->first) == 0); + assert(rejoin_ack_gather.count(p->first) == 0); + rejoin_sent.insert(p->first); rejoin_ack_gather.insert(p->first); + mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); } // nothing? if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing left to rejoin" << dendl; + dout(10) << "nothing to rejoin" << dendl; mds->rejoin_done(); } } @@ -1855,10 +1814,19 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) p != dir->items.end(); ++p) { CDentry *dn = p->second; - assert(dn->is_primary()); dout(15) << " add_weak_primary_dentry " << *dn << dendl; + assert(dn->is_primary()); + assert(dn->inode->is_dir()); rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); dn->get_inode()->get_nested_dirfrags(nested); + + if (dn->get_inode()->dirlock.is_updated()) { + // include full inode to shed any dirtyscattered state + rejoin->add_full_inode(dn->get_inode()->inode, + dn->get_inode()->symlink, + dn->get_inode()->dirfragtree); + dn->get_inode()->dirlock.clear_updated(); + } } } else { // STRONG @@ -1944,6 +1912,7 @@ void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) * the sender * - is recovering from their journal. * - may have incorrect (out of date) inode contents + * - will include full inodes IFF they contain dirty scatterlock content * * if the sender didn't trim_non_auth(), they * - may have incorrect (out of date) dentry/inode linkage @@ -2002,11 +1971,24 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) } } + // full inodes? + // dirty scatterlock content! + for (list::iterator p = weak->full_inodes.begin(); + p != weak->full_inodes.end(); + ++p) { + CInode *in = get_inode(p->inode.ino); + if (!in) continue; + if (p->inode.mtime > in->inode.mtime) in->inode.mtime = p->inode.mtime; + dout(10) << " got dirty inode scatterlock content " << *in << dendl; + in->dirlock.set_updated(); + } + // walk weak map for (map >::iterator p = weak->weak.begin(); p != weak->weak.end(); ++p) { CDir *dir = get_dirfrag(p->first); + if (!dir) dout(0) << " missing dirfrag " << p->first << dendl; assert(dir); int nonce = dir->add_replica(from); @@ -2022,7 +2004,8 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) assert(dn); assert(dn->is_primary()); - if (survivor) dentry_remove_replica(dn, from); + if (survivor && dn->is_replica(from)) + dentry_remove_replica(dn, from); // this induces a lock gather completion int dnonce = dn->add_replica(from); dout(10) << " have " << *dn << dendl; if (ack) @@ -2034,7 +2017,8 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) CInode *in = dn->get_inode(); assert(in); - if (survivor) inode_remove_replica(in, from); + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from); // this induces a lock gather completion int inonce = in->add_replica(from); dout(10) << " have " << *in << dendl; @@ -2056,14 +2040,35 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) } } - if (survivor) - rejoin_scour_survivor_replicas(from, ack); + // weak base inodes? (root, stray, etc.) + for (set::iterator p = weak->weak_inodes.begin(); + p != weak->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + assert(in); // hmm fixme wrt stray? + if (survivor && in->is_replica(from)) + inode_remove_replica(in, from); // this induces a lock gather completion + int inonce = in->add_replica(from); + dout(10) << " have base " << *in << dendl; + + if (ack) + ack->add_strong_inode(in->ino(), + inonce, + 0, + in->authlock.get_replica_state(), + in->linklock.get_replica_state(), + in->dirfragtreelock.get_replica_state(), + in->filelock.get_replica_state(), + in->dirlock.get_replica_state()); + } if (survivor) { - // send ack + // survivor. do everything now. + rejoin_scour_survivor_replicas(from, ack); mds->send_message_mds(ack, from, MDS_PORT_CACHE); } else { // done? + assert(rejoin_gather.count(from)); rejoin_gather.erase(from); if (rejoin_gather.empty()) { rejoin_gather_finish(); @@ -2079,11 +2084,11 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) * * @pathmap - map of inodeno to full pathnames. we remove items from this map * as we discover we have them. - * @retry - non-completion callback context. called when a pass of fetches - * completes. deleted if we are done (i.e. pathmap is empty). + * + * returns a C_Gather* is there is work to do. caller is responsible for setting + * the C_Gather completer. */ -bool MDCache::parallel_fetch(map& pathmap, - Context *retry) +C_Gather *MDCache::parallel_fetch(map& pathmap) { dout(10) << "parallel_fetch on " << pathmap.size() << " paths" << dendl; @@ -2110,12 +2115,11 @@ bool MDCache::parallel_fetch(map& pathmap, if (pathmap.empty()) { dout(10) << "parallel_fetch done" << dendl; assert(fetch_queue.empty()); - delete retry; - return true; + return false; } // do a parallel fetch - C_Gather *gather = new C_Gather(retry); + C_Gather *gather = new C_Gather; for (set::iterator p = fetch_queue.begin(); p != fetch_queue.end(); ++p) { @@ -2123,7 +2127,7 @@ bool MDCache::parallel_fetch(map& pathmap, (*p)->fetch(gather->new_sub()); } - return false; + return gather; } @@ -2340,11 +2344,22 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) } } + // base inodes? (root, stray, etc.) + for (set::iterator p = strong->weak_inodes.begin(); + p != strong->weak_inodes.end(); + ++p) { + CInode *in = get_inode(*p); + dout(10) << " have base " << *in << dendl; + in->add_replica(from); + } + // send missing? if (missing) { + // we expect a FULL soon. mds->send_message_mds(missing, from, MDS_PORT_CACHE); } else { // done? + assert(rejoin_gather.count(from)); rejoin_gather.erase(from); if (rejoin_gather.empty()) { rejoin_gather_finish(); @@ -2437,6 +2452,7 @@ void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) } // done? + assert(rejoin_ack_gather.count(from)); rejoin_ack_gather.erase(from); if (mds->is_rejoin() && rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. @@ -2501,6 +2517,7 @@ void MDCache::handle_cache_rejoin_full(MMDSCacheRejoin *full) } // done? + assert(rejoin_gather.count(from)); rejoin_gather.erase(from); if (rejoin_gather.empty()) { rejoin_gather_finish(); @@ -2576,6 +2593,8 @@ public: } }; + + void MDCache::rejoin_gather_finish() { dout(10) << "rejoin_gather_finish" << dendl; @@ -2584,9 +2603,15 @@ void MDCache::rejoin_gather_finish() rejoin_trim_undef_inodes(); // fetch paths? - if (!cap_import_paths.empty() && - !parallel_fetch(cap_import_paths, new C_MDC_RejoinGatherFinish(this))) - return; + // do this before ack, since some inodes we may have already gotten + // from surviving MDSs. + if (!cap_import_paths.empty()) { + C_Gather *gather = parallel_fetch(cap_import_paths); + if (gather) { + gather->set_finisher(new C_MDC_RejoinGatherFinish(this)); + return; + } + } // process cap imports // ino -> client -> frommds -> capex @@ -2710,6 +2735,32 @@ void MDCache::rejoin_send_acks() } } + // root inodes too + if (root) + for (map::iterator r = root->replicas_begin(); + r != root->replicas_end(); + ++r) { + ack[r->first]->add_full_inode(root->inode, root->symlink, root->dirfragtree); + ack[r->first]->add_strong_inode(root->ino(), r->second, 0, + root->authlock.get_replica_state(), + root->linklock.get_replica_state(), + root->dirfragtreelock.get_replica_state(), + root->filelock.get_replica_state(), + root->dirlock.get_replica_state()); + } + if (stray) + for (map::iterator r = stray->replicas_begin(); + r != stray->replicas_end(); + ++r) { + ack[r->first]->add_full_inode(stray->inode, stray->symlink, stray->dirfragtree); + ack[r->first]->add_strong_inode(stray->ino(), r->second, 0, + stray->authlock.get_replica_state(), + stray->linklock.get_replica_state(), + stray->dirfragtreelock.get_replica_state(), + stray->filelock.get_replica_state(), + stray->dirlock.get_replica_state()); + } + // send acks for (map::iterator p = ack.begin(); p != ack.end(); @@ -2727,6 +2778,7 @@ void MDCache::set_root(CInode *in) { assert(root == 0); root = in; + base_inodes.insert(in); } @@ -2739,22 +2791,24 @@ void MDCache::set_root(CInode *in) class C_MDC_PurgeFinish : public Context { MDCache *mdc; - inodeno_t ino; - off_t newsize; + CInode *in; + off_t newsize, oldsize; public: - C_MDC_PurgeFinish(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} + C_MDC_PurgeFinish(MDCache *c, CInode *i, off_t ns, off_t os) : + mdc(c), in(i), newsize(ns), oldsize(os) {} void finish(int r) { - mdc->purge_inode_finish(ino, newsize); + mdc->purge_inode_finish(in, newsize, oldsize); } }; class C_MDC_PurgeFinish2 : public Context { MDCache *mdc; - inodeno_t ino; - off_t newsize; + CInode *in; + off_t newsize, oldsize; public: - C_MDC_PurgeFinish2(MDCache *c, inodeno_t i, off_t s) : mdc(c), ino(i), newsize(s) {} + C_MDC_PurgeFinish2(MDCache *c, CInode *i, off_t ns, off_t os) : + mdc(c), in(i), newsize(ns), oldsize(os) {} void finish(int r) { - mdc->purge_inode_finish_2(ino, newsize); + mdc->purge_inode_finish_2(in, newsize, oldsize); } }; @@ -2762,85 +2816,108 @@ public: * will be called by on unlink or rmdir or truncate * caller responsible for journaling an appropriate EUpdate */ -void MDCache::purge_inode(inode_t *inode, off_t newsize) +void MDCache::purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) { - dout(10) << "purge_inode " << inode->ino << " size " << inode->size - << " -> " << newsize + dout(10) << "purge_inode " << oldsize << " -> " << newsize + << " on " << *in << dendl; - // take note - assert(purging[inode->ino].count(newsize) == 0); - purging[inode->ino][newsize] = *inode; + assert(oldsize >= newsize); - assert(inode->size >= newsize); - _do_purge_inode(inode, newsize); + purging[in][newsize] = oldsize; + purging_ls[in][newsize] = ls; + ls->purging_inodes[in][newsize] = oldsize; + + _do_purge_inode(in, newsize, oldsize); } -void MDCache::_do_purge_inode(inode_t *inode, off_t newsize) +void MDCache::_do_purge_inode(CInode *in, off_t newsize, off_t oldsize) { + in->get(CInode::PIN_PURGING); + // remove - if (inode->size > 0) { - mds->filer->remove(*inode, newsize, inode->size, - 0, new C_MDC_PurgeFinish(this, inode->ino, newsize)); + if (in->inode.size > 0) { + mds->filer->remove(in->inode, newsize, oldsize, + 0, new C_MDC_PurgeFinish(this, in, newsize, oldsize)); } else { // no need, empty file, just log it - purge_inode_finish(inode->ino, newsize); + purge_inode_finish(in, newsize, oldsize); } } -void MDCache::purge_inode_finish(inodeno_t ino, off_t newsize) +void MDCache::purge_inode_finish(CInode *in, off_t newsize, off_t oldsize) { - dout(10) << "purge_inode_finish " << ino << " to " << newsize - << " - logging our completion" << dendl; + dout(10) << "purge_inode_finish " << oldsize << " -> " << newsize + << " on " << *in << dendl; // log completion - mds->mdlog->submit_entry(new EPurgeFinish(ino, newsize), - new C_MDC_PurgeFinish2(this, ino, newsize)); + mds->mdlog->submit_entry(new EPurgeFinish(in->ino(), newsize, oldsize), + new C_MDC_PurgeFinish2(this, in, newsize, oldsize)); } -void MDCache::purge_inode_finish_2(inodeno_t ino, off_t newsize) +void MDCache::purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize) { - dout(10) << "purge_inode_finish_2 " << ino << " to " << newsize << dendl; + dout(10) << "purge_inode_finish_2 " << oldsize << " -> " << newsize + << " on " << *in << dendl; // remove from purging list - purging[ino].erase(newsize); - if (purging[ino].empty()) - purging.erase(ino); + LogSegment *ls = purging_ls[in][newsize]; + purging[in].erase(newsize); + purging_ls[in].erase(newsize); + if (purging[in].empty()) { + purging.erase(in); + purging_ls.erase(in); + } + + assert(ls->purging_inodes.count(in)); + assert(ls->purging_inodes[in].count(newsize)); + assert(ls->purging_inodes[in][newsize] == oldsize); + ls->purging_inodes[in].erase(newsize); + if (ls->purging_inodes[in].empty()) + ls->purging_inodes.erase(in); + in->put(CInode::PIN_PURGING); + // tell anyone who cares (log flusher?) - list ls; - ls.swap(waiting_for_purge[ino][newsize]); - waiting_for_purge[ino].erase(newsize); - if (waiting_for_purge[ino].empty()) - waiting_for_purge.erase(ino); - finish_contexts(ls, 0); + if (purging.count(in) == 0 || + purging[in].rbegin()->first < newsize) { + list ls; + ls.swap(waiting_for_purge[in][newsize]); + waiting_for_purge[in].erase(newsize); + if (waiting_for_purge[in].empty()) + waiting_for_purge.erase(in); + finish_contexts(ls, 0); + } } -void MDCache::add_recovered_purge(const inode_t& inode, off_t newsize) +void MDCache::add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls) { - assert(purging[inode.ino].count(newsize) == 0); - purging[inode.ino][newsize] = inode; + assert(purging[in].count(newsize) == 0); + purging[in][newsize] = oldsize; + purging_ls[in][newsize] = ls; + ls->purging_inodes[in][newsize] = oldsize; } -void MDCache::remove_recovered_purge(inodeno_t ino, off_t newsize) +void MDCache::remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize) { - purging[ino].erase(newsize); + purging[in].erase(newsize); } void MDCache::start_recovered_purges() { dout(10) << "start_recovered_purges (" << purging.size() << " purges)" << dendl; - for (map >::iterator p = purging.begin(); + for (map >::iterator p = purging.begin(); p != purging.end(); ++p) { - for (map::iterator q = p->second.begin(); + for (map::iterator q = p->second.begin(); q != p->second.end(); ++q) { - dout(10) << "start_recovered_purges " << p->first - << " size " << q->second.size - << " to " << q->first << dendl; - _do_purge_inode(&q->second, q->first); + dout(10) << "start_recovered_purges " + << q->second << " -> " << q->first + << " on " << *p->first + << dendl; + _do_purge_inode(p->first, q->first, q->second); } } } @@ -2862,41 +2939,31 @@ bool MDCache::trim(int max) map expiremap; - // DENTRIES from the LRU - + // trim dentries from the LRU while (lru.lru_get_size() > (unsigned)max) { CDentry *dn = (CDentry*)lru.lru_expire(); if (!dn) break; trim_dentry(dn, expiremap); } - // trim root inode+dir? - if (max == 0 && // only if we're trimming everything! - lru.lru_get_size() == 0) { - hash_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - hash_map::iterator n = p; - n++; - - CInode *in = p->second; - + // trim base inodes? + if (max == 0) { + set::iterator p = base_inodes.begin(); + while (p != base_inodes.end()) { + CInode *in = *p++; list ls; in->get_dirfrags(ls); - for (list::iterator q = ls.begin(); - q != ls.end(); - ++q) - if ((*q)->get_num_ref() == 0) - trim_dirfrag(*q, *q, expiremap); - - // root inode? + for (list::iterator p = ls.begin(); p != ls.end(); ++p) { + CDir *dir = *p; + if (dir->get_num_ref() == 1) // subtree pin + trim_dirfrag(dir, 0, expiremap); + } if (in->get_num_ref() == 0) - trim_inode(0, in, 0, expiremap); // hrm, FIXME - - p = n; - } + trim_inode(0, in, 0, expiremap); + } } - // send! + // send any expire messages send_expire_messages(expiremap); return true; @@ -2981,10 +3048,15 @@ void MDCache::trim_dentry(CDentry *dn, map& expiremap) void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) { - assert(dir->get_num_ref() == 0); - dout(15) << "trim_dirfrag " << *dir << dendl; + if (dir->is_subtree_root()) { + assert(!dir->is_auth() || + (!dir->is_replicated() && dir->inode->is_base())); + remove_subtree(dir); // remove from subtree map + } + assert(dir->get_num_ref() == 0); + CInode *in = dir->get_inode(); if (!dir->is_auth()) { @@ -3016,8 +3088,6 @@ void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expire } } - if (dir->is_subtree_root()) - remove_subtree(dir); // remove from subtree map in->close_dirfrag(dir->dirfrag().frag); } @@ -3087,6 +3157,8 @@ void MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) + p->first->get(CDir::PIN_SUBTREETEMP); + // note first auth item we see. // when we see it the second time, stop. CDentry *first_auth = 0; @@ -3172,6 +3250,15 @@ void MDCache::trim_non_auth() } } + // move everything in the pintail to the top bit of the lru. + lru.lru_touch_entire_pintail(); + + // unpin all subtrees + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + p++) + p->first->put(CDir::PIN_SUBTREETEMP); + show_subtrees(); } @@ -3388,17 +3475,6 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from) // ========================================================================================= // shutdown -class C_MDC_ShutdownCommit : public Context { - MDCache *mdc; -public: - C_MDC_ShutdownCommit(MDCache *mdc) { - this->mdc = mdc; - } - void finish(int r) { - mdc->shutdown_commits--; - } -}; - class C_MDC_ShutdownCheck : public Context { MDCache *mdc; public: @@ -3434,6 +3510,8 @@ void MDCache::shutdown_start() if (g_conf.mds_shutdown_check) mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); + + // g_conf.debug_mds = 10; } @@ -3449,57 +3527,27 @@ bool MDCache::shutdown_pass() return true; } - // commit dirs? - if (g_conf.mds_commit_on_shutdown) { - - if (shutdown_commits < 0) { - dout(1) << "shutdown_pass committing all dirty dirs" << dendl; - shutdown_commits = 0; - - for (hash_map::iterator it = inode_map.begin(); - it != inode_map.end(); - it++) { - CInode *in = it->second; - if (!in->is_dir()) continue; - - // commit any dirty dirfrag that's ours - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (dir->is_auth() && dir->is_dirty()) { - dir->commit(0, new C_MDC_ShutdownCommit(this)); - shutdown_commits++; - } - } - } - } + // flush batching eopens, so that we can properly expire them. + mds->server->journal_opens(); // hrm, this is sort of a hack. - // commits? - if (shutdown_commits > 0) { - dout(7) << "shutdown_commits still waiting for " << shutdown_commits << dendl; - return false; - } + // flush what we can from the log + mds->mdlog->set_max_events(0); + mds->mdlog->trim(); + + if (mds->mdlog->get_num_segments() > 1) { + dout(7) << "still >1 segments, waiting for log to trim" << dendl; + return false; } - // flush anything we can from the cache trim(0); dout(5) << "lru size now " << lru.lru_get_size() << dendl; - // flush batching eopens, so that we can properly expire them. - mds->server->journal_opens(); // hrm, this is sort of a hack. - - // flush what we can from the log - mds->mdlog->trim(0); - // SUBTREES - // send all imports back to 0. if (!subtrees.empty() && mds->get_nodeid() != 0 && !migrator->is_exporting() //&& //!migrator->is_importing() ) { - // export to root dout(7) << "looking for subtrees to export to mds0" << dendl; list ls; for (map >::iterator it = subtrees.begin(); @@ -3511,20 +3559,26 @@ bool MDCache::shutdown_pass() if (!dir->is_full_dir_auth()) continue; ls.push_back(dir); } + int max = 5; // throttle shutdown exports.. hack! for (list::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; - dout(7) << "sending " << *dir << " back to mds0" << dendl; - migrator->export_dir(dir, 0); + int dest = dir->get_inode()->authority().first; + if (dest > 0 && !mds->mdsmap->is_active(dest)) dest = 0; + dout(7) << "sending " << *dir << " back to mds" << dest << dendl; + migrator->export_dir(dir, dest); + if (--max == 0) break; } } + // subtrees map not empty yet? if (!subtrees.empty()) { dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; show_subtrees(); migrator->show_importing(); migrator->show_exporting(); - //show_cache(); + if (!migrator->is_importing() && !migrator->is_exporting()) + show_cache(); return false; } assert(subtrees.empty()); @@ -3532,44 +3586,32 @@ bool MDCache::shutdown_pass() assert(!migrator->is_importing()); + // empty out stray contents // FIXME dout(7) << "FIXME: i need to empty out stray dir contents..." << dendl; - // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown) { - if (mds->mdlog->get_non_subtreemap_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << dendl; - return false; - } + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the log" << dendl; + mds->mdlog->cap(); + mds->mdlog->trim(); } - - // cap log? - if (g_conf.mds_log_flush_on_shutdown) { - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - // note that this won't flush right away, so we'll make at least one more pass - } - - if (mds->mdlog->get_num_events()) { - dout(7) << "waiting for log to flush (including subtree_map, now) .. " << mds->mdlog->get_num_events() - << " (" << mds->mdlog->get_non_subtreemap_events() << ")" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } + + if (!mds->mdlog->empty()) { + dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() + << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; + return false; + } + + if (!did_shutdown_log_cap) { + // flush journal header + dout(7) << "writing header for (now-empty) journal" << dendl; + assert(mds->mdlog->empty()); + mds->mdlog->write_head(0); + // NOTE: filer active checker below will block us until this completes. + did_shutdown_log_cap = true; + return false; } // filer active? @@ -3578,8 +3620,7 @@ bool MDCache::shutdown_pass() return false; } - - // done? + // trim what we can from the cache if (lru.lru_get_size() > 0) { dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << dendl; show_cache(); @@ -3679,10 +3720,13 @@ void MDCache::dispatch(Message *m) Context *MDCache::_get_waiter(MDRequest *mdr, Message *req) { - if (mdr) + if (mdr) { + dout(20) << "_get_waiter retryrequest" << dendl; return new C_MDS_RetryRequest(this, mdr); - else + } else { + dout(20) << "_get_waiter retrymessage" << dendl; return new C_MDS_RetryMessage(mds, req); + } } int MDCache::path_traverse(MDRequest *mdr, Message *req, // who @@ -3736,34 +3780,15 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who // parent dir frozen_dir? if (cur->is_frozen_dir()) { dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << dendl; - cur->get_parent_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); + cur->get_parent_dn()->get_dir()->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req)); return 1; } - curdir = cur->get_or_open_dirfrag(this, fg); } else { // discover? - assert(!cur->is_auth()); - if (cur->is_ambiguous_auth()) { - dout(10) << "traverse: need dirfrag " << fg << ", waiting for single auth on " << *cur << dendl; - cur->add_waiter(CInode::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } else if (dir_discovers.count(cur->ino())) { - dout(10) << "traverse: need dirfrag " << fg << ", already doing discover for " << *cur << dendl; - assert(cur->is_waiter_for(CInode::WAIT_DIR)); - } else { - filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dirfrag " << fg << ", doing discover, want " << want.get_path() - << " from " << *cur << dendl; - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true, // need this dir! - onfail == MDS_TRAVERSE_DISCOVERXLOCK), - cur->authority().first, MDS_PORT_CACHE); - dir_discovers[cur->ino()].insert(cur->authority().first); - } - cur->add_waiter(CInode::WAIT_DIR, _get_waiter(mdr, req)); + dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; + discover_path(cur, path.postfixpath(depth), _get_waiter(mdr, req), + onfail == MDS_TRAVERSE_DISCOVERXLOCK); if (mds->logger) mds->logger->inc("tdis"); return 1; } @@ -3784,8 +3809,11 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who // must read directory hard data (permissions, x bit) to traverse if (!noperm && - !mds->locker->simple_rdlock_try(&cur->authlock, _get_waiter(mdr, req))) + !mds->locker->simple_rdlock_try(&cur->authlock, 0)) { + dout(7) << "traverse: waiting on authlock rdlock on " << *cur << dendl; + cur->authlock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req)); return 1; + } // check permissions? // XXX @@ -3895,7 +3923,7 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who << req->get_source() << " dn " << *dn << dendl; } else { dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << dendl; - MDiscoverReply *reply = new MDiscoverReply(curdir->ino()); + MDiscoverReply *reply = new MDiscoverReply(curdir->dirfrag()); reply->add_dentry( dn->replicate_to( from ) ); if (dn->is_primary()) reply->add_inode( dn->inode->replicate_to( from ) ); @@ -3934,32 +3962,9 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who if ((onfail == MDS_TRAVERSE_DISCOVER || onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { - // discover? - filepath want = path.postfixpath(depth); - - if (curdir->is_waiting_for_dentry(path[depth])) { - dout(7) << "traverse: already waiting for discover " << want.get_path() - << " from " << *curdir << dendl; - } - else if (curdir->is_ambiguous_auth()) { - dout(7) << "traverse: waiting for single auth on " << *curdir << dendl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); - return 1; - } - else { - dout(7) << "traverse: discover " << want << " from " << *curdir << dendl; - touch_inode(cur); - - mds->send_message_mds(new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - false, - onfail == MDS_TRAVERSE_DISCOVERXLOCK), - dauth.first, MDS_PORT_CACHE); - } - - // delay processing of current request. - curdir->add_dentry_waiter(path[depth], _get_waiter(mdr, req)); + dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; + discover_path(curdir, path.postfixpath(depth), _get_waiter(mdr, req), + onfail == MDS_TRAVERSE_DISCOVERXLOCK); if (mds->logger) mds->logger->inc("tdis"); return 1; } @@ -3972,29 +3977,28 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); return 1; - } else { - dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; - - // request replication? - if (mdr && mdr->client_request && curdir->is_rep()) { - dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " - << *curdir << " req " << *(MClientRequest*)req << dendl; - mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); - req->clear_payload(); // reencode! - } - - if (mdr) - request_forward(mdr, dauth.first, req->get_dest_port()); - else - mds->forward_message_mds(req, dauth.first, req->get_dest_port()); - - if (mds->logger) mds->logger->inc("tfw"); - return 2; + } + + dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; + + // request replication? + if (mdr && mdr->client_request && curdir->is_rep()) { + dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " + << *curdir << " req " << *(MClientRequest*)req << dendl; + mdr->client_request->set_mds_wants_replica_in_dirino(curdir->ino()); + req->clear_payload(); // reencode! } + + if (mdr) + request_forward(mdr, dauth.first, req->get_dest_port()); + else + mds->forward_message_mds(req, dauth.first, req->get_dest_port()); + + if (mds->logger) mds->logger->inc("tfw"); + return 2; } - if (onfail == MDS_TRAVERSE_FAIL) { + if (onfail == MDS_TRAVERSE_FAIL) return -ENOENT; // not necessarily exactly true.... - } } assert(0); // i shouldn't get here @@ -4070,16 +4074,7 @@ void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, Context *fin) int auth = diri->authority().first; if (mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) { - // discover it - filepath want; // no dentries, i just want the dir open - MDiscover *dis = new MDiscover(mds->get_nodeid(), - diri->ino(), - want, - true); // need the base dir open - dis->set_base_dir_frag(approxfg); - mds->send_message_mds(dis, auth, MDS_PORT_CACHE); - dir_discovers[diri->ino()].insert(auth); - diri->add_waiter(CInode::WAIT_DIR, fin); + discover_dir_frag(diri, approxfg, fin); } else { // mds is down or recovering. forge a replica! forge_replica_dir(diri, approxfg, auth); @@ -4206,7 +4201,9 @@ void MDCache::open_remote_ino_2(inodeno_t ino, return; } - if (!in->is_auth()) { + CDir *dir = in->get_dirfrag(frag); + + if (!dir && !in->is_auth()) { dout(10) << "opening remote dirfrag " << frag << " under " << *in << dendl; /* FIXME: we re-query the anchortable just to avoid a fragtree update race */ open_remote_dirfrag(in, frag, @@ -4214,7 +4211,9 @@ void MDCache::open_remote_ino_2(inodeno_t ino, return; } - CDir *dir = in->get_or_open_dirfrag(this, frag); + if (!dir && in->is_auth()) + dir = in->get_or_open_dirfrag(this, frag); + assert(dir); if (dir->is_auth()) { if (dir->is_complete()) { @@ -4234,12 +4233,8 @@ void MDCache::open_remote_ino_2(inodeno_t ino, // hmm, discover. dout(10) << "have remote dirfrag " << *dir << ", discovering " << anchortrace[i].ino << dendl; - - MDiscover *dis = new MDiscover(mds->get_nodeid(), - dir->dirfrag(), - anchortrace[i].ino, - true); // being conservative here. - mds->send_message_mds(dis, dir->authority().first, MDS_PORT_CACHE); + discover_ino(dir, anchortrace[i].ino, + new C_MDC_OpenRemoteIno(this, ino, anchortrace, mdr, onfinish)); } } @@ -4304,10 +4299,10 @@ void MDCache::request_finish(MDRequest *mdr) dout(7) << "request_finish " << *mdr << dendl; // slave finisher? - if (mdr->slave_commit) { - mdr->slave_commit->finish(0); - delete mdr->slave_commit; - mdr->slave_commit = 0; + if (mdr->more()->slave_commit) { + mdr->more()->slave_commit->finish(0); + delete mdr->more()->slave_commit; + mdr->more()->slave_commit = 0; } if (mdr->client_request && mds->logger) { @@ -4373,8 +4368,8 @@ void MDCache::request_cleanup(MDRequest *mdr) // clean up slaves // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->slaves.begin(); - p != mdr->slaves.end(); + for (set::iterator p = mdr->more()->slaves.begin(); + p != mdr->more()->slaves.end(); ++p) { MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_FINISH); mds->send_message_mds(r, *p, MDS_PORT_SERVER); @@ -4389,6 +4384,12 @@ void MDCache::request_cleanup(MDRequest *mdr) // drop (local) auth pins mdr->drop_local_auth_pins(); + // drop stickydirs + for (set::iterator p = mdr->stickydirs.begin(); + p != mdr->stickydirs.end(); + ++p) + (*p)->put_stickydirs(); + // drop cache pins for (set::iterator it = mdr->pins.begin(); it != mdr->pins.end(); @@ -4461,7 +4462,7 @@ void MDCache::anchor_create(MDRequest *mdr, CInode *in, Context *onfinish) if (!in->can_auth_pin() && !mdr->is_auth_pinned(in)) { dout(7) << "anchor_create not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); + in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); return; } @@ -4494,12 +4495,12 @@ class C_MDC_AnchorCreateLogged : public Context { MDCache *cache; CInode *in; version_t atid; - version_t pdv; + LogSegment *ls; public: - C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, version_t v) : - cache(c), in(i), atid(t), pdv(v) {} + C_MDC_AnchorCreateLogged(MDCache *c, CInode *i, version_t t, LogSegment *s) : + cache(c), in(i), atid(t), ls(s) {} void finish(int r) { - cache->_anchor_create_logged(in, atid, pdv); + cache->_anchor_create_logged(in, atid, ls); } }; @@ -4508,28 +4509,24 @@ void MDCache::_anchor_create_prepared(CInode *in, version_t atid) dout(10) << "_anchor_create_prepared " << *in << " atid " << atid << dendl; assert(in->inode.anchored == false); - // predirty, prepare log entry - version_t pdv = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); - le->metablob.add_dir_context(in->get_parent_dir()); - // update the logged inode copy - inode_t *pi = le->metablob.add_dentry(in->parent, true); + inode_t *pi = in->project_inode(); pi->anchored = true; - pi->version = pdv; + pi->version = in->pre_dirty(); // note anchor transaction + EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); + le->metablob.add_dir_context(in->get_parent_dir()); + le->metablob.add_primary_dentry(in->parent, true, 0, pi); le->metablob.add_anchor_transaction(atid); - - // log + wait - mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, pdv)); + mds->mdlog->submit_entry(le, new C_MDC_AnchorCreateLogged(this, in, atid, + mds->mdlog->get_current_segment())); } -void MDCache::_anchor_create_logged(CInode *in, version_t atid, version_t pdv) +void MDCache::_anchor_create_logged(CInode *in, version_t atid, LogSegment *ls) { - dout(10) << "_anchor_create_logged pdv " << pdv << " on " << *in << dendl; + dout(10) << "_anchor_create_logged on " << *in << dendl; // unpin assert(in->state_test(CInode::STATE_ANCHORING)); @@ -4538,11 +4535,10 @@ void MDCache::_anchor_create_logged(CInode *in, version_t atid, version_t pdv) in->auth_unpin(); // apply update to cache - in->inode.anchored = true; - in->mark_dirty(pdv); + in->pop_and_dirty_projected_inode(ls); // tell the anchortable we've committed - mds->anchorclient->commit(atid); + mds->anchorclient->commit(atid, ls); // trigger waiters in->finish_waiting(CInode::WAIT_ANCHORED, 0); @@ -4570,7 +4566,7 @@ void MDCache::anchor_destroy(CInode *in, Context *onfinish) if (!in->can_auth_pin()/* && !mdr->is_auth_pinned(in)*/) { dout(7) << "anchor_destroy not authpinnable, waiting on " << *in << dendl; - in->add_waiter(CInode::WAIT_AUTHPINNABLE, onfinish); + in->add_waiter(CInode::WAIT_UNFREEZE, onfinish); return; } @@ -4600,12 +4596,12 @@ class C_MDC_AnchorDestroyLogged : public Context { MDCache *cache; CInode *in; version_t atid; - version_t pdv; + LogSegment *ls; public: - C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, version_t v) : - cache(c), in(i), atid(t), pdv(v) {} + C_MDC_AnchorDestroyLogged(MDCache *c, CInode *i, version_t t, LogSegment *l) : + cache(c), in(i), atid(t), ls(l) {} void finish(int r) { - cache->_anchor_destroy_logged(in, atid, pdv); + cache->_anchor_destroy_logged(in, atid, ls); } }; @@ -4615,28 +4611,23 @@ void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) assert(in->inode.anchored == true); - // predirty, prepare log entry - version_t pdv = in->pre_dirty(); - - EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); - le->metablob.add_dir_context(in->get_parent_dir()); - // update the logged inode copy - inode_t *pi = le->metablob.add_dentry(in->parent, true); + inode_t *pi = in->project_inode(); pi->anchored = true; - pi->version = pdv; - - // note anchor transaction - le->metablob.add_anchor_transaction(atid); + pi->version = in->pre_dirty(); // log + wait - mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, pdv)); + EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); + le->metablob.add_dir_context(in->get_parent_dir()); + le->metablob.add_primary_dentry(in->parent, true, 0, pi); + le->metablob.add_anchor_transaction(atid); + mds->mdlog->submit_entry(le, new C_MDC_AnchorDestroyLogged(this, in, atid, mds->mdlog->get_current_segment())); } -void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, version_t pdv) +void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls) { - dout(10) << "_anchor_destroy_logged pdv " << pdv << " on " << *in << dendl; + dout(10) << "_anchor_destroy_logged on " << *in << dendl; // unpin assert(in->state_test(CInode::STATE_UNANCHORING)); @@ -4645,11 +4636,10 @@ void MDCache::_anchor_destroy_logged(CInode *in, version_t atid, version_t pdv) in->auth_unpin(); // apply update to cache - in->inode.anchored = false; - in->inode.version = pdv; - + in->pop_and_dirty_projected_inode(ls); + // tell the anchortable we've committed - mds->anchorclient->commit(atid); + mds->anchorclient->commit(atid, ls); // trigger waiters in->finish_waiting(CInode::WAIT_UNANCHORED, 0); @@ -4696,10 +4686,12 @@ class C_MDC_PurgeStray : public Context { MDCache *cache; CDentry *dn; version_t pdv; + LogSegment *ls; public: - C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v) : cache(c), dn(d), pdv(v) { } + C_MDC_PurgeStray(MDCache *c, CDentry *d, version_t v, LogSegment *s) : + cache(c), dn(d), pdv(v), ls(s) { } void finish(int r) { - cache->_purge_stray_logged(dn, pdv); + cache->_purge_stray_logged(dn, pdv, ls); } }; @@ -4714,22 +4706,25 @@ void MDCache::_purge_stray(CDentry *dn) EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); le->metablob.add_dir_context(dn->dir); le->metablob.add_null_dentry(dn, true); - le->metablob.add_inode_truncate(dn->inode->inode, 0); - mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv)); + le->metablob.add_inode_truncate(dn->inode->ino(), 0, dn->inode->inode.size); + + mds->mdlog->submit_entry(le, new C_MDC_PurgeStray(this, dn, pdv, mds->mdlog->get_current_segment())); + + } -void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv) +void MDCache::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls) { dout(10) << "_purge_stray_logged " << *dn << " " << *dn->inode << dendl; CInode *in = dn->inode; // dirty+unlink dentry - dn->dir->mark_dirty(pdv); + dn->dir->mark_dirty(pdv, ls); dn->dir->unlink_inode(dn); dn->dir->remove_dentry(dn); // purge+remove inode - purge_inode(&in->inode, 0); + purge_inode(in, 0, in->inode.size, ls); remove_inode(in); } @@ -4751,7 +4746,191 @@ void MDCache::migrate_stray(CDentry *dn, int dest) -// REPLICAS +// ======================================================================================== +// DISCOVER +/* + + - for all discovers (except base_inos, e.g. root, stray), waiters are attached + to the parent metadata object in the cache (pinning it). + + - the discover is also registered under the per-mds discover_ hashes, so that + waiters can be kicked in the event of a failure. that is, every discover will + be followed by a reply, unless the remote node fails.. + + - each discover_reply must reliably decrement the discover_ counts. + + - base_inos are the exception. those waiters are under waiting_for_base_ino. + +*/ + +void MDCache::discover_base_ino(inodeno_t want_ino, + Context *onfinish, + int from) +{ + dout(7) << "discover_base_ino " << want_ino << " from mds" << from << dendl; + if (waiting_for_base_ino[from].count(want_ino) == 0) { + filepath want_path; + MDiscover *dis = new MDiscover(mds->get_nodeid(), + want_ino, + want_path, + false); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } + + waiting_for_base_ino[from][want_ino].push_back(onfinish); +} + + +void MDCache::discover_dir_frag(CInode *base, + frag_t approx_fg, + Context *onfinish, + int from) +{ + if (from < 0) from = base->authority().first; + + dout(7) << "discover_dir_frag " << base->ino() << " " << approx_fg + << " from mds" << from << dendl; + + if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative + filepath want_path; + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->ino(), + want_path, + true); // need the base dir open + dis->set_base_dir_frag(approx_fg); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } + + // register + wait + if (onfinish) + base->add_waiter(CInode::WAIT_DIR, onfinish); + discover_dir[from][base->ino()]++; +} + +void MDCache::discover_path(CInode *base, + filepath want_path, + Context *onfinish, + bool want_xlocked, + int from) +{ + if (from < 0) from = base->authority().first; + + dout(7) << "discover_path " << base->ino() << " " << want_path << " from mds" << from + << (want_xlocked ? " want_xlocked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(10) << " waiting for single auth on " << *base << dendl; + base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); + return; + } + + if (!base->is_waiter_for(CInode::WAIT_DIR) || !onfinish) { // this is overly conservative + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->ino(), + want_path, + true, // we want the base dir; we are relative to ino. + want_xlocked); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } + + // register + wait + if (onfinish) base->add_waiter(CInode::WAIT_DIR, onfinish); + discover_dir[from][base->ino()]++; +} + +void MDCache::discover_path(CDir *base, + filepath want_path, + Context *onfinish, + bool want_xlocked) +{ + int from = base->authority().first; + + dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " from mds" << from + << (want_xlocked ? " want_xlocked":"") + << dendl; + + if (base->is_ambiguous_auth()) { + dout(7) << " waiting for single auth on " << *base << dendl; + base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); + return; + } + + if (!base->is_waiting_for_dentry(want_path[0]) || !onfinish) { + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->ino(), + want_path, + false, // no base dir; we are relative to dir + want_xlocked); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } + + // register + wait + if (onfinish) base->add_dentry_waiter(want_path[0], onfinish); + discover_dir_sub[from][base->dirfrag()]++; +} + +void MDCache::discover_ino(CDir *base, + inodeno_t want_ino, + Context *onfinish, + bool want_xlocked) +{ + int from = base->authority().first; + + dout(7) << "discover_ino " << base->dirfrag() << " " << want_ino << " from mds" << from + << (want_xlocked ? " want_xlocked":"") + << dendl; + + if (!base->is_waiting_for_ino(want_ino)) { + MDiscover *dis = new MDiscover(mds->get_nodeid(), + base->dirfrag(), + want_ino, + want_xlocked); + mds->send_message_mds(dis, from, MDS_PORT_CACHE); + } + + // register + wait + base->add_ino_waiter(want_ino, onfinish); + discover_dir_sub[from][base->dirfrag()]++; +} + + + +void MDCache::kick_discovers(int who) +{ + list waiters; + + for (hash_map >::iterator p = waiting_for_base_ino[who].begin(); + p != waiting_for_base_ino[who].end(); + ++p) { + dout(10) << "kick_discovers on base ino " << p->first << dendl; + mds->queue_waiters(p->second); + } + waiting_for_base_ino.erase(who); + + for (hash_map::iterator p = discover_dir[who].begin(); + p != discover_dir[who].end(); + ++p) { + CInode *in = get_inode(p->first); + if (!in) continue; + dout(10) << "kick_discovers dir waiters on " << *in << dendl; + in->take_waiting(CInode::WAIT_DIR, waiters); + } + discover_dir.erase(who); + + for (hash_map::iterator p = discover_dir_sub[who].begin(); + p != discover_dir_sub[who].end(); + ++p) { + CDir *dir = get_dirfrag(p->first); + if (!dir) continue; + dout(10) << "kick_discovers dentry+ino waiters on " << *dir << dendl; + dir->take_sub_waiting(waiters); + } + discover_dir_sub.erase(who); + + mds->queue_waiters(waiters); +} + void MDCache::handle_discover(MDiscover *dis) @@ -4770,7 +4949,7 @@ void MDCache::handle_discover(MDiscover *dis) CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis->get_base_ino()); + MDiscoverReply *reply = new MDiscoverReply(dis); // get started. if (dis->get_base_ino() == MDS_INO_ROOT) { @@ -4804,31 +4983,29 @@ void MDCache::handle_discover(MDiscover *dis) if (!cur) { dout(7) << "handle_discover mds" << dis->get_asker() << " don't have base ino " << dis->get_base_ino() - << ", dropping" << dendl; - delete reply; - return; + << dendl; + reply->set_flag_error_dir(); } if (dis->wants_base_dir()) { dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur << " wants basedir+" << dis->get_want().get_path() + << " has " << *cur << dendl; } else { dout(7) << "handle_discover mds" << dis->get_asker() - << " has " << *cur << " wants " << dis->get_want().get_path() + << " has " << *cur << dendl; } } assert(reply); - assert(cur); // add content // do some fidgeting to include a dir if they asked for the base dir, or just root. for (unsigned i = 0; - i < dis->get_want().depth() || dis->get_want().depth() == 0; + cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); i++) { // -- figure out the dir @@ -4848,24 +5025,41 @@ void MDCache::handle_discover(MDiscover *dis) } else { // requester explicity specified the frag fg = dis->get_base_dir_frag(); - assert(dis->wants_base_dir() || dis->get_base_ino() < MDS_INO_BASE); + assert(dis->wants_base_dir() || dis->get_want_ino() || dis->get_base_ino() < MDS_INO_BASE); } CDir *curdir = cur->get_dirfrag(fg); if ((!curdir && !cur->is_auth()) || (curdir && !curdir->is_auth())) { - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); + + /* before: + * ONLY set flag if empty!! + * otherwise requester will wake up waiter(s) _and_ continue with discover, + * resulting in duplicate discovers in flight, + * which can wreak havoc when discovering rename srcdn (which may move) + */ + + if (reply->is_empty()) { + // only hint if empty. + // someday this could be better, but right now the waiter logic isn't smart enough. + + // hint + if (curdir) { + dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; + reply->set_dir_auth_hint(curdir->authority().first); + } else { + dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " + << *cur << dendl; + reply->set_dir_auth_hint(cur->authority().first); + } + + // note error dentry, if any + // NOTE: important, as it allows requester to issue an equivalent discover + // to whomever we hint at. + if (dis->get_want().depth() > i) + reply->set_error_dentry(dis->get_dentry(i)); } - reply->set_wanted_xlocks_hint(dis->wants_xlocked()); - - // set hint (+ dentry, if there is one) - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); + break; } @@ -4896,19 +5090,20 @@ void MDCache::handle_discover(MDiscover *dis) reply->add_dir( curdir->replicate_to(dis->get_asker()) ); dout(7) << "handle_discover added dir " << *curdir << dendl; } - if (dis->get_want().depth() == 0) break; - - // lookup inode? + + // lookup CDentry *dn = 0; if (dis->get_want_ino()) { + // lookup by ino CInode *in = get_inode(dis->get_want_ino()); if (in && in->is_auth() && in->get_parent_dn()->get_dir() == curdir) dn = in->get_parent_dn(); - } else { + } else if (dis->get_want().depth() > 0) { // lookup dentry dn = curdir->lookup( dis->get_dentry(i) ); - } - + } else + break; // done! + // incomplete dir? if (!dn) { if (!curdir->is_complete()) { @@ -4957,6 +5152,20 @@ void MDCache::handle_discover(MDiscover *dis) } } + // frozen inode? + if (dn->is_primary() && + dn->inode->is_frozen()) { + if (reply->is_empty()) { + dout(7) << *dn->inode << " is frozen, empty reply, waiting" << dendl; + dn->inode->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); + delete reply; + return; + } else { + dout(7) << *dn->inode << " is frozen, non-empty reply, stopping" << dendl; + break; + } + } + // add dentry reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); dout(7) << "handle_discover added dentry " << *dn << dendl; @@ -4976,15 +5185,10 @@ void MDCache::handle_discover(MDiscover *dis) } // how did we do? - if (reply->is_empty()) { - dout(7) << "handle_discover dropping this empty reply)." << dendl; - delete reply; - } else { - dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl; - mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - } + assert(!reply->is_empty()); + dout(7) << "handle_discover sending result back to asker mds" << dis->get_asker() << dendl; + mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); - // done. delete dis; } @@ -4999,49 +5203,42 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) } */ - // starting point list finished, error; - - // grab base inode + int from = m->get_source().num(); + + // starting point CInode *cur = get_inode(m->get_base_ino()); - - if (cur) { - dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << dendl; - } - else if (m->get_base_ino() == MDS_INO_ROOT) { - // it's the root inode. - assert(!root); + + if (m->has_base_inode()) { + assert(m->get_base_ino() < MDS_INO_BASE); assert(!m->has_base_dentry()); assert(!m->has_base_dir()); - - dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << dendl; - - // add in root - cur = add_replica_inode(m->get_inode(0), NULL); - cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - set_root(cur); - dout(7) << "discover_reply got root " << *cur << dendl; - - // take root waiters - finished.swap(waiting_for_root); - } - else if (MDS_INO_IS_STRAY(m->get_base_ino())) { - dout(7) << "discover_reply stray + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << dendl; - - // add - cur = add_replica_inode(m->get_inode(0), NULL); + + // add base inode + cur = add_replica_inode(m->get_inode(0), NULL, finished); cur->force_auth = pair(m->get_source().num(), CDIR_AUTH_UNKNOWN); - dout(7) << "discover_reply got stray " << *cur << dendl; + dout(7) << "discover_reply got base inode " << *cur << dendl; // take waiters - finished.swap(waiting_for_stray[cur->ino()]); - waiting_for_stray.erase(cur->ino()); + finished.swap(waiting_for_base_ino[from][cur->ino()]); + waiting_for_base_ino[from].erase(cur->ino()); } + assert(cur); + dout(7) << "discover_reply " << *cur + << " + " << m->get_num_dentries() << " dn, " + << m->get_num_inodes() << " inodes" + << dendl; + // fyi - if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << dendl; - if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; + if (m->is_flag_error_dir()) + dout(7) << " flag error, dir" << dendl; + if (m->is_flag_error_dn()) + dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; + if (m->is_flag_error_ino()) + dout(7) << " flag error, ino = " << m->get_wanted_ino() << dendl; + dout(10) << "depth = " << m->get_depth() << ", has base_dir/base_dn/root = " << m->has_base_dir() << " / " << m->has_base_dentry() << " / " << m->has_base_inode() @@ -5049,10 +5246,22 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) << m->get_num_dirs() << " / " << m->get_num_dentries() << " / " << m->get_num_inodes() << dendl; + // decrement discover counters + if (m->get_wanted_base_dir()) { + inodeno_t ino = m->get_base_ino(); + assert(discover_dir[from].count(ino)); + if (--discover_dir[from][ino] == 0) + discover_dir[from].erase(ino); + } else if (m->get_base_ino() >= MDS_INO_BASE) { + dirfrag_t df(m->get_base_ino(), m->get_base_dir_frag()); + assert(discover_dir_sub[from].count(df)); + if (--discover_dir_sub[from][df] == 0) + discover_dir_sub[from].erase(df); + } + // loop over discover results. - // indexese follow each ([[dir] dentry] inode) + // indexes follow each ([[dir] dentry] inode) // can start, end with any type. - for (int i=m->has_base_inode(); iget_depth(); i++) { dout(10) << "discover_reply i=" << i << " cur " << *cur << dendl; @@ -5062,8 +5271,6 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) if (i > 0 || m->has_base_dir()) { assert(m->get_dir(i).get_dirfrag().ino == cur->ino()); fg = m->get_dir(i).get_dirfrag().frag; - - // add/update the dir replica curdir = add_replica_dir(cur, fg, m->get_dir(i), m->get_source().num(), finished); @@ -5074,18 +5281,23 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) } // dentry error? - if (i == m->get_depth()-1 && - m->is_flag_error_dn()) { + if (i == m->get_depth()-1 && (m->is_flag_error_dn() || m->is_flag_error_ino())) { // error! assert(cur->is_dir()); if (curdir) { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << dendl; - curdir->take_dentry_waiting(m->get_error_dentry(), - error); + if (m->get_error_dentry().length()) { + dout(7) << " flag_error on dentry " << m->get_error_dentry() + << ", triggering dentry" << dendl; + curdir->take_dentry_waiting(m->get_error_dentry(), error); + } else { + dout(7) << " flag_error on ino " << m->get_wanted_ino() + << ", triggering ino" << dendl; + curdir->take_ino_waiting(m->get_wanted_ino(), error); + } } else { - dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << dendl; + dout(7) << " flag_error on dentry " << m->get_error_dentry() + << ", triggering dir?" << dendl; cur->take_waiting(CInode::WAIT_DIR, error); - dir_discovers.erase(cur->ino()); } break; } @@ -5095,49 +5307,58 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) // dentry CDentry *dn = 0; if (i >= m->get_last_dentry()) break; - if (i > 0 || m->has_base_dentry()) { + if (i > 0 || m->has_base_dentry()) dn = add_replica_dentry(curdir, m->get_dentry(i), finished); - } // inode if (i >= m->get_last_inode()) break; - cur = add_replica_inode(m->get_inode(i), dn); + cur = add_replica_inode(m->get_inode(i), dn, finished); } - // dir_auth hint? - if (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && - m->get_dir_auth_hint() != mds->get_nodeid()) { - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - - // try again. include dentry _and_ dirfrag, just in case. - int hint = m->get_dir_auth_hint(); - filepath want; - want.push_dentry(m->get_error_dentry()); - MDiscover *dis = new MDiscover(mds->get_nodeid(), - cur->ino(), - want, - true, - m->get_wanted_xlocks_hint()); - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - dis->set_base_dir_frag(fg); - mds->send_message_mds(dis, hint, MDS_PORT_CACHE); - - // note the dangling discover... but only if it's already noted in dir_discovers (i.e. someone is waiting) - if (dir_discovers.count(cur->ino())) { - dir_discovers[cur->ino()].insert(hint); - assert(cur->is_waiter_for(CInode::WAIT_DIR)); - } - } - else if (m->is_flag_error_dir()) { - // dir error at the end there? - dout(7) << " flag_error on dir " << *cur << dendl; - assert(!cur->is_dir()); + // dir error? + // or dir_auth hint? + if (m->is_flag_error_dir() && !cur->is_dir()) { + // not a dir. cur->take_waiting(CInode::WAIT_DIR, error); - dir_discovers.erase(cur->ino()); + } else if (m->is_flag_error_dir() || + (m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN && + m->get_dir_auth_hint() != mds->get_nodeid())) { + int who = m->get_dir_auth_hint(); + if (who == mds->get_nodeid()) who = -1; + if (who >= 0) + dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; + + // try again? + if (m->get_error_dentry().length()) { + // wanted a dentry + frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); + CDir *dir = cur->get_dirfrag(fg); + if (dir) { + // don't actaully need the hint, now + if (dir->lookup(m->get_error_dentry()) == 0 && + dir->is_waiting_for_dentry(m->get_error_dentry())) + discover_path(dir, m->get_error_dentry(), 0, m->get_wanted_xlocked()); + else + dout(7) << " doing nothing, have dir but nobody is waiting on dentry " + << m->get_error_dentry() << dendl; + } else { + if (cur->is_waiter_for(CInode::WAIT_DIR)) + discover_path(cur, m->get_error_dentry(), 0, m->get_wanted_xlocked(), who); + else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } + } else { + // wanted just the dir + frag_t fg = m->get_base_dir_frag(); + if (cur->get_dirfrag(fg) == 0 && cur->is_waiter_for(CInode::WAIT_DIR)) + discover_dir_frag(cur, fg, 0, who); + else + dout(7) << " doing nothing, nobody is waiting for dir" << dendl; + } } - - // finish errors directly - finish_contexts(error, -ENOENT); + + // waiters + finish_contexts(error, -ENOENT); // finish errors directly mds->queue_waiters(finished); // done @@ -5145,6 +5366,10 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) } + +// ---------------------------- +// REPLICAS + CDir *MDCache::add_replica_dir(CInode *diri, frag_t fg, CDirDiscover &dis, int from, list& finished) @@ -5178,7 +5403,6 @@ CDir *MDCache::add_replica_dir(CInode *diri, // get waiters diri->take_waiting(CInode::WAIT_DIR, finished); - dir_discovers.erase(diri->ino()); } return dir; @@ -5228,7 +5452,7 @@ CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished) { CInode *in = get_inode(dis.get_ino()); if (!in) { @@ -5247,6 +5471,8 @@ CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn) if (dn) { assert(dn->is_primary()); assert(dn->inode == in); + + dn->get_dir()->take_ino_waiting(in->ino(), finished); } return in; @@ -5261,7 +5487,7 @@ CDentry *MDCache::add_replica_stray(bufferlist &bl, CInode *in, int from) // inode CInodeDiscover indis; indis._decode(bl, off); - CInode *strayin = add_replica_inode(indis, NULL); + CInode *strayin = add_replica_inode(indis, NULL, finished); strayin->force_auth = pair(from, CDIR_AUTH_UNKNOWN); dout(15) << "strayin " << *strayin << dendl; @@ -5444,7 +5670,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) CDentry *straydn = 0; if (m->strayin) { list finished; - CInode *in = add_replica_inode(*m->strayin, NULL); + CInode *in = add_replica_inode(*m->strayin, NULL, finished); CDir *dir = add_replica_dir(in, m->straydir->get_dirfrag().frag, *m->straydir, m->get_source().num(), finished); straydn = add_replica_dentry(dir, *m->straydn, finished); @@ -5604,7 +5830,7 @@ void MDCache::split_dir(CDir *dir, int bits) */ void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) { - C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); + C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); // freeze the dirs for (list::iterator p = frags.begin(); @@ -5612,7 +5838,9 @@ void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, ++p) { CDir *dir = *p; dir->auth_pin(); // this will block the freeze - dir->freeze_dir(gather->new_sub()); + dir->freeze_dir(); + assert(dir->is_freezing_dir()); + dir->add_waiter(CDir::WAIT_FROZEN, gather->new_sub()); } } @@ -5715,16 +5943,19 @@ class C_MDC_FragmentLogged : public Context { int bits; list resultfrags; vector pvs; + LogSegment *ls; public: C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, - list& rf, vector& p) : - mdcache(m), diri(di), basefrag(bf), bits(b) { + list& rf, vector& p, + LogSegment *s) : + mdcache(m), diri(di), basefrag(bf), bits(b), ls(s) { resultfrags.swap(rf); pvs.swap(p); } virtual void finish(int r) { mdcache->fragment_logged(diri, basefrag, bits, - resultfrags, pvs); + resultfrags, pvs, + ls); } }; @@ -5761,7 +5992,7 @@ void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, mds->mdlog->submit_entry(le, new C_MDC_FragmentLogged(this, diri, basefrag, bits, - resultfrags, pvs)); + resultfrags, pvs, mds->mdlog->get_current_segment())); // announcelist& resultfrags, for (set::iterator p = peers.begin(); @@ -5782,7 +6013,8 @@ void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, - vector& pvs) + vector& pvs, + LogSegment *ls) { dout(10) << "fragment_logged " << basefrag << " bits " << bits << " on " << *diri << dendl; @@ -5799,7 +6031,7 @@ void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, // dirty, unpin, unfreeze dir->state_clear(CDir::STATE_FRAGMENTING); - dir->mark_dirty(*pv); + dir->mark_dirty(*pv, ls); pv++; for (CDir::map_t::iterator p = dir->items.begin(); @@ -5859,15 +6091,18 @@ void MDCache::show_subtrees(int dbl) return; // i won't print anything. if (subtrees.empty()) { - dout(dbl) << "no subtrees" << dendl; + dout(dbl) << "show_subtrees - no subtrees" << dendl; return; } // root frags - list rootfrags; - if (root) root->get_dirfrags(rootfrags); - if (stray) stray->get_dirfrags(rootfrags); - dout(15) << "rootfrags " << rootfrags << dendl; + list basefrags; + for (set::iterator p = base_inodes.begin(); + p != base_inodes.end(); + ++p) + (*p)->get_dirfrags(basefrags); + //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; + dout(15) << "show_subtrees" << dendl; // queue stuff list > q; @@ -5875,9 +6110,11 @@ void MDCache::show_subtrees(int dbl) set seen; // calc max depth - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) q.push_back(pair(*p, 0)); + set subtrees_seen; + int depth = 0; while (!q.empty()) { CDir *dir = q.front().first; @@ -5886,6 +6123,8 @@ void MDCache::show_subtrees(int dbl) if (subtrees.count(dir) == 0) continue; + subtrees_seen.insert(dir); + if (d > depth) depth = d; // sanity check @@ -5907,7 +6146,7 @@ void MDCache::show_subtrees(int dbl) // print tree - for (list::iterator p = rootfrags.begin(); p != rootfrags.end(); ++p) + for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) q.push_back(pair(*p, 0)); while (!q.empty()) { @@ -5962,6 +6201,17 @@ void MDCache::show_subtrees(int dbl) q.push_front(pair(*p, d+2)); } } + + // verify there isn't stray crap in subtree map + int lost = 0; + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + if (subtrees_seen.count(p->first)) continue; + dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; + lost++; + } + assert(lost == 0); } diff --git a/trunk/ceph/mds/MDCache.h b/trunk/ceph/mds/MDCache.h index 10073e3e19a95..86e3b894c6c8d 100644 --- a/trunk/ceph/mds/MDCache.h +++ b/trunk/ceph/mds/MDCache.h @@ -57,6 +57,9 @@ class MMDSSlaveRequest; class MMDSFragmentNotify; +class ESubtreeMap; + + // MDCache //typedef const char* pchar; @@ -80,8 +83,6 @@ struct MDRequest { // -- i am a client (master) request MClientRequest *client_request; // client request (if any) - set slaves; // mds nodes that have slave requests to me (implies client_request) - set waiting_on_slave; // peers i'm waiting for slavereq replies from. vector trace; // original path traversal. CInode *ref; // reference inode. if there is only one, and its path is pinned. @@ -90,12 +91,17 @@ struct MDRequest { MMDSSlaveRequest *slave_request; // slave request (if one is pending; implies slave == true) int slave_to_mds; // this is a slave request if >= 0. + // -- misc -- + LogSegment *ls; // the log segment i'm committing to + utime_t now; + // -- my pins and locks -- // cache pins (so things don't expire) set< MDSCacheObject* > pins; set stickydirs; // auth pins + set< MDSCacheObject* > remote_auth_pins; set< MDSCacheObject* > auth_pins; // held locks @@ -110,48 +116,67 @@ struct MDRequest { bool committing; bool aborted; - // for rename/link/unlink - utime_t now; - set witnessed; // nodes who have journaled a RenamePrepare - map pvmap; - - // for rename - set extra_witnesses; // replica list from srcdn auth (rename) - version_t src_reanchor_atid; // src->dst - version_t dst_reanchor_atid; // dst->stray - bufferlist inode_import; - version_t inode_import_v; - CDentry *srcdn; // srcdn, if auth, on slave - - // called when slave commits - Context *slave_commit; + struct More { + set slaves; // mds nodes that have slave requests to me (implies client_request) + set waiting_on_slave; // peers i'm waiting for slavereq replies from. + + // for rename/link/unlink + set witnessed; // nodes who have journaled a RenamePrepare + map pvmap; + + // for rename + set extra_witnesses; // replica list from srcdn auth (rename) + version_t src_reanchor_atid; // src->dst + version_t dst_reanchor_atid; // dst->stray + bufferlist inode_import; + version_t inode_import_v; + CInode* destdn_was_remote_inode; + bool was_link_merge; + + // called when slave commits or aborts + Context *slave_commit; + + More() : + src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), + destdn_was_remote_inode(0), was_link_merge(false), + slave_commit(0) { } + } *_more; // --------------------------------------------------- MDRequest() : client_request(0), ref(0), slave_request(0), slave_to_mds(-1), + ls(0), done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } + _more(0) {} MDRequest(metareqid_t ri, MClientRequest *req) : reqid(ri), client_request(req), ref(0), slave_request(0), slave_to_mds(-1), + ls(0), done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } + _more(0) {} MDRequest(metareqid_t ri, int by) : reqid(ri), client_request(0), ref(0), slave_request(0), slave_to_mds(by), + ls(0), done_locking(false), committing(false), aborted(false), - src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - slave_commit(0) { } + _more(0) {} + ~MDRequest() { + delete _more; + } bool is_master() { return slave_to_mds < 0; } bool is_slave() { return slave_to_mds >= 0; } - bool slave_did_prepare() { return slave_commit; } + More* more() { + if (!_more) _more = new More(); + return _more; + } + + bool slave_did_prepare() { return more()->slave_commit; } + // pin items in cache void pin(MDSCacheObject *o) { if (pins.count(o) == 0) { @@ -168,7 +193,7 @@ struct MDRequest { // auth pins bool is_auth_pinned(MDSCacheObject *object) { - return auth_pins.count(object); + return auth_pins.count(object) || remote_auth_pins.count(object); } void auth_pin(MDSCacheObject *object) { if (!is_auth_pinned(object)) { @@ -176,15 +201,17 @@ struct MDRequest { auth_pins.insert(object); } } + void auth_unpin(MDSCacheObject *object) { + assert(is_auth_pinned(object)); + object->auth_unpin(); + auth_pins.erase(object); + } void drop_local_auth_pins() { - set::iterator it = auth_pins.begin(); - while (it != auth_pins.end()) { - if ((*it)->is_auth()) { - (*it)->auth_unpin(); - auth_pins.erase(it++); - } else { - it++; - } + for (set::iterator it = auth_pins.begin(); + it != auth_pins.end(); + it++) { + assert((*it)->is_auth()); + (*it)->auth_unpin(); } auth_pins.clear(); } @@ -201,22 +228,61 @@ inline ostream& operator<<(ostream& out, MDRequest &mdr) return out; } +struct MDSlaveUpdate { + EMetaBlob commit; + EMetaBlob rollback; + xlist::item xlistitem; + Context *waiter; + MDSlaveUpdate() : xlistitem(this), waiter(0) {} + MDSlaveUpdate(EMetaBlob c, EMetaBlob r, xlist &list) : + commit(c), rollback(r), + xlistitem(this), + waiter(0) { + list.push_back(&xlistitem); + } + ~MDSlaveUpdate() { + if (waiter) waiter->finish(0); + delete waiter; + } +}; + + class MDCache { public: // my master MDS *mds; - LRU lru; // dentry lru for expiring items from cache - + // -- my cache -- + LRU lru; // dentry lru for expiring items from cache protected: - // the cache - CInode *root; // root inode - hash_map inode_map; // map of inodes by ino - CInode *stray; // my stray dir + hash_map inode_map; // map of inodes by ino + CInode *root; // root inode + CInode *stray; // my stray dir + + set base_inodes; // inodes < MDS_INO_BASE (root, stray, etc.) + + // -- discover -- + // waiters + map > > waiting_for_base_ino; + + // in process discovers, by mds. + // this is just enough info to kick any waiters in the event of a failure. + // FIXME: use pointers here instead of identifiers? + map > discover_dir; + map > discover_dir_sub; + + void discover_base_ino(inodeno_t want_ino, Context *onfinish, int from=-1); + void discover_dir_frag(CInode *base, frag_t approx_fg, Context *onfinish, + int from=-1); + void discover_path(CInode *base, filepath want_path, Context *onfinish, + bool want_xlocked=false, int from=-1); + void discover_path(CDir *base, filepath want_path, Context *onfinish, + bool want_xlocked=false); + void discover_ino(CDir *base, inodeno_t want_ino, Context *onfinish, + bool want_xlocked=false); + + void kick_discovers(int who); // after a failure. - // root - list waiting_for_root; - map > waiting_for_stray; public: int get_num_inodes() { return inode_map.size(); } @@ -247,10 +313,9 @@ public: adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); } void map_dirfrag_set(list& dfs, set& result); - void adjust_export_state(CDir *dir); void try_subtree_merge(CDir *root); void try_subtree_merge_at(CDir *root); - void subtree_merge_writebehind_finish(CInode *in); + void subtree_merge_writebehind_finish(CInode *in, LogSegment *ls); void eval_subtree_root(CDir *dir); CDir *get_subtree_root(CDir *dir); void remove_subtree(CDir *dir); @@ -273,17 +338,11 @@ protected: // delayed cache expire map > delayed_expire; // subtree root -> expire msg - // -- discover -- - hash_map > dir_discovers; // dirino -> mds set i'm trying to discover. - // -- requests -- -public: - - protected: hash_map active_requests; - + public: MDRequest* request_start(MClientRequest *req); MDRequest* request_start_slave(metareqid_t rid, int by); @@ -300,14 +359,10 @@ public: // inode purging - map > purging; - map > > waiting_for_purge; + map > purging; // inode -> newsize -> oldsize + map > purging_ls; + map > > waiting_for_purge; - // shutdown crap - int shutdown_commits; - bool did_shutdown_log_cap; - friend class C_MDC_ShutdownCommit; - // -- recovery -- protected: set recovery_set; @@ -324,7 +379,7 @@ protected: // from MMDSResolves map > > other_ambiguous_imports; - map > uncommitted_slave_updates; // for replay. + map > uncommitted_slave_updates; // for replay. map ambiguous_slave_updates; // for log trimming. map waiting_for_slave_update_commit; friend class ESlaveUpdate; @@ -351,12 +406,14 @@ public: void send_resolve_now(int who); void send_resolve_later(int who); void maybe_send_pending_resolves(); - void log_subtree_map(Context *onsync=0); - void _logged_subtree_map(off_t off); + + ESubtreeMap *create_subtree_map(); + protected: // [rejoin] set rejoin_gather; // nodes from whom i need a rejoin + set rejoin_sent; // nodes i sent a rejoin to set rejoin_ack_gather; // nodes from whom i need a rejoin ack map > cap_exports; // ino -> client -> capex @@ -438,6 +495,8 @@ public: bool shutdown_pass(); bool shutdown(); // clear cache (ie at shutodwn) + bool did_shutdown_log_cap; + // inode_map bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } CInode* get_inode( inodeno_t ino ) { @@ -489,19 +548,19 @@ public: public: // inode purging - void purge_inode(inode_t *inode, off_t newsize); - void _do_purge_inode(inode_t *inode, off_t newsize); - void purge_inode_finish(inodeno_t ino, off_t newsize); - void purge_inode_finish_2(inodeno_t ino, off_t newsize); - bool is_purging(inodeno_t ino, off_t newsize) { - return purging.count(ino) && purging[ino].count(newsize); + void purge_inode(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); + void _do_purge_inode(CInode *in, off_t newsize, off_t oldsize); + void purge_inode_finish(CInode *in, off_t newsize, off_t oldsize); + void purge_inode_finish_2(CInode *in, off_t newsize, off_t oldsize); + bool is_purging(CInode *in, off_t newsize, off_t oldsize) { + return purging.count(in) && purging[in].count(newsize); } - void wait_for_purge(inodeno_t ino, off_t newsize, Context *c) { - waiting_for_purge[ino][newsize].push_back(c); + void wait_for_purge(CInode *in, off_t newsize, Context *c) { + waiting_for_purge[in][newsize].push_back(c); } - void add_recovered_purge(const inode_t& inode, off_t newsize); - void remove_recovered_purge(inodeno_t ino, off_t newsize); + void add_recovered_purge(CInode *in, off_t newsize, off_t oldsize, LogSegment *ls); + void remove_recovered_purge(CInode *in, off_t newsize, off_t oldsize); void start_recovered_purges(); @@ -538,8 +597,7 @@ public: vector& anchortrace, Context *onfinish); - bool parallel_fetch(map& pathmap, - Context *c); + C_Gather *parallel_fetch(map& pathmap); void make_trace(vector& trace, CInode *in); @@ -549,9 +607,9 @@ public: void anchor_destroy(CInode *in, Context *onfinish); protected: void _anchor_create_prepared(CInode *in, version_t atid); - void _anchor_create_logged(CInode *in, version_t atid, version_t pdv); + void _anchor_create_logged(CInode *in, version_t atid, LogSegment *ls); void _anchor_destroy_prepared(CInode *in, version_t atid); - void _anchor_destroy_logged(CInode *in, version_t atid, version_t pdv); + void _anchor_destroy_logged(CInode *in, version_t atid, LogSegment *ls); friend class C_MDC_AnchorCreatePrepared; friend class C_MDC_AnchorCreateLogged; @@ -563,7 +621,7 @@ public: void eval_stray(CDentry *dn); protected: void _purge_stray(CDentry *dn); - void _purge_stray_logged(CDentry *dn, version_t pdv); + void _purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls); friend class C_MDC_PurgeStray; void reintegrate_stray(CDentry *dn, CDentry *rlink); void migrate_stray(CDentry *dn, int dest); @@ -585,7 +643,8 @@ protected: CDir* forge_replica_dir(CInode *diri, frag_t fg, int from); CDentry *add_replica_dentry(CDir *dir, CDentryDiscover &dis, list& finished); - CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn); +public: // for Server::handle_slave_rename_prep + CInode *add_replica_inode(CInodeDiscover& dis, CDentry *dn, list& finished); public: CDentry *add_replica_stray(bufferlist &bl, CInode *strayin, int from); @@ -611,7 +670,7 @@ private: void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); - void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs); + void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs, LogSegment *ls); friend class C_MDC_FragmentGo; friend class C_MDC_FragmentMarking; friend class C_MDC_FragmentStored; diff --git a/trunk/ceph/mds/MDLog.cc b/trunk/ceph/mds/MDLog.cc index 11c1f8f835dfc..fc7cdffbe6e10 100644 --- a/trunk/ceph/mds/MDLog.cc +++ b/trunk/ceph/mds/MDLog.cc @@ -22,6 +22,8 @@ #include "common/LogType.h" #include "common/Logger.h" +#include "events/ESubtreeMap.h" + #include "config.h" #define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".log " @@ -50,14 +52,23 @@ void MDLog::reopen_logger(utime_t start, bool append) static bool didit = false; if (!didit) { didit = true; - mdlog_logtype.add_inc("add"); - mdlog_logtype.add_inc("obs"); - mdlog_logtype.add_inc("trims"); - mdlog_logtype.add_inc("trimf"); - mdlog_logtype.add_inc("trimng"); - mdlog_logtype.add_set("size"); - mdlog_logtype.add_set("rdpos"); + mdlog_logtype.add_inc("evadd"); + mdlog_logtype.add_inc("evex"); + mdlog_logtype.add_inc("evtrm"); + mdlog_logtype.add_set("ev"); + mdlog_logtype.add_set("evexg"); + mdlog_logtype.add_set("evexd"); + + mdlog_logtype.add_inc("segadd"); + mdlog_logtype.add_inc("segex"); + mdlog_logtype.add_inc("segtrm"); + mdlog_logtype.add_set("seg"); + mdlog_logtype.add_set("segexg"); + mdlog_logtype.add_set("segexd"); + + mdlog_logtype.add_set("expos"); mdlog_logtype.add_set("wrpos"); + mdlog_logtype.add_avg("jlat"); } @@ -78,13 +89,32 @@ void MDLog::init_journaler() journaler = new Journaler(log_inode, mds->objecter, logger, &mds->mds_lock); } +void MDLog::write_head(Context *c) +{ + journaler->write_head(c); +} + +off_t MDLog::get_read_pos() +{ + return journaler->get_read_pos(); +} + +off_t MDLog::get_write_pos() +{ + return journaler->get_write_pos(); +} + -void MDLog::reset() +void MDLog::create(Context *c) { - dout(5) << "reset to empty log" << dendl; + dout(5) << "create empty log" << dendl; init_journaler(); journaler->reset(); + write_head(c); + + logger->set("expos", journaler->get_expire_pos()); + logger->set("wrpos", journaler->get_write_pos()); } void MDLog::open(Context *c) @@ -92,6 +122,8 @@ void MDLog::open(Context *c) dout(5) << "open discovering log bounds" << dendl; init_journaler(); journaler->recover(c); + + // either append() or replay() will follow. } void MDLog::append() @@ -99,77 +131,71 @@ void MDLog::append() dout(5) << "append positioning at end" << dendl; journaler->set_read_pos(journaler->get_write_pos()); journaler->set_expire_pos(journaler->get_write_pos()); -} - -void MDLog::write_head(Context *c) -{ - journaler->write_head(c); -} - -off_t MDLog::get_read_pos() -{ - return journaler->get_read_pos(); + logger->set("expos", journaler->get_write_pos()); } -off_t MDLog::get_write_pos() -{ - return journaler->get_write_pos(); -} +// ------------------------------------------------- void MDLog::submit_entry( LogEvent *le, Context *c ) { - if (g_conf.mds_log) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl; - - // encode it, with event type - { - bufferlist bl; - bl.append((char*)&le->_type, sizeof(le->_type)); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); // bl is destroyed. - } - - assert(!capped); - - delete le; - num_events++; - - if (logger) { - logger->inc("add"); - logger->set("size", num_events); - logger->set("wrpos", journaler->get_write_pos()); - } - - if (c) { - unflushed = 0; - journaler->flush(c); - } - else - unflushed++; - - // should we log a new import_map? - // FIXME: should this go elsewhere? - if (!writing_subtree_map && - (journaler->get_write_pos() / log_inode.layout.period()) != - (get_last_subtree_map_offset() / log_inode.layout.period()) && - (journaler->get_write_pos() - get_last_subtree_map_offset() > log_inode.layout.period()/2)) { - // log import map - dout(10) << "submit_entry also logging subtree map: last = " << get_last_subtree_map_offset() - << ", cur pos = " << journaler->get_write_pos() << dendl; - mds->mdcache->log_subtree_map(); - } - - } else { + if (!g_conf.mds_log) { // hack: log is disabled. if (c) { c->finish(0); delete c; } + return; + } + + dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << dendl; + + // let the event register itself in the segment + assert(!segments.empty()); + le->_segment = segments.rbegin()->second; + le->_segment->num_events++; + le->update_segment(); + + num_events++; + assert(!capped); + + // encode it, with event type + { + bufferlist bl; + ::_encode(le->_type, bl); + le->encode_payload(bl); + + // journal it. + journaler->append_entry(bl); // bl is destroyed. + } + + delete le; + + if (logger) { + logger->inc("evadd"); + logger->set("ev", num_events); + logger->set("wrpos", journaler->get_write_pos()); + } + + if (c) { + unflushed = 0; + journaler->flush(c); + } + else + unflushed++; + + // start a new segment? + // FIXME: should this go elsewhere? + off_t last_seg = get_last_segment_offset(); + if (!segments.empty() && + !writing_subtree_map && + (journaler->get_write_pos() / log_inode.layout.period()) != (last_seg / log_inode.layout.period()) && + (journaler->get_write_pos() - last_seg > log_inode.layout.period()/2)) { + dout(10) << "submit_entry also starting new segment: last = " << last_seg + << ", cur pos = " << journaler->get_write_pos() << dendl; + start_new_segment(); } } @@ -192,211 +218,168 @@ void MDLog::flush() unflushed = 0; // trim - trim(NULL); + trim(); } void MDLog::cap() { dout(5) << "cap" << dendl; capped = true; - kick_subtree_map(); } -// trim +// ----------------------------- +// segments -class C_MDL_Trimmed : public Context { -public: - MDLog *mdl; - LogEvent *le; +void MDLog::start_new_segment(Context *onsync) +{ + dout(7) << "start_new_segment at " << journaler->get_write_pos() << dendl; + assert(!writing_subtree_map); - C_MDL_Trimmed(MDLog *mdl, LogEvent *le) { - this->mdl = mdl; - this->le = le; - } - void finish(int res) { - mdl->_trimmed(le); - } -}; + segments[journaler->get_write_pos()] = new LogSegment(journaler->get_write_pos()); -class C_MDL_Reading : public Context { -public: - MDLog *mdl; - C_MDL_Reading(MDLog *m) { - mdl = m; - } - void finish(int res) { - mdl->_did_read(); - } -}; + writing_subtree_map = true; + ESubtreeMap *le = mds->mdcache->create_subtree_map(); + submit_entry(le, new C_MDL_WroteSubtreeMap(this, mds->mdlog->get_write_pos())); + if (onsync) + wait_for_sync(onsync); + + logger->inc("segadd"); + logger->set("seg", segments.size()); +} -void MDLog::_did_read() +void MDLog::_logged_subtree_map(off_t off) { - dout(5) << "_did_read()" << dendl; - waiting_for_read = false; - trim(0); + dout(10) << "_logged_subtree_map at " << off << dendl; + writing_subtree_map = false; + + /* + list ls; + take_subtree_map_expire_waiters(ls); + mds->queue_waiters(ls); + */ } -void MDLog::_trimmed(LogEvent *le) + + +void MDLog::trim() { - // successful trim? - if (!le->has_expired(mds)) { - dout(7) << "retrimming : " << le->get_start_off() << " : " << *le << dendl; - le->expire(mds, new C_MDL_Trimmed(this, le)); - return; - } + // trim! + dout(10) << "trim " + << segments.size() << " / " << max_segments << " segments, " + << num_events << " / " << max_events << " events" + << ", " << expiring_segments.size() << " (" << expiring_events << ") expiring" + << ", " << expired_segments.size() << " (" << expired_events << ") expired" + << dendl; - dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << dendl; + if (segments.empty()) return; - bool kick = false; + // hack: only trim for a few seconds at a time + utime_t stop = g_clock.now(); + stop += 2.0; - map::iterator p = trimming.begin(); - if (p->first == le->_start_off) { - // we trimmed off the front! it must have been a segment head. - assert(!subtree_maps.empty()); - assert(p->first == *subtree_maps.begin()); - subtree_maps.erase(subtree_maps.begin()); + map::iterator p = segments.begin(); + int left = num_events; + while (p != segments.end() && + ((max_events >= 0 && left-expiring_events-expired_events > max_events) || + (max_segments >= 0 && (int)(segments.size()-expiring_segments.size()-expired_segments.size()) > max_segments))) { - // we can expire the log a bit. - off_t to = get_trimmed_to(); - journaler->set_expire_pos(to); - journaler->trim(); + if (stop < g_clock.now()) + break; + + if ((int)expiring_segments.size() >= g_conf.mds_log_max_expiring) + break; + + // look at first segment + LogSegment *ls = p->second; + assert(ls); - kick = true; - } else { p++; - // is the next one us? - if (le->_start_off == p->first) { - p++; - - // did we empty a segment? - if (subtree_maps.size() >= 2) { - set::iterator segp = subtree_maps.begin(); - assert(*segp < le->_end_off); - segp++; - dout(20) << "i ended at " << le->get_end_off() - << ", next seg starts at " << *segp - << ", next trimming is " << (p == trimming.end() ? 0:p->first) - << dendl; - if (*segp >= le->_end_off && - (p == trimming.end() || - p->first >= *segp)) { - dout(10) << "_trimmed segment looks empty" << dendl; - kick = true; - } - } else if (capped && trimming.size() < 3) { - kick = true; // blech, imprecise - } + left -= ls->num_events; + + if (expiring_segments.count(ls)) { + dout(5) << "trim already expiring segment " << ls->offset << ", " << ls->num_events << " events" << dendl; + } else if (expired_segments.count(ls)) { + dout(5) << "trim already expired segment " << ls->offset << ", " << ls->num_events << " events" << dendl; + } else { + try_expire(ls); } } +} - trimming.erase(le->_start_off); - delete le; - if (kick) - kick_subtree_map(); - - if (logger) { - logger->inc("trimf"); - logger->set("trimng", trimming.size()); - logger->set("rdpos", journaler->get_read_pos()); +void MDLog::try_expire(LogSegment *ls) +{ + C_Gather *exp = ls->try_to_expire(mds); + if (exp) { + assert(expiring_segments.count(ls) == 0); + expiring_segments.insert(ls); + expiring_events += ls->num_events; + dout(5) << "try_expire expiring segment " << ls->offset << dendl; + exp->set_finisher(new C_MaybeExpiredSegment(this, ls)); + } else { + dout(10) << "try_expire expired segment " << ls->offset << dendl; + _expired(ls); } - - trim(0); + + logger->set("segexg", expiring_segments.size()); + logger->set("evexg", expiring_events); } - - -void MDLog::trim(Context *c) +void MDLog::_maybe_expired(LogSegment *ls) { - // add waiter - if (c) - trim_waiters.push_back(c); - - // trim! - dout(10) << "trim " << num_events << " events / " << max_events << " max" << dendl; + dout(10) << "_maybe_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; + assert(expiring_segments.count(ls)); + expiring_segments.erase(ls); + expiring_events -= ls->num_events; + try_expire(ls); +} - // hack: only trim for a few seconds at a time - utime_t stop = g_clock.now(); - stop += 2.0; +void MDLog::_expired(LogSegment *ls) +{ + dout(5) << "_expired segment " << ls->offset << " " << ls->num_events << " events" << dendl; - while (num_events > max_events) { - // don't check the clock on _every_ event, here! - if (num_events % 100 == 0 && - stop < g_clock.now()) - break; + if (!capped && ls == get_current_segment()) { + dout(5) << "_expired not expiring " << ls->offset << ", last one and !capped" << dendl; + } else { + // expired. + expired_segments.insert(ls); + expired_events += ls->num_events; - off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); - dout(5) << "trim num_events " << num_events << " > max " << max_events - << ", trimming " << trimming.size() - << ", byte gap " << gap - << dendl; - - if ((int)trimming.size() >= g_conf.mds_log_max_trimming) { - dout(7) << "trim already trimming max, waiting" << dendl; - return; - } + logger->inc("evex", ls->num_events); + logger->inc("segex"); - bufferlist bl; - off_t so = journaler->get_read_pos(); - if (journaler->try_read_entry(bl)) { - // decode logevent - LogEvent *le = LogEvent::decode(bl); - le->_start_off = so; - le->_end_off = journaler->get_read_pos(); - num_events--; - - // we just read an event. - if (le->has_expired(mds)) { - // obsolete - dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << dendl; - delete le; - if (logger) logger->inc("obs"); - } else { - assert ((int)trimming.size() < g_conf.mds_log_max_trimming); - - // trim! - dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << dendl; - trimming[le->_start_off] = le; - le->expire(mds, new C_MDL_Trimmed(this, le)); - if (logger) { - logger->inc("trims"); - logger->set("trimng", trimming.size()); - } - } - if (logger) { - logger->set("rdpos", journaler->get_read_pos()); - logger->set("size", num_events); - } - } else { - // need to read! - if (!waiting_for_read) { - waiting_for_read = true; - dout(7) << "trim waiting for read" << dendl; - journaler->wait_for_readable(new C_MDL_Reading(this)); - } else { - dout(7) << "trim already waiting for read" << dendl; - } - return; + // trim expired segments? + while (!segments.empty()) { + ls = segments.begin()->second; + if (!expired_segments.count(ls)) break; + + expired_events -= ls->num_events; + expired_segments.erase(ls); + num_events -= ls->num_events; + + journaler->set_expire_pos(ls->offset); // this was the oldest segment, adjust expire pos + journaler->write_head(0); + + logger->set("expos", ls->offset); + logger->inc("segtrm"); + logger->inc("evtrm", ls->num_events); + + segments.erase(ls->offset); + delete ls; } } - dout(10) << "trim num_events " << num_events << " <= max " << max_events - << ", trimming " << trimming.size() - << ", done for now." - << dendl; - - // trimmed! - std::list finished; - finished.swap(trim_waiters); - finish_contexts(finished, 0); + logger->set("ev", num_events); + logger->set("evexd", expired_events); + logger->set("seg", segments.size()); + logger->set("segexd", expired_segments.size()); } - void MDLog::replay(Context *c) { assert(journaler->is_active()); @@ -425,7 +408,6 @@ void MDLog::replay(Context *c) assert(num_events == 0); replay_thread.create(); - //_replay(); } class C_MDL_Replay : public Context { @@ -434,7 +416,6 @@ public: C_MDL_Replay(MDLog *l) : mdlog(l) {} void finish(int r) { mdlog->replay_cond.Signal(); - //mdlog->_replay(); } }; @@ -471,25 +452,32 @@ void MDLog::_replay_thread() // unpack event LogEvent *le = LogEvent::decode(bl); + // new segment? + if (le->get_type() == EVENT_SUBTREEMAP) { + segments[pos] = new LogSegment(pos); + logger->set("seg", segments.size()); + } + // have we seen an import map yet? - if (!seen_subtree_map && - le->get_type() != EVENT_SUBTREEMAP) { + if (segments.empty()) { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() << " -- waiting for subtree_map. (skipping " << *le << ")" << dendl; } else { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() << " : " << *le << dendl; + le->_segment = get_current_segment(); // replay may need this + le->_segment->num_events++; + num_events++; + le->replay(mds); - num_events++; if (!new_expire_pos) new_expire_pos = pos; - - if (le->get_type() == EVENT_SUBTREEMAP) - seen_subtree_map = true; } delete le; + logger->set("rdpos", pos); + // drop lock for a second, so other events/messages (e.g. beacon timer!) can go off mds->mds_lock.Unlock(); mds->mds_lock.Lock(); @@ -502,6 +490,7 @@ void MDLog::_replay_thread() // move read pointer _back_ to first subtree map we saw, for eventual trimming journaler->set_read_pos(new_expire_pos); journaler->set_expire_pos(new_expire_pos); + logger->set("expos", new_expire_pos); // kick waiter(s) list ls; diff --git a/trunk/ceph/mds/MDLog.h b/trunk/ceph/mds/MDLog.h index 73f24fa9bb28b..f7bdcd21a5303 100644 --- a/trunk/ceph/mds/MDLog.h +++ b/trunk/ceph/mds/MDLog.h @@ -22,6 +22,8 @@ #include "common/Thread.h" #include "common/Cond.h" +#include "LogSegment.h" + #include //#include @@ -30,25 +32,21 @@ class Journaler; class LogEvent; class MDS; +class LogSegment; +class ESubtreeMap; class Logger; -/* -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const LogEvent *p) const { - static hash H; - return H((unsigned long)p); - } - }; -} -*/ +#include +using std::map; + class MDLog { protected: MDS *mds; - size_t num_events; // in events - size_t max_events; + int num_events; // in events + int max_events; + int max_segments; int unflushed; @@ -59,22 +57,6 @@ class MDLog { Logger *logger; - // -- trimming -- - map trimming; - std::list trim_waiters; // contexts waiting for trim - bool trim_reading; - - bool waiting_for_read; - friend class C_MDL_Reading; - - - off_t get_trimmed_to() { - if (trimming.empty()) - return get_read_pos(); - else - return trimming.begin()->first; - } - // -- replay -- Cond replay_cond; @@ -98,29 +80,36 @@ class MDLog { void _replay_thread(); // new way + // -- segments -- + map segments; + set expiring_segments; + set expired_segments; + int expiring_events; + int expired_events; + + class C_MDL_WroteSubtreeMap : public Context { + MDLog *mdlog; + off_t off; + public: + C_MDL_WroteSubtreeMap(MDLog *l, off_t o) : mdlog(l), off(o) { } + void finish(int r) { + mdlog->_logged_subtree_map(off); + } + }; + void _logged_subtree_map(off_t off); + // -- subtreemaps -- - set subtree_maps; - map > subtree_map_expire_waiters; bool writing_subtree_map; // one is being written now - bool seen_subtree_map; // for recovery friend class ESubtreeMap; friend class C_MDS_WroteImportMap; friend class MDCache; - void kick_subtree_map() { - if (subtree_map_expire_waiters.empty()) return; - list ls; - ls.swap(subtree_map_expire_waiters.begin()->second); - subtree_map_expire_waiters.erase(subtree_map_expire_waiters.begin()); - finish_contexts(ls); - } - public: - off_t get_last_subtree_map_offset() { - assert(!subtree_maps.empty()); - return *subtree_maps.rbegin(); + off_t get_last_segment_offset() { + assert(!segments.empty()); + return segments.rbegin()->first; } @@ -137,28 +126,36 @@ public: public: MDLog(MDS *m) : mds(m), - num_events(0), max_events(g_conf.mds_log_max_len), + num_events(0), + max_events(g_conf.mds_log_max_events), + max_segments(g_conf.mds_log_max_segments), unflushed(0), capped(false), journaler(0), logger(0), - trim_reading(false), waiting_for_read(false), replay_thread(this), - writing_subtree_map(false), seen_subtree_map(false) { + expiring_events(0), expired_events(0), + writing_subtree_map(false) { } ~MDLog(); - void set_max_events(size_t max) { max_events = max; } - size_t get_max_events() { return max_events; } - size_t get_num_events() { return num_events + trimming.size(); } - size_t get_non_subtreemap_events() { return num_events + trimming.size() - subtree_map_expire_waiters.size(); } + void start_new_segment(Context *onsync=0); + LogSegment *get_current_segment() { + return segments.empty() ? 0:segments.rbegin()->second; + } + + + void flush_logger(); + + size_t get_num_events() { return num_events; } + void set_max_events(int m) { max_events = m; } + size_t get_num_segments() { return segments.size(); } + void set_max_segments(int m) { max_segments = m; } off_t get_read_pos(); off_t get_write_pos(); - bool empty() { - return get_read_pos() == get_write_pos(); - } + bool empty() { return segments.empty(); } bool is_capped() { return capped; } void cap(); @@ -167,15 +164,31 @@ public: void wait_for_sync( Context *c ); void flush(); - void trim(Context *c); - void _did_read(); - void _trimmed(LogEvent *le); +private: + class C_MaybeExpiredSegment : public Context { + MDLog *mdlog; + LogSegment *ls; + public: + C_MaybeExpiredSegment(MDLog *mdl, LogSegment *s) : mdlog(mdl), ls(s) {} + void finish(int res) { + mdlog->_maybe_expired(ls); + } + }; + + void try_expire(LogSegment *ls); + void _maybe_expired(LogSegment *ls); + void _expired(LogSegment *ls); - void reset(); // fresh, empty log! - void open(Context *onopen); - void append(); +public: + void trim(); + +private: void write_head(Context *onfinish); +public: + void create(Context *onfinish); // fresh, empty log! + void open(Context *onopen); // append() or replay() to follow! + void append(); void replay(Context *onfinish); }; diff --git a/trunk/ceph/mds/MDS.cc b/trunk/ceph/mds/MDS.cc index 30a2ce2b0ba5b..6fc8ef46d9039 100644 --- a/trunk/ceph/mds/MDS.cc +++ b/trunk/ceph/mds/MDS.cc @@ -339,9 +339,6 @@ int MDS::init(bool standby) // schedule tick reset_tick(); - // init logger - //reopen_logger(g_clock.now()); - mds_lock.Unlock(); return 0; } @@ -369,14 +366,17 @@ void MDS::tick() if (logger) { req_rate = logger->get("req"); - logger->set("l", (int)load.mds_load()); + logger->fset("l", (int)load.mds_load()); logger->set("q", messenger->get_dispatch_queue_len()); logger->set("buf", buffer_total_alloc); logger->set("sm", mdcache->num_subtrees()); - + mdcache->log_stat(logger); } - + + if (is_active() || is_stopping()) + locker->scatter_unscatter_autoscattered(); + // booted? if (is_active()) { @@ -498,22 +498,13 @@ void MDS::handle_mds_map(MMDSMap *m) return; } - // note some old state + // keep old map, for a moment + MDSMap *oldmap = mdsmap; int oldwhoami = whoami; int oldstate = state; - set oldresolve; - mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); - bool wasrejoining = mdsmap->is_rejoining(); - set oldfailed; - mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); - set oldactive; - mdsmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); - set oldcreating; - mdsmap->get_mds_set(oldcreating, MDSMap::STATE_CREATING); - set oldstopped; - mdsmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); // decode and process + mdsmap = new MDSMap; mdsmap->decode(m->get_encoded()); // see who i am @@ -524,9 +515,13 @@ void MDS::handle_mds_map(MMDSMap *m) return; } - if (oldwhoami != whoami || !logger) // fakesyn/newsyn starts knowing who they are - reopen_logger(mdsmap->get_create()); - + // open logger? + // note that fakesyn/newsyn starts knowing who they are + if (whoami >= 0 && + mdsmap->is_up(whoami) && !mdsmap->is_standby(whoami) && + (oldwhoami != whoami || !logger)) + reopen_logger(mdsmap->get_create()); // adopt mds cluster timeline + if (oldwhoami != whoami) { // update messenger. dout(1) << "handle_mds_map i am now mds" << whoami @@ -541,7 +536,6 @@ void MDS::handle_mds_map(MMDSMap *m) messenger->send_message(new MOSDGetMap(0), monmap->get_inst(mon)); } - } // tell objecter my incarnation @@ -591,20 +585,20 @@ void MDS::handle_mds_map(MMDSMap *m) return; } } - + // RESOLVE // is someone else newly resolving? if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { - set resolve; + set oldresolve, resolve; + oldmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); if (oldresolve != resolve) { dout(10) << "resolve set is " << resolve << ", was " << oldresolve << dendl; - for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { - if (*p == whoami) continue; - if (oldresolve.count(*p)) continue; - mdcache->send_resolve(*p); // now or later. - } + for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) + if (*p != whoami && + oldresolve.count(*p) == 0) + mdcache->send_resolve(*p); // now or later. } } @@ -612,53 +606,56 @@ void MDS::handle_mds_map(MMDSMap *m) // is everybody finally rejoining? if (is_rejoin() || is_active() || is_stopping()) { // did we start? - if (!wasrejoining && mdsmap->is_rejoining()) + if (!oldmap->is_rejoining() && mdsmap->is_rejoining()) rejoin_joint_start(); // did we finish? if (g_conf.mds_dump_cache_after_rejoin && - wasrejoining && !mdsmap->is_rejoining()) + oldmap->is_rejoining() && !mdsmap->is_rejoining()) mdcache->dump_cache(); // for DEBUG only } - + if (oldmap->is_degraded() && !mdsmap->is_degraded() && state >= MDSMap::STATE_ACTIVE) + dout(1) << "cluster recovered." << dendl; + // did someone go active? if (is_active() || is_stopping()) { - set active; + set oldactive, active; + oldmap->get_mds_set(oldactive, MDSMap::STATE_ACTIVE); mdsmap->get_mds_set(active, MDSMap::STATE_ACTIVE); - for (set::iterator p = active.begin(); p != active.end(); ++p) { - if (*p == whoami) continue; // not me - if (oldactive.count(*p)) continue; // newly so? - handle_mds_recovery(*p); - } + for (set::iterator p = active.begin(); p != active.end(); ++p) + if (*p != whoami && // not me + oldactive.count(*p) == 0) // newly so? + handle_mds_recovery(*p); } + // did someone fail or stop? if (is_active() || is_stopping()) { - // did anyone go down? - set failed; + // new failed? + set oldfailed, failed; + oldmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); - for (set::iterator p = failed.begin(); p != failed.end(); ++p) { - if (oldfailed.count(*p)) continue; // newly so? - mdcache->handle_mds_failure(*p); - } + for (set::iterator p = failed.begin(); p != failed.end(); ++p) + if (oldfailed.count(*p) == 0) + mdcache->handle_mds_failure(*p); + + // or down then up? + // did their addr/inst change? + set up; + mdsmap->get_up_mds_set(up); + for (set::iterator p = up.begin(); p != up.end(); ++p) + if (oldmap->have_inst(*p) && + oldmap->get_inst(*p) != mdsmap->get_inst(*p)) + mdcache->handle_mds_failure(*p); // did anyone stop? - set stopped; + set oldstopped, stopped; + oldmap->get_mds_set(oldstopped, MDSMap::STATE_STOPPED); mdsmap->get_mds_set(stopped, MDSMap::STATE_STOPPED); - for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) { - if (oldstopped.count(*p)) continue; // newly so? - mdcache->migrator->handle_mds_failure_or_stop(*p); - } + for (set::iterator p = stopped.begin(); p != stopped.end(); ++p) + if (oldstopped.count(*p) == 0) // newly so? + mdcache->migrator->handle_mds_failure_or_stop(*p); } - - // in set set changed? - /* - if (state >= MDSMap::STATE_ACTIVE && // only if i'm active+. otherwise they'll get map during reconnect. - mdsmap->get_same_in_set_since() > last_client_mdsmap_bcast) { - bcast_mds_map(); - } - */ - // just got mdsmap+osdmap? if (hadepoch == 0 && mdsmap->get_epoch() > 0 && @@ -670,6 +667,7 @@ void MDS::handle_mds_map(MMDSMap *m) } delete m; + delete oldmap; } void MDS::bcast_mds_map() @@ -734,6 +732,7 @@ void MDS::boot_create() C_Gather *fin = new C_Gather(new C_MDS_CreateFinish(this)); + CDir *rootdir = 0; if (whoami == 0) { dout(3) << "boot_create since i am also mds0, creating root inode and dir" << dendl; @@ -743,33 +742,35 @@ void MDS::boot_create() assert(root); // force empty root dir - CDir *dir = root->get_dirfrag(frag_t()); - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - - // save it - dir->commit(0, fin->new_sub()); + rootdir = root->get_dirfrag(frag_t()); + rootdir->mark_complete(); } // create my stray dir + CDir *straydir; { dout(10) << "boot_create creating local stray dir" << dendl; mdcache->open_local_stray(); CInode *stray = mdcache->get_stray(); - CDir *dir = stray->get_dirfrag(frag_t()); - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); - dir->commit(0, fin->new_sub()); + straydir = stray->get_dirfrag(frag_t()); + straydir->mark_complete(); } // start with a fresh journal dout(10) << "boot_create creating fresh journal" << dendl; - mdlog->reset(); - mdlog->write_head(fin->new_sub()); + mdlog->create(fin->new_sub()); // write our first subtreemap - mdcache->log_subtree_map(fin->new_sub()); + mdlog->start_new_segment(fin->new_sub()); + // dirty, commit (root and) stray dir(s) + if (whoami == 0) { + rootdir->mark_dirty(rootdir->pre_dirty(), mdlog->get_current_segment()); + rootdir->commit(0, fin->new_sub()); + } + straydir->mark_dirty(straydir->pre_dirty(), mdlog->get_current_segment()); + straydir->commit(0, fin->new_sub()); + // fixme: fake out idalloc (reset, pretend loaded) dout(10) << "boot_create creating fresh idalloc table" << dendl; idalloc->reset(); @@ -830,12 +831,12 @@ void MDS::boot_start(int step) if (is_replay()) { dout(2) << "boot_start " << step << ": replaying mds log" << dendl; mdlog->replay(new C_MDS_BootStart(this, 3)); + break; } else { dout(2) << "boot_start " << step << ": positioning at end of old mds log" << dendl; mdlog->append(); - mdcache->log_subtree_map(new C_MDS_BootStart(this, 3)); + step++; } - break; case 3: if (is_replay()) { @@ -866,6 +867,9 @@ void MDS::starting_done() dout(3) << "starting_done" << dendl; assert(is_starting()); set_want_state(MDSMap::STATE_ACTIVE); + + // start new segment + mdlog->start_new_segment(0); } @@ -898,6 +902,9 @@ void MDS::replay_done() dout(2) << "i am not alone, moving to state resolve" << dendl; set_want_state(MDSMap::STATE_RESOLVE); } + + // start new segment + mdlog->start_new_segment(0); } @@ -994,17 +1001,14 @@ void MDS::handle_mds_recovery(int who) void MDS::stopping_start() { dout(2) << "stopping_start" << dendl; - + // start cache shutdown mdcache->shutdown_start(); // terminate client sessions server->terminate_sessions(); - - // flush log - mdlog->set_max_events(0); - mdlog->trim(NULL); } + void MDS::stopping_done() { dout(2) << "stopping_done" << dendl; @@ -1066,14 +1070,18 @@ void MDS::my_dispatch(Message *m) mdsmap->get_inst(from) != m->get_source_inst() || mdsmap->is_down(from)) { // bogus mds? - if (m->get_type() != MSG_MDS_MAP) { + if (m->get_type() == MSG_MDS_MAP) { + dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + << ", but it's an mdsmap, looking at it" << dendl; + } else if (m->get_type() == MSG_MDS_CACHEEXPIRE && + mdsmap->get_inst(from) == m->get_source_inst()) { + dout(5) << "got " << *m << " from down mds " << m->get_source() + << ", but it's a cache_expire, looking at it" << dendl; + } else { dout(5) << "got " << *m << " from down/old/bad/imposter mds " << m->get_source() << ", dropping" << dendl; delete m; return; - } else { - dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() - << ", but it's an mdsmap, looking at it" << dendl; } } } diff --git a/trunk/ceph/mds/Migrator.cc b/trunk/ceph/mds/Migrator.cc index a7a219d7dda4f..ac02938ddbe88 100644 --- a/trunk/ceph/mds/Migrator.cc +++ b/trunk/ceph/mds/Migrator.cc @@ -112,39 +112,36 @@ public: void Migrator::export_empty_import(CDir *dir) { dout(7) << "export_empty_import " << *dir << dendl; - - if (dir->inode->is_auth()) return; - if (!dir->is_auth()) return; - - if (dir->inode->is_freezing() || dir->inode->is_frozen()) return; - if (dir->is_freezing() || dir->is_frozen()) return; - - if (dir->get_size() > 0) { - dout(7) << "not actually empty" << dendl; + assert(dir->is_subtree_root()); + + if (dir->inode->is_auth()) { + dout(7) << " inode is auth" << dendl; return; } - - if (dir->inode->is_root()) { - dout(7) << "root" << dendl; + if (!dir->is_auth()) { + dout(7) << " not auth" << dendl; return; } - - // is it really empty? - if (!dir->is_complete()) { - dout(7) << "not complete, fetching." << dendl; - dir->fetch(new C_MDC_EmptyImport(this,dir)); + if (dir->is_freezing() || dir->is_frozen()) { + dout(7) << " freezing or frozen" << dendl; + return; + } + if (dir->get_size() > 0) { + dout(7) << " not actually empty" << dendl; + return; + } + if (dir->inode->is_root()) { + dout(7) << " root" << dendl; return; } int dest = dir->inode->authority().first; - - // comment this out ot wreak havoc? //if (mds->is_shutting_down()) dest = 0; // this is more efficient. - dout(7) << "really empty, exporting to " << dest << dendl; + dout(7) << " really empty, exporting to " << dest << dendl; assert (dest != mds->get_nodeid()); - dout(-7) << "exporting to mds" << dest + dout(7) << "exporting to mds" << dest << " empty import " << *dir << dendl; export_dir( dir, dest ); } @@ -181,7 +178,6 @@ void Migrator::handle_mds_failure_or_stop(int who) dir->auth_unpin(); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); if (export_peer[dir] != who) // tell them. mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); break; @@ -191,7 +187,6 @@ void Migrator::handle_mds_failure_or_stop(int who) dir->unfreeze_tree(); // cancel the freeze export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); if (export_peer[dir] != who) // tell them. mds->send_message_mds(new MExportDirCancel(dir->dirfrag()), export_peer[dir], MDS_PORT_MIGRATOR); break; @@ -222,7 +217,6 @@ void Migrator::handle_mds_failure_or_stop(int who) cache->try_subtree_merge(dir); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); break; case EXPORT_EXPORTING: @@ -230,7 +224,6 @@ void Migrator::handle_mds_failure_or_stop(int who) export_reverse(dir); export_state.erase(dir); // clean up dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); break; case EXPORT_LOGGINGFINISH: @@ -578,14 +571,15 @@ void Migrator::export_dir(CDir *dir, int dest) export_peer[dir] = dest; dir->state_set(CDir::STATE_EXPORTING); - dir->get(CDir::PIN_EXPORTING); // send ExportDirDiscover (ask target) - mds->send_message_mds(new MExportDirDiscover(dir), export_peer[dir], MDS_PORT_MIGRATOR); + mds->send_message_mds(new MExportDirDiscover(dir), dest, MDS_PORT_MIGRATOR); // start the freeze, but hold it up with an auth_pin. dir->auth_pin(); - dir->freeze_tree(new C_MDC_ExportFreeze(this, dir)); + dir->freeze_tree(); + assert(dir->is_freezing_tree()); + dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir)); } @@ -775,7 +769,6 @@ void Migrator::export_go(CDir *dir) export_warning_ack_waiting.erase(dir); export_state[dir] = EXPORT_EXPORTING; - assert(export_data.count(dir) == 0); assert(dir->get_cum_auth_pins() == 0); // set ambiguous auth @@ -786,22 +779,20 @@ void Migrator::export_go(CDir *dir) // fill export message with cache data utime_t now = g_clock.now(); - C_Contexts *fin = new C_Contexts; // collect all the waiters map exported_client_map; - int num_exported_inodes = encode_export_dir( export_data[dir], - fin, - dir, // base + bufferlist export_data; + int num_exported_inodes = encode_export_dir( export_data, dir, // recur start point - dest, exported_client_map, now ); bufferlist bl; ::_encode(exported_client_map, bl); - export_data[dir].push_front(bl); + bl.claim_append(export_data); + export_data.claim(bl); // send the export data! MExportDir *req = new MExportDir(dir->dirfrag()); - req->set_dirstate(export_data[dir]); + req->take_dirstate(export_data); // add bounds to message set bounds; @@ -811,12 +802,9 @@ void Migrator::export_go(CDir *dir) ++p) req->add_export((*p)->dirfrag()); - //s end + // send mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); - // queue up the finisher - dir->add_waiter( CDir::WAIT_UNFREEZE, fin ); - // stats if (mds->logger) mds->logger->inc("ex"); if (mds->logger) mds->logger->inc("iex", num_exported_inodes); @@ -829,38 +817,53 @@ void Migrator::export_go(CDir *dir) * update our local state for this inode to export. * encode relevant state to be sent over the wire. * used by: encode_export_dir, file_rename (if foreign) + * + * FIXME: the separation between CInode.encode_export and these methods + * is pretty arbitrary and dumb. */ -void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth, - map& exported_client_map, - utime_t now) +void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, + map& exported_client_map) +{ + dout(7) << "encode_export_inode " << *in << dendl; + assert(!in->is_replica(mds->get_nodeid())); + + ::_encode_simple(in->inode.ino, enc_state); + in->encode_export(enc_state); + + // make note of clients named by exported capabilities + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); + it++) + exported_client_map[it->first] = mds->clientmap.get_inst(it->first); +} + +void Migrator::finish_export_inode(CInode *in, utime_t now, list& finished) { + dout(12) << "finish_export_inode " << *in << dendl; + + in->finish_export(now); + // tell (all) clients about migrating caps.. mark STALE for (map::iterator it = in->client_caps.begin(); it != in->client_caps.end(); it++) { - dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << dendl; + dout(7) << "finish_export_inode telling client" << it->first + << " stale caps on " << *in << dendl; MClientFileCaps *m = new MClientFileCaps(MClientFileCaps::OP_STALE, in->inode, it->second.get_last_seq(), it->second.pending(), it->second.wanted()); entity_inst_t inst = mds->clientmap.get_inst(it->first); - exported_client_map[it->first] = inst; mds->send_message_client_maybe_open(m, inst); } + in->clear_client_caps(); // relax locks? if (!in->is_replicated()) in->replicate_relax_locks(); - // add inode - assert(!in->is_replica(mds->get_nodeid())); - CInodeExport istate(in, now); - istate._encode( enc_state ); - - // we're export this inode; fix inode state - dout(7) << "encode_export_inode " << *in << dendl; - + // clean if (in->is_dirty()) in->mark_clean(); // clear/unpin cached_by (we're no longer the authority) @@ -878,19 +881,19 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au in->state_clear(CInode::STATE_AUTH); in->replica_nonce = CInode::EXPORT_NONCE; + // waiters + in->take_waiting(CInode::WAIT_ANY, finished); + // *** other state too? // move to end of LRU so we drop out of cache quickly! if (in->get_parent_dn()) cache->lru.lru_bottouch(in->get_parent_dn()); -} +} -int Migrator::encode_export_dir(list& dirstatelist, - C_Contexts *fin, - CDir *basedir, +int Migrator::encode_export_dir(bufferlist& exportbl, CDir *dir, - int newauth, map& exported_client_map, utime_t now) { @@ -901,33 +904,15 @@ int Migrator::encode_export_dir(list& dirstatelist, assert(dir->get_projected_version() == dir->get_version()); // dir - bufferlist enc_dir; - - CDirExport dstate(dir, now); - dstate._encode( enc_dir ); - - // release open_by - dir->clear_replica_map(); - - // mark - assert(dir->is_auth()); - dir->state_clear(CDir::STATE_AUTH); - dir->replica_nonce = CDir::NONCE_EXPORT; - - list subdirs; - - if (dir->is_dirty()) - dir->mark_clean(); - - // discard most dir state - dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. + dirfrag_t df = dir->dirfrag(); + ::_encode_simple(df, exportbl); + dir->encode_export(exportbl); - // suck up all waiters - list waiting; - dir->take_waiting(CDir::WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); + long nden = dir->items.size(); + ::_encode_simple(nden, exportbl); // dentries + list subdirs; CDir::map_t::iterator it; for (it = dir->begin(); it != dir->end(); it++) { CDentry *dn = it->second; @@ -938,36 +923,36 @@ int Migrator::encode_export_dir(list& dirstatelist, // -- dentry dout(7) << "encode_export_dir exporting " << *dn << dendl; - // name - ::_encode(it->first, enc_dir); + // dn name + ::_encode(it->first, exportbl); // state - it->second->encode_export_state(enc_dir); + dn->encode_export(exportbl); // points to... // null dentry? if (dn->is_null()) { - enc_dir.append("N", 1); // null dentry + exportbl.append("N", 1); // null dentry continue; } if (dn->is_remote()) { // remote link - enc_dir.append("L", 1); // remote link + exportbl.append("L", 1); // remote link inodeno_t ino = dn->get_remote_ino(); unsigned char d_type = dn->get_remote_d_type(); - ::_encode(ino, enc_dir); - ::_encode(d_type, enc_dir); + ::_encode(ino, exportbl); + ::_encode(d_type, exportbl); continue; } // primary link // -- inode - enc_dir.append("I", 1); // inode dentry + exportbl.append("I", 1); // inode dentry - encode_export_inode(in, enc_dir, newauth, exported_client_map, now); // encode, and (update state for) export + encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export // directory? list dfs; @@ -980,26 +965,62 @@ int Migrator::encode_export_dir(list& dirstatelist, subdirs.push_back(dir); // it's ours, recurse (later) } } - - // waiters - list waiters; - in->take_waiting(CInode::WAIT_ANY, waiters); - fin->take(waiters); } - // add to dirstatelist - bufferlist bl; - dirstatelist.push_back( bl ); - dirstatelist.back().claim( enc_dir ); - // subdirs for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth, - exported_client_map, now); + num_exported += encode_export_dir(exportbl, *it, exported_client_map, now); return num_exported; } +void Migrator::finish_export_dir(CDir *dir, list& finished, utime_t now) +{ + dout(10) << "finish_export_dir " << *dir << dendl; + + // release open_by + dir->clear_replica_map(); + + // mark + assert(dir->is_auth()); + dir->state_clear(CDir::STATE_AUTH); + dir->replica_nonce = CDir::NONCE_EXPORT; + + if (dir->is_dirty()) + dir->mark_clean(); + + // discard most dir state + dir->state &= CDir::MASK_STATE_EXPORT_KEPT; // i only retain a few things. + + // suck up all waiters + dir->take_waiting(CDir::WAIT_ANY, finished); // all dir waiters + + // pop + dir->finish_export(now); + + // dentries + list subdirs; + CDir::map_t::iterator it; + for (it = dir->begin(); it != dir->end(); it++) { + CDentry *dn = it->second; + CInode *in = dn->get_inode(); + + // dentry + dn->finish_export(); + + // inode? + if (dn->is_primary()) { + finish_export_inode(in, now, finished); + + // subdirs? + in->get_nested_dirfrags(subdirs); + } + } + + // subdirs + for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) + finish_export_dir(*it, finished, now); +} class C_MDS_ExportFinishLogged : public Context { Migrator *migrator; @@ -1027,12 +1048,12 @@ void Migrator::handle_export_ack(MExportDirAck *m) export_warning_ack_waiting.erase(dir); export_state[dir] = EXPORT_LOGGINGFINISH; - export_data.erase(dir); set bounds; cache->get_subtree_bounds(dir, bounds); - // log completion + // log completion. + // include export bounds, to ensure they're in the journal. EExport *le = new EExport(mds->mdlog, dir); le->metablob.add_dir_context(dir); le->metablob.add_dir( dir, false ); @@ -1054,6 +1075,8 @@ void Migrator::handle_export_ack(MExportDirAck *m) + + /* * this happens if hte dest failes after i send teh export data but before it is acked * that is, we don't know they safely received and logged it, so we reverse our changes @@ -1064,7 +1087,6 @@ void Migrator::export_reverse(CDir *dir) dout(7) << "export_reverse " << *dir << dendl; assert(export_state[dir] == EXPORT_EXPORTING); - assert(export_data.count(dir)); set bounds; cache->get_subtree_bounds(dir, bounds); @@ -1073,6 +1095,23 @@ void Migrator::export_reverse(CDir *dir) cache->adjust_subtree_auth(dir, mds->get_nodeid()); cache->try_subtree_merge(dir); + // remove exporting pins + list rq; + rq.push_back(dir); + while (!rq.empty()) { + CDir *dir = rq.front(); + rq.pop_front(); + dir->abort_export(); + for (CDir::map_t::iterator p = dir->items.begin(); p != dir->items.end(); ++p) { + p->second->abort_export(); + if (!p->second->is_primary()) continue; + CInode *in = p->second->get_inode(); + in->abort_export(); + if (in->is_dir()) + in->get_nested_dirfrags(rq); + } + } + // unpin bounds for (set::iterator p = bounds.begin(); p != bounds.end(); @@ -1082,26 +1121,10 @@ void Migrator::export_reverse(CDir *dir) bd->state_clear(CDir::STATE_EXPORTBOUND); } - // re-import the metadata - map imported_client_map; - int off = 0; - ::_decode(imported_client_map, export_data[dir].front(), off); - export_data[dir].pop_front(); - - while (!export_data[dir].empty()) { - decode_import_dir(export_data[dir].front(), - export_peer[dir], - dir, // import root - 0, - imported_client_map); - export_data[dir].pop_front(); - } - // process delayed expires cache->process_delayed_expire(dir); // some clean up - export_data.erase(dir); export_warning_ack_waiting.erase(dir); export_notify_ack_waiting.erase(dir); @@ -1225,6 +1248,11 @@ void Migrator::export_finish(CDir *dir) dout(7) << "not sending MExportDirFinish, dest has failed" << dendl; } + // finish export (adjust local cache state) + C_Contexts *fin = new C_Contexts; + finish_export_dir(dir, fin->contexts, g_clock.now()); + dir->add_waiter(CDir::WAIT_UNFREEZE, fin); + // unfreeze dout(7) << "export_finish unfreezing" << dendl; dir->unfreeze_tree(); @@ -1256,7 +1284,6 @@ void Migrator::export_finish(CDir *dir) // remove from exporting list, clean up state dir->state_clear(CDir::STATE_EXPORTING); - dir->put(CDir::PIN_EXPORTING); export_state.erase(dir); export_peer.erase(dir); export_notify_ack_waiting.erase(dir); @@ -1572,19 +1599,19 @@ void Migrator::handle_export_dir(MExportDir *m) // add this crap to my cache map imported_client_map; - int off = 0; - ::_decode(imported_client_map, m->get_dirstate().front(), off); - m->get_dirstate().pop_front(); + bufferlist::iterator blp = m->get_dirstate().begin(); + ::_decode_simple(imported_client_map, blp); int num_imported_inodes = 0; - while (!m->get_dirstate().empty()) { + while (!blp.end()) { num_imported_inodes += - decode_import_dir(m->get_dirstate().front(), + decode_import_dir(blp, oldauth, dir, // import root le, - imported_client_map); - m->get_dirstate().pop_front(); + imported_client_map, + mds->mdlog->get_current_segment(), + import_updated_scatterlocks[dir]); } dout(10) << " " << m->get_bounds().size() << " imported bounds" << dendl; @@ -1762,6 +1789,7 @@ void Migrator::import_reverse_final(CDir *dir) import_peer.erase(dir->dirfrag()); import_bystanders.erase(dir); import_bound_ls.erase(dir); + import_updated_scatterlocks.erase(dir); // send pending import_maps? mds->mdcache->maybe_send_pending_resolves(); @@ -1803,6 +1831,12 @@ void Migrator::import_finish(CDir *dir) // log finish mds->mdlog->submit_entry(new EImportFinish(dir, true)); + // clear updated scatterlocks + for (list::iterator p = import_updated_scatterlocks[dir].begin(); + p != import_updated_scatterlocks[dir].end(); + ++p) + (*p)->clear_updated(); + // remove pins set bounds; cache->get_subtree_bounds(dir, bounds); @@ -1817,6 +1851,7 @@ void Migrator::import_finish(CDir *dir) import_peer.erase(dir->dirfrag()); import_bystanders.erase(dir); import_bound_ls.erase(dir); + import_updated_scatterlocks.erase(dir); // process delayed expires cache->process_delayed_expire(dir); @@ -1839,16 +1874,18 @@ void Migrator::import_finish(CDir *dir) } -void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth, - map& imported_client_map) +void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, + map& imported_client_map, + LogSegment *ls, + list& updated_scatterlocks) { dout(15) << "decode_import_inode on " << *dn << dendl; - CInodeExport istate; - off = istate._decode(bl, off); + inodeno_t ino; + ::_decode_simple(ino, blp); bool added = false; - CInode *in = cache->get_inode(istate.get_ino()); + CInode *in = cache->get_inode(ino); if (!in) { in = new CInode(mds->mdcache); added = true; @@ -1858,7 +1895,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol // state after link -- or not! -sage set merged_client_caps; - istate.update_inode(in, merged_client_caps); + in->decode_import(blp, merged_client_caps, ls); // link before state -- or not! -sage if (dn->inode != in) { @@ -1874,6 +1911,15 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol dout(10) << " had " << *in << dendl; } + // clear if dirtyscattered, since we're going to journal this + // but not until we _actually_ finish the import... + if (in->dirlock.is_updated()) + updated_scatterlocks.push_back(&in->dirlock); + + // put in autoscatter list? + // this is conservative, but safe. + if (in->dirlock.get_state() == LOCK_SCATTER) + mds->locker->note_autoscattered(&in->dirlock); // adjust replica list //assert(!in->is_replica(oldauth)); // not true on failed export @@ -1897,27 +1943,27 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol } -int Migrator::decode_import_dir(bufferlist& bl, +int Migrator::decode_import_dir(bufferlist::iterator& blp, int oldauth, CDir *import_root, EImportStart *le, - map& imported_client_map) + map& imported_client_map, + LogSegment *ls, + list& updated_scatterlocks) { - int off = 0; - // set up dir - CDirExport dstate; - off = dstate._decode(bl, off); - - CInode *diri = cache->get_inode(dstate.get_dirfrag().ino); + dirfrag_t df; + ::_decode_simple(df, blp); + + CInode *diri = cache->get_inode(df.ino); assert(diri); - CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, dstate.get_dirfrag().frag); + CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag); assert(dir); dout(7) << "decode_import_dir " << *dir << dendl; // assimilate state - dstate.update_dir( dir ); + dir->decode_import(blp); // mark (may already be marked from get_or_open_dir() above) if (!dir->is_auth()) @@ -1951,27 +1997,31 @@ int Migrator::decode_import_dir(bufferlist& bl, dout(15) << "doing contents" << dendl; // contents - long nden = dstate.get_nden(); + long nden; + ::_decode_simple(nden, blp); for (; nden>0; nden--) { num_imported++; // dentry string dname; - ::_decode(dname, bl, off); + ::_decode_simple(dname, blp); CDentry *dn = dir->lookup(dname); if (!dn) dn = dir->add_null_dentry(dname); - // decode state - dn->decode_import_state(bl, off, oldauth, mds->get_nodeid()); + dn->decode_import(blp, ls); + + dn->add_replica(oldauth, CDentry::EXPORT_NONCE); + if (dn->is_replica(mds->get_nodeid())) + dn->remove_replica(mds->get_nodeid()); + dout(15) << "decode_import_dir got " << *dn << dendl; // points to... char icode; - bl.copy(off, 1, &icode); - off++; + ::_decode_simple(icode, blp); if (icode == 'N') { // null dentry @@ -1983,8 +2033,8 @@ int Migrator::decode_import_dir(bufferlist& bl, // remote link inodeno_t ino; unsigned char d_type; - ::_decode(ino, bl, off); - ::_decode(d_type, bl, off); + ::_decode_simple(ino, blp); + ::_decode_simple(d_type, blp); if (dn->is_remote()) { assert(dn->get_remote_ino() == ino); } else { @@ -1993,7 +2043,7 @@ int Migrator::decode_import_dir(bufferlist& bl, } else if (icode == 'I') { // inode - decode_import_inode(dn, bl, off, oldauth, imported_client_map); + decode_import_inode(dn, blp, oldauth, imported_client_map, ls, updated_scatterlocks); } // add dentry to journal entry diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h index 5af7a42ce38f6..07a8731868a92 100644 --- a/trunk/ceph/mds/Migrator.h +++ b/trunk/ceph/mds/Migrator.h @@ -78,7 +78,7 @@ protected: // export fun map export_state; map export_peer; - map > export_data; // only during EXPORTING state + //map > export_data; // only during EXPORTING state map > export_warning_ack_waiting; map > export_notify_ack_waiting; @@ -113,7 +113,7 @@ protected: map import_peer; map > import_bystanders; map > import_bound_ls; - + map > import_updated_scatterlocks; /* // -- hashing madness -- @@ -182,17 +182,18 @@ public: void export_dir_nicely(CDir *dir, int dest); void maybe_do_queued_export(); + void clear_export_queue() { + export_queue.clear(); + } - void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth, - map& exported_client_map, - utime_t now); - int encode_export_dir(list& dirstatelist, - class C_Contexts *fin, - CDir *basedir, + void encode_export_inode(CInode *in, bufferlist& enc_state, + map& exported_client_map); + void finish_export_inode(CInode *in, utime_t now, list& finished); + int encode_export_dir(bufferlist& exportbl, CDir *dir, - int newauth, map& exported_client_map, utime_t now); + void finish_export_dir(CDir *dir, list& finished, utime_t now); void add_export_finish_waiter(CDir *dir, Context *c) { export_finish_waiters[dir].push_back(c); @@ -221,13 +222,17 @@ public: void handle_export_dir(MExportDir *m); public: - void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth, - map& imported_client_map); - int decode_import_dir(bufferlist& bl, + void decode_import_inode(CDentry *dn, bufferlist::iterator& blp, int oldauth, + map& imported_client_map, + LogSegment *ls, + list& updated_scatterlocks); + int decode_import_dir(bufferlist::iterator& blp, int oldauth, CDir *import_root, EImportStart *le, - map& imported_client_map); + map& imported_client_map, + LogSegment *ls, + list& updated_scatterlocks); public: void import_reverse(CDir *dir); diff --git a/trunk/ceph/mds/ScatterLock.h b/trunk/ceph/mds/ScatterLock.h index 5f5085e59b82a..24a1361f82d68 100644 --- a/trunk/ceph/mds/ScatterLock.h +++ b/trunk/ceph/mds/ScatterLock.h @@ -65,12 +65,16 @@ inline const char *get_scatterlock_state_name(int s) { class ScatterLock : public SimpleLock { int num_wrlock; bool updated; + utime_t last_scatter; public: + xlist::item xlistitem_autoscattered; + ScatterLock(MDSCacheObject *o, int t, int wo) : SimpleLock(o, t, wo), num_wrlock(0), - updated(false) {} + updated(false), + xlistitem_autoscattered(this) {} int get_replica_state() { switch (state) { @@ -109,13 +113,15 @@ public: if (updated) { parent->put(MDSCacheObject::PIN_DIRTYSCATTERED); updated = false; + parent->clear_dirty_scattered(type); } } bool is_updated() { return updated; } + void set_last_scatter(utime_t t) { last_scatter = t; } + utime_t get_last_scatter() { return last_scatter; } + void replicate_relax() { - //if (state == LOCK_SYNC && !is_rdlocked()) - //state = LOCK_SCATTER; } void export_twiddle() { @@ -167,6 +173,8 @@ public: out << " x=" << get_xlocked_by(); if (is_wrlocked()) out << " wr=" << get_num_wrlocks(); + if (updated) + out << " updated"; out << ")"; } diff --git a/trunk/ceph/mds/Server.cc b/trunk/ceph/mds/Server.cc index 5d5b8dd1298a1..3be92948cf0b3 100644 --- a/trunk/ceph/mds/Server.cc +++ b/trunk/ceph/mds/Server.cc @@ -534,7 +534,7 @@ void Server::dispatch_client_request(MDRequest *mdr) } // we shouldn't be waiting on anyone. - assert(mdr->waiting_on_slave.empty()); + assert(mdr->more()->waiting_on_slave.empty()); switch (req->get_op()) { @@ -620,7 +620,7 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(), m->get_object_info()); MDRequest *mdr = mdcache->request_get(m->get_reqid()); - mdr->slaves.insert(from); + mdr->more()->slaves.insert(from); dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; mdr->xlocks.insert(lock); mdr->locks.insert(lock); @@ -650,13 +650,6 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) } break; - case MMDSSlaveRequest::OP_RENAMEGETINODEACK: - { - MDRequest *mdr = mdcache->request_get(m->get_reqid()); - handle_slave_rename_get_inode_ack(mdr, m); - } - break; - default: assert(0); } @@ -771,10 +764,6 @@ void Server::dispatch_slave_request(MDRequest *mdr) handle_slave_rename_prep(mdr); break; - case MMDSSlaveRequest::OP_RENAMEGETINODE: - handle_slave_rename_get_inode(mdr); - break; - case MMDSSlaveRequest::OP_FINISH: // finish off request. mdcache->request_finish(mdr); @@ -821,7 +810,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) !(*p)->can_auth_pin()) { // wait dout(10) << " waiting for authpinnable on " << **p << dendl; - (*p)->add_waiter(CDir::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); mdr->drop_local_auth_pins(); return; } @@ -874,36 +863,36 @@ void Server::handle_slave_auth_pin_ack(MDRequest *mdr, MMDSSlaveRequest *ack) assert(object); // we pinned it dout(10) << " remote has pinned " << *object << dendl; if (!mdr->is_auth_pinned(object)) - mdr->auth_pins.insert(object); + mdr->remote_auth_pins.insert(object); pinned.insert(object); } // removed auth pins? - set::iterator p = mdr->auth_pins.begin(); - while (p != mdr->auth_pins.end()) { + set::iterator p = mdr->remote_auth_pins.begin(); + while (p != mdr->remote_auth_pins.end()) { if ((*p)->authority().first == from && pinned.count(*p) == 0) { dout(10) << " remote has unpinned " << **p << dendl; set::iterator o = p; ++p; - mdr->auth_pins.erase(o); + mdr->remote_auth_pins.erase(o); } else { ++p; } } // note slave - mdr->slaves.insert(from); + mdr->more()->slaves.insert(from); // clear from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); + assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); // go again? - if (mdr->waiting_on_slave.empty()) + if (mdr->more()->waiting_on_slave.empty()) dispatch_client_request(mdr); else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl; + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; } @@ -1112,7 +1101,7 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, bool want_auth) if (want_auth) { if (ref->is_frozen()) { dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } mdr->auth_pin(ref); @@ -1156,7 +1145,7 @@ CDentry* Server::rdlock_path_xlock_dentry(MDRequest *mdr, bool okexist, bool mus // make sure we can auth_pin (or have already authpinned) dir if (dir->is_frozen()) { dout(7) << "waiting for !frozen/authpinnable on " << *dir << dendl; - dir->add_waiter(CInode::WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mdcache, mdr)); + dir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); return 0; } @@ -1282,7 +1271,7 @@ version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) inode_t *pi = diri->project_inode(); if (dirpv) pi->version = dirpv; pi->ctime = pi->mtime = mdr->now; - blob->add_dir_context(diri->get_parent_dir()); + blob->add_dir_context(diri->get_parent_dn()->get_dir()); blob->add_primary_dentry(diri->get_parent_dn(), true, 0, pi); } else { // journal the mtime change anyway. @@ -1292,6 +1281,8 @@ version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) dout(10) << "predirty_dn_diri (non-auth) ctime/mtime " << mdr->now << " on " << *diri << dendl; blob->add_dirtied_inode_mtime(diri->ino(), mdr->now); + assert(mdr->ls); + mdr->ls->dirty_inode_mtimes.push_back(&diri->xlist_dirty_inode_mtime); } return dirpv; @@ -1300,7 +1291,7 @@ version_t Server::predirty_dn_diri(MDRequest *mdr, CDentry *dn, EMetaBlob *blob) /** dirty_dn_diri * follow-up with actual dirty of inode after journal entry commits. */ -void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) +void Server::dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv) { CInode *diri = dn->dir->inode; @@ -1309,13 +1300,13 @@ void Server::dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime) if (dirpv) { // we journaled and predirtied. assert(diri->is_auth() && !diri->is_root()); - diri->pop_and_dirty_projected_inode(); - dout(10) << "dirty_dn_diri ctime/mtime " << mtime << " v " << diri->inode.version << " on " << *diri << dendl; + diri->pop_and_dirty_projected_inode(mdr->ls); + dout(10) << "dirty_dn_diri ctime/mtime " << mdr->now << " v " << diri->inode.version << " on " << *diri << dendl; } else { // dirlock scatterlock will propagate the update. - diri->inode.ctime = diri->inode.mtime = mtime; + diri->inode.ctime = diri->inode.mtime = mdr->now; diri->dirlock.set_updated(); - dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mtime << " on " << *diri << dendl; + dout(10) << "dirty_dn_diri (non-dirty) ctime/mtime " << mdr->now << " on " << *diri << dendl; } } @@ -1382,7 +1373,7 @@ public: assert(r == 0); // apply - in->pop_and_dirty_projected_inode(); + in->pop_and_dirty_projected_inode(mdr->ls); mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); @@ -1423,13 +1414,13 @@ void Server::handle_client_utime(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "utime"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); + mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); } @@ -1463,13 +1454,13 @@ void Server::handle_client_chmod(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chmod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_inode_update_finish(mds, mdr, cur)); + mdlog->submit_entry(le, new C_MDS_inode_update_finish(mds, mdr, cur)); } @@ -1502,6 +1493,7 @@ void Server::handle_client_chown(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "chown"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); @@ -1564,8 +1556,9 @@ void Server::handle_client_readdir(MDRequest *mdr) } // build dir contents - list inls; - list dnls; + bufferlist dirbl; + + DirStat::_encode(dirbl, dir, mds->get_nodeid()); int numfiles = 0; for (CDir::map_t::iterator it = dir->begin(); @@ -1598,29 +1591,14 @@ void Server::handle_client_readdir(MDRequest *mdr) } assert(in); - InodeStat *st; - if (in) { - dout(12) << "including inode " << *in << dendl; - - // add this item - // note: InodeStat makes note of whether inode data is readable. - st = new InodeStat(in, mds->get_nodeid()); - } else { - assert(0); - /* - assert(dn->is_remote()); - dout(12) << "including inode-less (remote) dentry " << *dn << dendl; - st = new InodeStat; - st->mask = STAT_MASK_INO | STAT_MASK_TYPE; - memset(&st->inode, 0, sizeof(st->inode)); - st->inode.ino = dn->get_remote_ino(); - st->inode.mode = DT_TO_MODE(dn->get_remote_d_type()); - */ - } + + assert(in); - dnls.push_back( it->first ); - inls.push_back(st); - numfiles++; + dout(12) << "including inode " << *in << dendl; + + // add this dentry + inodeinfo + ::_encode(it->first, dirbl); + InodeStat::_encode(dirbl, in); // touch it mdcache->lru.lru_touch(dn); @@ -1628,7 +1606,7 @@ void Server::handle_client_readdir(MDRequest *mdr) // yay, reply MClientReply *reply = new MClientReply(req); - reply->take_dir_items(dnls, inls, numfiles); + reply->take_dir_items(dirbl); dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << dendl; reply->set_result(0); @@ -1652,10 +1630,11 @@ class C_MDS_mknod_finish : public Context { CDentry *dn; CInode *newi; version_t dirpv; + version_t newdirpv; public: - C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_) : + C_MDS_mknod_finish(MDS *m, MDRequest *r, CDentry *d, CInode *ni, version_t dirpv_, version_t newdirpv_=0) : mds(m), mdr(r), dn(d), newi(ni), - dirpv(dirpv_) {} + dirpv(dirpv_), newdirpv(newdirpv_) {} void finish(int r) { assert(r == 0); @@ -1663,10 +1642,17 @@ public: dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir - newi->mark_dirty(newi->inode.version + 1); + newi->mark_dirty(newi->inode.version + 1, mdr->ls); + + // mkdir? + if (newdirpv) { + CDir *dir = newi->get_dirfrag(frag_t()); + assert(dir); + dir->mark_dirty(newdirpv, mdr->ls); + } // dir inode's mtime - mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime); + mds->server->dirty_dn_diri(mdr, dn, dirpv); // hit pop mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); @@ -1699,6 +1685,7 @@ void Server::handle_client_mknod(MDRequest *mdr) newi->inode.version = dn->pre_dirty() - 1; // prepare finisher + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "mknod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); @@ -1707,8 +1694,7 @@ void Server::handle_client_mknod(MDRequest *mdr) le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); + mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } @@ -1737,11 +1723,12 @@ void Server::handle_client_mkdir(MDRequest *mdr) // ...and that new dir is empty. CDir *newdir = newi->get_or_open_dirfrag(mds->mdcache, frag_t()); newdir->mark_complete(); - newdir->mark_dirty(newdir->pre_dirty()); + version_t newdirpv = newdir->pre_dirty(); //if (mds->logger) mds->logger->inc("mkdir"); // prepare finisher + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "mkdir"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); @@ -1751,8 +1738,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) le->metablob.add_dir(newdir, true, true); // dirty AND complete // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); + mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv, newdirpv)); /* old export heuristic. pbly need to reimplement this at some point. if ( @@ -1780,6 +1766,7 @@ void Server::handle_client_symlink(MDRequest *mdr) if (!dn) return; mdr->now = g_clock.real_now(); + CInode *newi = prepare_new_inode(mdr, dn->dir); assert(newi); @@ -1790,6 +1777,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->inode.version = dn->pre_dirty() - 1; // prepare finisher + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "symlink"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); @@ -1798,8 +1786,7 @@ void Server::handle_client_symlink(MDRequest *mdr) le->metablob.add_primary_dentry(dn, true, newi, &newi->inode); // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); + mdlog->submit_entry(le, new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); } @@ -1927,6 +1914,8 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) { dout(10) << "_link_local " << *dn << " to " << *targeti << dendl; + mdr->ls = mdlog->get_current_segment(); + // predirty NEW dentry version_t dnpv = dn->pre_dirty(); version_t tipv = targeti->pre_dirty(); @@ -1946,8 +1935,7 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) le->metablob.add_dir_context(targeti->get_parent_dir()); le->metablob.add_primary_dentry(targeti->parent, true, targeti, pi); // update old primary - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); + mdlog->submit_entry(le, new C_MDS_link_local_finish(mds, mdr, dn, targeti, dnpv, tipv, dirpv)); } void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, @@ -1957,13 +1945,13 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, // link and unlock the NEW dentry dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dnpv); + dn->mark_dirty(dnpv, mdr->ls); // target inode - targeti->pop_and_dirty_projected_inode(); + targeti->pop_and_dirty_projected_inode(mdr->ls); // new dentry dir mtime - dirty_dn_diri(dn, dirpv, mdr->now); + dirty_dn_diri(mdr, dn, dirpv); // bump target popularity mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); @@ -2001,7 +1989,7 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) // 1. send LinkPrepare to dest (journal nlink++ prepare) int linkauth = targeti->authority().first; - if (mdr->witnessed.count(linkauth) == 0) { + if (mdr->more()->witnessed.count(linkauth) == 0) { dout(10) << " targeti auth must prepare nlink++" << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREP); @@ -2009,8 +1997,8 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) req->now = mdr->now; mds->send_message_mds(req, linkauth, MDS_PORT_SERVER); - assert(mdr->waiting_on_slave.count(linkauth) == 0); - mdr->waiting_on_slave.insert(linkauth); + assert(mdr->more()->waiting_on_slave.count(linkauth) == 0); + mdr->more()->waiting_on_slave.insert(linkauth); return; } dout(10) << " targeti auth has prepared nlink++" << dendl; @@ -2020,6 +2008,7 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) dn->pre_dirty(); // add to event + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "link_remote"); le->metablob.add_client_req(mdr->reqid); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime @@ -2030,8 +2019,7 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) mdr->committing = true; // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); + mdlog->submit_entry(le, new C_MDS_link_remote_finish(mds, mdr, dn, targeti, dirpv)); } void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, @@ -2041,10 +2029,10 @@ void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, // link the new dentry dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); - dn->mark_dirty(dpv); + dn->mark_dirty(dpv, mdr->ls); // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); + dirty_dn_diri(mdr, dn, dirpv); // bump target popularity mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); @@ -2100,9 +2088,16 @@ void Server::handle_slave_link_prep(MDRequest *mdr) } } + // journal it + mdr->ls = mdlog->get_current_segment(); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); inode_t *pi = dn->inode->project_inode(); + // rollback case + le->rollback.add_dir_context(targeti->get_parent_dir()); + le->rollback.add_primary_dentry(dn, true, targeti, pi); // update old primary + // update journaled target inode bool inc; if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) { @@ -2118,11 +2113,11 @@ void Server::handle_slave_link_prep(MDRequest *mdr) dout(10) << " projected inode " << pi << " v " << pi->version << dendl; - // journal it - ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - le->metablob.add_dir_context(targeti->get_parent_dir()); - le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary - mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); + // commit case + le->commit.add_dir_context(targeti->get_parent_dir()); + le->commit.add_primary_dentry(dn, true, targeti, pi); // update old primary + + mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); } class C_MDS_SlaveLinkCommit : public Context { @@ -2150,7 +2145,7 @@ void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_cti version_t old_version = targeti->inode.version; // update the target - targeti->pop_and_dirty_projected_inode(); + targeti->pop_and_dirty_projected_inode(mdr->ls); // hit pop mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); @@ -2160,7 +2155,7 @@ void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_cti mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); + mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti, old_ctime, old_version, inc); // done. delete mdr->slave_request; @@ -2195,7 +2190,7 @@ void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, targeti->inode.nlink--; } - mds->mdlog->submit_entry(le); + mdlog->submit_entry(le); } @@ -2207,17 +2202,17 @@ void Server::handle_slave_link_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) int from = m->get_source().num(); // note slave - mdr->slaves.insert(from); + mdr->more()->slaves.insert(from); // witnessed! - assert(mdr->witnessed.count(from) == 0); - mdr->witnessed.insert(from); + assert(mdr->more()->witnessed.count(from) == 0); + mdr->more()->witnessed.insert(from); // remove from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); + assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); - assert(mdr->waiting_on_slave.empty()); + assert(mdr->more()->waiting_on_slave.empty()); dispatch_client_request(mdr); // go again! } @@ -2324,12 +2319,12 @@ void Server::handle_client_unlink(MDRequest *mdr) dout(10) << " straydn is " << *straydn << dendl; assert(straydn->is_null()); - if (!mdr->dst_reanchor_atid && + if (!mdr->more()->dst_reanchor_atid && dn->inode->is_anchored()) { dout(10) << "reanchoring to stray " << *dn->inode << dendl; vector trace; straydn->make_anchor_trace(trace, dn->inode); - mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->dst_reanchor_atid, + mds->anchorclient->prepare_update(dn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -2368,6 +2363,8 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) dout(10) << "_unlink_local " << *dn << dendl; // ok, let's do it. + mdr->ls = mdlog->get_current_segment(); + // prepare log entry EUpdate *le = new EUpdate(mdlog, "unlink_local"); le->metablob.add_client_req(mdr->reqid); @@ -2400,14 +2397,13 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); - if (mdr->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); + if (mdr->more()->dst_reanchor_atid) + le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); // log + wait journal_opens(); // journal pending opens, just in case - mdlog->submit_entry(le); - mdlog->wait_for_sync(new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv)); + mdlog->submit_entry(le, new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, + dirpv)); } void Server::_unlink_local_finish(MDRequest *mdr, @@ -2427,11 +2423,11 @@ void Server::_unlink_local_finish(MDRequest *mdr, } // nlink--, dirty old dentry - in->pop_and_dirty_projected_inode(); - dn->mark_dirty(dnpv); + in->pop_and_dirty_projected_inode(mdr->ls); + dn->mark_dirty(dnpv, mdr->ls); // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); + dirty_dn_diri(mdr, dn, dirpv); // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); @@ -2448,8 +2444,8 @@ void Server::_unlink_local_finish(MDRequest *mdr, } // commit anchor update? - if (mdr->dst_reanchor_atid) - mds->anchorclient->commit(mdr->dst_reanchor_atid); + if (mdr->more()->dst_reanchor_atid) + mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); // bump pop //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); @@ -2490,7 +2486,7 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) // 1. send LinkPrepare to dest (journal nlink-- prepare) int inauth = dn->inode->authority().first; - if (mdr->witnessed.count(inauth) == 0) { + if (mdr->more()->witnessed.count(inauth) == 0) { dout(10) << " inode auth must prepare nlink--" << dendl; MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_UNLINKPREP); @@ -2498,14 +2494,15 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) req->now = mdr->now; mds->send_message_mds(req, inauth, MDS_PORT_SERVER); - assert(mdr->waiting_on_slave.count(inauth) == 0); - mdr->waiting_on_slave.insert(inauth); + assert(mdr->more()->waiting_on_slave.count(inauth) == 0); + mdr->more()->waiting_on_slave.insert(inauth); return; } dout(10) << " inode auth has prepared nlink--" << dendl; // ok, let's do it. // prepare log entry + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "unlink_remote"); le->metablob.add_client_req(mdr->reqid); @@ -2515,8 +2512,8 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) le->metablob.add_dir_context(dn->get_dir()); le->metablob.add_null_dentry(dn, true); - if (mdr->dst_reanchor_atid) - le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); + if (mdr->more()->dst_reanchor_atid) + le->metablob.add_anchor_transaction(mdr->more()->dst_reanchor_atid); // finisher C_MDS_unlink_remote_finish *fin = new C_MDS_unlink_remote_finish(mds, mdr, dn, dirpv); @@ -2527,8 +2524,7 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) mdr->committing = true; // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->submit_entry(le, fin); } void Server::_unlink_remote_finish(MDRequest *mdr, @@ -2539,10 +2535,10 @@ void Server::_unlink_remote_finish(MDRequest *mdr, // unlink main dentry dn->dir->unlink_inode(dn); - dn->mark_dirty(dnpv); // dirty old dentry + dn->mark_dirty(dnpv, mdr->ls); // dirty old dentry // dir inode's mtime - dirty_dn_diri(dn, dirpv, mdr->now); + dirty_dn_diri(mdr, dn, dirpv); // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); @@ -2554,8 +2550,8 @@ void Server::_unlink_remote_finish(MDRequest *mdr, } // commit anchor update? - if (mdr->dst_reanchor_atid) - mds->anchorclient->commit(mdr->dst_reanchor_atid); + if (mdr->more()->dst_reanchor_atid) + mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); @@ -2755,7 +2751,11 @@ void Server::handle_client_rename(MDRequest *mdr) rdlocks.insert(&srctrace[i]->lock); xlocks.insert(&srcdn->lock); wrlocks.insert(&srcdn->dir->inode->dirlock); - rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. + /* + * no, this causes problems if the dftlock is scattered... + * and what was i thinking anyway? + * rdlocks.insert(&srcdn->dir->inode->dirfragtreelock); // rd lock on srci dirfragtree. + */ // rdlock destdir path, xlock dest dentry for (int i=0; i<(int)desttrace.size(); i++) @@ -2763,6 +2763,13 @@ void Server::handle_client_rename(MDRequest *mdr) xlocks.insert(&destdn->lock); wrlocks.insert(&destdn->dir->inode->dirlock); + // xlock versionlock on srci if remote? + // this ensures it gets safely remotely auth_pinned, avoiding deadlock; + // strictly speaking, having the slave node freeze the inode is + // otherwise sufficient for avoiding conflicts with inode locks, etc. + if (!srcdn->is_auth() && srcdn->is_primary()) + xlocks.insert(&srcdn->inode->versionlock); + // xlock oldin (for nlink--) if (oldin) xlocks.insert(&oldin->linklock); @@ -2788,7 +2795,7 @@ void Server::handle_client_rename(MDRequest *mdr) ++p) { CDir *dir = srci->get_dirfrag(*p); if (!dir) { - dout(10) << " opening " << *dir << dendl; + dout(10) << " opening " << *p << " under " << *srci << dendl; mdcache->open_remote_dirfrag(srci, *p, new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -2817,68 +2824,40 @@ void Server::handle_client_rename(MDRequest *mdr) * (currently, it can ignore rename effects, because the resolve * stage will sort them out.) */ - set witnesses = mdr->extra_witnesses; + set witnesses = mdr->more()->extra_witnesses; if (srcdn->is_auth()) srcdn->list_replicas(witnesses); else witnesses.insert(srcdn->authority().first); destdn->list_replicas(witnesses); + dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl; + // do srcdn auth last + int last = -1; + if (!srcdn->is_auth()) + last = srcdn->authority().first; + for (set::iterator p = witnesses.begin(); p != witnesses.end(); ++p) { - if (mdr->witnessed.count(*p)) { + if (*p == last) continue; // do it last! + if (mdr->more()->witnessed.count(*p)) { dout(10) << " already witnessed by mds" << *p << dendl; + } else if (mdr->more()->waiting_on_slave.count(*p)) { + dout(10) << " already waiting on witness mds" << *p << dendl; } else { - dout(10) << " not yet witnessed by mds" << *p << ", sending prepare" << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); - srcdn->make_path(req->srcdnpath); - destdn->make_path(req->destdnpath); - req->now = mdr->now; - - if (straydn) { - CInodeDiscover *indis = straydn->dir->inode->replicate_to(*p); - CDirDiscover *dirdis = straydn->dir->replicate_to(*p); - CDentryDiscover *dndis = straydn->replicate_to(*p); - indis->_encode(req->stray); - dirdis->_encode(req->stray); - dndis->_encode(req->stray); - delete indis; - delete dirdis; - delete dndis; - } - - mds->send_message_mds(req, *p, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(*p) == 0); - mdr->waiting_on_slave.insert(*p); + _rename_prepare_witness(mdr, *p, srcdn, destdn, straydn); } } - if (!mdr->waiting_on_slave.empty()) + if (!mdr->more()->waiting_on_slave.empty()) return; // we're waiting for a witness. - // -- inode migration? -- - if (!srcdn->is_auth() && - srcdn->is_primary()) { - if (mdr->inode_import.length() == 0) { - // get inode - int auth = srcdn->authority().first; - dout(10) << " requesting inode export from srcdn auth mds" << auth << dendl; - MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODE); - srcdn->make_path(req->srcdnpath); - mds->send_message_mds(req, auth, MDS_PORT_SERVER); - - assert(mdr->waiting_on_slave.count(auth) == 0); - mdr->waiting_on_slave.insert(auth); - return; - } else { - dout(10) << " already (just!) got inode export from srcdn auth" << dendl; - /*int off = 0; - mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off, - srcdn->authority().first); - srcdn->inode->force_auth.first = srcdn->authority().first; - */ - } + if (last >= 0 && + mdr->more()->witnessed.count(last) == 0 && + mdr->more()->waiting_on_slave.count(last) == 0) { + dout(10) << " preparing last witness (srcdn auth)" << dendl; + _rename_prepare_witness(mdr, last, srcdn, destdn, straydn); + return; } // -- prepare anchor updates -- @@ -2890,18 +2869,18 @@ void Server::handle_client_rename(MDRequest *mdr) if (srcdn->is_primary() && srcdn->inode->is_anchored() && srcdn->dir != destdn->dir && - !mdr->src_reanchor_atid) { + !mdr->more()->src_reanchor_atid) { dout(10) << "reanchoring src->dst " << *srcdn->inode << dendl; vector trace; destdn->make_anchor_trace(trace, srcdn->inode); anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->src_reanchor_atid, + mds->anchorclient->prepare_update(srcdn->inode->ino(), trace, &mdr->more()->src_reanchor_atid, anchorgather->new_sub()); } if (destdn->is_primary() && destdn->inode->is_anchored() && - !mdr->dst_reanchor_atid) { + !mdr->more()->dst_reanchor_atid) { dout(10) << "reanchoring dst->stray " << *destdn->inode << dendl; assert(straydn); @@ -2910,7 +2889,7 @@ void Server::handle_client_rename(MDRequest *mdr) if (!anchorgather) anchorgather = new C_Gather(new C_MDS_RetryRequest(mdcache, mdr)); - mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->dst_reanchor_atid, + mds->anchorclient->prepare_update(destdn->inode->ino(), trace, &mdr->more()->dst_reanchor_atid, anchorgather->new_sub()); } @@ -2919,6 +2898,7 @@ void Server::handle_client_rename(MDRequest *mdr) } // -- prepare journal entry -- + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "rename"); le->metablob.add_client_req(mdr->reqid); @@ -2933,8 +2913,7 @@ void Server::handle_client_rename(MDRequest *mdr) mdr->committing = true; // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->submit_entry(le, fin); } @@ -2946,8 +2925,10 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe _rename_apply(mdr, srcdn, destdn, straydn); // commit anchor updates? - if (mdr->src_reanchor_atid) mds->anchorclient->commit(mdr->src_reanchor_atid); - if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid); + if (mdr->more()->src_reanchor_atid) + mds->anchorclient->commit(mdr->more()->src_reanchor_atid, mdr->ls); + if (mdr->more()->dst_reanchor_atid) + mds->anchorclient->commit(mdr->more()->dst_reanchor_atid, mdr->ls); // bump popularity //if (srcdn->is_auth()) @@ -2970,6 +2951,36 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe // helpers +void Server::_rename_prepare_witness(MDRequest *mdr, int who, CDentry *srcdn, CDentry *destdn, CDentry *straydn) +{ + dout(10) << "_rename_prepare_witness mds" << who << dendl; + MMDSSlaveRequest *req = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREP); + srcdn->make_path(req->srcdnpath); + destdn->make_path(req->destdnpath); + req->now = mdr->now; + + if (straydn) { + CInodeDiscover *indis = straydn->dir->inode->replicate_to(who); + CDirDiscover *dirdis = straydn->dir->replicate_to(who); + CDentryDiscover *dndis = straydn->replicate_to(who); + indis->_encode(req->stray); + dirdis->_encode(req->stray); + dndis->_encode(req->stray); + delete indis; + delete dirdis; + delete dndis; + } + + // srcdn auth will verify our current witness list is sufficient + req->witnesses = mdr->more()->witnessed; + + mds->send_message_mds(req, who, MDS_PORT_SERVER); + + assert(mdr->more()->waiting_on_slave.count(who) == 0); + mdr->more()->waiting_on_slave.insert(who); +} + + void Server::_rename_prepare(MDRequest *mdr, EMetaBlob *metablob, CDentry *srcdn, CDentry *destdn, CDentry *straydn) @@ -2981,9 +2992,9 @@ void Server::_rename_prepare(MDRequest *mdr, (srcdn->is_primary() || destdn->is_primary())); if (mdr->is_master()) { - mdr->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); + mdr->more()->pvmap[destdn->dir->inode] = predirty_dn_diri(mdr, destdn, metablob); if (destdn->dir != srcdn->dir) - mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); + mdr->more()->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); } inode_t *ji = 0; // journaled inode getting nlink-- @@ -2995,13 +3006,13 @@ void Server::_rename_prepare(MDRequest *mdr, // destdn -> primary metablob->add_dir_context(destdn->dir); if (destdn->is_auth()) - ipv = mdr->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); + ipv = mdr->more()->pvmap[destdn] = destdn->pre_dirty(destdn->inode->inode.version); ji = metablob->add_primary_dentry(destdn, true, destdn->inode); // do src dentry metablob->add_dir_context(srcdn->dir); if (srcdn->is_auth()) - mdr->pvmap[srcdn] = srcdn->pre_dirty(); + mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); metablob->add_null_dentry(srcdn, true); } else { @@ -3013,7 +3024,7 @@ void Server::_rename_prepare(MDRequest *mdr, // link-- inode, move to stray dir. metablob->add_dir_context(straydn->dir); if (straydn->is_auth()) - ipv = mdr->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); + ipv = mdr->more()->pvmap[straydn] = straydn->pre_dirty(destdn->inode->inode.version); ji = metablob->add_primary_dentry(straydn, true, destdn->inode); } else if (destdn->is_remote()) { @@ -3021,7 +3032,7 @@ void Server::_rename_prepare(MDRequest *mdr, // nlink-- targeti metablob->add_dir_context(destdn->inode->get_parent_dir()); if (destdn->inode->is_auth()) - ipv = mdr->pvmap[destdn->inode] = destdn->inode->pre_dirty(); + ipv = mdr->more()->pvmap[destdn->inode] = destdn->inode->pre_dirty(); ji = metablob->add_primary_dentry(destdn->inode->parent, true, destdn->inode); // update primary dout(10) << "remote targeti (nlink--) is " << *destdn->inode << dendl; } @@ -3038,8 +3049,8 @@ void Server::_rename_prepare(MDRequest *mdr, if (srcdn->is_auth()) siv = srcdn->inode->get_projected_version(); else - siv = mdr->inode_import_v; - mdr->pvmap[destdn] = destdn->pre_dirty(siv+1); + siv = mdr->more()->inode_import_v; + mdr->more()->pvmap[destdn] = destdn->pre_dirty(siv+1); } metablob->add_primary_dentry(destdn, true, srcdn->inode); @@ -3047,14 +3058,14 @@ void Server::_rename_prepare(MDRequest *mdr, assert(srcdn->is_remote()); dout(10) << "src is a remote dentry" << dendl; if (destdn->is_auth()) - mdr->pvmap[destdn] = destdn->pre_dirty(); + mdr->more()->pvmap[destdn] = destdn->pre_dirty(); metablob->add_remote_dentry(destdn, true, srcdn->get_remote_ino()); } // remove src dentry metablob->add_dir_context(srcdn->dir); if (srcdn->is_auth()) - mdr->pvmap[srcdn] = srcdn->pre_dirty(); + mdr->more()->pvmap[srcdn] = srcdn->pre_dirty(); metablob->add_null_dentry(srcdn, true); // new subtree? @@ -3078,17 +3089,17 @@ void Server::_rename_prepare(MDRequest *mdr, } // anchor updates? - if (mdr->src_reanchor_atid) - metablob->add_anchor_transaction(mdr->src_reanchor_atid); - if (mdr->dst_reanchor_atid) - metablob->add_anchor_transaction(mdr->dst_reanchor_atid); + if (mdr->more()->src_reanchor_atid) + metablob->add_anchor_transaction(mdr->more()->src_reanchor_atid); + if (mdr->more()->dst_reanchor_atid) + metablob->add_anchor_transaction(mdr->more()->dst_reanchor_atid); } void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn) { dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl; - dout(10) << " pvs " << mdr->pvmap << dendl; + dout(10) << " pvs " << mdr->more()->pvmap << dendl; CInode *oldin = destdn->inode; @@ -3098,9 +3109,9 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen // dir mtimes if (mdr->is_master()) { - dirty_dn_diri(destdn, mdr->pvmap[destdn->dir->inode], mdr->now); + dirty_dn_diri(mdr, destdn, mdr->more()->pvmap[destdn->dir->inode]); if (destdn->dir != srcdn->dir) - dirty_dn_diri(srcdn, mdr->pvmap[srcdn->dir->inode], mdr->now); + dirty_dn_diri(mdr, srcdn, mdr->more()->pvmap[srcdn->dir->inode]); } if (linkmerge) { @@ -3111,12 +3122,12 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen destdn->inode->inode.nlink--; destdn->inode->inode.ctime = mdr->now; if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); + destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); // unlink srcdn srcdn->dir->unlink_inode(srcdn); if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); + srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } else { dout(10) << "merging primary onto remote link" << dendl; assert(srcdn->is_primary()); @@ -3130,11 +3141,11 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen destdn->inode->inode.nlink--; destdn->inode->inode.ctime = mdr->now; if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); + destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); // mark src dirty if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); + srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } } else { @@ -3154,14 +3165,14 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen oldin->inode.nlink--; oldin->inode.ctime = mdr->now; if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(); + oldin->pop_and_dirty_projected_inode(mdr->ls); } else if (oldin) { // nlink-- remote. destdn was remote. oldin->inode.nlink--; oldin->inode.ctime = mdr->now; if (oldin->is_auth()) - oldin->pop_and_dirty_projected_inode(); + oldin->pop_and_dirty_projected_inode(mdr->ls); } CInode *in = srcdn->inode; @@ -3172,7 +3183,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); destdn->link_remote(in); if (destdn->is_auth()) - destdn->mark_dirty(mdr->pvmap[destdn]); + destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); } else { // srcdn was primary. srcdn->dir->unlink_inode(srcdn); @@ -3180,20 +3191,24 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen // srcdn inode import? if (!srcdn->is_auth() && destdn->is_auth()) { - assert(mdr->inode_import.length() > 0); - int off = 0; + assert(mdr->more()->inode_import.length() > 0); + bufferlist::iterator blp = mdr->more()->inode_import.begin(); map imported_client_map; - ::_decode(imported_client_map, mdr->inode_import, off); - mdcache->migrator->decode_import_inode(destdn, mdr->inode_import, off, + list updated_scatterlocks; // we clear_updated explicitly below + ::_decode_simple(imported_client_map, blp); + mdcache->migrator->decode_import_inode(destdn, blp, srcdn->authority().first, - imported_client_map); + imported_client_map, + mdr->ls, + updated_scatterlocks); + destdn->inode->dirlock.clear_updated(); } if (destdn->inode->is_auth()) - destdn->inode->mark_dirty(mdr->pvmap[destdn]); + destdn->inode->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls); } if (srcdn->is_auth()) - srcdn->mark_dirty(mdr->pvmap[srcdn]); + srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls); } // update subtree map? @@ -3285,17 +3300,82 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) mdr->now = mdr->slave_request->now; + // set up commit waiter (early, to clean up any freezing etc we do) + if (!mdr->more()->slave_commit) + mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); + + // am i srcdn auth? + if (srcdn->is_auth()) { + if (srcdn->is_primary() && + !srcdn->inode->is_freezing_inode() && + !srcdn->inode->is_frozen_inode()) { + // srci auth. + // set ambiguous auth. + srcdn->inode->state_set(CInode::STATE_AMBIGUOUSAUTH); + + // freeze? + // we need this to + // - avoid conflicting lock state changes + // - avoid concurrent updates to the inode + // (this could also be accomplished with the versionlock) + int allowance = 1; // for the versionlock and possible linklock xlock (both are tied to mdr) + dout(10) << " freezing srci " << *srcdn->inode << " with allowance " << allowance << dendl; + if (!srcdn->inode->freeze_inode(allowance)) { + srcdn->inode->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); + return; + } + } + + // is witness list sufficient? + set srcdnrep; + srcdn->list_replicas(srcdnrep); + for (set::iterator p = srcdnrep.begin(); + p != srcdnrep.end(); + ++p) { + if (*p == mdr->slave_to_mds || + mdr->slave_request->witnesses.count(*p)) continue; + dout(10) << " witness list insufficient; providing srcdn replica list" << dendl; + MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); + reply->witnesses.swap(srcdnrep); + mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); + delete mdr->slave_request; + mdr->slave_request = 0; + return; + } + dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl; + } + // journal it? if (srcdn->is_auth() || - destdn->inode->is_auth() || + (destdn->inode && destdn->inode->is_auth()) || srcdn->inode->is_any_caps()) { // journal. + mdr->ls = mdlog->get_current_segment(); ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); - _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); - mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); + + // rollback case + if (destdn->inode && destdn->inode->is_auth()) { + assert(destdn->is_remote()); + le->rollback.add_dir_context(destdn->dir); + le->rollback.add_dentry(destdn, true); + } + if (srcdn->is_auth() || + (srcdn->inode && srcdn->inode->is_auth())) { + le->rollback.add_dir_context(srcdn->dir); + le->rollback.add_dentry(srcdn, true); + } + + // commit case + _rename_prepare(mdr, &le->commit, srcdn, destdn, straydn); + + mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); } else { // don't journal. dout(10) << "not journaling, i'm not auth for anything, and srci isn't open" << dendl; + + // prepare anyway; this may twiddle dir_auth + EMetaBlob blah; + _rename_prepare(mdr, &blah, srcdn, destdn, straydn); _logged_slave_rename(mdr, srcdn, destdn, straydn); } } @@ -3305,25 +3385,38 @@ void Server::_logged_slave_rename(MDRequest *mdr, { dout(10) << "_logged_slave_rename " << *mdr << dendl; - // ack + // prepare ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEPREPACK); - if (srcdn->is_auth()) { - // share the replica list, so that they can all witness the rename. - srcdn->list_replicas(reply->srcdn_replicas); + + // export srci? + if (srcdn->is_auth() && srcdn->is_primary()) { + list finished; + map exported_client_map; + bufferlist inodebl; + mdcache->migrator->encode_export_inode(srcdn->inode, inodebl, + exported_client_map); + mdcache->migrator->finish_export_inode(srcdn->inode, mdr->now, finished); + mds->queue_waiters(finished); // this includes SINGLEAUTH waiters. + ::_encode(exported_client_map, reply->inode_export); + reply->inode_export.claim_append(inodebl); + reply->inode_export_v = srcdn->inode->inode.version; + + // remove mdr auth pin + mdr->auth_unpin(srcdn->inode); + assert(!srcdn->inode->is_auth_pinned()); + + dout(10) << " exported srci " << *srcdn->inode << dendl; + } - // note srcdn, we'll get asked for inode momentarily - mdr->srcdn = srcdn; - } + // apply + _rename_apply(mdr, srcdn, destdn, straydn); mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); - // set up commit waiter - mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); - // bump popularity //if (srcdn->is_auth()) //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); - if (destdn->inode->is_auth()) + if (destdn->inode && destdn->inode->is_auth()) mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); // done. @@ -3336,110 +3429,134 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, { dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl; + // unfreeze+singleauth inode + // hmm, do i really need to delay this? + if (srcdn->is_auth() && destdn->is_primary()) { + dout(10) << " unfreezing exported inode " << *destdn->inode << dendl; + list finished; + + // singleauth + assert(destdn->inode->state_test(CInode::STATE_AMBIGUOUSAUTH)); + destdn->inode->state_clear(CInode::STATE_AMBIGUOUSAUTH); + destdn->inode->take_waiting(CInode::WAIT_SINGLEAUTH, finished); + + // unfreeze + assert(destdn->inode->is_frozen_inode() || + destdn->inode->is_freezing_inode()); + destdn->inode->unfreeze_inode(finished); + + mds->queue_waiters(finished); + } + + ESlaveUpdate *le; if (r == 0) { - // commit - _rename_apply(mdr, srcdn, destdn, straydn); - // write a commit to the journal le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); + } else { // abort le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); - } - mds->mdlog->submit_entry(le); -} -void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_rename_prep_ack " << *mdr - << " witnessed by " << m->get_source() - << " " << *m << dendl; - int from = m->get_source().num(); + // -- rollback in memory -- - // note slave - mdr->slaves.insert(from); + if (mdr->more()->was_link_merge) { + // link merge + CInode *in = destdn->inode; + in->inode.nlink++; + if (mdr->more()->destdn_was_remote_inode) { + destdn->dir->unlink_inode(destdn); + srcdn->dir->link_primary_inode(srcdn, in); + destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); + } else { + srcdn->dir->link_remote_inode(srcdn, in->ino(), MODE_TO_DT(in->inode.mode)); + } + } else { + // normal - // witnessed! - assert(mdr->witnessed.count(from) == 0); - mdr->witnessed.insert(from); + // revert srcdn + if (destdn->is_remote()) { + srcdn->dir->link_remote_inode(srcdn, destdn->inode->ino(), MODE_TO_DT(destdn->inode->inode.mode)); + destdn->dir->unlink_inode(destdn); + } else { + // renamed a primary + CInode *in = destdn->inode; + destdn->dir->unlink_inode(destdn); + srcdn->dir->link_primary_inode(srcdn, in); + } + + // revert destdn + if (mdr->more()->destdn_was_remote_inode) { + destdn->dir->link_remote_inode(destdn, + mdr->more()->destdn_was_remote_inode->ino(), + MODE_TO_DT(mdr->more()->destdn_was_remote_inode->inode.mode)); + mdr->more()->destdn_was_remote_inode->inode.nlink++; + } else if (straydn && straydn->inode) { + CInode *in = straydn->inode; + straydn->dir->unlink_inode(straydn); + destdn->dir->link_primary_inode(destdn, in); + straydn->dir->remove_dentry(straydn); + } + } + // FIXME: reverse srci export? + + dout(-10) << " srcdn back to " << *srcdn << dendl; + dout(-10) << " srci back to " << *srcdn->inode << dendl; + dout(-10) << " destdn back to " << *destdn << dendl; + if (destdn->inode) dout(-10) << " desti back to " << *destdn->inode << dendl; + + // *** WRITE ME *** + assert(0); - // add extra witnesses? - if (!m->srcdn_replicas.empty()) { - dout(10) << " extra witnesses (srcdn replicas) are " << m->srcdn_replicas << dendl; - mdr->extra_witnesses = m->srcdn_replicas; - mdr->extra_witnesses.erase(mds->get_nodeid()); // not me! } - // remove from waiting list - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); + - if (mdr->waiting_on_slave.empty()) - dispatch_client_request(mdr); // go again! - else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl; + mdlog->submit_entry(le); } - - -void Server::handle_slave_rename_get_inode(MDRequest *mdr) +void Server::handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *ack) { - dout(10) << "handle_slave_rename_get_inode " << *mdr << dendl; - - assert(mdr->srcdn); - assert(mdr->srcdn->is_auth()); - assert(mdr->srcdn->is_primary()); - - // reply - MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_RENAMEGETINODEACK); - dout(10) << " replying with inode export info " << *mdr->srcdn->inode << dendl; - - map exported_client_map; - bufferlist inodebl; - mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, mdr->slave_to_mds, - exported_client_map, - mdr->now); - ::_encode(exported_client_map, reply->inode_export); - reply->inode_export.claim_append(inodebl); - - reply->inode_export_v = mdr->srcdn->inode->inode.version; + dout(10) << "handle_slave_rename_prep_ack " << *mdr + << " witnessed by " << ack->get_source() + << " " << *ack << dendl; + int from = ack->get_source().num(); - mdr->inode_import = reply->inode_export; // keep a copy locally, in case we have to rollback - - mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); + // note slave + mdr->more()->slaves.insert(from); - // clean up. - delete mdr->slave_request; - mdr->slave_request = 0; -} + // witnessed? or add extra witnesses? + assert(mdr->more()->witnessed.count(from) == 0); + if (ack->witnesses.empty()) { + mdr->more()->witnessed.insert(from); + } else { + dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl; + mdr->more()->extra_witnesses.swap(ack->witnesses); + mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me! + } -void Server::handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m) -{ - dout(10) << "handle_slave_rename_get_inode_ack " << *mdr - << " " << *m << dendl; - int from = m->get_source().num(); + // srci import? + if (ack->inode_export.length()) { + dout(10) << " got srci import" << dendl; + mdr->more()->inode_import.claim(ack->inode_export); + mdr->more()->inode_import_v = ack->inode_export_v; + } - assert(m->inode_export.length()); - dout(10) << " got inode export, saving in " << *mdr << dendl; - mdr->inode_import.claim(m->inode_export); - mdr->inode_import_v = m->inode_export_v; + // remove from waiting list + assert(mdr->more()->waiting_on_slave.count(from)); + mdr->more()->waiting_on_slave.erase(from); - assert(mdr->waiting_on_slave.count(from)); - mdr->waiting_on_slave.erase(from); - - if (mdr->waiting_on_slave.empty()) + if (mdr->more()->waiting_on_slave.empty()) dispatch_client_request(mdr); // go again! else - dout(10) << "still waiting on slaves " << mdr->waiting_on_slave << dendl; + dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl; } - // =================================== // TRUNCATE, FSYNC @@ -3462,7 +3579,7 @@ public: in->inode.size = size; in->inode.ctime = ctime; in->inode.mtime = ctime; - in->mark_dirty(pv); + in->mark_dirty(pv, mdr->ls); // reply mds->server->reply_request(mdr, 0); @@ -3485,8 +3602,8 @@ public: assert(r == 0); // purge - mds->mdcache->purge_inode(&in->inode, size); - mds->mdcache->wait_for_purge(in->inode.ino, size, + mds->mdcache->purge_inode(in, size, in->inode.size, mdr->ls); + mds->mdcache->wait_for_purge(in, size, new C_MDS_truncate_purged(mds, mdr, in, pv, size, ctime)); } }; @@ -3520,18 +3637,19 @@ void Server::handle_client_truncate(MDRequest *mdr) pdv, req->args.truncate.length, ctime); // log + wait + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length); + le->metablob.add_inode_truncate(cur->ino(), req->args.truncate.length, cur->inode.size); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mtime = ctime; pi->ctime = ctime; pi->version = pdv; pi->size = req->args.truncate.length; - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + + mdlog->submit_entry(le, fin); } @@ -3646,21 +3764,24 @@ void Server::journal_opens() EOpen *le = 0; // check queued inodes + LogSegment *ls = mdlog->get_current_segment(); for (set::iterator p = journal_open_queue.begin(); p != journal_open_queue.end(); ++p) { - (*p)->put(CInode::PIN_BATCHOPENJOURNAL); - if ((*p)->is_any_caps()) { + CInode *in = *p; + in->put(CInode::PIN_BATCHOPENJOURNAL); + if (in->is_any_caps()) { if (!le) le = new EOpen(mdlog); - le->add_inode(*p); - (*p)->last_open_journaled = mds->mdlog->get_write_pos(); + le->add_inode(in); + in->last_open_journaled = mds->mdlog->get_write_pos(); + ls->open_files.push_back(&in->xlist_open_file); } } journal_open_queue.clear(); if (le) { // journal - mds->mdlog->submit_entry(le); + mdlog->submit_entry(le); // add waiters to journal entry for (list::iterator p = journal_open_waiters.begin(); @@ -3695,7 +3816,7 @@ public: in->inode.size = 0; in->inode.ctime = ctime; in->inode.mtime = ctime; - in->mark_dirty(pv); + in->mark_dirty(pv, mdr->ls); // do the open mds->server->_do_open(mdr, in); @@ -3720,8 +3841,8 @@ public: mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); // purge also... - mds->mdcache->purge_inode(&in->inode, 0); - mds->mdcache->wait_for_purge(in->inode.ino, 0, + mds->mdcache->purge_inode(in, 0, in->inode.size, mdr->ls); + mds->mdcache->wait_for_purge(in, 0, new C_MDS_open_truncate_purged(mds, mdr, in, pv, ctime)); } }; @@ -3739,18 +3860,18 @@ void Server::handle_client_opent(MDRequest *mdr) pdv, ctime); // log + wait + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "open_truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); - le->metablob.add_inode_truncate(cur->inode, 0); + le->metablob.add_inode_truncate(cur->ino(), 0, cur->inode.size); inode_t *pi = le->metablob.add_dentry(cur->parent, true); pi->mtime = ctime; pi->ctime = ctime; pi->version = pdv; pi->size = 0; - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->submit_entry(le, fin); } @@ -3772,7 +3893,7 @@ public: dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir - newi->mark_dirty(pv); + newi->mark_dirty(pv, mdr->ls); // downgrade xlock to rdlock //mds->locker->dentry_xlock_downgrade_to_rdlock(dn, mdr); @@ -3824,6 +3945,7 @@ void Server::handle_client_openc(MDRequest *mdr) // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); + mdr->ls = mdlog->get_current_segment(); EUpdate *le = new EUpdate(mdlog, "openc"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); @@ -3831,8 +3953,7 @@ void Server::handle_client_openc(MDRequest *mdr) le->metablob.add_primary_dentry(dn, true, in, &in->inode); // log + wait - mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); + mdlog->submit_entry(le, fin); /* FIXME. this needs to be rewritten when the write capability stuff starts diff --git a/trunk/ceph/mds/Server.h b/trunk/ceph/mds/Server.h index 7cc910d6e266d..281fd13ca2593 100644 --- a/trunk/ceph/mds/Server.h +++ b/trunk/ceph/mds/Server.h @@ -92,7 +92,7 @@ public: CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequest *mdr); version_t predirty_dn_diri(MDRequest *mdr, CDentry *dn, class EMetaBlob *blob); - void dirty_dn_diri(CDentry *dn, version_t dirpv, utime_t mtime); + void dirty_dn_diri(MDRequest *mdr, CDentry *dn, version_t dirpv); // requests on existing inodes. @@ -163,6 +163,8 @@ public: CDentry *srcdn, CDentry *destdn, CDentry *straydn); // helpers + void _rename_prepare_witness(MDRequest *mdr, int who, + CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _rename_prepare(MDRequest *mdr, EMetaBlob *metablob, CDentry *srcdn, CDentry *destdn, CDentry *straydn); @@ -173,8 +175,6 @@ public: void handle_slave_rename_prep_ack(MDRequest *mdr, MMDSSlaveRequest *m); void _logged_slave_rename(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); void _commit_slave_rename(MDRequest *mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn); - void handle_slave_rename_get_inode(MDRequest *mdr); - void handle_slave_rename_get_inode_ack(MDRequest *mdr, MMDSSlaveRequest *m); }; diff --git a/trunk/ceph/mds/SimpleLock.h b/trunk/ceph/mds/SimpleLock.h index 8e26bec25061f..e785e2c36d50c 100644 --- a/trunk/ceph/mds/SimpleLock.h +++ b/trunk/ceph/mds/SimpleLock.h @@ -202,12 +202,12 @@ public: // encode/decode void _encode(bufferlist& bl) { - ::_encode(state, bl); - ::_encode(gather_set, bl); + ::_encode_simple(state, bl); + ::_encode_simple(gather_set, bl); } - void _decode(bufferlist& bl, int& off) { - ::_decode(state, bl, off); - ::_decode(gather_set, bl, off); + void _decode(bufferlist::iterator& p) { + ::_decode_simple(state, p); + ::_decode_simple(gather_set, p); } diff --git a/trunk/ceph/mds/events/EAnchor.h b/trunk/ceph/mds/events/EAnchor.h index 5980d40c17cd9..97a21a36734be 100644 --- a/trunk/ceph/mds/events/EAnchor.h +++ b/trunk/ceph/mds/events/EAnchor.h @@ -73,10 +73,8 @@ protected: if (reqmds >= 0) out << " by mds" << reqmds; } - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); - void replay(MDS *mds); - + void update_segment(); + void replay(MDS *mds); }; #endif diff --git a/trunk/ceph/mds/events/EAnchorClient.h b/trunk/ceph/mds/events/EAnchorClient.h index 7cd36453e17b9..21f78369cae72 100644 --- a/trunk/ceph/mds/events/EAnchorClient.h +++ b/trunk/ceph/mds/events/EAnchorClient.h @@ -49,8 +49,6 @@ protected: if (atid) out << " atid " << atid; } - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); void replay(MDS *mds); }; diff --git a/trunk/ceph/mds/events/EMetaBlob.h b/trunk/ceph/mds/events/EMetaBlob.h index 9111a8672c46d..767521523f9fe 100644 --- a/trunk/ceph/mds/events/EMetaBlob.h +++ b/trunk/ceph/mds/events/EMetaBlob.h @@ -23,8 +23,11 @@ using std::string; #include "../CDir.h" #include "../CDentry.h" +#include "include/triple.h" + class MDS; class MDLog; +class LogSegment; /* * a bunch of metadata in the journal @@ -62,7 +65,7 @@ class EMetaBlob { ::_encode(dn, bl); ::_encode(dnv, bl); ::_encode(inode, bl); - ::_encode(dirfragtree, bl); + dirfragtree._encode(bl); if (inode.is_symlink()) ::_encode(symlink, bl); ::_encode(dirty, bl); @@ -71,7 +74,7 @@ class EMetaBlob { ::_decode(dn, bl, off); ::_decode(dnv, bl, off); ::_decode(inode, bl, off); - ::_decode(dirfragtree, bl, off); + dirfragtree._decode(bl, off); if (inode.is_symlink()) ::_decode(symlink, bl, off); ::_decode(dirty, bl, off); @@ -161,7 +164,7 @@ public: list dnull; public: - dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } + dirlump() : dirv(0), state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } bool is_complete() { return state & STATE_COMPLETE; } void mark_complete() { state |= STATE_COMPLETE; } @@ -242,7 +245,7 @@ private: version_t alloc_tablev; // inodes i've destroyed. - list< pair > truncated_inodes; + list< triple > truncated_inodes; // idempotent op(s) list client_reqs; @@ -252,7 +255,10 @@ private: off_t last_subtree_map; off_t my_offset; - EMetaBlob() : last_subtree_map(0), my_offset(0) { } + // for replay, in certain cases + LogSegment *_segment; + + EMetaBlob() : last_subtree_map(0), my_offset(0), _segment(0) { } EMetaBlob(MDLog *mdl); // defined in journal.cc void print(ostream& out) { @@ -280,8 +286,8 @@ private: alloc_tablev = tablev; } - void add_inode_truncate(const inode_t& inode, off_t newsize) { - truncated_inodes.push_back(pair(inode, newsize)); + void add_inode_truncate(inodeno_t ino, off_t newsize, off_t oldsize) { + truncated_inodes.push_back(triple(ino, newsize, oldsize)); } void add_null_dentry(CDentry *dn, bool dirty) { @@ -483,7 +489,8 @@ private: bool has_expired(MDS *mds); void expire(MDS *mds, Context *c); - void replay(MDS *mds); + void update_segment(LogSegment *ls); + void replay(MDS *mds, LogSegment *ls=0); }; inline ostream& operator<<(ostream& out, const EMetaBlob& t) { diff --git a/trunk/ceph/mds/events/EOpen.h b/trunk/ceph/mds/events/EOpen.h index e13cbe19542be..fccc3650707a8 100644 --- a/trunk/ceph/mds/events/EOpen.h +++ b/trunk/ceph/mds/events/EOpen.h @@ -46,8 +46,7 @@ public: metablob._decode(bl, off); } - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); + void update_segment(); void replay(MDS *mds); }; diff --git a/trunk/ceph/mds/events/EPurgeFinish.h b/trunk/ceph/mds/events/EPurgeFinish.h index b0c727bff305b..dff0101b7699a 100644 --- a/trunk/ceph/mds/events/EPurgeFinish.h +++ b/trunk/ceph/mds/events/EPurgeFinish.h @@ -22,33 +22,33 @@ class EPurgeFinish : public LogEvent { protected: inodeno_t ino; - off_t newsize; + off_t newsize, oldsize; public: - EPurgeFinish(inodeno_t i, off_t s) : + EPurgeFinish(inodeno_t i, off_t ns, off_t os) : LogEvent(EVENT_PURGEFINISH), - ino(i), newsize(s) { } + ino(i), newsize(ns), oldsize(os) { } EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } void print(ostream& out) { - out << "purgefinish " << ino << " to " << newsize; + out << "purgefinish " << ino << " " << oldsize << " ->" << newsize; } virtual void encode_payload(bufferlist& bl) { bl.append((char*)&ino, sizeof(ino)); bl.append((char*)&newsize, sizeof(newsize)); + bl.append((char*)&oldsize, sizeof(oldsize)); } void decode_payload(bufferlist& bl, int& off) { - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(newsize), (char*)&newsize); - off += sizeof(newsize); + ::_decode(ino, bl, off); + ::_decode(newsize, bl, off); + ::_decode(oldsize, bl, off); } bool has_expired(MDS *mds); void expire(MDS *mds, Context *c); + void update_segment(); void replay(MDS *mds); - }; #endif diff --git a/trunk/ceph/mds/events/ESession.h b/trunk/ceph/mds/events/ESession.h index 953eff2d0e01c..a8f9992486a18 100644 --- a/trunk/ceph/mds/events/ESession.h +++ b/trunk/ceph/mds/events/ESession.h @@ -57,8 +57,8 @@ class ESession : public LogEvent { bool has_expired(MDS *mds); void expire(MDS *mds, Context *c); - void replay(MDS *mds); - + void update_segment(); + void replay(MDS *mds); }; #endif diff --git a/trunk/ceph/mds/events/ESlaveUpdate.h b/trunk/ceph/mds/events/ESlaveUpdate.h index 23d280a7b831c..54eaef9c6a296 100644 --- a/trunk/ceph/mds/events/ESlaveUpdate.h +++ b/trunk/ceph/mds/events/ESlaveUpdate.h @@ -24,7 +24,14 @@ public: const static int OP_COMMIT = 2; const static int OP_ROLLBACK = 3; - EMetaBlob metablob; + /* + * we journal a rollback metablob that contains the unmodified metadata + * too, because we may be updating previously dirty metadata, which + * will allow old log segments to be trimmed. if we end of rolling back, + * those updates could be lost.. so we re-journal the unmodified metadata, + * and replay will apply _either_ commit or rollback. + */ + EMetaBlob commit, rollback; string type; metareqid_t reqid; int master; @@ -32,7 +39,7 @@ public: ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), metablob(mdlog), + LogEvent(EVENT_SLAVEUPDATE), commit(mdlog), rollback(mdlog), type(s), reqid(ri), master(mastermds), @@ -44,7 +51,7 @@ public: out << " " << op; out << " " << reqid; out << " for mds" << master; - out << metablob; + out << commit << " " << rollback; } void encode_payload(bufferlist& bl) { @@ -52,14 +59,16 @@ public: ::_encode(reqid, bl); ::_encode(master, bl); ::_encode(op, bl); - metablob._encode(bl); + commit._encode(bl); + rollback._encode(bl); } void decode_payload(bufferlist& bl, int& off) { ::_decode(type, bl, off); ::_decode(reqid, bl, off); ::_decode(master, bl, off); ::_decode(op, bl, off); - metablob._decode(bl, off); + commit._decode(bl, off); + rollback._decode(bl, off); } bool has_expired(MDS *mds); diff --git a/trunk/ceph/mds/events/ESubtreeMap.h b/trunk/ceph/mds/events/ESubtreeMap.h index 3997a6b5686c1..cb6feb1d92ec6 100644 --- a/trunk/ceph/mds/events/ESubtreeMap.h +++ b/trunk/ceph/mds/events/ESubtreeMap.h @@ -39,8 +39,8 @@ public: ::_decode(subtrees, bl, off); } - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); + //bool has_expired(MDS *mds); + //void expire(MDS *mds, Context *c); void replay(MDS *mds); }; diff --git a/trunk/ceph/mds/events/EUpdate.h b/trunk/ceph/mds/events/EUpdate.h index afc0b708bd916..de965429f9bdd 100644 --- a/trunk/ceph/mds/events/EUpdate.h +++ b/trunk/ceph/mds/events/EUpdate.h @@ -43,8 +43,7 @@ public: metablob._decode(bl, off); } - bool has_expired(MDS *mds); - void expire(MDS *mds, Context *c); + void update_segment(); void replay(MDS *mds); }; diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc index 062262610ea3c..1f27cf713a078 100644 --- a/trunk/ceph/mds/journal.cc +++ b/trunk/ceph/mds/journal.cc @@ -32,6 +32,8 @@ #include "events/EAnchor.h" #include "events/EAnchorClient.h" +#include "LogSegment.h" + #include "MDS.h" #include "MDLog.h" #include "MDCache.h" @@ -40,9 +42,157 @@ #include "AnchorTable.h" #include "AnchorClient.h" #include "IdAllocator.h" +#include "Locker.h" + #include "config.h" +#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " +#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log || l <= g_conf.debug_mds_log_expire) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " + + +// ----------------------- +// LogSegment + +class C_MDL_RetryExpireSegment : public Context { +public: + MDS *mds; + LogSegment *ls; + C_MDL_RetryExpireSegment(MDS *m, LogSegment *l) : mds(m), ls(l) {} + void finish(int r) { + ls->try_to_expire(mds); + } +}; + +C_Gather *LogSegment::try_to_expire(MDS *mds) +{ + C_Gather *gather = 0; + + set commit; + + dout(6) << "LogSegment(" << offset << ").try_to_expire" << dendl; + + // commit dirs + for (xlist::iterator p = dirty_dirfrags.begin(); !p.end(); ++p) + commit.insert(*p); + for (xlist::iterator p = dirty_dentries.begin(); !p.end(); ++p) + commit.insert((*p)->get_dir()); + for (xlist::iterator p = dirty_inodes.begin(); !p.end(); ++p) + commit.insert((*p)->get_parent_dn()->get_dir()); + + if (!commit.empty()) { + if (!gather) gather = new C_Gather; + + for (set::iterator p = commit.begin(); + p != commit.end(); + ++p) { + CDir *dir = *p; + if (dir->can_auth_pin()) { + dout(15) << "try_to_expire committing " << *dir << dendl; + dir->commit(0, gather->new_sub()); + } else { + dout(15) << "try_to_expire waiting for unfreeze on " << *dir << dendl; + dir->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); + } + } + } + + // dirty non-auth mtimes + for (xlist::iterator p = dirty_inode_mtimes.begin(); !p.end(); ++p) { + CInode *in = *p; + dout(10) << "try_to_expire waiting for dirlock mtime flush on " << *in << dendl; + if (!gather) gather = new C_Gather; + + if (in->is_ambiguous_auth()) { + dout(10) << " waiting for single auth on " << *in << dendl; + in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, gather->new_sub()); + } else if (in->is_auth()) { + dout(10) << " i'm auth, unscattering dirlock on " << *in << dendl; + assert(in->is_replicated()); // hrm! + mds->locker->scatter_lock(&in->dirlock); + in->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); + } else { + dout(10) << " i'm a replica, requesting dirlock unscatter of " << *in << dendl; + mds->locker->scatter_try_unscatter(&in->dirlock, gather->new_sub()); + } + //(*p)->dirlock.add_waiter(SimpleLock::WAIT_STABLE, gather->new_sub()); + } + + // open files + if (!open_files.empty()) { + assert(!mds->mdlog->is_capped()); // hmm FIXME + for (xlist::iterator p = open_files.begin(); !p.end(); ++p) { + dout(20) << "try_to_expire requeueing open file " << **p << dendl; + mds->server->queue_journal_open(*p); + } + if (!gather) gather = new C_Gather; + mds->server->add_journal_open_waiter(gather->new_sub()); + mds->server->maybe_journal_opens(); + dout(10) << "try_to_expire waiting for open files to rejournal" << dendl; + } + + // slave updates + for (xlist::iterator p = slave_updates.begin(); !p.end(); ++p) { + MDSlaveUpdate *su = *p; + dout(10) << "try_to_expire waiting on slave update " << su << dendl; + assert(su->waiter == 0); + if (!gather) gather = new C_Gather; + su->waiter = gather->new_sub(); + } + + // idalloc + if (allocv > mds->idalloc->get_committed_version()) { + dout(10) << "try_to_expire saving idalloc table, need " << allocv + << ", committed is " << mds->idalloc->get_committed_version() + << " (" << mds->idalloc->get_committing_version() << ")" + << dendl; + if (!gather) gather = new C_Gather; + mds->idalloc->save(gather->new_sub(), allocv); + } + + // clientmap + if (clientmapv > mds->clientmap.get_committed()) { + dout(10) << "try_to_expire saving clientmap, need " << clientmapv + << ", committed is " << mds->clientmap.get_committed() + << " (" << mds->clientmap.get_committing() << ")" + << dendl; + if (!gather) gather = new C_Gather; + mds->clientmap.save(gather->new_sub(), clientmapv); + } + + // pending commit atids + for (hash_set::iterator p = pending_commit_atids.begin(); + p != pending_commit_atids.end(); + ++p) { + if (!gather) gather = new C_Gather; + assert(!mds->anchorclient->has_committed(*p)); + dout(10) << "try_to_expire anchor transaction " << *p + << " pending commit (not yet acked), waiting" << dendl; + mds->anchorclient->wait_for_ack(*p, gather->new_sub()); + } + + // anchortable + if (anchortablev > mds->anchortable->get_committed_version()) { + dout(10) << "try_to_expire waiting for anchor table to save, need " << anchortablev << dendl; + if (!gather) gather = new C_Gather; + mds->anchortable->save(gather->new_sub()); + } + + // FIXME client requests...? + // audit handling of anchor transactions? + + if (gather) { + dout(6) << "LogSegment(" << offset << ").try_to_expire waiting" << dendl; + } else { + dout(6) << "LogSegment(" << offset << ").try_to_expire success" << dendl; + } + return gather; +} + + + +#undef dout +#undef derr #define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " #define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) *_dout << dbeginl << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " @@ -69,7 +219,7 @@ void EString::replay(MDS *mds) // EMetaBlob EMetaBlob::EMetaBlob(MDLog *mdlog) : - last_subtree_map(mdlog->get_last_subtree_map_offset()), + last_subtree_map(mdlog->get_last_segment_offset()), my_offset(mdlog->get_write_pos()) { } @@ -91,6 +241,7 @@ EMetaBlob::EMetaBlob(MDLog *mdlog) : */ bool EMetaBlob::has_expired(MDS *mds) { +/* // examine dirv's for my lumps for (map::iterator lp = lump_map.begin(); lp != lump_map.end(); @@ -211,12 +362,14 @@ bool EMetaBlob::has_expired(MDS *mds) } + */ return true; // all dirlumps expired, etc. } void EMetaBlob::expire(MDS *mds, Context *c) { +/* map commit; // dir -> version needed list waitfor_export; list waitfor_import; @@ -280,7 +433,7 @@ void EMetaBlob::expire(MDS *mds, Context *c) else // pbly about to export|split|merge. // just wait for it to unfreeze, then retry - p->first->add_waiter(CDir::WAIT_AUTHPINNABLE, gather->new_sub()); + p->first->add_waiter(CDir::WAIT_UNFREEZE, gather->new_sub()); } for (list::iterator p = waitfor_export.begin(); p != waitfor_export.end(); @@ -352,12 +505,39 @@ void EMetaBlob::expire(MDS *mds, Context *c) dout(10) << "my gather finsher is " << gather << " with " << gather->get_num() << dendl; +*/ +} + +void EMetaBlob::update_segment(LogSegment *ls) +{ + // atids? + //for (list::iterator p = atids.begin(); p != atids.end(); ++p) + // ls->pending_commit_atids[*p] = ls; + // -> handled directly by AnchorClient + + // dirty inode mtimes + // -> handled directly by Server.cc, replay() + + // alloc table update? + if (!allocated_inos.empty()) + ls->allocv = alloc_tablev; + + // truncated inodes + // -> handled directly by Server.cc + + // client requests + // note the newest request per client + //if (!client_reqs.empty()) + // ls->last_client_tid[client_reqs.rbegin()->client] = client_reqs.rbegin()->tid); } -void EMetaBlob::replay(MDS *mds) +void EMetaBlob::replay(MDS *mds, LogSegment *logseg) { dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << dendl; + if (!logseg) logseg = _segment; + assert(logseg); + // walk through my dirs (in order!) for (list::iterator lp = lump_order.begin(); lp != lump_order.end(); @@ -393,7 +573,7 @@ void EMetaBlob::replay(MDS *mds) } dir->set_version( lump.dirv ); if (lump.is_dirty()) - dir->_mark_dirty(); + dir->_mark_dirty(logseg); if (lump.is_complete()) dir->mark_complete(); @@ -408,11 +588,11 @@ void EMetaBlob::replay(MDS *mds) if (!dn) { dn = dir->add_null_dentry(p->dn); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *dn << dendl; } else { dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay had " << *dn << dendl; } @@ -433,7 +613,7 @@ void EMetaBlob::replay(MDS *mds) //assert(0); // hrm! fallout from sloppy unlink? or? hmmm FIXME investigate further } dir->link_primary_inode(dn, in); - if (p->dirty) in->_mark_dirty(); + if (p->dirty) in->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *in << dendl; } else { if (dn->get_inode() != in && in->get_parent_dn()) { @@ -443,7 +623,7 @@ void EMetaBlob::replay(MDS *mds) in->inode = p->inode; in->dirfragtree = p->dirfragtree; if (in->inode.is_symlink()) in->symlink = p->symlink; - if (p->dirty) in->_mark_dirty(); + if (p->dirty) in->_mark_dirty(logseg); if (dn->get_inode() != in) { dir->link_primary_inode(dn, in); dout(10) << "EMetaBlob.replay linked " << *in << dendl; @@ -461,7 +641,7 @@ void EMetaBlob::replay(MDS *mds) if (!dn) { dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *dn << dendl; } else { if (!dn->is_null()) { @@ -470,7 +650,7 @@ void EMetaBlob::replay(MDS *mds) } dn->set_remote(p->ino, p->d_type); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay had " << *dn << dendl; } } @@ -483,7 +663,7 @@ void EMetaBlob::replay(MDS *mds) if (!dn) { dn = dir->add_null_dentry(p->dn); dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay added " << *dn << dendl; } else { if (!dn->is_null()) { @@ -491,7 +671,7 @@ void EMetaBlob::replay(MDS *mds) dir->unlink_inode(dn); } dn->set_version(p->dnv); - if (p->dirty) dn->_mark_dirty(); + if (p->dirty) dn->_mark_dirty(logseg); dout(10) << "EMetaBlob.replay had " << *dn << dendl; } } @@ -502,7 +682,7 @@ void EMetaBlob::replay(MDS *mds) p != atids.end(); ++p) { dout(10) << "EMetaBlob.replay noting anchor transaction " << *p << dendl; - mds->anchorclient->got_journaled_agree(*p); + mds->anchorclient->got_journaled_agree(*p, logseg); } // dirtied inode mtimes @@ -513,6 +693,7 @@ void EMetaBlob::replay(MDS *mds) CInode *in = mds->mdcache->get_inode(p->first); dout(10) << "EMetaBlob.replay setting dirlock updated flag on " << *in << dendl; in->dirlock.set_updated(); + logseg->dirty_inode_mtimes.push_back(&in->xlist_dirty_inode_mtime); } // allocated_inos @@ -530,19 +711,21 @@ void EMetaBlob::replay(MDS *mds) inodeno_t ino = mds->idalloc->alloc_id(); assert(ino == *p); // this should match. - - assert(alloc_tablev == mds->idalloc->get_version()); } + assert(alloc_tablev == mds->idalloc->get_version()); } } // truncated inodes - for (list< pair >::iterator p = truncated_inodes.begin(); + for (list< triple >::iterator p = truncated_inodes.begin(); p != truncated_inodes.end(); ++p) { - dout(10) << "EMetaBlob.replay will purge truncated inode " << p->first.ino - << " to " << p->second << dendl; - mds->mdcache->add_recovered_purge(p->first, p->second); + CInode *in = mds->mdcache->get_inode(p->first); + assert(in); + dout(10) << "EMetaBlob.replay will purge truncated " + << p->third << " -> " << p->second + << " on " << *in << dendl; + mds->mdcache->add_recovered_purge(in, p->second, p->third, logseg); } // client requests @@ -550,31 +733,18 @@ void EMetaBlob::replay(MDS *mds) p != client_reqs.end(); ++p) mds->clientmap.add_completed_request(*p); + + + // update segment + update_segment(logseg); } // ----------------------- // ESession -bool ESession::has_expired(MDS *mds) -{ - if (mds->clientmap.get_committed() >= cmapv) { - dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committed() - << " >= " << cmapv << " has committed" << dendl; - return true; - } else if (mds->clientmap.get_committing() >= cmapv) { - dout(10) << "ESession.has_expired newer clientmap " << mds->clientmap.get_committing() - << " >= " << cmapv << " is still committing" << dendl; - return false; - } else { - dout(10) << "ESession.has_expired clientmap " << mds->clientmap.get_version() - << " > " << cmapv << ", need to save" << dendl; - return false; - } -} -void ESession::expire(MDS *mds, Context *c) -{ - dout(10) << "ESession.expire saving clientmap" << dendl; - mds->clientmap.save(c, cmapv); +void ESession::update_segment() +{ + _segment->clientmapv = cmapv; } void ESession::replay(MDS *mds) @@ -606,24 +776,9 @@ void ESession::replay(MDS *mds) // ----------------------- // EAnchor -bool EAnchor::has_expired(MDS *mds) +void EAnchor::update_segment() { - version_t cv = mds->anchortable->get_committed_version(); - if (cv < version) { - dout(10) << "EAnchor.has_expired v " << version << " > " << cv - << ", still dirty" << dendl; - return false; // still dirty - } else { - dout(10) << "EAnchor.has_expired v " << version << " <= " << cv - << ", already flushed" << dendl; - return true; // already flushed - } -} - -void EAnchor::expire(MDS *mds, Context *c) -{ - dout(10) << "EAnchor.expire saving anchor table" << dendl; - mds->anchortable->save(c); + _segment->anchortablev = version; } void EAnchor::replay(MDS *mds) @@ -662,16 +817,6 @@ void EAnchor::replay(MDS *mds) // EAnchorClient -bool EAnchorClient::has_expired(MDS *mds) -{ - return true; -} - -void EAnchorClient::expire(MDS *mds, Context *c) -{ - assert(0); -} - void EAnchorClient::replay(MDS *mds) { dout(10) << " EAnchorClient.replay op " << op << " atid " << atid << dendl; @@ -691,172 +836,71 @@ void EAnchorClient::replay(MDS *mds) // ----------------------- // EUpdate -bool EUpdate::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EUpdate::expire(MDS *mds, Context *c) +void EUpdate::update_segment() { - metablob.expire(mds, c); + metablob.update_segment(_segment); } void EUpdate::replay(MDS *mds) { - metablob.replay(mds); + metablob.replay(mds, _segment); } // ------------------------ // EOpen -bool EOpen::has_expired(MDS *mds) -{ - for (list::iterator p = inos.begin(); p != inos.end(); ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (in && - in->is_any_caps() && - !(in->last_open_journaled > get_start_off() || - in->last_open_journaled == 0)) { - dout(10) << "EOpen.has_expired still refer to caps on " << *in << dendl; - return false; - } - } - return true; -} - -void EOpen::expire(MDS *mds, Context *c) +void EOpen::update_segment() { - dout(10) << "EOpen.expire " << dendl; - - if (mds->mdlog->is_capped()) { - dout(0) << "uh oh, log is capped, but i have unexpired opens." << dendl; - assert(0); - } - - for (list::iterator p = inos.begin(); p != inos.end(); ++p) { - CInode *in = mds->mdcache->get_inode(*p); - if (!in) continue; - if (!in->is_any_caps()) continue; - - dout(10) << "EOpen.expire " << in->ino() - << " last_open_journaled " << in->last_open_journaled << dendl; - - mds->server->queue_journal_open(in); - } - mds->server->add_journal_open_waiter(c); - mds->server->maybe_journal_opens(); + // ?? } void EOpen::replay(MDS *mds) { dout(10) << "EOpen.replay " << dendl; - metablob.replay(mds); + metablob.replay(mds, _segment); } // ----------------------- // ESlaveUpdate -bool ESlaveUpdate::has_expired(MDS *mds) -{ - switch (op) { - case ESlaveUpdate::OP_PREPARE: - if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": haven't yet seen commit|rollback" << dendl; - return false; - } - else if (mds->mdcache->ambiguous_slave_updates[reqid]) { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": committed, checking metablob" << dendl; - bool exp = metablob.has_expired(mds); - if (exp) - mds->mdcache->ambiguous_slave_updates.erase(reqid); - return exp; - } - else { - dout(10) << "ESlaveUpdate.has_expired prepare " << reqid << " for mds" << master - << ": aborted" << dendl; - mds->mdcache->ambiguous_slave_updates.erase(reqid); - return true; - } - - case ESlaveUpdate::OP_COMMIT: - case ESlaveUpdate::OP_ROLLBACK: - if (mds->mdcache->waiting_for_slave_update_commit.count(reqid)) { - dout(10) << "ESlaveUpdate.has_expired " - << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ") - << reqid << " for mds" << master - << ": noting commit, kicking prepare waiter" << dendl; - mds->mdcache->ambiguous_slave_updates[reqid] = (op == ESlaveUpdate::OP_COMMIT); - mds->mdcache->waiting_for_slave_update_commit[reqid]->finish(0); - delete mds->mdcache->waiting_for_slave_update_commit[reqid]; - mds->mdcache->waiting_for_slave_update_commit.erase(reqid); - } else { - dout(10) << "ESlaveUpdate.has_expired " - << ((op == ESlaveUpdate::OP_COMMIT) ? "commit ":"rollback ") - << reqid << " for mds" << master - << ": no prepare waiter, ignoring" << dendl; - } - return true; - - default: - assert(0); - return false; - } -} - -void ESlaveUpdate::expire(MDS *mds, Context *c) -{ - assert(op == ESlaveUpdate::OP_PREPARE); - - if (mds->mdcache->ambiguous_slave_updates.count(reqid) == 0) { - // wait - dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master - << ": waiting for commit|rollback" << dendl; - mds->mdcache->waiting_for_slave_update_commit[reqid] = c; - } else { - // we committed.. expire the metablob - assert(mds->mdcache->ambiguous_slave_updates[reqid] == true); - dout(10) << "ESlaveUpdate.expire prepare " << reqid << " for mds" << master - << ": waiting for metablob to expire" << dendl; - metablob.expire(mds, c); - } -} - void ESlaveUpdate::replay(MDS *mds) { switch (op) { case ESlaveUpdate::OP_PREPARE: // FIXME: horribly inefficient copy; EMetaBlob needs a swap() or something dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds" << master - << ": saving blob for later commit" << dendl; + << ": saving blobs for later commit" << dendl; assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0); - mds->mdcache->uncommitted_slave_updates[master][reqid] = metablob; + commit._segment = _segment; // may need this later + rollback._segment = _segment; // may need this later + mds->mdcache->uncommitted_slave_updates[master][reqid] = + MDSlaveUpdate(commit, rollback, _segment->slave_updates); break; case ESlaveUpdate::OP_COMMIT: if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": applying previously saved blob" << dendl; - mds->mdcache->uncommitted_slave_updates[master][reqid].replay(mds); + << ": applying commit blob" << dendl; + mds->mdcache->uncommitted_slave_updates[master][reqid].commit.replay(mds, _segment); mds->mdcache->uncommitted_slave_updates[master].erase(reqid); } else { dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds" << master - << ": ignoring, no previously saved blob" << dendl; + << ": ignoring, no previously saved blobs" << dendl; } break; case ESlaveUpdate::OP_ROLLBACK: if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) { dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": discarding previously saved blob" << dendl; + << ": applying rollback blob" << dendl; assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid)); + mds->mdcache->uncommitted_slave_updates[master][reqid].rollback.replay(mds, _segment); mds->mdcache->uncommitted_slave_updates[master].erase(reqid); } else { dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds" << master - << ": ignoring, no previously saved blob" << dendl; + << ": ignoring, no previously saved blobs" << dendl; } break; @@ -869,83 +913,28 @@ void ESlaveUpdate::replay(MDS *mds) // ----------------------- // ESubtreeMap -bool ESubtreeMap::has_expired(MDS *mds) -{ - assert(!mds->mdlog->subtree_maps.empty()); - set::iterator p = mds->mdlog->subtree_maps.begin(); - off_t first = *p; - if (get_start_off() != first) { - dout(10) << "ESubtreeMap.has_expired -- not the oldest segment" << dendl; - return false; - } - - // i am the oldest. - - // capped and last event? - if (mds->mdlog->is_capped() && - mds->mdlog->subtree_maps.size() == 1 && - (mds->mdlog->trimming.empty() || - (mds->mdlog->trimming.size() == 1 && - mds->mdlog->trimming.begin()->second == this))) { - dout(10) << "ESubtreeMap.has_expired -- capped and last one" << dendl; - return true; - } - - p++; - if (p == mds->mdlog->subtree_maps.end()) { - dout(10) << "ESubtreeMap.has_expired -- only segment" << dendl; - return false; - } - off_t next = *p; - - if (mds->mdlog->get_read_pos() < next) { - dout(10) << "ESubtreeMap.has_expired -- haven't read this segment, read pos " - << mds->mdlog->get_read_pos() << " < next map at " << next - << dendl; - return false; - } - - map::iterator trimp = mds->mdlog->trimming.begin(); - assert(trimp->first == get_start_off()); - trimp++; - if (trimp != mds->mdlog->trimming.end() && - trimp->first < next) { - dout(10) << "ESubtreeMap.has_expired -- segment still trimming at " << trimp->first << dendl; - return false; - } - - dout(10) << "ESubtreeMap.has_expired -- segment is empty" << dendl; - return true; -} - -void ESubtreeMap::expire(MDS *mds, Context *c) -{ - dout(10) << "ESubtreeMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << dendl; - mds->mdlog->subtree_map_expire_waiters[get_start_off()].push_back(c); -} - void ESubtreeMap::replay(MDS *mds) { - // note location - mds->mdlog->subtree_maps.insert(get_start_off()); - + // suck up the subtree map? if (mds->mdcache->is_subtrees()) { dout(10) << "ESubtreeMap.replay -- ignoring, already have import map" << dendl; - } else { - dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; - - // first, stick the spanning tree in my cache - //metablob.print(cout); - metablob.replay(mds); - - // restore import/export maps - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = mds->mdcache->get_dirfrag(p->first); - mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); - } + return; } + + dout(10) << "ESubtreeMap.replay -- reconstructing (auth) subtree spanning tree" << dendl; + + // first, stick the spanning tree in my cache + //metablob.print(cout); + metablob.replay(mds, _segment); + + // restore import/export maps + for (map >::iterator p = subtrees.begin(); + p != subtrees.end(); + ++p) { + CDir *dir = mds->mdcache->get_dirfrag(p->first); + mds->mdcache->adjust_bounded_subtree_auth(dir, p->second, mds->get_nodeid()); + } + mds->mdcache->show_subtrees(); } @@ -954,16 +943,6 @@ void ESubtreeMap::replay(MDS *mds) // ----------------------- // EFragment -bool EFragment::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EFragment::expire(MDS *mds, Context *c) -{ - metablob.expire(mds, c); -} - void EFragment::replay(MDS *mds) { dout(10) << "EFragment.replay " << ino << " " << basefrag << " by " << bits << dendl; @@ -975,7 +954,7 @@ void EFragment::replay(MDS *mds) list waiters; mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters); - metablob.replay(mds); + metablob.replay(mds, _segment); } @@ -994,10 +973,17 @@ void EPurgeFinish::expire(MDS *mds, Context *c) assert(0); } +void EPurgeFinish::update_segment() +{ + // ** update purge lists? +} + void EPurgeFinish::replay(MDS *mds) { - dout(10) << "EPurgeFinish.replay " << ino << " to " << newsize << dendl; - mds->mdcache->remove_recovered_purge(ino, newsize); + dout(10) << "EPurgeFinish.replay " << ino << " " << oldsize << " -> " << newsize << dendl; + CInode *in = mds->mdcache->get_inode(ino); + assert(in); + mds->mdcache->remove_recovered_purge(in, newsize, oldsize); } @@ -1032,7 +1018,7 @@ void EExport::expire(MDS *mds, Context *c) void EExport::replay(MDS *mds) { dout(10) << "EExport.replay " << base << dendl; - metablob.replay(mds); + metablob.replay(mds, _segment); CDir *dir = mds->mdcache->get_dirfrag(base); assert(dir); @@ -1055,21 +1041,10 @@ void EExport::replay(MDS *mds) // ----------------------- // EImportStart -bool EImportStart::has_expired(MDS *mds) -{ - return metablob.has_expired(mds); -} - -void EImportStart::expire(MDS *mds, Context *c) -{ - dout(10) << "EImportStart.expire " << base << dendl; - metablob.expire(mds, c); -} - void EImportStart::replay(MDS *mds) { dout(10) << "EImportStart.replay " << base << dendl; - metablob.replay(mds); + metablob.replay(mds, _segment); // put in ambiguous import list mds->mdcache->add_ambiguous_import(base, bounds); diff --git a/trunk/ceph/mds/mdstypes.h b/trunk/ceph/mds/mdstypes.h index f48c4e2d34ca4..a2f779757255e 100644 --- a/trunk/ceph/mds/mdstypes.h +++ b/trunk/ceph/mds/mdstypes.h @@ -17,6 +17,7 @@ using namespace std; #include #include "include/frag.h" +#include "include/xlist.h" #define MDS_REF_SET // define me for improved debug output, sanity checking @@ -53,10 +54,11 @@ using namespace std; struct metareqid_t { + uint64_t tid; int32_t client; - tid_t tid; - metareqid_t() : client(-1), tid(0) {} - metareqid_t(int c, tid_t t) : client(c), tid(t) {} + int32_t _pad; + metareqid_t() : tid(0), client(-1), _pad(0) {} + metareqid_t(int c, tid_t t) : tid(t), client(c), _pad(0) {} }; inline ostream& operator<<(ostream& out, const metareqid_t& r) { @@ -111,9 +113,10 @@ struct inode_caps_reconnect_t { struct dirfrag_t { inodeno_t ino; frag_t frag; + uint32_t _pad; - dirfrag_t() { } - dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } + dirfrag_t() : ino(0), _pad(0) { } + dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f), _pad(0) { } }; inline ostream& operator<<(ostream& out, const dirfrag_t df) { @@ -130,6 +133,17 @@ inline bool operator==(dirfrag_t l, dirfrag_t r) { return l.ino == r.ino && l.frag == r.frag; } +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const dirfrag_t &df) const { + static rjhash H; + static rjhash I; + return H(df.ino) ^ I(df.frag); + } + }; +} + + // ================================================================ @@ -347,6 +361,8 @@ public: dirfrag_t dirfrag; string dname; + MDSCacheObjectInfo() : ino(0) {} + void _encode(bufferlist& bl) const { ::_encode(ino, bl); ::_encode(dirfrag, bl); @@ -357,6 +373,11 @@ public: ::_decode(dirfrag, bl, off); ::_decode(dname, bl, off); } + void _decode(bufferlist::iterator& p) { + ::_decode_simple(ino, p); + ::_decode_simple(dirfrag, p); + ::_decode_simple(dname, p); + } }; @@ -368,8 +389,10 @@ class MDSCacheObject { const static int PIN_LOCK = -1002; const static int PIN_REQUEST = -1003; const static int PIN_WAITER = 1004; - const static int PIN_DIRTYSCATTERED = 1005; + const static int PIN_DIRTYSCATTERED = 1005; // make this neg if we start using multiple scatterlocks? static const int PIN_AUTHPIN = 1006; + static const int PIN_PTRWAITER = -1007; + const static int PIN_TEMPEXPORTING = 1008; // temp pin between encode_ and finish_export const char *generic_pin_name(int p) { switch (p) { @@ -380,6 +403,8 @@ class MDSCacheObject { case PIN_WAITER: return "waiter"; case PIN_DIRTYSCATTERED: return "dirtyscattered"; case PIN_AUTHPIN: return "authpin"; + case PIN_PTRWAITER: return "ptrwaiter"; + case PIN_TEMPEXPORTING: return "tempexporting"; default: assert(0); return 0; } } @@ -391,15 +416,14 @@ class MDSCacheObject { // -- wait -- const static int WAIT_SINGLEAUTH = (1<<30); - const static int WAIT_AUTHPINNABLE = (1<<29); - const static int WAIT_UNFREEZE = WAIT_AUTHPINNABLE; + const static int WAIT_UNFREEZE = (1<<29); // pka AUTHPINNABLE // ============================================ // cons public: MDSCacheObject() : - state(0), + state(0), ref(0), replica_nonce(0) {} virtual ~MDSCacheObject() {} @@ -416,16 +440,16 @@ class MDSCacheObject { unsigned state; // state bits public: - unsigned get_state() { return state; } + unsigned get_state() const { return state; } + unsigned state_test(unsigned mask) const { return (state & mask); } void state_clear(unsigned mask) { state &= ~mask; } void state_set(unsigned mask) { state |= mask; } - unsigned state_test(unsigned mask) { return state & mask; } void state_reset(unsigned s) { state = s; } - bool is_auth() { return state_test(STATE_AUTH); } - bool is_dirty() { return state_test(STATE_DIRTY); } - bool is_clean() { return !is_dirty(); } - bool is_rejoining() { return state_test(STATE_REJOINING); } + bool is_auth() const { return state_test(STATE_AUTH); } + bool is_dirty() const { return state_test(STATE_DIRTY); } + bool is_clean() const { return !is_dirty(); } + bool is_rejoining() const { return state_test(STATE_REJOINING); } // -------------------------------------------- // authority @@ -632,6 +656,7 @@ protected: virtual void add_lock_waiter(int type, int mask, Context *c) { assert(0); } virtual bool is_lock_waiting(int type, int mask) { assert(0); return false; } + virtual void clear_dirty_scattered(int type) { assert(0); } // --------------------------------------------- // ordering diff --git a/trunk/ceph/messages/MClientMount.h b/trunk/ceph/messages/MClientMount.h index 78bff7b2154fa..a49b558c7f040 100644 --- a/trunk/ceph/messages/MClientMount.h +++ b/trunk/ceph/messages/MClientMount.h @@ -20,7 +20,7 @@ class MClientMount : public Message { public: entity_addr_t addr; - int instance; // on this node + int32_t instance; // on this node MClientMount() : Message(MSG_CLIENT_MOUNT) { } MClientMount(entity_addr_t a, int i = 0) : diff --git a/trunk/ceph/messages/MClientReply.h b/trunk/ceph/messages/MClientReply.h index 24b7eae8976b7..760dcc971ebad 100644 --- a/trunk/ceph/messages/MClientReply.h +++ b/trunk/ceph/messages/MClientReply.h @@ -17,7 +17,7 @@ #define __MCLIENTREPLY_H #include "include/types.h" - +#include "include/encodable.h" #include "MClientRequest.h" #include "msg/Message.h" @@ -50,93 +50,101 @@ class CInode; * */ -class InodeStat { - public: +struct DirStat { + // mds distribution hints + frag_t frag; + int auth; + set dist; + bool is_rep; + + DirStat() {} + DirStat(bufferlist::iterator& p) { + _decode(p); + } + + void _decode(bufferlist::iterator& p) { + ::_decode_simple(frag, p); + ::_decode_simple(auth, p); + ::_decode_simple(dist, p); + ::_decode_simple(is_rep, p); + } + + static void _encode(bufferlist& bl, CDir *dir, int whoami) { + frag_t frag = dir->get_frag(); + int auth; + set dist; + bool is_rep; + + auth = dir->get_dir_auth().first; + if (dir->is_auth()) + dir->get_dist_spec(dist, whoami); + is_rep = dir->is_rep(); + + ::_encode_simple(frag, bl); + ::_encode_simple(auth, bl); + ::_encode_simple(dist, bl); + ::_encode_simple(is_rep, bl); + } +}; + +struct InodeStat { inode_t inode; string symlink; // symlink content (if symlink) fragtree_t dirfragtree; uint32_t mask; - // mds distribution hints - map dirfrag_auth; - map > dirfrag_dist; - set dirfrag_rep; - public: InodeStat() {} - InodeStat(CInode *in, int whoami) : - inode(in->inode), - mask(STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE) - { + InodeStat(bufferlist::iterator& p) { + _decode(p); + } + + void _decode(bufferlist::iterator &p) { + ::_decode_simple(mask, p); + ::_decode_simple(inode, p); + ::_decode_simple(symlink, p); + dirfragtree._decode(p); + } + + static void _encode(bufferlist &bl, CInode *in) { + int mask = STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE; + // mask if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH; if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK; if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE; - - // symlink content? - if (in->is_symlink()) - symlink = in->symlink; - - // dirfragtree - dirfragtree = in->dirfragtree; - - // dirfrag info - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *dir = *p; - dirfrag_auth[dir->dirfrag().frag] = dir->get_dir_auth().first; - if (dir->is_auth()) - dir->get_dist_spec(dirfrag_dist[dir->dirfrag().frag], whoami); - if (dir->is_rep()) - dirfrag_rep.insert(dir->dirfrag().frag); - } - } - - void _encode(bufferlist &bl) { - ::_encode(mask, bl); - ::_encode(inode, bl); - ::_encode(dirfrag_auth, bl); - ::_encode(dirfrag_dist, bl); - ::_encode(dirfrag_rep, bl); - ::_encode(symlink, bl); - dirfragtree._encode(bl); + + ::_encode_simple(mask, bl); + ::_encode_simple(in->inode, bl); + ::_encode_simple(in->symlink, bl); + in->dirfragtree._encode(bl); } - void _decode(bufferlist &bl, int& off) { - ::_decode(mask, bl, off); - ::_decode(inode, bl, off); - ::_decode(dirfrag_auth, bl, off); - ::_decode(dirfrag_dist, bl, off); - ::_decode(dirfrag_rep, bl, off); - ::_decode(symlink, bl, off); - dirfragtree._decode(bl, off); - } }; class MClientReply : public Message { // reply data - struct { + struct st_ { long tid; int op; int result; // error code unsigned char file_caps; // for open long file_caps_seq; uint64_t file_data_version; // for client buffercache consistency - - int _num_trace_in; - int _dir_size; } st; string path; + list trace_in; + list trace_dir; list trace_dn; + bufferlist trace_bl; - list dir_dn; + DirStat *dir_dir; list dir_in; + list dir_dn; + bufferlist dir_bl; public: long get_tid() { return st.tid; } @@ -148,12 +156,6 @@ class MClientReply : public Message { inodeno_t get_ino() { return trace_in.back()->inode.ino; } const inode_t& get_inode() { return trace_in.back()->inode; } - const list& get_trace_in() { return trace_in; } - const list& get_trace_dn() { return trace_dn; } - - const list& get_dir_in() { return dir_in; } - const list& get_dir_dn() { return dir_dn; } - unsigned char get_file_caps() { return st.file_caps; } long get_file_caps_seq() { return st.file_caps_seq; } uint64_t get_file_data_version() { return st.file_data_version; } @@ -163,18 +165,15 @@ class MClientReply : public Message { void set_file_caps_seq(long s) { st.file_caps_seq = s; } void set_file_data_version(uint64_t v) { st.file_data_version = v; } - MClientReply() {}; + MClientReply() : dir_dir(0) {}; MClientReply(MClientRequest *req, int result = 0) : - Message(MSG_CLIENT_REPLY) { + Message(MSG_CLIENT_REPLY), dir_dir(0) { memset(&st, 0, sizeof(st)); this->st.tid = req->get_tid(); this->st.op = req->get_op(); this->path = req->get_path(); this->st.result = result; - - st._dir_size = 0; - st._num_trace_in = 0; } virtual ~MClientReply() { list::iterator it; @@ -195,100 +194,91 @@ class MClientReply : public Message { // serialization virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(st), (char*)&st); - off += sizeof(st); - - _decode(path, payload, off); - - for (int i=0; i_decode(payload, off); - trace_in.push_back(ci); - } - - // dir contents - ::_decode(dir_dn, payload, off); - for (int i=0; i_decode(payload, off); - dir_in.push_back(ci); - } + bufferlist::iterator p = payload.begin(); + ::_decode_simple(st, p); + ::_decode_simple(path, p); + ::_decode_simple(trace_bl, p); + ::_decode_simple(dir_bl, p); + assert(p.end()); } virtual void encode_payload() { - payload.append((char*)&st, sizeof(st)); - _encode(path, payload); - - // trace - list::iterator pdn = trace_dn.begin(); - list::iterator pin; - for (pin = trace_in.begin(); - pin != trace_in.end(); - ++pin) { - if (pin != trace_in.begin()) { - ::_encode(*pdn, payload); - ++pdn; - } - (*pin)->_encode(payload); - } - - // dir contents - ::_encode(dir_dn, payload); - for (pin = dir_in.begin(); - pin != dir_in.end(); - ++pin) - (*pin)->_encode(payload); + ::_encode_simple(st, payload); + ::_encode_simple(path, payload); + ::_encode_simple(trace_bl, payload); + ::_encode_simple(dir_bl, payload); } - // builders - /* - void add_dir_item(string& dn, InodeStat *in) { - dir_dn.push_back(dn); - dir_in.push_back(in); - ++st._dir_size; - }*/ - void take_dir_items(list& dnls, - list& inls, - int num) { - dir_dn.swap(dnls); - dir_in.swap(inls); - st._dir_size = num; + + // dir contents + void take_dir_items(bufferlist& bl) { + dir_bl.claim(bl); } - /* - void copy_dir_items(const list& inls, - const list& dnls) { - list::const_iterator pdn = dnls.begin(); - list::const_iterator pin = inls.begin(); - while (pin != inls.end()) { - // copy! - InodeStat *i = new InodeStat; - *i = **pin; - dir_in.push_back(i); - dir_dn.push_back(*pdn); - ++pin; - ++pdn; - ++st._dir_size; + void _decode_dir() { + bufferlist::iterator p = dir_bl.begin(); + dir_dir = new DirStat(p); + while (!p.end()) { + string dn; + ::_decode_simple(dn, p); + dir_dn.push_back(dn); + dir_in.push_back(new InodeStat(p)); } } - */ + const list& get_dir_in() { + if (dir_in.empty() && dir_bl.length()) _decode_dir(); + return dir_in; + } + const list& get_dir_dn() { + if (dir_dn.empty() && dir_bl.length()) _decode_dir(); + return dir_dn; + } + const DirStat* get_dir_dir() { + return dir_dir; + } + + + // trace void set_trace_dist(CInode *in, int whoami) { - st._num_trace_in = 0; + // inode, dentry, dir, ..., inode while (in) { - // add this inode to trace, along with referring dentry name - if (in->get_parent_dn()) - trace_dn.push_front(in->get_parent_dn()->get_name()); - trace_in.push_front(new InodeStat(in, whoami)); - ++st._num_trace_in; - - in = in->get_parent_inode(); + InodeStat::_encode(trace_bl, in); + CDentry *dn = in->get_parent_dn(); + if (!dn) break; + ::_encode_simple(in->get_parent_dn()->get_name(), trace_bl); + DirStat::_encode(trace_bl, dn->get_dir(), whoami); + in = dn->get_dir()->get_inode(); } } + void _decode_trace() { + bufferlist::iterator p = trace_bl.begin(); + while (!p.end()) { + // inode + trace_in.push_front(new InodeStat(p)); + if (!p.end()) { + // dentry + string ref_dn; + ::_decode_simple(ref_dn, p); + trace_dn.push_front(ref_dn); + + // dir + trace_dir.push_front(new DirStat(p)); + } + } + } + + const list& get_trace_in() { + if (trace_in.empty() && trace_bl.length()) _decode_trace(); + return trace_in; + } + const list& get_trace_dir() { + if (trace_in.empty() && trace_bl.length()) _decode_trace(); + return trace_dir; + } + const list& get_trace_dn() { + if (trace_in.empty() && trace_bl.length()) _decode_trace(); + return trace_dn; + } + }; diff --git a/trunk/ceph/messages/MClientRequestForward.h b/trunk/ceph/messages/MClientRequestForward.h index c81e3b3c06ce8..53fb5270d30a9 100644 --- a/trunk/ceph/messages/MClientRequestForward.h +++ b/trunk/ceph/messages/MClientRequestForward.h @@ -18,8 +18,8 @@ class MClientRequestForward : public Message { tid_t tid; - int dest_mds; - int num_fwd; + int32_t dest_mds; + int32_t num_fwd; public: MClientRequestForward() : Message(MSG_CLIENT_REQUEST_FORWARD) {} diff --git a/trunk/ceph/messages/MDiscover.h b/trunk/ceph/messages/MDiscover.h index 5917c719a8af4..7294bad22d796 100644 --- a/trunk/ceph/messages/MDiscover.h +++ b/trunk/ceph/messages/MDiscover.h @@ -40,13 +40,14 @@ class MDiscover : public Message { int get_asker() { return asker; } inodeno_t get_base_ino() { return base_ino; } frag_t get_base_dir_frag() { return base_dir_frag; } + filepath& get_want() { return want; } inodeno_t get_want_ino() { return want_ino; } - const string& get_dentry(int n) { return want[n]; } + const string& get_dentry(int n) { return want[n]; } bool wants_base_dir() { return want_base_dir; } bool wants_xlocked() { return want_xlocked; } - + void set_base_dir_frag(frag_t f) { base_dir_frag = f; } MDiscover() { } diff --git a/trunk/ceph/messages/MDiscoverReply.h b/trunk/ceph/messages/MDiscoverReply.h index cd3729e2914a0..67491049c0b8f 100644 --- a/trunk/ceph/messages/MDiscoverReply.h +++ b/trunk/ceph/messages/MDiscoverReply.h @@ -69,25 +69,36 @@ using namespace std; */ class MDiscoverReply : public Message { - inodeno_t base_ino; - bool no_base_dir; // no base dir (but IS dentry+inode) - bool no_base_dentry; // no base dentry (but IS inode) - bool flag_error_dn; + // info about original request + inodeno_t base_ino; + frag_t base_dir_frag; + bool wanted_base_dir; + bool wanted_xlocked; + inodeno_t wanted_ino; + + // and the response + bool flag_error_dn; bool flag_error_ino; - bool flag_error_dir; - string error_dentry; // dentry that was not found (to trigger waiters on asker) + bool flag_error_dir; + bool no_base_dir; // no base dir (but IS dentry+inode) + bool no_base_dentry; // no base dentry (but IS inode) + string error_dentry; // dentry that was not found (to trigger waiters on asker) + int dir_auth_hint; - bool wanted_xlocks_hint; - + vector dirs; // not inode-aligned if no_base_dir = true. vector dentries; // not inode-aligned if no_base_dentry = true vector inodes; - string path; public: // accessors inodeno_t get_base_ino() { return base_ino; } + frag_t get_base_dir_frag() { return base_dir_frag; } + bool get_wanted_base_dir() { return wanted_base_dir; } + bool get_wanted_xlocked() { return wanted_xlocked; } + inodeno_t get_wanted_ino() { return wanted_ino; } + int get_num_inodes() { return inodes.size(); } int get_num_dentries() { return dentries.size(); } int get_num_dirs() { return dirs.size(); } @@ -106,17 +117,13 @@ class MDiscoverReply : public Message { bool has_base_dentry() { return !no_base_dentry && dentries.size(); } bool has_base_inode() { return no_base_dir && no_base_dentry; } - const string& get_path() { return path; } - - // bool is_flag_forward() { return flag_forward; } bool is_flag_error_dn() { return flag_error_dn; } bool is_flag_error_ino() { return flag_error_ino; } bool is_flag_error_dir() { return flag_error_dir; } string& get_error_dentry() { return error_dentry; } + int get_dir_auth_hint() { return dir_auth_hint; } - bool get_wanted_xlocks_hint() { return wanted_xlocks_hint; } - void set_wanted_xlocks_hint(bool w) { wanted_xlocks_hint = w; } // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. CInodeDiscover& get_inode(int n) { return *(inodes[n]); } @@ -126,13 +133,31 @@ class MDiscoverReply : public Message { // cons MDiscoverReply() {} - MDiscoverReply(inodeno_t base_ino) : - Message(MSG_MDS_DISCOVERREPLY) { - this->base_ino = base_ino; - flag_error_dn = false; - flag_error_dir = false; - no_base_dir = no_base_dentry = false; - dir_auth_hint = CDIR_AUTH_UNKNOWN; + MDiscoverReply(MDiscover *dis) : + Message(MSG_MDS_DISCOVERREPLY), + base_ino(dis->get_base_ino()), + base_dir_frag(dis->get_base_dir_frag()), + wanted_base_dir(dis->wants_base_dir()), + wanted_xlocked(dis->wants_xlocked()), + wanted_ino(dis->get_want_ino()), + flag_error_dn(false), + flag_error_ino(false), + flag_error_dir(false), + no_base_dir(false), no_base_dentry(false), + dir_auth_hint(CDIR_AUTH_UNKNOWN) { + } + MDiscoverReply(dirfrag_t df) : + Message(MSG_MDS_DISCOVERREPLY), + base_ino(df.ino), + base_dir_frag(df.frag), + wanted_base_dir(false), + wanted_xlocked(false), + wanted_ino(inodeno_t()), + flag_error_dn(false), + flag_error_ino(false), + flag_error_dir(false), + no_base_dir(false), no_base_dentry(false), + dir_auth_hint(CDIR_AUTH_UNKNOWN) { } ~MDiscoverReply() { for (vector::iterator it = dirs.begin(); @@ -154,14 +179,13 @@ class MDiscoverReply : public Message { bool is_empty() { return dirs.empty() && dentries.empty() && inodes.empty() && !flag_error_dn && + !flag_error_ino && !flag_error_dir && dir_auth_hint == CDIR_AUTH_UNKNOWN; } void add_dentry(CDentryDiscover* ddis) { if (dentries.empty() && dirs.empty()) no_base_dir = true; dentries.push_back(ddis); - if (path.length()) path += "/"; - path += ddis->get_dname(); } void add_inode(CInodeDiscover* din) { @@ -173,6 +197,7 @@ class MDiscoverReply : public Message { dirs.push_back( dir ); } + // void set_flag_forward() { flag_forward = true; } void set_flag_error_dn(const string& dn) { flag_error_dn = true; @@ -196,14 +221,16 @@ class MDiscoverReply : public Message { virtual void decode_payload() { int off = 0; ::_decode(base_ino, payload, off); - ::_decode(no_base_dir, payload, off); - ::_decode(no_base_dentry, payload, off); + ::_decode(base_dir_frag, payload, off); + ::_decode(wanted_base_dir, payload, off); + ::_decode(wanted_xlocked, payload, off); ::_decode(flag_error_dn, payload, off); ::_decode(flag_error_ino, payload, off); ::_decode(flag_error_dir, payload, off); + ::_decode(no_base_dir, payload, off); + ::_decode(no_base_dentry, payload, off); ::_decode(error_dentry, payload, off); ::_decode(dir_auth_hint, payload, off); - ::_decode(wanted_xlocks_hint, payload, off); // dirs int n; @@ -232,14 +259,16 @@ class MDiscoverReply : public Message { } void encode_payload() { ::_encode(base_ino, payload); - ::_encode(no_base_dir, payload); - ::_encode(no_base_dentry, payload); + ::_encode(base_dir_frag, payload); + ::_encode(wanted_base_dir, payload); + ::_encode(wanted_xlocked, payload); ::_encode(flag_error_dn, payload); ::_encode(flag_error_ino, payload); ::_encode(flag_error_dir, payload); + ::_encode(no_base_dir, payload); + ::_encode(no_base_dentry, payload); ::_encode(error_dentry, payload); ::_encode(dir_auth_hint, payload); - ::_encode(wanted_xlocks_hint, payload); // dirs int n = dirs.size(); diff --git a/trunk/ceph/messages/MExportDir.h b/trunk/ceph/messages/MExportDir.h index 8fafbe0312636..9964a7059c1d2 100644 --- a/trunk/ceph/messages/MExportDir.h +++ b/trunk/ceph/messages/MExportDir.h @@ -22,8 +22,8 @@ class MExportDir : public Message { dirfrag_t dirfrag; - list dirstate; // a bl for reach dir - list bounds; + bufferlist dirstate; + list bounds; public: MExportDir() {} @@ -37,14 +37,11 @@ class MExportDir : public Message { } dirfrag_t get_dirfrag() { return dirfrag; } - list& get_dirstate() { return dirstate; } + bufferlist& get_dirstate() { return dirstate; } list& get_bounds() { return bounds; } - void add_dir(bufferlist& dir) { - dirstate.push_back(dir); - } - void set_dirstate(const list& ls) { - dirstate = ls; + void take_dirstate(bufferlist& bl) { + dirstate.claim(bl); } void add_export(dirfrag_t df) { bounds.push_back(df); diff --git a/trunk/ceph/messages/MExportDirDiscover.h b/trunk/ceph/messages/MExportDirDiscover.h index 7375fad6c5057..c311d1e87e940 100644 --- a/trunk/ceph/messages/MExportDirDiscover.h +++ b/trunk/ceph/messages/MExportDirDiscover.h @@ -45,15 +45,14 @@ class MExportDirDiscover : public Message { } virtual void decode_payload() { - int off = 0; - payload.copy(off, sizeof(dirfrag), (char*)&dirfrag); - off += sizeof(dirfrag); - ::_decode(path, payload, off); + bufferlist::iterator p = payload.begin(); + ::_decode_simple(dirfrag, p); + ::_decode_simple(path, p); } virtual void encode_payload() { - payload.append((char*)&dirfrag, sizeof(dirfrag)); - ::_encode(path, payload); + ::_encode_simple(dirfrag, payload); + ::_encode_simple(path, payload); } }; diff --git a/trunk/ceph/messages/MLock.h b/trunk/ceph/messages/MLock.h index 208b5b6e8a003..95c3e5f325212 100644 --- a/trunk/ceph/messages/MLock.h +++ b/trunk/ceph/messages/MLock.h @@ -32,6 +32,7 @@ #define LOCK_AC_LOCKACK 3 #define LOCK_AC_REQSCATTER 7 +#define LOCK_AC_REQUNSCATTER 8 #define LOCK_AC_FOR_REPLICA(a) ((a) < 0) #define LOCK_AC_FOR_AUTH(a) ((a) > 0) @@ -47,14 +48,15 @@ static const char *get_lock_action_name(int a) { case LOCK_AC_MIXEDACK: return "mixedack"; case LOCK_AC_LOCKACK: return "lockack"; case LOCK_AC_REQSCATTER: return "reqscatter"; + case LOCK_AC_REQUNSCATTER: return "requnscatter"; default: assert(0); return 0; } } class MLock : public Message { - int asker; // who is initiating this request - int action; // action type + int32_t action; // action type + int32_t asker; // who is initiating this request metareqid_t reqid; // for remote lock requests char lock_type; // lock object type @@ -72,24 +74,20 @@ class MLock : public Message { MDSCacheObjectInfo &get_object_info() { return object_info; } MLock() {} - MLock(int action, int asker) : - Message(MSG_MDS_LOCK) { - this->action = action; - this->asker = asker; - } - MLock(SimpleLock *lock, int action, int asker) : - Message(MSG_MDS_LOCK) { - this->lock_type = lock->get_type(); + MLock(int ac, int as) : + Message(MSG_MDS_LOCK), + action(ac), asker(as), + lock_type(0) { } + MLock(SimpleLock *lock, int ac, int as) : + Message(MSG_MDS_LOCK), + action(ac), asker(as), + lock_type(lock->get_type()) { lock->get_parent()->set_object_info(object_info); - this->action = action; - this->asker = asker; } - MLock(SimpleLock *lock, int action, int asker, bufferlist& bl) : - Message(MSG_MDS_LOCK) { - this->lock_type = lock->get_type(); + MLock(SimpleLock *lock, int ac, int as, bufferlist& bl) : + Message(MSG_MDS_LOCK), + action(ac), asker(as), lock_type(lock->get_type()) { lock->get_parent()->set_object_info(object_info); - this->action = action; - this->asker = asker; data.claim(bl); } virtual char *get_type_name() { return "ILock"; } diff --git a/trunk/ceph/messages/MMDSCacheRejoin.h b/trunk/ceph/messages/MMDSCacheRejoin.h index c2d26ed23f45a..844ece02000ae 100644 --- a/trunk/ceph/messages/MMDSCacheRejoin.h +++ b/trunk/ceph/messages/MMDSCacheRejoin.h @@ -64,15 +64,15 @@ class MMDSCacheRejoin : public Message { inode_full(const inode_t& i, const string& s, const fragtree_t& f) : inode(i), symlink(s), dirfragtree(f) {} - void _decode(bufferlist& bl, int& off) { - ::_decode(inode, bl, off); - ::_decode(symlink, bl, off); - ::_decode(dirfragtree, bl, off); + void _decode(bufferlist::iterator& p) { + ::_decode_simple(inode, p); + ::_decode_simple(symlink, p); + dirfragtree._decode(p); } void _encode(bufferlist& bl) const { ::_encode(inode, bl); ::_encode(symlink, bl); - ::_encode(dirfragtree, bl); + dirfragtree._encode(bl); } }; @@ -205,24 +205,24 @@ class MMDSCacheRejoin : public Message { ::_encode(xlocked_dentries, payload); } void decode_payload() { - int off = 0; - ::_decode(op, payload, off); - ::_decode(strong_inodes, payload, off); - ::_decode_complex(full_inodes, payload, off); - ::_decode(authpinned_inodes, payload, off); - ::_decode(xlocked_inodes, payload, off); - ::_decode(cap_export_bl, payload, off); + bufferlist::iterator p = payload.begin(); + ::_decode_simple(op, p); + ::_decode_simple(strong_inodes, p); + ::_decode_complex(full_inodes, p); + ::_decode_simple(authpinned_inodes, p); + ::_decode_simple(xlocked_inodes, p); + ::_decode_simple(cap_export_bl, p); if (cap_export_bl.length()) { - int off = 0; - ::_decode(cap_exports, cap_export_bl, off); - ::_decode(cap_export_paths, cap_export_bl, off); + bufferlist::iterator q = cap_export_bl.begin(); + ::_decode_simple(cap_exports, q); + ::_decode_simple(cap_export_paths, q); } - ::_decode(strong_dirfrags, payload, off); - ::_decode(weak, payload, off); - ::_decode(weak_inodes, payload, off); - ::_decode(strong_dentries, payload, off); - ::_decode(authpinned_dentries, payload, off); - ::_decode(xlocked_dentries, payload, off); + ::_decode_simple(strong_dirfrags, p); + ::_decode_simple(weak, p); + ::_decode_simple(weak_inodes, p); + ::_decode_simple(strong_dentries, p); + ::_decode_simple(authpinned_dentries, p); + ::_decode_simple(xlocked_dentries, p); } }; diff --git a/trunk/ceph/messages/MMDSSlaveRequest.h b/trunk/ceph/messages/MMDSSlaveRequest.h index e2dbbd8f7298a..5ef65223ec1c9 100644 --- a/trunk/ceph/messages/MMDSSlaveRequest.h +++ b/trunk/ceph/messages/MMDSSlaveRequest.h @@ -35,9 +35,6 @@ class MMDSSlaveRequest : public Message { static const int OP_RENAMEPREP = 7; static const int OP_RENAMEPREPACK = -7; - static const int OP_RENAMEGETINODE = 8; - static const int OP_RENAMEGETINODEACK = -8; - static const int OP_FINISH = 17; static const int OP_ABORT = 20; // used for recovery only @@ -58,8 +55,6 @@ class MMDSSlaveRequest : public Message { case OP_RENAMEPREP: return "rename_prep"; case OP_RENAMEPREPACK: return "rename_prep_ack"; - case OP_RENAMEGETINODE: return "rename_get_inode"; - case OP_RENAMEGETINODEACK: return "rename_get_inode_ack"; case OP_FINISH: return "finish"; // commit case OP_ABORT: return "abort"; @@ -84,9 +79,10 @@ class MMDSSlaveRequest : public Message { // for rename prep string srcdnpath; string destdnpath; - set srcdn_replicas; + set witnesses; bufferlist inode_export; version_t inode_export_v; + bufferlist srci_replica; utime_t now; bufferlist stray; // stray dir + dentry @@ -116,26 +112,28 @@ public: ::_encode_complex(authpins, payload); ::_encode(srcdnpath, payload); ::_encode(destdnpath, payload); - ::_encode(srcdn_replicas, payload); + ::_encode(witnesses, payload); ::_encode(now, payload); ::_encode(inode_export, payload); ::_encode(inode_export_v, payload); + ::_encode(srci_replica, payload); ::_encode(stray, payload); } void decode_payload() { - int off = 0; - ::_decode(reqid, payload, off); - ::_decode(op, payload, off); - ::_decode(lock_type, payload, off); - object_info._decode(payload, off); - ::_decode_complex(authpins, payload, off); - ::_decode(srcdnpath, payload, off); - ::_decode(destdnpath, payload, off); - ::_decode(srcdn_replicas, payload, off); - ::_decode(now, payload, off); - ::_decode(inode_export, payload, off); - ::_decode(inode_export_v, payload, off); - ::_decode(stray, payload, off); + bufferlist::iterator p = payload.begin(); + ::_decode_simple(reqid, p); + ::_decode_simple(op, p); + ::_decode_simple(lock_type, p); + object_info._decode(p); + ::_decode_complex(authpins, p); + ::_decode_simple(srcdnpath, p); + ::_decode_simple(destdnpath, p); + ::_decode_simple(witnesses, p); + ::_decode_simple(now, p); + ::_decode_simple(inode_export, p); + ::_decode_simple(inode_export_v, p); + ::_decode_simple(srci_replica, p); + ::_decode_simple(stray, p); } char *get_type_name() { return "slave_request"; } diff --git a/trunk/ceph/mon/ClientMonitor.cc b/trunk/ceph/mon/ClientMonitor.cc index ebc204507d034..b7ac275b0afca 100644 --- a/trunk/ceph/mon/ClientMonitor.cc +++ b/trunk/ceph/mon/ClientMonitor.cc @@ -26,8 +26,8 @@ #include "config.h" -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".client v" << client_map.version << " " @@ -64,8 +64,7 @@ bool ClientMonitor::update_from_paxos() inc._decode(bl, off); client_map.apply_incremental(inc); - dout(1) << "v" << client_map.version << ": " - << client_map.client_addr.size() << " clients (+" + dout(1) << client_map.client_addr.size() << " clients (+" << inc.mount.size() << " -" << inc.unmount.size() << ")" << dendl; diff --git a/trunk/ceph/mon/MonMap.h b/trunk/ceph/mon/MonMap.h index eb18579cd7e99..dbe9c9b5ac5e9 100644 --- a/trunk/ceph/mon/MonMap.h +++ b/trunk/ceph/mon/MonMap.h @@ -24,8 +24,8 @@ class MonMap { public: - epoch_t epoch; // what epoch/version of the monmap - int num_mon; + epoch_t epoch; // what epoch/version of the monmap + int32_t num_mon; vector mon_inst; int last_mon; // last mon i talked to @@ -52,20 +52,16 @@ class MonMap { } void encode(bufferlist& blist) { - blist.append((char*)&epoch, sizeof(epoch)); - blist.append((char*)&num_mon, sizeof(num_mon)); - - _encode(mon_inst, blist); + ::_encode(epoch, blist); + ::_encode(num_mon, blist); + ::_encode(mon_inst, blist); } void decode(bufferlist& blist) { int off = 0; - blist.copy(off, sizeof(epoch), (char*)&epoch); - off += sizeof(epoch); - blist.copy(off, sizeof(num_mon), (char*)&num_mon); - off += sizeof(num_mon); - - _decode(mon_inst, blist, off); + ::_decode(epoch, blist, off); + ::_decode(num_mon, blist, off); + ::_decode(mon_inst, blist, off); } // read from/write to a file diff --git a/trunk/ceph/mon/MonitorStore.cc b/trunk/ceph/mon/MonitorStore.cc index 30eb4796a5383..86df22bcd6590 100644 --- a/trunk/ceph/mon/MonitorStore.cc +++ b/trunk/ceph/mon/MonitorStore.cc @@ -109,9 +109,8 @@ void MonitorStore::put_int(version_t val, const char *a, const char *b) char tfn[200]; sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT); + int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); assert(fd > 0); - ::fchmod(fd, 0644); ::write(fd, vs, strlen(vs)); ::close(fd); ::rename(tfn, fn); @@ -201,12 +200,9 @@ int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) char tfn[200]; sprintf(tfn, "%s.new", fn); - int fd = ::open(tfn, O_WRONLY|O_CREAT); + int fd = ::open(tfn, O_WRONLY|O_CREAT, 0644); assert(fd); - // chmod - ::fchmod(fd, 0644); - // write data for (list::const_iterator it = bl.buffers().begin(); it != bl.buffers().end(); diff --git a/trunk/ceph/mon/OSDMonitor.cc b/trunk/ceph/mon/OSDMonitor.cc index ff912ff815423..200187510f698 100644 --- a/trunk/ceph/mon/OSDMonitor.cc +++ b/trunk/ceph/mon/OSDMonitor.cc @@ -37,8 +37,8 @@ #include "config.h" -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(e" << osdmap.get_epoch() << ") " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_dout << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) *_derr << dbeginl << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd e" << osdmap.get_epoch() << " " // FAKING diff --git a/trunk/ceph/mon/PGMap.h b/trunk/ceph/mon/PGMap.h index af6f8c206079e..b915c28cbd755 100644 --- a/trunk/ceph/mon/PGMap.h +++ b/trunk/ceph/mon/PGMap.h @@ -40,6 +40,8 @@ public: ::_decode(pg_stat_updates, bl, off); ::_decode(osd_stat_updates, bl, off); } + + Incremental() : version(0) {} }; void apply_incremental(Incremental& inc) { diff --git a/trunk/ceph/msg/SimpleMessenger.cc b/trunk/ceph/msg/SimpleMessenger.cc index 846d8a6381665..7e29f033d83b5 100644 --- a/trunk/ceph/msg/SimpleMessenger.cc +++ b/trunk/ceph/msg/SimpleMessenger.cc @@ -86,6 +86,7 @@ int Rank::Accepter::start() dout(10) << "accepter.start" << dendl; char hostname[100]; + memset(hostname, 0, 100); gethostname(hostname, 100); dout(2) << "accepter.start my hostname is " << hostname << dendl; @@ -168,8 +169,10 @@ int Rank::Accepter::start() // set a harmless handle for SIGUSR1 (we'll use it to stop the accepter) struct sigaction sa; + memset(&sa, 0, sizeof(sa)); sa.sa_handler = noop_signal_handler; sa.sa_flags = 0; + sigemptyset(&sa.sa_mask); sigaction(SIGUSR1, &sa, NULL); // start thread @@ -1216,27 +1219,26 @@ void Rank::wait() * EntityMessenger */ -Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : - Messenger(myaddr), - stop(false), - dispatch_thread(this) -{ -} -Rank::EntityMessenger::~EntityMessenger() -{ - // join dispatch thread - if (dispatch_thread.is_started()) - dispatch_thread.join(); -} - void Rank::EntityMessenger::dispatch_entry() { lock.Lock(); while (!stop) { - if (!dispatch_queue.empty()) { + if (!dispatch_queue.empty() || !prio_dispatch_queue.empty()) { list ls; - ls.swap(dispatch_queue); - qlen = 0; + if (!prio_dispatch_queue.empty()) { + ls.swap(prio_dispatch_queue); + pqlen = 0; + } else { + if (0) { + ls.swap(dispatch_queue); + qlen = 0; + } else { + // limit how much low-prio stuff we grab, to avoid starving high-prio messages! + ls.push_back(dispatch_queue.front()); + dispatch_queue.pop_front(); + qlen--; + } + } lock.Unlock(); { diff --git a/trunk/ceph/msg/SimpleMessenger.h b/trunk/ceph/msg/SimpleMessenger.h index 87aa9793144b3..6bd417adc8e10 100644 --- a/trunk/ceph/msg/SimpleMessenger.h +++ b/trunk/ceph/msg/SimpleMessenger.h @@ -164,8 +164,9 @@ private: Mutex lock; Cond cond; list dispatch_queue; + list prio_dispatch_queue; bool stop; - int qlen; + int qlen, pqlen; class DispatchThread : public Thread { EntityMessenger *m; @@ -184,22 +185,28 @@ private: m->set_recv_stamp(g_clock.now()); lock.Lock(); - dispatch_queue.push_back(m); - qlen++; - cond.Signal(); - lock.Unlock(); - } - void queue_messages(list ls) { - lock.Lock(); - qlen += ls.size(); - dispatch_queue.splice(dispatch_queue.end(), ls); + if (m->get_source().is_mon()) { + prio_dispatch_queue.push_back(m); + pqlen++; + } else { + qlen++; + dispatch_queue.push_back(m); + } cond.Signal(); lock.Unlock(); } public: - EntityMessenger(entity_name_t myaddr); - ~EntityMessenger(); + EntityMessenger(entity_name_t myaddr) : + Messenger(myaddr), + stop(false), + qlen(0), pqlen(0), + dispatch_thread(this) { } + ~EntityMessenger() { + // join dispatch thread + if (dispatch_thread.is_started()) + dispatch_thread.join(); + } void ready(); bool is_stopped() { return stop; } @@ -210,7 +217,7 @@ private: const entity_addr_t &get_myaddr(); - int get_dispatch_queue_len() { return qlen; } + int get_dispatch_queue_len() { return qlen + pqlen; } void reset_myname(entity_name_t m); diff --git a/trunk/ceph/msg/msg_types.h b/trunk/ceph/msg/msg_types.h index 3c425f2b1ca0c..652525729cdfc 100644 --- a/trunk/ceph/msg/msg_types.h +++ b/trunk/ceph/msg/msg_types.h @@ -111,8 +111,9 @@ namespace __gnu_cxx { */ struct entity_addr_t { struct ceph_entity_addr v; + uint32_t _pad; - entity_addr_t() { + entity_addr_t() : _pad(0) { v.port = v.nonce = 0; v.ipq[0] = v.ipq[1] = v.ipq[2] = v.ipq[3] = 0; } diff --git a/trunk/ceph/newsyn.cc b/trunk/ceph/newsyn.cc index 0ad3064560056..e580e49a9b7e9 100644 --- a/trunk/ceph/newsyn.cc +++ b/trunk/ceph/newsyn.cc @@ -14,6 +14,8 @@ #define intabs(x) ((x) >= 0 ? (x):(-(x))) +#include + #include #include #include @@ -46,7 +48,6 @@ extern std::map g_fake_kill_after; /* * start up NewMessenger via MPI. */ -#include pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) { diff --git a/trunk/ceph/osd/OSD.cc b/trunk/ceph/osd/OSD.cc index 30c320c623aac..ab57f0c603302 100644 --- a/trunk/ceph/osd/OSD.cc +++ b/trunk/ceph/osd/OSD.cc @@ -838,6 +838,8 @@ void OSD::send_pg_stats() bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) { bool shared = false; + dout(20) << "_share_map_incoming " << inst << " " << epoch << dendl; + assert(osd_lock.is_locked()); // does client have old map? if (inst.name.is_client()) { @@ -851,8 +853,10 @@ bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) // does peer have old map? if (inst.name.is_osd()) { // remember - if (peer_map_epoch[inst.name] < epoch) + if (peer_map_epoch[inst.name] < epoch) { + dout(20) << "peer " << inst.name << " has " << epoch << dendl; peer_map_epoch[inst.name] = epoch; + } // older? if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { diff --git a/trunk/ceph/osd/osd_types.h b/trunk/ceph/osd/osd_types.h index 6c36d0f2ffd77..08292252934ec 100644 --- a/trunk/ceph/osd/osd_types.h +++ b/trunk/ceph/osd/osd_types.h @@ -245,7 +245,7 @@ struct pg_stat_t { int64_t num_blocks; // in 4k blocks int64_t num_objects; - pg_stat_t() : state(0), size(0), num_blocks(0), num_objects(0) {} + pg_stat_t() : reported(0), state(0), size(0), num_blocks(0), num_objects(0) {} }; diff --git a/trunk/ceph/osdc/Objecter.cc b/trunk/ceph/osdc/Objecter.cc index 9d31d023ed56e..e6efee1aa4a33 100644 --- a/trunk/ceph/osdc/Objecter.cc +++ b/trunk/ceph/osdc/Objecter.cc @@ -130,15 +130,18 @@ void Objecter::handle_osd_map(MOSDMap *m) void Objecter::maybe_request_map() { utime_t now; - - if (last_epoch_requested <= osdmap->get_epoch() || - (now = g_clock.now()) - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) { - dout(10) << "maybe_request_map requesting next osd map" << dendl; - last_epoch_requested_stamp = now; - last_epoch_requested = osdmap->get_epoch()+1; - messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested), - monmap->get_inst(monmap->pick_mon())); - } + if (!osdmap) goto yes; + if (last_epoch_requested <= osdmap->get_epoch()) goto yes; + now = g_clock.now(); + if (now - last_epoch_requested_stamp > g_conf.objecter_map_request_interval) goto yes; + return; + + yes: + dout(10) << "maybe_request_map requesting next osd map" << dendl; + last_epoch_requested_stamp = now; + last_epoch_requested = osdmap->get_epoch()+1; + messenger->send_message(new MOSDGetMap(osdmap->get_epoch(), last_epoch_requested), + monmap->get_inst(monmap->pick_mon())); } diff --git a/trunk/ceph/osdc/Objecter.h b/trunk/ceph/osdc/Objecter.h index 30b1c3840a0c4..ed5c44745604e 100644 --- a/trunk/ceph/osdc/Objecter.h +++ b/trunk/ceph/osdc/Objecter.h @@ -172,6 +172,7 @@ class Objecter { messenger(m), monmap(mm), osdmap(om), last_tid(0), client_inc(-1), num_unacked(0), num_uncommitted(0), + last_epoch_requested(0), client_lock(l), timer(l) { } ~Objecter() { } diff --git a/trunk/ceph/valgrind.supp b/trunk/ceph/valgrind.supp index a6154be057544..356df039050c4 100644 --- a/trunk/ceph/valgrind.supp +++ b/trunk/ceph/valgrind.supp @@ -23,3 +23,40 @@ obj:* obj:* } + +# gethostbyname +{ + gethostbyname on issdm + Memcheck:Param + socketcall.sendto(msg) + fun:send + fun:get_mapping + fun:__nscd_get_map_ref + fun:nscd_gethst_r + fun:__nscd_gethostbyname_r + fun:gethostbyname_r@@GLIBC_2.2.5 + fun:gethostbyname + fun:_ZN4Rank8Accepter5startEv + fun:_ZN4Rank10start_rankEv + fun:main +} + +# gethostbyname + +{ + gethostbyname on foil + Memcheck:Addr8 + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/ld-2.6.1.so + obj:/lib/libc-2.6.1.so + obj:/lib/ld-2.6.1.so + fun:__libc_dlopen_mode + fun:__nss_lookup_function + obj:/lib/libc-2.6.1.so +} + -- 2.39.5