From 2aec5d111f178d12ab3832e35817fa3a30de252b Mon Sep 17 00:00:00 2001 From: sageweil Date: Mon, 13 Aug 2007 19:14:00 +0000 Subject: [PATCH] merged branches/sage/mds r1627 back to trunk git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1628 29311d96-e01e-0410-9327-a35deaab8ce9 --- trunk/ceph/Makefile | 105 +- trunk/ceph/TODO | 106 +- trunk/ceph/cfuse.cc | 6 +- trunk/ceph/client/Client.cc | 1739 +++++++++++++++------- trunk/ceph/client/Client.h | 212 ++- trunk/ceph/client/SyntheticClient.cc | 481 +++++- trunk/ceph/client/SyntheticClient.h | 15 +- trunk/ceph/client/Trace.h | 25 +- trunk/ceph/client/fuse.cc | 113 +- trunk/ceph/client/fuse_ll.cc | 385 +++++ trunk/ceph/client/fuse_ll.h | 15 + trunk/ceph/common/DecayCounter.h | 119 +- trunk/ceph/common/Logger.cc | 4 +- trunk/ceph/config.cc | 71 +- trunk/ceph/config.h | 24 +- trunk/ceph/ebofs/Allocator.cc | 2 +- trunk/ceph/ebofs/BlockDevice.cc | 22 +- trunk/ceph/ebofs/BufferCache.cc | 6 +- trunk/ceph/ebofs/Ebofs.cc | 172 ++- trunk/ceph/ebofs/FileJournal.cc | 26 +- trunk/ceph/ebofs/FileJournal.h | 4 +- trunk/ceph/ebofs/Journal.h | 1 + trunk/ceph/fakefuse.cc | 37 +- trunk/ceph/fakesyn.cc | 3 +- trunk/ceph/include/buffer.h | 96 +- trunk/ceph/include/frag.h | 149 +- trunk/ceph/include/lru.h | 13 +- trunk/ceph/include/types.h | 74 +- trunk/ceph/include/utime.h | 6 +- trunk/ceph/mds/Anchor.h | 2 +- trunk/ceph/mds/CDentry.cc | 34 +- trunk/ceph/mds/CDentry.h | 82 +- trunk/ceph/mds/CDir.cc | 368 +++-- trunk/ceph/mds/CDir.h | 147 +- trunk/ceph/mds/CInode.cc | 79 +- trunk/ceph/mds/CInode.h | 63 +- trunk/ceph/mds/Capability.h | 1 - trunk/ceph/mds/FileLock.h | 2 +- trunk/ceph/mds/Hasher.cc | 1582 -------------------- trunk/ceph/mds/IdAllocator.cc | 6 + trunk/ceph/mds/Locker.cc | 38 +- trunk/ceph/mds/LogEvent.h | 5 +- trunk/ceph/mds/MDBalancer.cc | 476 +++--- trunk/ceph/mds/MDBalancer.h | 18 +- trunk/ceph/mds/MDCache.cc | 579 ++++++- trunk/ceph/mds/MDCache.h | 27 + trunk/ceph/mds/MDLog.cc | 55 +- trunk/ceph/mds/MDS.cc | 122 +- trunk/ceph/mds/MDS.h | 2 +- trunk/ceph/mds/Migrator.cc | 229 +-- trunk/ceph/mds/Migrator.h | 16 +- trunk/ceph/mds/Renamer.cc | 905 ----------- trunk/ceph/mds/Renamer.h | 99 -- trunk/ceph/mds/ScatterLock.h | 3 +- trunk/ceph/mds/Server.cc | 243 +-- trunk/ceph/mds/Server.h | 14 +- trunk/ceph/mds/SimpleLock.h | 6 +- trunk/ceph/mds/events/EExport.h | 7 +- trunk/ceph/mds/events/EFragment.h | 20 +- trunk/ceph/mds/events/EMetaBlob.h | 125 +- trunk/ceph/mds/events/EOpen.h | 6 +- trunk/ceph/mds/events/ESlaveUpdate.h | 6 +- trunk/ceph/mds/events/EUpdate.h | 5 +- trunk/ceph/mds/journal.cc | 47 +- trunk/ceph/mds/mdstypes.h | 129 +- trunk/ceph/messages/MClientReply.h | 39 +- trunk/ceph/messages/MClientRequest.h | 2 + trunk/ceph/messages/MClientSession.h | 2 +- trunk/ceph/messages/MExportDirPrep.h | 30 +- trunk/ceph/messages/MLock.h | 2 +- trunk/ceph/messages/MMDSBeacon.h | 8 +- trunk/ceph/messages/MMDSCacheRejoin.h | 29 +- trunk/ceph/messages/MMDSFragmentNotify.h | 60 + trunk/ceph/messages/MOSDOp.h | 12 +- trunk/ceph/messages/MOSDOpReply.h | 21 +- trunk/ceph/messages/MOSDPGLog.h | 3 + trunk/ceph/messages/MOSDPGSummary.h | 3 + trunk/ceph/messages/MOSDPGUpdate.h | 6 + trunk/ceph/messages/MPGStats.h | 41 + trunk/ceph/messages/MStatfs.h | 41 + trunk/ceph/mon/ClientMonitor.cc | 1 - trunk/ceph/mon/MDSMonitor.cc | 66 +- trunk/ceph/mon/MonitorStore.cc | 2 +- trunk/ceph/mon/PGMap.h | 69 +- trunk/ceph/mon/PGMonitor.cc | 134 +- trunk/ceph/mon/PGMonitor.h | 8 +- trunk/ceph/msg/FakeMessenger.cc | 14 +- trunk/ceph/msg/Message.cc | 16 + trunk/ceph/msg/Message.h | 11 +- trunk/ceph/msg/SimpleMessenger.cc | 6 +- trunk/ceph/msg/msg_types.h | 4 +- trunk/ceph/msg/tcp.cc | 6 +- trunk/ceph/msg/tcp.h | 34 +- trunk/ceph/newsyn.cc | 11 +- trunk/ceph/osd/FakeStore.cc | 12 +- trunk/ceph/osd/OSD.cc | 6 +- trunk/ceph/osd/OSDMap.h | 42 +- trunk/ceph/osd/ObjectStore.h | 217 ++- trunk/ceph/osd/PG.cc | 9 +- trunk/ceph/osd/osd_types.h | 29 +- trunk/ceph/osdc/Journaler.cc | 27 +- trunk/ceph/osdc/ObjectCacher.cc | 60 +- trunk/ceph/osdc/ObjectCacher.h | 5 +- trunk/ceph/osdc/Objecter.cc | 18 +- trunk/ceph/test/fg.cc | 19 + trunk/ceph/test/testcounter.cc | 70 + 106 files changed, 5753 insertions(+), 5021 deletions(-) create mode 100644 trunk/ceph/client/fuse_ll.cc create mode 100644 trunk/ceph/client/fuse_ll.h delete mode 100644 trunk/ceph/mds/Hasher.cc delete mode 100644 trunk/ceph/mds/Renamer.cc delete mode 100644 trunk/ceph/mds/Renamer.h create mode 100644 trunk/ceph/messages/MMDSFragmentNotify.h create mode 100644 trunk/ceph/messages/MPGStats.h create mode 100644 trunk/ceph/messages/MStatfs.h create mode 100644 trunk/ceph/test/fg.cc create mode 100644 trunk/ceph/test/testcounter.cc diff --git a/trunk/ceph/Makefile b/trunk/ceph/Makefile index 93b2c0a635766..acf52719b68be 100644 --- a/trunk/ceph/Makefile +++ b/trunk/ceph/Makefile @@ -1,29 +1,39 @@ +# +# until autoconf is set up, here are the options i understand: +# +# darwin=yes -- build on darwin +# fuse=no -- don't build anything requiring FUSE +# mpi=no -- don't build newsyn (require MPI) +# use_ccpp=yes -- use Common C++ for buffer.h reference counting +# want_bdb=yes -- build berkelydb objectstore +# # mpicxx must be on your path to build newsyn. # on googoo, this means that /usr/local/mpich2-1.0.2/bin must be on your path. # on issdm, it's /usr/local/mpich2/bin. # Hook for extra -I options, etc. -EXTRA_CFLAGS = +EXTRA_CFLAGS = -I${HOME}/include -L${HOME}/lib + +# base +CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE ${EXTRA_CFLAGS} +LDINC = ld -i -o +CC = g++ +LIBS = -pthread +# darwin? ifeq ($(target),darwin) -# For Darwin -CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} +CFLAGS += -DDARWIN -D__FreeBSD__=10 LDINC = ar -rc -else -# For linux -CFLAGS = -pg -g -Wall -I. -D_FILE_OFFSET_BITS=64 -D_REENTRANT -D_THREAD_SAFE -LDINC = ld -i -o endif -CC = g++ -LIBS = -lpthread - -ifeq ($(want_bdb),yes) -CFLAGS += -DUSE_OSBDB -OSBDB_LIBS = -ldb_cxx +# use Common C++ (for buffer.h)? +ifeq ($(use_ccpp),yes) +CFLAGS += -D_GNU_SOURCE -DBUFFER_USE_CCPP +LIBS += -lccgnu2 -ldl endif + #for normal mpich2 machines MPICC = mpicxx MPICFLAGS = -DMPICH_IGNORE_CXX_SEEK ${CFLAGS} @@ -98,22 +108,29 @@ CLIENT_OBJS= \ client/Trace.o +# bdbstore? ifeq ($(want_bdb),yes) +CFLAGS += -DUSE_OSBDB +LIBS = -ldb_cxx +OSD_OBJS += osbdb/OSBDB.o OSBDB_OBJS = \ osbdb/OSBDB.o - -OSBDB_OBJ = osbdb.o endif -TARGETS = cmon cosd cmds csyn newsyn fakesyn mkmonmap cmonctl cfuse fakefuse -NO_FUSE = cmon cosd cmds csyn newsyn fakesyn mkmonmap - +# targets +TARGETS = cmon cosd cmds csyn mkmonmap cmonctl fakesyn SRCS=*.cc */*.cc *.h */*.h */*/*.h -all: depend ${TARGETS} +ifneq ($(fuse),no) +TARGETS += cfuse fakefuse +endif -nofuse: depend ${NO_FUSE} +ifneq ($(mpi),no) +TARGETS += newsyn +endif + +all: depend ${TARGETS} test: depend ${TEST_TARGETS} @@ -128,8 +145,8 @@ cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o cmonctl: cmonctl.cc msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o - ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ +cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o + ${CC} ${CFLAGS} ${LIBS} $^ -o $@ cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ @@ -137,7 +154,7 @@ cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o csyn: csyn.cc client.o osdc.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/SimpleMessenger.o common.o +cfuse: cfuse.cc client.o osdc.o client/fuse.o client/fuse_ll.o msg/SimpleMessenger.o common.o ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ @@ -162,25 +179,17 @@ libtrivialtask.so: active/trivial_task.cc client.o osdc.o msg/SimpleMessenger.o -# misc -gprof-helper.so: test/gprof-helper.c - gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl - - # fake* -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@ +fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o client/fuse_ll.o msg/FakeMessenger.o common.o + ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o - ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ +fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ # mpi startup -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ - -newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ +newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o + ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@ # ebofs @@ -199,6 +208,7 @@ libhadoopcephfs.so: client/hadoop/CephFSInterface.cc client.o osdc.o msg/SimpleM libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} ${LDINC} $^ -o $@ +# some benchmarking tools bench/mdtest/mdtest.o: bench/mdtest/mdtest.c mpicc -c $^ -o $@ @@ -208,19 +218,11 @@ mdtest: bench/mdtest/mdtest.o mdtest.ceph: bench/mdtest/mdtest.o libceph.o ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ -# OSD test - testos: test/testos.o ebofs.o osbdb.o common.o ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ -# - -%.so: %.cc - ${CC} -shared -fPIC ${CFLAGS} $< -o $@ - -clean: - rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} +# bits common.o: ${COMMON_OBJS} ${LDINC} $@ $^ @@ -245,12 +247,22 @@ mon.o: ${MON_OBJS} osbdb.o: ${OSBDB_OBJS} ${LDINC} $@ $^ + +# generic rules +%.so: %.cc + ${CC} -shared -fPIC ${CFLAGS} $< -o $@ + %.o: %.cc ${CC} -fPIC ${CFLAGS} -c $< -o $@ %.po: %.cc ${CC} -fPIC ${CFLAGS} -c $< -o $@ + +# handy +clean: + rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} + count: cat ${SRCS} | wc -l cat ${SRCS} | grep -c \; @@ -265,5 +277,6 @@ depend: $(RM) .depend makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null + # now add a line to include the dependency list. include .depend diff --git a/trunk/ceph/TODO b/trunk/ceph/TODO index 7b7f681cb9c3c..e054faa939406 100644 --- a/trunk/ceph/TODO +++ b/trunk/ceph/TODO @@ -26,6 +26,7 @@ some smallish projects: + code cleanup - endian portability - word size @@ -56,38 +57,6 @@ sage mds - the split/merge plan: -/ - fragset_t to describe bounds; we need to tolerate concurrent merge/splits - -/ - fragtree_t -/ - get_leaves(fg, ls) needs to be smarter -/ - force_to_leaf() -/ - simplified/normalized form. - -/ - CDir is never request pinned -/ - add a CInode sticky_dir flag to somehow pin all cdirs on the fly. -/ - STICKY dir state and pin? make sure it's kept across import/export/fragment -/ - pull _bound maps out of Migrator; they are redundant (trust the subtree map!) - - - handle_resolve needs to infer splits/merges - - - auth journals and applies update in the request update pipeline - - dirfragtree is lazily consistent. no lock. bcast by primary when it updates. - - - bcast to dir replicas - - inode auth will journal inode update separately/lazily - - also on handle_resolve(), if there is a mismatch. - - do i need a fragtrace_t something to tell me where the splits for a given frag occurred? - - or something like a fragtree_t simplify()? - - is there any reason to freeze the dir? - - CDentry objects will be moved to the new frag(s) - - Server etc. must take care not to carry CDir pointers around; they're unstable! - - -- journal epoch, or something similar - - reduce size of EMetaBlob by skipping context when inode was already journaled since the last - SubtreeMap - - - hmm, should we move ESubtreeMap out of the journal? that would avoid all the icky weirdness in shutdown, with periodic logging, etc. @@ -100,20 +69,13 @@ sage mds - need to export stray crap to another mds.. - verify stray is empty on shutdown -- dirfrag split/merge - - client readdir for dirfrags - consistency points/snapshots - dentry versions vs dirfrags... -- statfs? - more testing of failures + thrashing. - is export prep dir open deadlock properly fixed by forge_replica_dir()? - failures during recovery stages (resolve, rejoin)... make sure rejoin still works! -- dirfrag split - - make sure we are freezing _before_ we fetch to complete the dirfrag, else - we break commit()'s preconditions when it fetches an incomplete dir. - - detect and deal with client failure - failure during reconnect vs clientmap. although probalby the whole thing needs a larger overhaul... @@ -162,18 +124,6 @@ crush - crush tools -rados+ebofs -- purge replicated writes from cache. (with exception of partial tail blocks.) - -rados paper todo? -- better experiments - - berkeleydb objectstore? -- flush log only in response to subsequent read or write? -- better behaving recovery -- justify use of splay. - - dynamic replication -- snapshots - rados snapshots - integrate revisions into ObjectCacher - clean up oid.rev vs op.rev in osd+osdc @@ -207,14 +157,18 @@ osd/rados - rollback - rollback logging (to fix slow prepare vs rollback race) - read+floor_lockout for clean STOGITH-like/fencing semantics after failover. -- efficiently replicate clone() objects -- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? + - consider implications of nvram writeahead logs +- clean shutdown? +- pgmonitor should supplement failure detection + +- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? + +- efficiently replicate clone() objects - fix heartbeat wrt new replication - mark residual pgs obsolete ??? - rdlocks - optimize remove wrt recovery pushes -- pg_num changes - report crashed pgs? messenger @@ -227,6 +181,7 @@ simplemessenger - exponential backoff on monitor resend attempts (actually, this should go outside the messenger!) objectcacher +- merge clean bh's - ocacher caps transitions vs locks - test read locks @@ -235,16 +190,47 @@ reliability - osdmonitor, filter ebofs +- allow holes + - verify proper behavior of conflicting/overlapping reads of clones -- test(fix) sync() - combine inodes and/or cnodes into same blocks - allow btree sets instead of maps - eliminate nodepools - nonblocking write on missing onodes? - fix bug in node rotation on insert (and reenable) - fix NEAR_LAST_FWD (?) -- journaling? in NVRAM? -- metadata in nvram? flash? + +- awareness of underlying software/hardware raid in allocator so that we + write full stripes _only_. + - hmm, that's basically just a large block size. + +- rewrite the btree code! + - multithreaded + - eliminate nodepools + - allow btree sets + - allow arbitrary embedded data? + - allow arbitrary btrees + - allow root node(s?) to be embedded in onode, or whereever. + - keys and values can be uniform (fixed-size) or non-uniform. + - fixed size (if any) is a value in the btree struct. + - negative indicates bytes of length value? (1 -> 255bytes, 2 -> 65535 bytes, etc.?) + - non-uniform records preceeded by length. + - keys sorted via a comparator defined in btree root. + - lexicographically, by default. + +- goal + - object btree key->value payload, not just a data blob payload. + - better threading behavior. + - with transactional goodness! + +- onode + - object attributes.. as a btree? + - blob stream + - map stream. + - allow blob values. + + - + remaining hard problems @@ -258,12 +244,6 @@ crush mds - distributed client management -- anchormgr - - 2pc - - independent journal? - - distributed? -- link count management - - also 2pc - chdir (directory opens!) - rewrite logstream - clean up diff --git a/trunk/ceph/cfuse.cc b/trunk/ceph/cfuse.cc index 3540e1b2a14e8..d08898d720f63 100644 --- a/trunk/ceph/cfuse.cc +++ b/trunk/ceph/cfuse.cc @@ -21,6 +21,7 @@ using namespace std; #include "client/Client.h" #include "client/fuse.h" +#include "client/fuse_ll.h" #include "msg/SimpleMessenger.h" @@ -67,7 +68,10 @@ int main(int argc, char **argv, char *envp[]) { client->mount(); cerr << "starting fuse on pid " << getpid() << endl; - ceph_fuse_main(client, argc, argv); + if (g_conf.fuse_ll) + ceph_fuse_ll_main(client, argc, argv); + else + ceph_fuse_main(client, argc, argv); cerr << "fuse finished on pid " << getpid() << endl; client->unmount(); diff --git a/trunk/ceph/client/Client.cc b/trunk/ceph/client/Client.cc index 7b3db74d09f3a..8a633e5a40013 100644 --- a/trunk/ceph/client/Client.cc +++ b/trunk/ceph/client/Client.cc @@ -55,11 +55,12 @@ using namespace std; #include "common/Logger.h" + #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " " +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami /*<< "." << pthread_self() */ << " " -#define tout if (g_conf.client_trace) cout << "trace: " +#define tout if (g_conf.client_trace) traceout // static logger @@ -113,10 +114,10 @@ Client::Client(Messenger *m, MonMap *mm) : timer(client_lock) // root = 0; - set_cache_size(g_conf.client_cache_size); + lru.lru_set_max(g_conf.client_cache_size); // file handles - free_fh_set.insert(10, 1<<30); + free_fd_set.insert(10, 1<<30); // set up messengers messenger = m; @@ -154,16 +155,16 @@ Client::~Client() void Client::tear_down_cache() { - // fh's - for (hash_map::iterator it = fh_map.begin(); - it != fh_map.end(); + // fd's + for (hash_map::iterator it = fd_map.begin(); + it != fd_map.end(); it++) { Fh *fh = it->second; dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl; put_inode(fh->inode); delete fh; } - fh_map.clear(); + fd_map.clear(); // caps! // *** FIXME *** @@ -226,7 +227,9 @@ void Client::dump_cache() } -void Client::init() { +void Client::init() +{ + } @@ -253,12 +256,15 @@ void Client::trim_cache() Dentry *dn = (Dentry*)lru.lru_expire(); if (!dn) break; // done - //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl; + dout(15) << "trim_cache unlinking dn " << dn->name + << " in dir " << hex << dn->dir->parent_inode->inode.ino + << endl; unlink(dn); } // hose root? - if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) { + if (lru.lru_get_size() == 0 && root && root->ref == 0 && inode_map.size() == 1) { + dout(15) << "trim_cache trimmed root " << root << endl; delete root; root = 0; inode_map.clear(); @@ -278,6 +284,8 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) dout(12) << "insert_inode " << dname << " ino " << st->inode.ino << " size " << st->inode.size << " mtime " << st->inode.mtime + << " mask " << st->mask + << " in dir " << dir->parent_inode->inode.ino << endl; if (dn) { @@ -322,20 +330,28 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) dout(12) << " new dentry+node with ino " << st->inode.ino << endl; } else { // actually update info - dout(12) << " stat inode mask is " << st->inode.mask << endl; - dn->inode->inode = st->inode; + dout(12) << " stat inode mask is " << st->mask << endl; + if (st->mask & STAT_MASK_BASE) { + dn->inode->inode = st->inode; + dn->inode->dirfragtree = st->dirfragtree; // FIXME look at the mask! + } // ...but don't clobber our mtime, size! - if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && + /* isn't this handled below? + if ((dn->inode->mask & STAT_MASK_SIZE) == 0 && dn->inode->file_wr_size > dn->inode->inode.size) dn->inode->inode.size = dn->inode->file_wr_size; - if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && + if ((dn->inode->mask & STAT_MASK_MTIME) == 0 && dn->inode->file_wr_mtime > dn->inode->inode.mtime) dn->inode->inode.mtime = dn->inode->file_wr_mtime; + */ } // OK, we found it! assert(dn && dn->inode); + + // save the mask + dn->inode->mask = st->mask; // or do we have newer size/mtime from writing? if (dn->inode->file_caps() & CAP_FILE_WR) { @@ -346,7 +362,7 @@ Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) } // symlink? - if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { + if (dn->inode->inode.is_symlink()) { if (!dn->inode->symlink) dn->inode->symlink = new string; *(dn->inode->symlink) = st->symlink; @@ -409,13 +425,14 @@ Inode* Client::insert_trace(MClientReply *reply) if (!root) { // create cur = root = new Inode((*pin)->inode, objectcacher); + dout(10) << "insert_trace new root is " << root << endl; inode_map[root->inode.ino] = root; } } else { // not root. - dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl; Dir *dir = cur->open_dir(); cur = this->insert_inode(dir, *pin, *pdn); + dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << " -> " << cur << endl; ++pdn; // move to top of lru! @@ -608,6 +625,7 @@ MClientReply *Client::make_request(MClientRequest *req, dout(10) << "chose target mds" << mds << " based on hierarchy" << endl; } else { mds = mdsmap->get_random_in_mds(); + if (mds < 0) mds = 0; // hrm. dout(10) << "chose random target mds" << mds << " for lack of anything better" << endl; } } @@ -1347,6 +1365,23 @@ int Client::mount() << " and mdsmap " << mdsmap->get_epoch() << endl; + /* + // hack: get+pin root inode + Inode *root; + _do_lstat("/", STAT_MASK_ALL, &root); + _ll_get(root); + */ + + // trace? + if (g_conf.client_trace) { + traceout.open(g_conf.client_trace); + if (traceout.is_open()) { + dout(1) << "opened trace file '" << g_conf.client_trace << "'" << endl; + } else { + dout(1) << "FAILED to open trace file '" << g_conf.client_trace << "'" << endl; + } + } + client_lock.Unlock(); /* @@ -1356,8 +1391,8 @@ int Client::mount() dout(3) << "op: int readlinkbuf_len = 1000;" << endl; dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; dout(3) << "op: map dir_contents;" << endl; - dout(3) << "op: map open_files;" << endl; - dout(3) << "op: fh_t fh;" << endl; + dout(3) << "op: map open_files;" << endl; + dout(3) << "op: int fd;" << endl; */ return 0; } @@ -1376,8 +1411,14 @@ int Client::unmount() unmounting = true; // NOTE: i'm assuming all caches are already flushing (because all files are closed). - assert(fh_map.empty()); + assert(fd_map.empty()); + + dout(10) << "a" << endl; + + _ll_drop_pins(); + dout(10) << "b" << endl; + // empty lru cache lru.lru_set_max(0); trim_cache(); @@ -1405,14 +1446,14 @@ int Client::unmount() !inode_map.empty()) { dout(2) << "cache still has " << lru.lru_get_size() << "+" << inode_map.size() << " items" - << ", waiting (presumably for safe or for caps to be released?)" + << ", waiting (for caps to release?)" << endl; dump_cache(); mount_cond.Wait(client_lock); } assert(lru.lru_get_size() == 0); assert(inode_map.empty()); - + // unsafe writes if (!g_conf.client_oc) { while (unsafe_sync_write > 0) { @@ -1421,6 +1462,13 @@ int Client::unmount() mount_cond.Wait(client_lock); } } + + // stop tracing + if (g_conf.client_trace) { + dout(1) << "closing trace file '" << g_conf.client_trace << "'" << endl; + traceout.close(); + } + // send session closes! for (map::iterator p = mds_sessions.begin(); @@ -1462,22 +1510,26 @@ void Client::handle_unmount(Message* m) } +// =============================================================== +// high level (POSIXy) interface + // namespace ops int Client::link(const char *existing, const char *newname) { - client_lock.Lock(); - dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl; + Mutex::Locker lock(client_lock); tout << "link" << endl; tout << existing << endl; tout << newname << endl; + return _link(existing, newname); +} - +int Client::_link(const char *existing, const char *newname) +{ // main path arg is new link name // sarg is target (existing file) - MClientRequest *req = new MClientRequest(MDS_OP_LINK, messenger->get_myinst()); req->set_path(newname); req->set_sarg(existing); @@ -1494,23 +1546,24 @@ int Client::link(const char *existing, const char *newname) dout(10) << "link result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "link(\"" << existing << "\", \"" << newname << "\") = " << res << endl; return res; } int Client::unlink(const char *relpath) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "unlink" << endl; + tout << relpath << endl; string abspath; mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->unlink\(\"" << path << "\");" << endl; - tout << "unlink" << endl; - tout << path << endl; + return _unlink(abspath.c_str()); +} +int Client::_unlink(const char *path) +{ MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, messenger->get_myinst()); req->set_path(path); @@ -1537,27 +1590,25 @@ int Client::unlink(const char *relpath) dout(10) << "unlink result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "unlink(\"" << path << "\") = " << res << endl; return res; } int Client::rename(const char *relfrom, const char *relto) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "rename" << endl; + tout << relfrom << endl; + tout << relto << endl; - string absfrom; + string absfrom, absto; mkabspath(relfrom, absfrom); - const char *from = absfrom.c_str(); - string absto; mkabspath(relto, absto); - const char *to = absto.c_str(); - - dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl; - tout << "rename" << endl; - tout << from << endl; - tout << to << endl; - + return _rename(absfrom.c_str(), absto.c_str()); +} +int Client::_rename(const char *from, const char *to) +{ MClientRequest *req = new MClientRequest(MDS_OP_RENAME, messenger->get_myinst()); req->set_path(from); req->set_sarg(to); @@ -1574,8 +1625,10 @@ int Client::rename(const char *relfrom, const char *relto) delete reply; dout(10) << "rename result is " << res << endl; + // renamed item from our cache + trim_cache(); - client_lock.Unlock(); + dout(3) << "rename(\"" << from << "\", \"" << to << "\") = " << res << endl; return res; } @@ -1583,18 +1636,18 @@ int Client::rename(const char *relfrom, const char *relto) int Client::mkdir(const char *relpath, mode_t mode) { - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl; + Mutex::Locker lock(client_lock); tout << "mkdir" << endl; - tout << path << endl; + tout << relpath << endl; tout << mode << endl; + string abspath; + mkabspath(relpath, abspath); + return _mkdir(abspath.c_str(), mode); +} +int Client::_mkdir(const char *path, mode_t mode) +{ MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, messenger->get_myinst()); req->set_path(path); req->args.mkdir.mode = mode; @@ -1612,23 +1665,24 @@ int Client::mkdir(const char *relpath, mode_t mode) dout(10) << "mkdir result is " << res << endl; trim_cache(); - client_lock.Unlock(); + + dout(3) << "mkdir(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl; return res; } int Client::rmdir(const char *relpath) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "rmdir" << endl; + tout << relpath << endl; string abspath; mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->rmdir(\"" << path << "\");" << endl; - tout << "rmdir" << endl; - tout << path << endl; - + return _rmdir(abspath.c_str()); +} +int Client::_rmdir(const char *path) +{ MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, messenger->get_myinst()); req->set_path(path); @@ -1652,10 +1706,9 @@ int Client::rmdir(const char *relpath) } insert_trace(reply); delete reply; - dout(10) << "rmdir result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "rmdir(\"" << path << "\") = " << res << endl; return res; } @@ -1663,21 +1716,19 @@ int Client::rmdir(const char *relpath) int Client::symlink(const char *reltarget, const char *rellink) { - client_lock.Lock(); - - string abstarget; - mkabspath(reltarget, abstarget); - const char *target = abstarget.c_str(); - string abslink; - mkabspath(rellink, abslink); - const char *link = abslink.c_str(); - - dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl; + Mutex::Locker lock(client_lock); tout << "symlink" << endl; - tout << target << endl; - tout << link << endl; + tout << reltarget << endl; + tout << rellink << endl; + string target, link; + mkabspath(reltarget, target); + mkabspath(rellink, link); + return _symlink(target.c_str(), link.c_str()); +} +int Client::_symlink(const char *target, const char *link) +{ MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, messenger->get_myinst()); req->set_path(link); req->set_sarg(target); @@ -1692,52 +1743,47 @@ int Client::symlink(const char *reltarget, const char *rellink) int res = reply->get_result(); insert_trace(reply); //FIXME assuming trace of link, not of target delete reply; - dout(10) << "symlink result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "symlink(\"" << target << "\", \"" << link << "\") = " << res << endl; return res; } -int Client::readlink(const char *relpath, char *buf, off_t size) -{ - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; +int Client::readlink(const char *path, char *buf, off_t size) +{ + Mutex::Locker lock(client_lock); tout << "readlink" << endl; tout << path << endl; - client_lock.Unlock(); - // stat first (FIXME, PERF access cache directly) **** - struct stat stbuf; - int r = this->lstat(path, &stbuf); - if (r != 0) return r; - - client_lock.Lock(); - - // pull symlink content from cache - Inode *in = inode_map[stbuf.st_ino]; - assert(in); // i just did a stat - - // copy into buf (at most size bytes) - unsigned res = in->symlink->length(); - if (res > size) res = size; - memcpy(buf, in->symlink->c_str(), res); + string abspath; + mkabspath(path, abspath); + return _readlink(abspath.c_str(), buf, size); +} +int Client::_readlink(const char *path, char *buf, off_t size) +{ + Inode *in; + int r = _do_lstat(path, STAT_MASK_BASE, &in); + if (r == 0 && !in->inode.is_symlink()) r = -EINVAL; + if (r == 0) { + // copy into buf (at most size bytes) + r = in->symlink->length(); + if (r > size) r = size; + memcpy(buf, in->symlink->c_str(), r); + } else { + buf[0] = 0; + } trim_cache(); - client_lock.Unlock(); - return res; // return length in bytes (to mimic the system call) + + dout(3) << "readlink(\"" << path << "\", \"" << buf << "\", " << size << ") = " << r << endl; + return r; } // inode stuff -int Client::_lstat(const char *path, int mask, Inode **in) +int Client::_do_lstat(const char *path, int mask, Inode **in) { MClientRequest *req = 0; filepath fpath(path); @@ -1748,11 +1794,16 @@ int Client::_lstat(const char *path, int mask, Inode **in) Dentry *dn = lookup(fpath); inode_t inode; utime_t now = g_clock.real_now(); + if (dn && - now <= dn->inode->valid_until && - ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { + now <= dn->inode->valid_until) + dout(10) << "_lstat has inode " << path << " with mask " << dn->inode->mask << ", want " << mask << endl; + + if (dn && dn->inode && + ((mask & ~STAT_MASK_BASE) || now <= dn->inode->valid_until) && + ((dn->inode->mask & mask) == mask)) { inode = dn->inode->inode; - dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; + dout(10) << "lstat cache hit w/ sufficient mask, valid until " << dn->inode->valid_until << endl; if (g_conf.client_cache_stat_ttl == 0) dn->inode->valid_until = utime_t(); // only one stat allowed after each readdir @@ -1789,40 +1840,26 @@ int Client::_lstat(const char *path, int mask, Inode **in) } -void Client::fill_stat(inode_t& inode, struct stat *st) +int Client::fill_stat(Inode *in, struct stat *st) { + dout(10) << "fill_stat on " << in->inode.ino << " mode 0" << oct << in->inode.mode << dec + << " mtime " << in->inode.mtime << " ctime " << in->inode.ctime << endl; memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; - st->st_ctime = MAX(inode.ctime, inode.mtime); - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; + st->st_ino = in->inode.ino; + st->st_mode = in->inode.mode; + st->st_rdev = in->inode.rdev; + st->st_nlink = in->inode.nlink; + st->st_uid = in->inode.uid; + st->st_gid = in->inode.gid; + st->st_ctime = MAX(in->inode.ctime, in->inode.mtime); + st->st_atime = in->inode.atime; + st->st_mtime = in->inode.mtime; + st->st_size = in->inode.size; + st->st_blocks = in->inode.size ? ((in->inode.size - 1) / 4096 + 1):0; st->st_blksize = 4096; + return in->mask; } -void Client::fill_statlite(inode_t& inode, struct statlite *st) -{ - memset(st, 0, sizeof(struct stat)); - st->st_ino = inode.ino; - st->st_mode = inode.mode; - st->st_nlink = inode.nlink; - st->st_uid = inode.uid; - st->st_gid = inode.gid; -#ifndef DARWIN - // FIXME what's going on here with darwin? - st->st_ctime = MAX(inode.ctime, inode.mtime); - st->st_atime = inode.atime; - st->st_mtime = inode.mtime; -#endif - st->st_size = inode.size; - st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; - st->st_blksize = 4096; - /* S_REQUIREBLKSIZE(st->st_litemask); if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); @@ -1833,36 +1870,36 @@ void Client::fill_statlite(inode_t& inode, struct statlite *st) if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); */ -} int Client::lstat(const char *relpath, struct stat *stbuf) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "lstat" << endl; + tout << relpath << endl; string abspath; mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl; - tout << "lstat" << endl; - tout << path << endl; + return _lstat(abspath.c_str(), stbuf); +} +int Client::_lstat(const char *path, struct stat *stbuf) +{ Inode *in = 0; - - int res = _lstat(path, INODE_MASK_ALL_STAT, &in); + int res = _do_lstat(path, STAT_MASK_ALL, &in); if (res == 0) { assert(in); - fill_stat(in->inode,stbuf); - dout(10) << "stat sez size = " << in->inode.size << " mode = " << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << endl; + fill_stat(in, stbuf); + dout(10) << "stat sez size = " << in->inode.size << " mode = 0" << oct << stbuf->st_mode << dec << " ino = " << stbuf->st_ino << endl; } trim_cache(); - client_lock.Unlock(); + dout(3) << "lstat(\"" << path << "\", " << stbuf << ") = " << res << endl; return res; } +/* int Client::lstatlite(const char *relpath, struct statlite *stl) { client_lock.Lock(); @@ -1895,23 +1932,24 @@ int Client::lstatlite(const char *relpath, struct statlite *stl) client_lock.Unlock(); return res; } - +*/ int Client::chmod(const char *relpath, mode_t mode) { - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl; + Mutex::Locker lock(client_lock); tout << "chmod" << endl; - tout << path << endl; + tout << relpath << endl; tout << mode << endl; + string abspath; + mkabspath(relpath, abspath); + return _chmod(abspath.c_str(), mode); +} +int Client::_chmod(const char *path, mode_t mode) +{ + dout(3) << "_chmod(" << path << ", 0" << oct << mode << dec << ")" << endl; MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, messenger->get_myinst()); req->set_path(path); req->args.chmod.mode = mode; @@ -1924,28 +1962,28 @@ int Client::chmod(const char *relpath, mode_t mode) int res = reply->get_result(); insert_trace(reply); delete reply; - dout(10) << "chmod result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "_chmod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl; return res; } int Client::chown(const char *relpath, uid_t uid, gid_t gid) { - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl; + Mutex::Locker lock(client_lock); tout << "chown" << endl; - tout << path << endl; + tout << relpath << endl; tout << uid << endl; tout << gid << endl; + string abspath; + mkabspath(relpath, abspath); + return _chown(abspath.c_str(), uid, gid); +} +int Client::_chown(const char *path, uid_t uid, gid_t gid) +{ + dout(3) << "_chown(" << path << ", " << uid << ", " << gid << ")" << endl; MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, messenger->get_myinst()); req->set_path(path); req->args.chown.uid = uid; @@ -1964,69 +2002,68 @@ int Client::chown(const char *relpath, uid_t uid, gid_t gid) dout(10) << "chown result is " << res << endl; trim_cache(); - client_lock.Unlock(); + dout(3) << "chown(\"" << path << "\", " << uid << ", " << gid << ") = " << res << endl; return res; } int Client::utime(const char *relpath, struct utimbuf *buf) { - client_lock.Lock(); - - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl; - dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl; + Mutex::Locker lock(client_lock); tout << "utime" << endl; - tout << path << endl; - tout << buf->actime << endl; + tout << relpath << endl; tout << buf->modtime << endl; + tout << buf->actime << endl; + string abspath; + mkabspath(relpath, abspath); + return _utimes(abspath.c_str(), utime_t(buf->modtime,0), utime_t(buf->actime,0)); +} +int Client::_utimes(const char *path, utime_t mtime, utime_t atime) +{ + dout(3) << "_utimes(" << path << ", " << mtime << ", " << atime << ")" << endl; MClientRequest *req = new MClientRequest(MDS_OP_UTIME, messenger->get_myinst()); req->set_path(path); - req->args.utime.mtime.tv_sec = buf->modtime; - req->args.utime.mtime.tv_usec = 0; - req->args.utime.atime.tv_sec = buf->actime; - req->args.utime.atime.tv_usec = 0; + req->args.utime.mtime = mtime.tv_ref(); + req->args.utime.atime = atime.tv_ref(); // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - //FIXME enforce caller uid rights? - MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); delete reply; - dout(10) << "utime result is " << res << endl; + dout(3) << "utimes(\"" << path << "\", " << mtime << ", " << atime << ") = " << res << endl; trim_cache(); - client_lock.Unlock(); return res; } -int Client::mknod(const char *relpath, mode_t mode) +int Client::mknod(const char *relpath, mode_t mode, dev_t rdev) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "mknod" << endl; + tout << relpath << endl; + tout << mode << endl; + tout << rdev << endl; string abspath; mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - - dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl; - tout << "mknod" << endl; - tout << path << endl; - tout << mode << endl; + return _mknod(abspath.c_str(), mode, rdev); +} +int Client::_mknod(const char *path, mode_t mode, dev_t rdev) +{ + dout(3) << "_mknod(" << path << ", 0" << oct << mode << dec << ", " << rdev << ")" << endl; MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, messenger->get_myinst()); req->set_path(path); req->args.mknod.mode = mode; + req->args.mknod.rdev = rdev; // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); @@ -2038,131 +2075,278 @@ int Client::mknod(const char *relpath, mode_t mode) int res = reply->get_result(); insert_trace(reply); - dout(10) << "mknod result is " << res << endl; - delete reply; trim_cache(); - client_lock.Unlock(); + + dout(3) << "mknod(\"" << path << "\", 0" << oct << mode << dec << ") = " << res << endl; return res; } -//readdir usually include inode info for each entry except of locked entries +int Client::getdir(const char *relpath, list& contents) +{ + dout(3) << "getdir(" << relpath << ")" << endl; + { + Mutex::Locker lock(client_lock); + tout << "getdir" << endl; + tout << relpath << endl; + } -// -// getdir + DIR *d; + int r = opendir(relpath, &d); + if (r < 0) return r; -// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); + struct dirent de; + int n = 0; + while (readdir_r(d, &de) == 0) { + contents.push_back(de.d_name); + n++; + } + closedir(d); + + return n; +} -int Client::getdir(const char *relpath, map& contents) +int Client::opendir(const char *name, DIR **dirpp) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "opendir" << endl; + tout << name << endl; - string abspath; - mkabspath(relpath, abspath); - const char *path = abspath.c_str(); + int r = _opendir(name, (DirResult**)dirpp); + tout << (unsigned long)*dirpp; + return r; +} - dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl; - tout << "getdir" << endl; - tout << path << endl; +int Client::_opendir(const char *name, DirResult **dirpp) +{ + *dirpp = new DirResult(name); + + // do we have the inode in our cache? + // if so, should be we ask for a different dirfrag? + filepath path(name); + Dentry *dn = lookup(path); + if (dn && dn->inode) { + (*dirpp)->inode = dn->inode; + (*dirpp)->inode->get(); + dout(10) << "had inode " << dn->inode << " " << dn->inode->inode.ino << " ref now " << dn->inode->ref << endl; + (*dirpp)->set_frag(dn->inode->dirfragtree[0]); + dout(10) << "_opendir " << name << ", our cache says the first dirfrag is " << (*dirpp)->frag() << endl; + } + // get the first frag + int r = _readdir_get_frag(*dirpp); + if (r < 0) { + _closedir(*dirpp); + *dirpp = 0; + } + dout(3) << "_opendir(" << name << ") = " << r << " (" << *dirpp << ")" << endl; - MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); - req->set_path(path); + return r; +} + +void Client::_readdir_add_dirent(DirResult *dirp, const string& name, Inode *in) +{ + struct stat st; + int stmask = fill_stat(in, &st); + frag_t fg = dirp->frag(); + dirp->buffer[fg].push_back(DirEntry(name, st, stmask)); + dout(10) << "_readdir_add_dirent added " << name << ", size now " << dirp->buffer[fg].size() << endl; +} + +void Client::_readdir_add_dirent(DirResult *dirp, const string& name, unsigned char d_type) +{ + struct stat st; + memset(&st, 0, sizeof(st)); + st.st_mode = DT_TO_MODE(d_type); + int stmask = STAT_MASK_TYPE; + frag_t fg = dirp->frag(); + dirp->buffer[fg].push_back(DirEntry(name, st, stmask)); + dout(10) << "_readdir_add_dirent added " << name << ", size now " << dirp->buffer[fg].size() << endl; +} + +void Client::_readdir_next_frag(DirResult *dirp) +{ + frag_t fg = dirp->frag(); + + // hose old data + assert(dirp->buffer.count(fg)); + dirp->buffer.erase(fg); + + // advance + dirp->next_frag(); + if (dirp->at_end()) { + dout(10) << "_readdir_next_frag advance from " << fg << " to END" << endl; + } else { + dout(10) << "_readdir_next_frag advance from " << fg << " to " << dirp->frag() << endl; + _readdir_rechoose_frag(dirp); + } +} + +void Client::_readdir_rechoose_frag(DirResult *dirp) +{ + assert(dirp->inode); + frag_t cur = dirp->frag(); + frag_t f = dirp->inode->dirfragtree[cur.value()]; + if (f != cur) { + dout(10) << "_readdir_rechoose_frag frag " << cur << " maps to " << f << endl; + dirp->set_frag(f); + } +} + +int Client::_readdir_get_frag(DirResult *dirp) +{ + // get the current frag. + frag_t fg = dirp->frag(); + assert(dirp->buffer.count(fg) == 0); + + dout(10) << "_readdir_get_frag " << dirp << " on " << dirp->path << " fg " << fg << endl; + MClientRequest *req = new MClientRequest(MDS_OP_READDIR, messenger->get_myinst()); + req->set_path(dirp->path); + req->args.readdir.frag = fg; + // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); - - //FIXME enforce caller uid rights? - + MClientReply *reply = make_request(req); int res = reply->get_result(); insert_trace(reply); + inodeno_t ino = reply->get_ino(); + + // did i get directory inode? + Inode *diri = 0; + if ((res == -EAGAIN || res == 0) && + inode_map.count(ino)) { + diri = inode_map[ino]; + dout(10) << "_readdir_get_frag got diri " << diri << " " << diri->inode.ino << endl; + assert(diri); + assert(diri->inode.mode & INODE_MODE_DIR); + } + + if (!dirp->inode && diri) { + dout(10) << "_readdir_get_frag attaching inode" << endl; + dirp->inode = inode_map[ino]; + diri->get(); + } - if (res == 0) { + if (res == -EAGAIN) { + dout(10) << "_readdir_get_frag got EAGAIN, retrying" << endl; + _readdir_rechoose_frag(dirp); + return _readdir_get_frag(dirp); + } - // dir contents to cache! - inodeno_t ino = reply->get_ino(); - Inode *diri = inode_map[ ino ]; + if (res == 0) { + // stuff dir contents to cache, DirResult assert(diri); - assert(diri->inode.mode & INODE_MODE_DIR); - // add . and ..? - string dot("."); - contents[dot] = diri->inode; - if (diri != root) { + // create empty result vector + dirp->buffer[fg].clear(); + + if (fg.is_leftmost()) { + // add . and ..? + string dot("."); + _readdir_add_dirent(dirp, dot, diri); string dotdot(".."); - contents[dotdot] = diri->dn->dir->parent_inode->inode; + if (diri->dn) + _readdir_add_dirent(dirp, dotdot, diri->dn->dir->parent_inode); + //else + //_readdir_add_dirent(dirp, dotdot, DT_DIR); } - + // the rest? - if (!reply->get_dir_in().empty()) { + if (!reply->get_dir_dn().empty()) { // only open dir if we're actually adding stuff to it! Dir *dir = diri->open_dir(); assert(dir); utime_t now = g_clock.real_now(); - list::const_iterator pdn = reply->get_dir_dn().begin(); - for (list::const_iterator pin = reply->get_dir_in().begin(); - pin != reply->get_dir_in().end(); - ++pin, ++pdn) { - // ignore . - if (*pdn == ".") - continue; - + list::const_iterator pin = reply->get_dir_in().begin(); + for (list::const_iterator pdn = reply->get_dir_dn().begin(); + pdn != reply->get_dir_dn().end(); + ++pdn, ++pin) { // count entries res++; - - // put in cache - Inode *in = this->insert_inode(dir, *pin, *pdn); - - if (g_conf.client_cache_stat_ttl) { - in->valid_until = now; + + // put in cache + Inode *in = this->insert_inode(dir, *pin, *pdn); + + if (g_conf.client_cache_stat_ttl) { + in->valid_until = now; in->valid_until += g_conf.client_cache_stat_ttl; } - else if (g_conf.client_cache_readdir_ttl) { - in->valid_until = now; + else if (g_conf.client_cache_readdir_ttl) { + in->valid_until = now; in->valid_until += g_conf.client_cache_readdir_ttl; } - - // contents to caller too! - dout(15) << "getdir including " << *pdn << " to " << in->inode.ino << endl; - contents[*pdn] = in->inode; + + // contents to caller too! + dout(15) << "_readdir_get_frag got " << *pdn << " to " << in->inode.ino << endl; + _readdir_add_dirent(dirp, *pdn, in); } + if (dir->is_empty()) close_dir(dir); } // FIXME: remove items in cache that weren't in my readdir? // *** + } else { + dout(10) << "_readdir_get_frag got error " << res << ", setting end flag" << endl; + dirp->set_end(); } - delete reply; //fix thing above first + delete reply; - client_lock.Unlock(); return res; } +int Client::readdir_r(DIR *d, struct dirent *de) +{ + return readdirplus_r(d, de, 0, 0); +} + +int Client::readdirplus_r(DIR *d, struct dirent *de, struct stat *st, int *stmask) +{ + DirResult *dirp = (DirResult*)d; + + while (1) { + if (dirp->at_end()) return -1; + + if (dirp->buffer.count(dirp->frag()) == 0) { + Mutex::Locker lock(client_lock); + _readdir_get_frag(dirp); + if (dirp->at_end()) return -1; + } -/** POSIX stubs **/ + frag_t fg = dirp->frag(); + uint32_t pos = dirp->fragpos(); + assert(dirp->buffer.count(fg)); + vector &ent = dirp->buffer[fg]; -DIR *Client::opendir(const char *name) -{ - DirResult *d = new DirResult; - d->size = getdir(name, d->contents); - d->p = d->contents.begin(); - d->off = 0; - return (DIR*)d; -} + if (ent.empty()) { + dout(10) << "empty frag " << fg << ", moving on to next" << endl; + _readdir_next_frag(dirp); + continue; + } + + assert(pos < ent.size()); + _readdir_fill_dirent(de, &ent[pos], dirp->offset); + if (st) *st = ent[pos].st; + if (stmask) *stmask = ent[pos].stmask; + pos++; + dirp->offset++; + + if (pos == ent.size()) + _readdir_next_frag(dirp); + + break; + } -int Client::closedir(DIR *dir) -{ - DirResult *d = (DirResult*)dir; - delete d; return 0; } @@ -2173,163 +2357,64 @@ int Client::closedir(DIR *dir) // unsigned char d_type; /* type of file */ // char d_name[256]; /* filename */ //}; - -struct dirent *Client::readdir(DIR *dirp) +void Client::_readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t off) { - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; + if (entry->stmask) + de->d_ino = entry->st.st_ino; else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif + de->d_ino = 0; + de->d_off = off + 1; + de->d_reclen = 1; + de->d_type = MODE_TO_DT(entry->st.st_mode); + strncpy(de->d_name, entry->d_name.c_str(), 256); + dout(10) << "_readdir_fill_dirent " << de->d_name << " " << de->d_ino + << " type " << (int)de->d_type << " at off " << off << endl; +} - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); +int Client::closedir(DIR *dir) +{ + Mutex::Locker lock(client_lock); + tout << "closedir" << endl; + tout << (unsigned long)dir << endl; - // move up - ++d->off; - ++d->p; + dout(3) << "closedir(" << dir << ") = 0" << endl; + _closedir((DirResult*)dir); + return 0; +} - return &d->dp.d_dirent; +void Client::_closedir(DirResult *dirp) +{ + dout(10) << "_closedir(" << dirp << ")" << endl; + if (dirp->inode) { + dout(10) << "_closedir detaching inode " << dirp->inode << endl; + put_inode(dirp->inode); + dirp->inode = 0; + } + delete dirp; } - + void Client::rewinddir(DIR *dirp) { + dout(3) << "rewinddir(" << dirp << ")" << endl; DirResult *d = (DirResult*)dirp; - d->p = d->contents.begin(); - d->off = 0; + d->offset = 0; + d->buffer.clear(); } off_t Client::telldir(DIR *dirp) { DirResult *d = (DirResult*)dirp; - return d->off; + dout(3) << "telldir(" << dirp << ") = " << d->offset << endl; + return d->offset; } void Client::seekdir(DIR *dirp, off_t offset) { + dout(3) << "seekdir(" << dirp << ", " << offset << ")" << endl; DirResult *d = (DirResult*)dirp; - - d->p = d->contents.begin(); - d->off = 0; - - if (offset >= d->size) offset = d->size-1; - while (offset > 0) { - ++d->p; - ++d->off; - --offset; - } -} - -struct dirent_plus *Client::readdirplus(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; -#ifndef __CYGWIN__ -#ifndef DARWIN - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) -#endif // DARWIN -#endif - - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_stat(d->p->second, &d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = d->path; - path += "/"; - path += d->p->first; - d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; + d->offset = offset; } -/* -struct dirent_lite *Client::readdirlite(DIR *dirp) -{ - DirResult *d = (DirResult*)dirp; - - // end of dir? - if (d->p == d->contents.end()) - return 0; - - // fill the dirent - d->dp.d_dirent.d_ino = d->p->second.ino; - if (d->p->second.is_symlink()) - d->dp.d_dirent.d_type = DT_LNK; - else if (d->p->second.is_dir()) - d->dp.d_dirent.d_type = DT_DIR; - else if (d->p->second.is_file()) - d->dp.d_dirent.d_type = DT_REG; - else - d->dp.d_dirent.d_type = DT_UNKNOWN; - strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); - - d->dp.d_dirent.d_off = d->off; - d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) - - // plus - if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { - // have it - fill_statlite(d->p->second,d->dp.d_stat); - d->dp.d_stat_err = 0; - } else { - // don't have it, stat it - string path = p->path; - path += "/"; - path += p->first; - d->dp.d_statlite - d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); - } - - // move up - ++d->off; - ++d->p; - - return &d->dp; -} -*/ @@ -2340,17 +2425,31 @@ struct dirent_lite *Client::readdirlite(DIR *dirp) int Client::open(const char *relpath, int flags, mode_t mode) { - client_lock.Lock(); + Mutex::Locker lock(client_lock); + tout << "open" << endl; + tout << relpath << endl; + tout << flags << endl; string abspath; mkabspath(relpath, abspath); - const char *path = abspath.c_str(); - dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl; - tout << "open" << endl; - tout << path << endl; - tout << flags << endl; + Fh *fh; + int r = _open(abspath.c_str(), flags, mode, &fh); + if (r >= 0) { + // allocate a integer file descriptor + assert(fh); + r = get_fd(); + assert(fd_map.count(r) == 0); + fd_map[r] = fh; + } + + tout << r << endl; + dout(3) << "open(" << relpath << ", " << flags << ") = " << r << endl; + return r; +} +int Client::_open(const char *path, int flags, mode_t mode, Fh **fhp) +{ // go MClientRequest *req = new MClientRequest(MDS_OP_OPEN, messenger->get_myinst()); req->set_path(path); @@ -2362,19 +2461,26 @@ int Client::open(const char *relpath, int flags, mode_t mode) // FIXME where does FUSE maintain user information req->set_caller_uid(getuid()); req->set_caller_gid(getgid()); + + // do i have the inode? + Dentry *dn = lookup(req->get_filepath()); + Inode *in = 0; + if (dn) { + in = dn->inode; + in->add_open(cmode); // make note of pending open, since it effects _wanted_ caps. + } MClientReply *reply = make_request(req); - assert(reply); insert_trace(reply); int result = reply->get_result(); // success? - fh_t fh = 0; if (result >= 0) { // yay Fh *f = new Fh; + if (fhp) *fhp = f; f->mode = cmode; // inode @@ -2382,63 +2488,56 @@ int Client::open(const char *relpath, int flags, mode_t mode) assert(f->inode); f->inode->get(); - if (cmode & FILE_MODE_R) f->inode->num_open_rd++; - if (cmode & FILE_MODE_W) f->inode->num_open_wr++; - if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++; + if (!in) { + in = f->inode; + in->add_open(f->mode); + } // caps included? int mds = reply->get_source().num(); - if (f->inode->caps.empty()) {// first caps? - dout(7) << " first caps on " << f->inode->inode.ino << endl; - f->inode->get(); + if (in->caps.empty()) {// first caps? + dout(7) << " first caps on " << in->inode.ino << endl; + in->get(); } int new_caps = reply->get_file_caps(); - assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); - if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { - int old_caps = f->inode->caps[mds].caps; + assert(reply->get_file_caps_seq() >= in->caps[mds].seq); + if (reply->get_file_caps_seq() > in->caps[mds].seq) { + int old_caps = in->caps[mds].caps; dout(7) << "open got caps " << cap_string(new_caps) << " (had " << cap_string(old_caps) << ")" - << " for " << f->inode->ino() + << " for " << in->ino() << " seq " << reply->get_file_caps_seq() << " from mds" << mds << endl; - f->inode->caps[mds].caps = new_caps; - f->inode->caps[mds].seq = reply->get_file_caps_seq(); + in->caps[mds].caps = new_caps; + in->caps[mds].seq = reply->get_file_caps_seq(); // we shouldn't ever lose caps at this point. // actually, we might...? - assert((old_caps & ~f->inode->caps[mds].caps) == 0); + assert((old_caps & ~in->caps[mds].caps) == 0); if (g_conf.client_oc) - f->inode->fc.set_caps(new_caps); + in->fc.set_caps(new_caps); } else { dout(7) << "open got SAME caps " << cap_string(new_caps) - << " for " << f->inode->ino() + << " for " << in->ino() << " seq " << reply->get_file_caps_seq() << " from mds" << mds << endl; } - // put in map - result = fh = get_fh(); - assert(fh_map.count(fh) == 0); - fh_map[fh] = f; - - dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; - } else { - dout(0) << "open failure result " << result << endl; + dout(5) << "open success, fh is " << f << " combined caps " << cap_string(in->file_caps()) << endl; } delete reply; trim_cache(); - client_lock.Unlock(); return result; } @@ -2471,35 +2570,37 @@ void Client::close_safe(Inode *in) mount_cond.Signal(); } -int Client::close(fh_t fh) + +int Client::close(int fd) { - client_lock.Lock(); - dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; - dout(3) << "op: open_files.erase( " << fh << " );" << endl; + Mutex::Locker lock(client_lock); tout << "close" << endl; - tout << fh << endl; + tout << fd << endl; - // get Fh, Inode - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + dout(3) << "close(" << fd << ")" << endl; + assert(fd_map.count(fd)); + Fh *fh = fd_map[fd]; + _release(fh); + fd_map.erase(fd); + return 0; +} + +int Client::_release(Fh *f) +{ + //dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; + //dout(3) << "op: open_files.erase( " << fh << " );" << endl; + dout(5) << "_release " << f << endl; Inode *in = f->inode; // update inode rd/wr counts int before = in->file_caps_wanted(); - if (f->mode & FILE_MODE_R) - in->num_open_rd--; - if (f->mode & FILE_MODE_W) - in->num_open_wr--; + in->sub_open(f->mode); int after = in->file_caps_wanted(); // does this change what caps we want? if (before != after && after) update_caps_wanted(in); - // hose fh - fh_map.erase(fh); - delete f; - // release caps right away? dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl; @@ -2531,10 +2632,7 @@ int Client::close(fh_t fh) } put_inode( in ); - int result = 0; - - client_lock.Unlock(); - return result; + return 0; } @@ -2543,13 +2641,16 @@ int Client::close(fh_t fh) // read, write -off_t Client::lseek(fh_t fh, off_t offset, int whence) +off_t Client::lseek(int fd, off_t offset, int whence) { - client_lock.Lock(); - dout(3) << "op: client->lseek(" << fh << ", " << offset << ", " << whence << ");" << endl; + Mutex::Locker lock(client_lock); + tout << "lseek" << endl; + tout << fd << endl; + tout << offset << endl; + tout << whence << endl; - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; Inode *in = f->inode; switch (whence) { @@ -2570,8 +2671,8 @@ off_t Client::lseek(fh_t fh, off_t offset, int whence) } off_t pos = f->pos; - client_lock.Unlock(); + dout(3) << "lseek(" << fd << ", " << offset << ", " << whence << ") = " << pos << endl; return pos; } @@ -2602,20 +2703,34 @@ void Client::unlock_fh_pos(Fh *f) } + +//char *hackbuf = 0; + + // blocking osd interface -int Client::read(fh_t fh, char *buf, off_t size, off_t offset) +int Client::read(int fd, char *buf, off_t size, off_t offset) { - client_lock.Lock(); - - dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl; + Mutex::Locker lock(client_lock); tout << "read" << endl; - tout << fh << endl; + tout << fd << endl; tout << size << endl; tout << offset << endl; - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; + bufferlist bl; + int r = _read(f, offset, size, &bl); + dout(3) << "read(" << fd << ", " << buf << ", " << size << ", " << offset << ") = " << r << endl; + if (r >= 0) { + bl.copy(0, bl.length(), buf); + r = bl.length(); + } + return r; +} + +int Client::_read(Fh *f, off_t offset, off_t size, bufferlist *bl) +{ Inode *in = f->inode; bool movepos = false; @@ -2635,7 +2750,6 @@ int Client::read(fh_t fh, char *buf, off_t size, off_t offset) dout(10) << "file size: " << in->inode.size << endl; if (offset > 0 && offset >= in->inode.size) { if (movepos) unlock_fh_pos(f); - client_lock.Unlock(); return 0; } if (offset + size > (off_t)in->inode.size) @@ -2644,7 +2758,6 @@ int Client::read(fh_t fh, char *buf, off_t size, off_t offset) if (size == 0) { dout(10) << "read is size=0, returning 0" << endl; if (movepos) unlock_fh_pos(f); - client_lock.Unlock(); return 0; } } else { @@ -2653,13 +2766,25 @@ int Client::read(fh_t fh, char *buf, off_t size, off_t offset) // defer to OSDs for file bounds. } - bufferlist blist; // data will go here int r = 0; int rvalue = 0; if (g_conf.client_oc) { // object cache ON - rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block. + rvalue = r = in->fc.read(offset, size, *bl, client_lock); // may block. + + /* + if (in->inode.ino == 0x10000000075 && hackbuf) { + int s = MIN(size, bl->length()); + char *v = bl->c_str(); + for (int a=0; aprepare_read(in->inode, offset, size, &blist); + Objecter::OSDRead *rd = filer->prepare_read(in->inode, offset, size, bl); if (in->hack_balance_reads || g_conf.client_hack_balance_reads) rd->balance_reads = true; @@ -2697,18 +2822,11 @@ int Client::read(fh_t fh, char *buf, off_t size, off_t offset) if (movepos) { // adjust fd pos - f->pos = offset+blist.length(); + f->pos = offset+bl->length(); unlock_fh_pos(f); } - // copy data into caller's char* buf - blist.copy(0, blist.length(), buf); - - //dout(10) << "i read '" << blist.c_str() << "'" << endl; - dout(10) << "read rvalue " << rvalue << ", r " << r << endl; - // done! - client_lock.Unlock(); return rvalue; } @@ -2740,19 +2858,25 @@ void Client::hack_sync_write_safe() client_lock.Unlock(); } -int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) +int Client::write(int fd, const char *buf, off_t size, off_t offset) { - client_lock.Lock(); - - //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; - dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl; + Mutex::Locker lock(client_lock); tout << "write" << endl; - tout << fh << endl; + tout << fd << endl; tout << size << endl; tout << offset << endl; - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + assert(fd_map.count(fd)); + Fh *fh = fd_map[fd]; + int r = _write(fh, offset, size, buf); + dout(3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << endl; + return r; +} + + +int Client::_write(Fh *f, off_t offset, off_t size, const char *buf) +{ + //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; Inode *in = f->inode; // use/adjust fd pos? @@ -2778,9 +2902,23 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) if (g_conf.client_oc) { // buffer cache ON? assert(objectcacher); + /* + if (f->inode->inode.ino == 0x10000000075) { + if (!hackbuf) { + dout(7) << "alloc and zero new hackbuf" << endl; + hackbuf = new char[16384]; + memset(hackbuf, 0, 16384); + } + dout(7) << "hackbuf copying " << offset << "~" << size << " first is " << (int)buf[0] << endl; + memcpy(hackbuf+offset, buf, size); + for (int a=0; afc.write(offset, size, blist, client_lock); - + } else { // legacy, inconsistent synchronous write. dout(7) << "synchronous write" << endl; @@ -2855,20 +2993,24 @@ int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) in->file_wr_mtime = in->inode.mtime = g_clock.real_now(); // ok! - client_lock.Unlock(); return totalwritten; } -int Client::truncate(const char *file, off_t length) +int Client::truncate(const char *relpath, off_t length) { - client_lock.Lock(); - dout(3) << "op: client->truncate(\"" << file << "\", " << length << ");" << endl; + Mutex::Locker lock(client_lock); tout << "truncate" << endl; - tout << file << endl; + tout << relpath << endl; tout << length << endl; + string path; + mkabspath(relpath, path); + return _truncate(path.c_str(), length); +} +int Client::_truncate(const char *file, off_t length) +{ MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); req->set_path(file); req->args.truncate.length = length; @@ -2882,29 +3024,62 @@ int Client::truncate(const char *file, off_t length) insert_trace(reply); delete reply; - dout(10) << " truncate result is " << res << endl; + dout(3) << "truncate(\"" << file << "\", " << length << ") = " << res << endl; + return res; +} - client_lock.Unlock(); +int Client::ftruncate(int fd, off_t length) +{ + Mutex::Locker lock(client_lock); + tout << "ftruncate" << endl; + tout << fd << endl; + tout << length << endl; + + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; + return _ftruncate(f, length); +} + +int Client::_ftruncate(Fh *fh, off_t length) +{ + MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, messenger->get_myinst()); + req->args.truncate.ino = fh->inode->inode.ino; + req->args.truncate.length = length; + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + + dout(3) << "ftruncate(\"" << fh << "\", " << length << ") = " << res << endl; return res; } -int Client::fsync(fh_t fh, bool syncdataonly) +int Client::fsync(int fd, bool syncdataonly) { - client_lock.Lock(); - dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; + Mutex::Locker lock(client_lock); tout << "fsync" << endl; - tout << fh << endl; + tout << fd << endl; tout << syncdataonly << endl; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; + int r = _fsync(f, syncdataonly); + dout(3) << "fsync(" << fd << ", " << syncdataonly << ") = " << r << endl; + return r; +} + +int Client::_fsync(Fh *f, bool syncdataonly) +{ int r = 0; - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; Inode *in = f->inode; - dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl; - // metadata? if (!syncdataonly) { dout(0) << "fsync - not syncing metadata yet.. implement me" << endl; @@ -2918,8 +3093,6 @@ int Client::fsync(fh_t fh, bool syncdataonly) // wait for callback while (!done) cond.Wait(client_lock); } - - client_lock.Unlock(); return r; } @@ -2928,6 +3101,10 @@ int Client::fsync(fh_t fh, bool syncdataonly) int Client::chdir(const char *path) { + Mutex::Locker lock(client_lock); + tout << "chdir" << endl; + tout << path << endl; + // fake it for now! string abs; mkabspath(path, abs); @@ -2938,6 +3115,9 @@ int Client::chdir(const char *path) int Client::statfs(const char *path, struct statvfs *stbuf) { + Mutex::Locker lock(client_lock); + tout << "statfs" << endl; + bzero (stbuf, sizeof (struct statvfs)); // FIXME stbuf->f_bsize = 1024; @@ -2960,8 +3140,8 @@ int Client::lazyio_propogate(int fd, off_t offset, size_t count) dout(3) << "op: client->lazyio_propogate(" << fd << ", " << offset << ", " << count << ")" << endl; - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; Inode *in = f->inode; if (f->mode & FILE_MODE_LAZY) { @@ -2996,8 +3176,8 @@ int Client::lazyio_synchronize(int fd, off_t offset, size_t count) dout(3) << "op: client->lazyio_synchronize(" << fd << ", " << offset << ", " << count << ")" << endl; - assert(fh_map.count(fd)); - Fh *f = fh_map[fd]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; Inode *in = f->inode; if (f->mode & FILE_MODE_LAZY) { @@ -3022,22 +3202,544 @@ int Client::lazyio_synchronize(int fd, off_t offset, size_t count) } + + +// ========================================= +// low level + +// ugly hack for ll +#define FUSE_SET_ATTR_MODE (1 << 0) +#define FUSE_SET_ATTR_UID (1 << 1) +#define FUSE_SET_ATTR_GID (1 << 2) +#define FUSE_SET_ATTR_SIZE (1 << 3) +#define FUSE_SET_ATTR_ATIME (1 << 4) +#define FUSE_SET_ATTR_MTIME (1 << 5) + +int Client::ll_lookup(inodeno_t parent, const char *name, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_lookup " << parent << " " << name << endl; + tout << "ll_lookup" << endl; + tout << parent.val << endl; + tout << name << endl; + + string dname = name; + Inode *diri = 0; + int r = 0; + + if (inode_map.count(parent) == 0) { + tout << 0 << endl; + dout(1) << "ll_lookup " << parent << " " << name << " -> ENOENT (parent DNE... WTF)" << endl; + r = -ENOENT; + goto out; + } + diri = inode_map[parent]; + if (!diri->inode.is_dir()) { + tout << 0 << endl; + dout(1) << "ll_lookup " << parent << " " << name << " -> ENOTDIR (parent not a dir... WTF)" << endl; + r = -ENOTDIR; + goto out; + } + + + // refresh the dir? + // FIXME: this is the hackish way. + if (!diri->dir || + diri->dir->dentries.count(dname) == 0) { + string path; + diri->make_path(path); + DirResult *dirp = new DirResult(path, diri); + + while (1) { + hash H; + dirp->set_frag(diri->dirfragtree[H(dname)]); + + dout(10) << "ll_lookup fetching frag " << dirp->frag() << " for " << name << endl; + int r = _readdir_get_frag(dirp); + if (r < 0) return r; + + if (dirp->buffer.count(diri->dirfragtree[H(dname)])) break; + dirp->buffer.clear(); + } + + _closedir(dirp); + } + + // do we have it? + if (diri->dir && + diri->dir->dentries.count(dname)) { + Inode *in = diri->dir->dentries[dname]->inode; + fill_stat(in, attr); + _ll_get(in); + assert(inode_map[in->inode.ino] == in); + } else { + r = -ENOENT; + } + + out: + dout(3) << "ll_lookup " << parent << " " << name + << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << endl; + tout << attr->st_ino << endl; + return r; +} + +void Client::_ll_get(Inode *in) +{ + if (in->ll_ref == 0) + in->get(); + in->ll_get(); + dout(20) << "_ll_get " << in << " " << in->inode.ino << " -> " << in->ll_ref << endl; +} + +int Client::_ll_put(Inode *in, int num) +{ + in->ll_put(num); + dout(20) << "_ll_put " << in << " " << in->inode.ino << " " << num << " -> " << in->ll_ref << endl; + if (in->ll_ref == 0) { + put_inode(in); + return 0; + } else { + return in->ll_ref; + } +} + +void Client::_ll_drop_pins() +{ + dout(10) << "_ll_drop_pins" << endl; + hash_map::iterator next; + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it = next) { + Inode *in = it->second; + next = it; + next++; + if (in->ll_ref) + _ll_put(in, in->ll_ref); + } +} + +bool Client::ll_forget(inodeno_t ino, int num) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_forget " << ino << " " << num << endl; + tout << "ll_forget" << endl; + tout << ino.val << endl; + tout << num << endl; + + if (ino == 1) return true; // ignore forget on root. + + bool last = false; + if (inode_map.count(ino) == 0) { + dout(1) << "WARNING: ll_forget on " << ino << " " << num + << ", which I don't have" << endl; + } else { + Inode *in = inode_map[ino]; + assert(in); + if (_ll_put(in, num) == 0) + last = true; + } + return last; +} + +Inode *Client::_ll_get_inode(inodeno_t ino) +{ + if (inode_map.count(ino) == 0) { + assert(ino == 1); // must be the root inode. + Inode *in; + int r = _do_lstat("/", 0, &in); + assert(r >= 0); + return in; + } else { + return inode_map[ino]; + } +} + + +int Client::ll_getattr(inodeno_t ino, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_getattr " << ino << endl; + tout << "ll_getattr" << endl; + tout << ino.val << endl; + + Inode *in = _ll_get_inode(ino); + fill_stat(in, attr); + return 0; +} + +int Client::ll_setattr(inodeno_t ino, struct stat *attr, int mask) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_setattr " << ino << " mask " << hex << mask << dec << endl; + tout << "ll_setattr" << endl; + tout << ino.val << endl; + tout << attr->st_mode << endl; + tout << attr->st_uid << endl; + tout << attr->st_gid << endl; + tout << attr->st_size << endl; + tout << attr->st_mtime << endl; + tout << attr->st_atime << endl; + tout << mask << endl; + + Inode *in = _ll_get_inode(ino); + + string path; + in->make_path(path); + + int r; + if ((mask & FUSE_SET_ATTR_MODE) && + ((r = _chmod(path.c_str(), attr->st_mode)) < 0)) return r; + + if ((mask & FUSE_SET_ATTR_UID) && (mask & FUSE_SET_ATTR_GID) && + ((r = _chown(path.c_str(), attr->st_uid, attr->st_gid)) < 0)) return r; + //if ((mask & FUSE_SET_ATTR_GID) && + //(r = client->_chgrp(path.c_str(), attr->st_gid) < 0)) return r; + + if ((mask & FUSE_SET_ATTR_SIZE) && + ((r = _truncate(path.c_str(), attr->st_size)) < 0)) return r; + + if ((mask & FUSE_SET_ATTR_MTIME) && (mask & FUSE_SET_ATTR_ATIME)) { + if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t(attr->st_atime,0))) < 0) return r; + } else if (mask & FUSE_SET_ATTR_MTIME) { + if ((r = _utimes(path.c_str(), utime_t(attr->st_mtime,0), utime_t())) < 0) return r; + } else if (mask & FUSE_SET_ATTR_ATIME) { + if ((r = _utimes(path.c_str(), utime_t(), utime_t(attr->st_atime,0))) < 0) return r; + } + + assert(r == 0); + fill_stat(in, attr); + + dout(3) << "ll_setattr " << ino << " = " << r << endl; + return 0; +} + +int Client::ll_readlink(inodeno_t ino, const char **value) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_readlink " << ino << endl; + tout << "ll_readlink" << endl; + tout << ino.val << endl; + + Inode *in = _ll_get_inode(ino); + int r = 0; + if (in->inode.is_symlink()) { + *value = in->symlink->c_str(); + } else { + *value = ""; + r = -EINVAL; + } + dout(3) << "ll_readlink " << ino << " = " << r << " (" << *value << ")" << endl; + return r; +} + +int Client::ll_mknod(inodeno_t parent, const char *name, mode_t mode, dev_t rdev, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_mknod " << parent << " " << name << endl; + tout << "ll_mknod" << endl; + tout << parent.val << endl; + tout << name << endl; + tout << mode << endl; + tout << rdev << endl; + + Inode *diri = _ll_get_inode(parent); + + string path; + diri->make_path(path); + path += "/"; + path += name; + int r = _mknod(path.c_str(), mode, rdev); + if (r == 0) { + string dname(name); + Inode *in = diri->dir->dentries[dname]->inode; + fill_stat(in, attr); + _ll_get(in); + } + tout << attr->st_ino << endl; + dout(3) << "ll_mknod " << parent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl; + return r; +} + +int Client::ll_mkdir(inodeno_t parent, const char *name, mode_t mode, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_mkdir " << parent << " " << name << endl; + tout << "ll_mkdir" << endl; + tout << parent.val << endl; + tout << name << endl; + tout << mode << endl; + + Inode *diri = _ll_get_inode(parent); + + string path; + diri->make_path(path); + path += "/"; + path += name; + int r = _mkdir(path.c_str(), mode); + if (r == 0) { + string dname(name); + Inode *in = diri->dir->dentries[dname]->inode; + fill_stat(in, attr); + _ll_get(in); + } + tout << attr->st_ino << endl; + dout(3) << "ll_mkdir " << parent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl; + return r; +} + +int Client::ll_symlink(inodeno_t parent, const char *name, const char *value, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_symlink " << parent << " " << name << " -> " << value << endl; + tout << "ll_symlink" << endl; + tout << parent.val << endl; + tout << name << endl; + tout << value << endl; + + Inode *diri = _ll_get_inode(parent); + + string path; + diri->make_path(path); + path += "/"; + path += name; + int r = _symlink(value, path.c_str()); + if (r == 0) { + string dname(name); + Inode *in = diri->dir->dentries[dname]->inode; + fill_stat(in, attr); + _ll_get(in); + } + tout << attr->st_ino << endl; + dout(3) << "ll_symlink " << parent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << endl; + return r; +} + +int Client::ll_unlink(inodeno_t ino, const char *name) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_unlink " << ino << " " << name << endl; + tout << "ll_unlink" << endl; + tout << ino.val << endl; + tout << name << endl; + + Inode *diri = _ll_get_inode(ino); + + string path; + diri->make_path(path); + path += "/"; + path += name; + return _unlink(path.c_str()); +} + +int Client::ll_rmdir(inodeno_t ino, const char *name) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_rmdir " << ino << " " << name << endl; + tout << "ll_rmdir" << endl; + tout << ino.val << endl; + tout << name << endl; + + Inode *diri = _ll_get_inode(ino); + + string path; + diri->make_path(path); + path += "/"; + path += name; + return _rmdir(path.c_str()); +} + +int Client::ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_rename " << parent << " " << name << " to " + << newparent << " " << newname << endl; + tout << "ll_rename" << endl; + tout << parent.val << endl; + tout << name << endl; + tout << newparent.val << endl; + tout << newname << endl; + + Inode *diri = _ll_get_inode(parent); + string path; + diri->make_path(path); + path += "/"; + path += name; + + Inode *newdiri = _ll_get_inode(newparent); + string newpath; + newdiri->make_path(newpath); + newpath += "/"; + newpath += newname; + + return _rename(path.c_str(), newpath.c_str()); +} + +int Client::ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_link " << ino << " to " << newparent << " " << newname << endl; + tout << "ll_link" << endl; + tout << ino.val << endl; + tout << newparent << endl; + tout << newname << endl; + + Inode *old = _ll_get_inode(ino); + Inode *diri = _ll_get_inode(newparent); + + string path; + old->make_path(path); + + string newpath; + diri->make_path(newpath); + newpath += "/"; + newpath += newname; + + int r = _link(path.c_str(), newpath.c_str()); + if (r == 0) { + string dname(newname); + Inode *in = diri->dir->dentries[dname]->inode; + fill_stat(in, attr); + _ll_get(in); + } + return r; +} + +int Client::ll_opendir(inodeno_t ino, void **dirpp) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_opendir " << ino << endl; + tout << "ll_opendir" << endl; + tout << ino.val << endl; + + Inode *diri = inode_map[ino]; + assert(diri); + string path; + diri->make_path(path); + + int r = _opendir(path.c_str(), (DirResult**)dirpp); + + tout << (unsigned long)*dirpp << endl; + + dout(3) << "ll_opendir " << ino << " = " << r << " (" << *dirpp << ")" << endl; + return r; +} + +void Client::ll_releasedir(void *dirp) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_releasedir " << dirp << endl; + tout << "ll_releasedir" << endl; + tout << (unsigned long)dirp << endl; + _closedir((DirResult*)dirp); +} + +int Client::ll_open(inodeno_t ino, int flags, Fh **fhp) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_open " << ino << " " << flags << endl; + tout << "ll_open" << endl; + tout << ino.val << endl; + tout << flags << endl; + + Inode *in = _ll_get_inode(ino); + string path; + in->make_path(path); + + int r = _open(path.c_str(), flags, 0, fhp); + + tout << (unsigned long)*fhp << endl; + dout(3) << "ll_open " << ino << " " << flags << " = " << r << " (" << *fhp << ")" << endl; + return r; +} + +int Client::ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, + struct stat *attr, Fh **fhp) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags << endl; + tout << "ll_create" << endl; + tout << parent.val << endl; + tout << name << endl; + tout << mode << endl; + tout << flags << endl; + + Inode *pin = _ll_get_inode(parent); + string path; + pin->make_path(path); + path += "/"; + path += name; + + int r = _open(path.c_str(), flags|O_CREAT, mode, fhp); + if (r >= 0) { + Inode *in = (*fhp)->inode; + fill_stat(in, attr); + //_ll_get(in); + } + tout << (unsigned long)*fhp << endl; + tout << attr->st_ino << endl; + dout(3) << "ll_create " << parent << " " << name << " 0" << oct << mode << dec << " " << flags + << " = " << r << " (" << *fhp << " " << hex << attr->st_ino << dec << ")" << endl; + return 0; +} + +int Client::ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_read " << fh << " " << off << "~" << len << endl; + tout << "ll_read" << endl; + tout << (unsigned long)fh << endl; + tout << off << endl; + tout << len << endl; + + return _read(fh, off, len, bl); +} + +int Client::ll_write(Fh *fh, off_t off, off_t len, const char *data) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_write " << fh << " " << off << "~" << len << endl; + tout << "ll_write" << endl; + tout << (unsigned long)fh << endl; + tout << off << endl; + tout << len << endl; + + return _write(fh, off, len, data); +} + +int Client::ll_release(Fh *fh) +{ + Mutex::Locker lock(client_lock); + dout(3) << "ll_release " << fh << endl; + tout << "ll_release" << endl; + tout << (unsigned long)fh << endl; + + _release(fh); + return 0; +} + + + + + + // ========================================= // layout -int Client::describe_layout(int fh, FileLayout *lp) +int Client::describe_layout(int fd, FileLayout *lp) { - client_lock.Lock(); - dout(3) << "op: client->describe_layout(" << fh << ");" << endl; + Mutex::Locker lock(client_lock); - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; Inode *in = f->inode; *lp = in->inode.layout; - client_lock.Unlock(); + dout(3) << "describe_layout(" << fd << ") = 0" << endl; return 0; } @@ -3062,20 +3764,19 @@ int Client::get_stripe_period(int fd) return layout.period(); } -int Client::enumerate_layout(int fh, list& result, +int Client::enumerate_layout(int fd, list& result, off_t length, off_t offset) { - client_lock.Lock(); - dout(3) << "op: client->enumerate_layout(" << fh << ", " << length << ", " << offset << ");" << endl; + Mutex::Locker lock(client_lock); - assert(fh_map.count(fh)); - Fh *f = fh_map[fh]; + assert(fd_map.count(fd)); + Fh *f = fd_map[fd]; Inode *in = f->inode; // map to a list of extents filer->file_to_extents(in->inode, offset, length, result); - client_lock.Unlock(); + dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << endl; return 0; } diff --git a/trunk/ceph/client/Client.h b/trunk/ceph/client/Client.h index 71bd58df1d53b..5f62b5c9292ac 100644 --- a/trunk/ceph/client/Client.h +++ b/trunk/ceph/client/Client.h @@ -41,6 +41,7 @@ // stl #include #include +#include using namespace std; #include @@ -72,8 +73,6 @@ extern class Logger *client_logger; */ -typedef int fh_t; - class Dir; class Inode; @@ -124,6 +123,7 @@ class Inode { public: inode_t inode; // the actual inode utime_t valid_until; + int mask; // about the dir (if this is one!) int dir_auth; @@ -139,9 +139,11 @@ class Inode { int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers int ref; // ref count. 1 for each dentry, fh that links to me. + int ll_ref; // separate ref count for ll client Dir *dir; // if i'm a dir. Dentry *dn; // if i'm linked to a dentry. string *symlink; // symlink content, if it's a symlink + fragtree_t dirfragtree; // for caching i/o mode FileCache fc; @@ -170,11 +172,19 @@ class Inode { void get() { ref++; - //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; + //cout << "inode.get on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; + } + void put(int n=1) { + ref -= n; assert(ref >= 0); + //cout << "inode.put on " << this << " " << hex << inode.ino << dec << " now " << ref << endl; + } + + void ll_get() { + ll_ref++; } - void put() { - ref--; assert(ref >= 0); - //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; + void ll_put(int n=1) { + assert(ll_ref >= n); + ll_ref -= n; } Inode(inode_t _inode, ObjectCacher *_oc) : @@ -183,7 +193,8 @@ class Inode { dir_auth(-1), dir_hashed(false), dir_replicated(false), file_wr_mtime(0, 0), file_wr_size(0), num_open_rd(0), num_open_wr(0), num_open_lazy(0), - ref(0), dir(0), dn(0), symlink(0), + ref(0), ll_ref(0), + dir(0), dn(0), symlink(0), fc(_oc, _inode), sync_reads(0), sync_writes(0), hack_balance_reads(false) @@ -221,6 +232,17 @@ class Inode { return w; } + void add_open(int cmode) { + if (cmode & FILE_MODE_R) num_open_rd++; + if (cmode & FILE_MODE_W) num_open_wr++; + if (cmode & FILE_MODE_LAZY) num_open_lazy++; + } + void sub_open(int cmode) { + if (cmode & FILE_MODE_R) num_open_rd--; + if (cmode & FILE_MODE_W) num_open_wr--; + if (cmode & FILE_MODE_LAZY) num_open_lazy--; + } + int authority(MDSMap *mdsmap) { //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; // parent? @@ -319,15 +341,47 @@ class Client : public Dispatcher { public: /* getdir result */ + struct DirEntry { + string d_name; + struct stat st; + int stmask; + DirEntry(const string &s) : d_name(s), stmask(0) {} + DirEntry(const string &n, struct stat& s, int stm) : d_name(n), st(s), stmask(stm) {} + }; + struct DirResult { + static const int SHIFT = 28; + static const int64_t MASK = (1 << SHIFT) - 1; + static const off_t END = 1ULL << (SHIFT + 32); + string path; - map contents; - map::iterator p; - int off; - int size; - struct dirent_plus dp; - struct dirent_lite dl; - DirResult() : p(contents.end()), off(-1), size(0) {} + Inode *inode; + int64_t offset; // high bits: frag_t, low bits: an offset + map > buffer; + + DirResult(const char *p, Inode *in=0) : path(p), inode(in), offset(0) { + if (inode) inode->get(); + } + DirResult(const string &p, Inode *in=0) : path(p), inode(in), offset(0) { + if (inode) inode->get(); + } + + frag_t frag() { return frag_t(offset >> SHIFT); } + unsigned fragpos() { return offset & MASK; } + + void next_frag() { + frag_t fg = offset >> SHIFT; + if (fg.is_rightmost()) + set_end(); + else + set_frag(fg.next()); + } + void set_frag(frag_t f) { + offset = (uint64_t)f << SHIFT; + assert(sizeof(offset) == 8); + } + void set_end() { offset = END; } + bool at_end() { return (offset == END); } }; @@ -410,16 +464,16 @@ protected: // file handles, etc. string cwd; - interval_set free_fh_set; // unused fh's - hash_map fh_map; + interval_set free_fd_set; // unused fds + hash_map fd_map; - fh_t get_fh() { - fh_t fh = free_fh_set.start(); - free_fh_set.erase(fh, 1); - return fh; + int get_fd() { + int fd = free_fd_set.start(); + free_fd_set.erase(fd, 1); + return fd; } - void put_fh(fh_t fh) { - free_fh_set.insert(fh, 1); + void put_fd(int fd) { + free_fd_set.insert(fd, 1); } void mkabspath(const char *rel, string& abs) { @@ -441,9 +495,11 @@ protected: // -- metadata cache stuff // decrease inode ref. delete if dangling. - void put_inode(Inode *in) { - in->put(); + void put_inode(Inode *in, int n=1) { + //cout << "put_inode on " << in << " " << in->inode.ino << endl; + in->put(n); if (in->ref == 0) { + //cout << "put_inode deleting " << in->inode.ino << endl; inode_map.erase(in->inode.ino); if (in == root) root = 0; delete in; @@ -461,8 +517,8 @@ protected: put_inode(in); // unpin inode } - int get_cache_size() { return lru.lru_get_size(); } - void set_cache_size(int m) { lru.lru_set_max(m); } + //int get_cache_size() { return lru.lru_get_size(); } + //void set_cache_size(int m) { lru.lru_set_max(m); } Dentry* link(Dir *dir, const string& name, Inode *in) { Dentry *dn = new Dentry; @@ -548,8 +604,11 @@ protected: // find dentry based on filepath Dentry *lookup(filepath& path); - void fill_stat(inode_t& inode, struct stat *st); - void fill_statlite(inode_t& inode, struct statlite *st); + int fill_stat(Inode *in, struct stat *st); + + + // trace generation + ofstream traceout; // friends @@ -604,6 +663,44 @@ private: } }; + // some helpers + int _do_lstat(const char *path, int mask, Inode **in); + int _opendir(const char *name, DirResult **dirpp); + void _readdir_add_dirent(DirResult *dirp, const string& name, Inode *in); + void _readdir_add_dirent(DirResult *dirp, const string& name, unsigned char d_type); + bool _readdir_have_frag(DirResult *dirp); + void _readdir_next_frag(DirResult *dirp); + void _readdir_rechoose_frag(DirResult *dirp); + int _readdir_get_frag(DirResult *dirp); + void _readdir_fill_dirent(struct dirent *de, DirEntry *entry, off_t); + void _closedir(DirResult *dirp); + void _ll_get(Inode *in); + int _ll_put(Inode *in, int num); + void _ll_drop_pins(); + + // internal interface + // call these with client_lock held! + int _link(const char *existing, const char *newname); + int _unlink(const char *path); + int _rename(const char *from, const char *to); + int _mkdir(const char *path, mode_t mode); + int _rmdir(const char *path); + int _readlink(const char *path, char *buf, off_t size); + int _symlink(const char *existing, const char *newname); + int _lstat(const char *path, struct stat *stbuf); + int _chmod(const char *relpath, mode_t mode); + int _chown(const char *relpath, uid_t uid, gid_t gid); + int _utimes(const char *relpath, utime_t mtime, utime_t atime); + int _mknod(const char *path, mode_t mode, dev_t rdev); + int _open(const char *path, int flags, mode_t mode, Fh **fhp); + int _release(Fh *fh); + int _read(Fh *fh, off_t offset, off_t size, bufferlist *bl); + int _write(Fh *fh, off_t offset, off_t size, const char *buf); + int _truncate(const char *file, off_t length); + int _ftruncate(Fh *fh, off_t length); + int _fsync(Fh *fh, bool syncdataonly); + + public: int mount(); int unmount(); @@ -616,22 +713,21 @@ public: const string getcwd() { return cwd; } // namespace ops - int getdir(const char *path, list& contents); - int getdir(const char *path, map& contents); + int getdir(const char *relpath, list& names); // get the whole dir at once. - DIR *opendir(const char *name); - int closedir(DIR *dir); - struct dirent *readdir(DIR *dir); - void rewinddir(DIR *dir); - off_t telldir(DIR *dir); - void seekdir(DIR *dir, off_t offset); + int opendir(const char *name, DIR **dirpp); + int closedir(DIR *dirp); + int readdir_r(DIR *dirp, struct dirent *de); + int readdirplus_r(DIR *dirp, struct dirent *de, struct stat *st, int *stmask); + void rewinddir(DIR *dirp); + off_t telldir(DIR *dirp); + void seekdir(DIR *dirp, off_t offset); struct dirent_plus *readdirplus(DIR *dirp); int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); struct dirent_lite *readdirlite(DIR *dirp); int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); - int link(const char *existing, const char *newname); int unlink(const char *path); int rename(const char *from, const char *to); @@ -645,25 +741,24 @@ public: int symlink(const char *existing, const char *newname); // inode stuff - int _lstat(const char *path, int mask, Inode **in); int lstat(const char *path, struct stat *stbuf); int lstatlite(const char *path, struct statlite *buf); int chmod(const char *path, mode_t mode); int chown(const char *path, uid_t uid, gid_t gid); int utime(const char *path, struct utimbuf *buf); - + // file ops - int mknod(const char *path, mode_t mode); + int mknod(const char *path, mode_t mode, dev_t rdev=0); int open(const char *path, int flags, mode_t mode=0); - int close(fh_t fh); - off_t lseek(fh_t fh, off_t offset, int whence); - int read(fh_t fh, char *buf, off_t size, off_t offset=-1); - int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); + int close(int fd); + off_t lseek(int fd, off_t offset, int whence); + int read(int fd, char *buf, off_t size, off_t offset=-1); + int write(int fd, const char *buf, off_t size, off_t offset=-1); + int fake_write_size(int fd, off_t size); int truncate(const char *file, off_t size); - //int truncate(fh_t fh, long long size); - int fsync(fh_t fh, bool syncdataonly); - + int ftruncate(int fd, off_t size); + int fsync(int fd, bool syncdataonly); // hpc lazyio int lazyio_propogate(int fd, off_t offset, size_t count); @@ -677,6 +772,29 @@ public: int enumerate_layout(int fd, list& result, off_t length, off_t offset); + // low-level interface + int ll_lookup(inodeno_t parent, const char *name, struct stat *attr); + bool ll_forget(inodeno_t ino, int count); + Inode *_ll_get_inode(inodeno_t ino); + int ll_getattr(inodeno_t ino, struct stat *st); + int ll_setattr(inodeno_t ino, struct stat *st, int mask); + int ll_opendir(inodeno_t ino, void **dirpp); + void ll_releasedir(void *dirp); + int ll_readlink(inodeno_t ino, const char **value); + int ll_mknod(inodeno_t ino, const char *name, mode_t mode, dev_t rdev, struct stat *attr); + int ll_mkdir(inodeno_t ino, const char *name, mode_t mode, struct stat *attr); + int ll_symlink(inodeno_t ino, const char *name, const char *value, struct stat *attr); + int ll_unlink(inodeno_t ino, const char *name); + int ll_rmdir(inodeno_t ino, const char *name); + int ll_rename(inodeno_t parent, const char *name, inodeno_t newparent, const char *newname); + int ll_link(inodeno_t ino, inodeno_t newparent, const char *newname, struct stat *attr); + int ll_open(inodeno_t ino, int flags, Fh **fh); + int ll_create(inodeno_t parent, const char *name, mode_t mode, int flags, struct stat *attr, Fh **fh); + int ll_read(Fh *fh, off_t off, off_t len, bufferlist *bl); + int ll_write(Fh *fh, off_t off, off_t len, const char *data); + int ll_release(Fh *fh); + + // failure void ms_handle_failure(Message*, const entity_inst_t& inst); }; diff --git a/trunk/ceph/client/SyntheticClient.cc b/trunk/ceph/client/SyntheticClient.cc index a496480d7328e..52dba42418c14 100644 --- a/trunk/ceph/client/SyntheticClient.cc +++ b/trunk/ceph/client/SyntheticClient.cc @@ -30,6 +30,7 @@ using namespace std; #include #include #include +#include #include "config.h" #undef dout @@ -173,6 +174,11 @@ void parse_syn_options(vector& args) syn_modes.push_back( SYNCLIENT_MODE_TRUNCATE ); syn_sargs.push_back(args[++i]); syn_iargs.push_back(atoi(args[++i])); + } else if (strcmp(args[i],"importfind") == 0) { + syn_modes.push_back(SYNCLIENT_MODE_IMPORTFIND); + syn_sargs.push_back(args[++i]); + syn_sargs.push_back(args[++i]); + syn_iargs.push_back(atoi(args[++i])); } else { cerr << "unknown syn arg " << args[i] << endl; assert(0); @@ -549,7 +555,7 @@ int SyntheticClient::run() utime_t start = g_clock.now(); if (time_to_stop()) break; - play_trace(t, prefix); + play_trace(t, prefix, false); if (time_to_stop()) break; clean_dir(prefix); @@ -586,7 +592,7 @@ int SyntheticClient::run() { int count = iargs.front(); iargs.pop_front(); if (run_me()) { - client->mknod("test",0777); + client->mknod("test", 0777); struct stat st; for (int i=0; ilstat("test", &st); @@ -605,6 +611,17 @@ int SyntheticClient::run() client->truncate(file.c_str(), iarg1); } break; + + + case SYNCLIENT_MODE_IMPORTFIND: + { + string base = get_sarg(0); + string find = get_sarg(0); + int data = get_iarg(); + if (run_me()) + import_find(base.c_str(), find.c_str(), data); + } + break; default: assert(0); @@ -676,65 +693,76 @@ void SyntheticClient::up() } -int SyntheticClient::play_trace(Trace& t, string& prefix) +int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) { dout(4) << "play trace" << endl; t.start(); + char buf[1024]; + utime_t start = g_clock.now(); const char *p = prefix.c_str(); - map open_files; + hash_map open_files; + hash_map open_dirs; + + hash_map ll_files; + hash_map ll_dirs; + hash_map ll_inos; + + ll_inos[1] = 1; // root inode is known. while (!t.end()) { if (time_to_stop()) break; // op - const char *op = t.get_string(); - dout(4) << "trace op " << op << endl; + const char *op = t.get_string(buf, 0); + dout(4) << (t.get_line()-1) << ": trace op " << op << endl; + + // high level ops --------------------- if (strcmp(op, "link") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf, p); client->link(a,b); } else if (strcmp(op, "unlink") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); client->unlink(a); } else if (strcmp(op, "rename") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf, p); client->rename(a,b); } else if (strcmp(op, "mkdir") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); client->mkdir(a, b); } else if (strcmp(op, "rmdir") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); client->rmdir(a); } else if (strcmp(op, "symlink") == 0) { - const char *a = t.get_string(p); - const char *b = t.get_string(p); + const char *a = t.get_string(buf, p); + const char *b = t.get_string(buf, p); client->symlink(a,b); } else if (strcmp(op, "readlink") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); char buf[100]; client->readlink(a, buf, 100); } else if (strcmp(op, "lstat") == 0) { struct stat st; - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); client->lstat(a, &st); } else if (strcmp(op, "chmod") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); client->chmod(a, b); } else if (strcmp(op, "chown") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); int64_t c = t.get_int(); client->chown(a, b, c); } else if (strcmp(op, "utime") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); int64_t c = t.get_int(); struct utimbuf u; @@ -742,85 +770,279 @@ int SyntheticClient::play_trace(Trace& t, string& prefix) u.modtime = c; client->utime(a, &u); } else if (strcmp(op, "mknod") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); - client->mknod(a, b); + int64_t c = t.get_int(); + client->mknod(a, b, c); + } else if (strcmp(op, "getdir") == 0) { + const char *a = t.get_string(buf, p); + list contents; + client->getdir(a, contents); } else if (strcmp(op, "getdir") == 0) { - const char *a = t.get_string(p); - map contents; + const char *a = t.get_string(buf, p); + list contents; client->getdir(a, contents); + } else if (strcmp(op, "opendir") == 0) { + const char *a = t.get_string(buf, p); + int64_t b = t.get_int(); + DIR *dirp; + client->opendir(a, &dirp); + if (dirp) open_dirs[b] = dirp; + } else if (strcmp(op, "closedir") == 0) { + int64_t a = t.get_int(); + client->closedir(open_dirs[a]); + open_dirs.erase(a); } else if (strcmp(op, "open") == 0) { - const char *a = t.get_string(p); + const char *a = t.get_string(buf, p); int64_t b = t.get_int(); - int64_t id = t.get_int(); - int64_t fh = client->open(a, b); - open_files[id] = fh; + int64_t c = t.get_int(); + int64_t d = t.get_int(); + int64_t fd = client->open(a, b, c); + if (fd > 0) open_files[d] = fd; } else if (strcmp(op, "close") == 0) { int64_t id = t.get_int(); int64_t fh = open_files[id]; if (fh > 0) client->close(fh); open_files.erase(id); - } else if (strcmp(op, "truncate") == 0) { - const char *a = t.get_string(p); - int64_t b = t.get_int(); - client->truncate(a,b); - } else if (strcmp(op, "read") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - client->read(fh, buf, size, off); - delete[] buf; } else if (strcmp(op, "lseek") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int off = t.get_int(); - int whence = t.get_int(); - client->lseek(fh, off, whence); + int64_t f = t.get_int(); + int fd = open_files[f]; + int64_t off = t.get_int(); + int64_t whence = t.get_int(); + client->lseek(fd, off, whence); + } else if (strcmp(op, "read") == 0) { + int64_t f = t.get_int(); + int64_t size = t.get_int(); + int64_t off = t.get_int(); + int64_t fd = open_files[f]; + if (!metadata_only) { + char *b = new char[size]; + client->read(fd, b, size, off); + delete[] b; + } } else if (strcmp(op, "write") == 0) { - int64_t id = t.get_int(); - int64_t fh = open_files[id]; - int size = t.get_int(); - int off = t.get_int(); - char *buf = new char[size]; - memset(buf, 1, size); // let's write 1's! - client->write(fh, buf, size, off); - delete[] buf; + int64_t f = t.get_int(); + int64_t fd = open_files[f]; + int64_t size = t.get_int(); + int64_t off = t.get_int(); + if (!metadata_only) { + char *b = new char[size]; + memset(b, 1, size); // let's write 1's! + client->write(fd, b, size, off); + delete[] b; + } + } else if (strcmp(op, "truncate") == 0) { + const char *a = t.get_string(buf, p); + int64_t l = t.get_int(); + client->truncate(a, l); + } else if (strcmp(op, "ftruncate") == 0) { + int64_t f = t.get_int(); + int fd = open_files[f]; + int64_t l = t.get_int(); + client->ftruncate(fd, l); } else if (strcmp(op, "fsync") == 0) { + int64_t f = t.get_int(); + int64_t b = t.get_int(); + int fd = open_files[f]; + client->fsync(fd, b); + } else if (strcmp(op, "chdir") == 0) { + const char *a = t.get_string(buf, p); + client->chdir(a); + } else if (strcmp(op, "statfs") == 0) { + struct statvfs stbuf; + client->statfs("/", &stbuf); + } + + // low level ops --------------------- + else if (strcmp(op, "ll_lookup") == 0) { + int64_t i = t.get_int(); + const char *name = t.get_string(buf, p); + int64_t r = t.get_int(); + struct stat attr; + if (client->ll_lookup(ll_inos[i], name, &attr) == 0) + ll_inos[r] = attr.st_ino; + } else if (strcmp(op, "ll_forget") == 0) { + int64_t i = t.get_int(); + int64_t n = t.get_int(); + if (client->ll_forget(ll_inos[i], n)) + ll_inos.erase(i); + } else if (strcmp(op, "ll_getattr") == 0) { + int64_t i = t.get_int(); + struct stat attr; + client->ll_getattr(ll_inos[i], &attr); + } else if (strcmp(op, "ll_setattr") == 0) { + int64_t i = t.get_int(); + struct stat attr; + memset(&attr, 0, sizeof(attr)); + attr.st_mode = t.get_int(); + attr.st_uid = t.get_int(); + attr.st_gid = t.get_int(); + attr.st_size = t.get_int(); + attr.st_mtime = t.get_int(); + attr.st_atime = t.get_int(); + int mask = t.get_int(); + client->ll_setattr(ll_inos[i], &attr, mask); + } else if (strcmp(op, "ll_readlink") == 0) { + int64_t i = t.get_int(); + const char *value; + client->ll_readlink(ll_inos[i], &value); + } else if (strcmp(op, "ll_mknod") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int m = t.get_int(); + int r = t.get_int(); + int64_t ri = t.get_int(); + struct stat attr; + if (client->ll_mknod(ll_inos[i], n, m, r, &attr) == 0) + ll_inos[ri] = attr.st_ino; + } else if (strcmp(op, "ll_mkdir") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int m = t.get_int(); + int64_t ri = t.get_int(); + struct stat attr; + if (client->ll_mkdir(ll_inos[i], n, m, &attr) == 0) + ll_inos[ri] = attr.st_ino; + } else if (strcmp(op, "ll_symlink") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + const char *v = t.get_string(buf, p); + int64_t ri = t.get_int(); + struct stat attr; + if (client->ll_symlink(i, n, v, &attr) == 0) + ll_inos[ri] = attr.st_ino; + } else if (strcmp(op, "ll_unlink") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + client->ll_unlink(ll_inos[i], n); + } else if (strcmp(op, "ll_rmdir") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + client->ll_rmdir(ll_inos[i], n); + } else if (strcmp(op, "ll_rename") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int64_t ni = t.get_int(); + const char *nn = t.get_string(buf, p); + client->ll_rename(ll_inos[i], n, ll_inos[ni], nn); + } else if (strcmp(op, "ll_link") == 0) { + int64_t i = t.get_int(); + int64_t ni = t.get_int(); + const char *nn = t.get_string(buf, p); + struct stat attr; + client->ll_link(ll_inos[i], ni, nn, &attr); + } else if (strcmp(op, "ll_opendir") == 0) { + int64_t i = t.get_int(); + int64_t r = t.get_int(); + void *dirp; + client->ll_opendir(ll_inos[i], &dirp); + ll_dirs[r] = dirp; + } else if (strcmp(op, "ll_releasedir") == 0) { + int64_t f = t.get_int(); + void *dirp = ll_dirs[f]; + client->ll_releasedir(dirp); + ll_dirs.erase(f); + } else if (strcmp(op, "ll_open") == 0) { + int64_t i = t.get_int(); + int64_t f = t.get_int(); + int64_t r = t.get_int(); + Fh *fhp; + client->ll_open(ll_inos[i], f, &fhp); + ll_files[r] = fhp; + } else if (strcmp(op, "ll_create") == 0) { + int64_t i = t.get_int(); + const char *n = t.get_string(buf, p); + int64_t m = t.get_int(); + int64_t f = t.get_int(); + int64_t r = t.get_int(); + int64_t ri = t.get_int(); + Fh *fhp; + struct stat attr; + if (client->ll_create(ll_inos[i], n, m, f, &attr, &fhp) == 0) { + ll_inos[ri] = attr.st_ino; + ll_files[r] = fhp; + } + } else if (strcmp(op, "ll_read") == 0) { + int64_t f = t.get_int(); + int64_t off = t.get_int(); + int64_t size = t.get_int(); + Fh *fh = ll_files[f]; + if (!metadata_only) { + bufferlist bl; + client->ll_read(fh, off, size, &bl); + } + } else if (strcmp(op, "ll_write") == 0) { + int64_t f = t.get_int(); + int64_t off = t.get_int(); + int64_t size = t.get_int(); + Fh *fh = ll_files[f]; + if (!metadata_only) { + bufferlist bl; + bufferptr bp(size); + bl.push_back(bp); + bp.zero(); + client->ll_write(fh, off, size, bl.c_str()); + } + } else if (strcmp(op, "ll_release") == 0) { + int64_t f = t.get_int(); + Fh *fh = ll_files[f]; + client->ll_release(fh); + ll_files.erase(f); + } + + else { + cout << (t.get_line()-1) << ": *** trace hit unrecognized symbol '" << op << "' " << endl; assert(0); - } else - assert(0); + } } // close open files - for (map::iterator fi = open_files.begin(); + for (hash_map::iterator fi = open_files.begin(); fi != open_files.end(); fi++) { dout(1) << "leftover close " << fi->second << endl; if (fi->second > 0) client->close(fi->second); } + for (hash_map::iterator fi = open_dirs.begin(); + fi != open_dirs.end(); + fi++) { + dout(1) << "leftover closedir " << fi->second << endl; + if (fi->second != 0) client->closedir(fi->second); + } + for (hash_map::iterator fi = ll_files.begin(); + fi != ll_files.end(); + fi++) { + dout(1) << "leftover ll_release " << fi->second << endl; + if (fi->second > 0) client->ll_release(fi->second); + } + for (hash_map::iterator fi = ll_dirs.begin(); + fi != ll_dirs.end(); + fi++) { + dout(1) << "leftover ll_releasedir " << fi->second << endl; + if (fi->second > 0) client->ll_releasedir(fi->second); + } return 0; } + int SyntheticClient::clean_dir(string& basedir) { // read dir - map contents; + list contents; int r = client->getdir(basedir.c_str(), contents); if (r < 0) { dout(1) << "readdir on " << basedir << " returns " << r << endl; return r; } - for (map::iterator it = contents.begin(); + for (list::iterator it = contents.begin(); it != contents.end(); it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = basedir + "/" + it->first; + if (*it == ".") continue; + if (*it == "..") continue; + string file = basedir + "/" + *it; if (time_to_stop()) break; @@ -856,19 +1078,20 @@ int SyntheticClient::full_walk(string& basedir) dirq.pop_front(); // read dir - map contents; + list contents; int r = client->getdir(dir.c_str(), contents); if (r < 0) { dout(1) << "readdir on " << dir << " returns " << r << endl; continue; } - for (map::iterator it = contents.begin(); + for (list::iterator it = contents.begin(); it != contents.end(); it++) { - if (it->first == ".") continue; - if (it->first == "..") continue; - string file = dir + "/" + it->first; + if (*it == "." || + *it == "..") + continue; + string file = dir + "/" + *it; struct stat st; int r = client->lstat(file.c_str(), &st); @@ -975,7 +1198,7 @@ int SyntheticClient::read_dirs(const char *basedir, int dirs, int files, int dep char d[500]; dout(3) << "read_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; - map contents; + list contents; utime_t s = g_clock.now(); int r = client->getdir(basedir, contents); utime_t e = g_clock.now(); @@ -1629,16 +1852,19 @@ int SyntheticClient::random_walk(int num_req) if (op == MDS_OP_READDIR) { clear_dir(); - map c; + list c; r = client->getdir( cwd.c_str(), c ); - for (map::iterator it = c.begin(); + for (list::iterator it = c.begin(); it != c.end(); it++) { - //dout(DBL) << " got " << it->first << endl; - contents[it->first] = it->second; - if (it->second.is_dir()) - subdirs.insert(it->first); + //dout(DBL) << " got " << *it << endl; + assert(0); + /*contents[*it] = it->second; + if (it->second && + S_ISDIR(it->second->st_mode)) + subdirs.insert(*it); + */ } did_readdir = true; @@ -1723,7 +1949,8 @@ void SyntheticClient::foo() int c = rand() % s; char src[80]; sprintf(src, "syn.0.0/dir.%d/dir.%d/file.%d", a, b, c); - int fd = client->open(src, O_RDONLY); + //int fd = + client->open(src, O_RDONLY); } return; @@ -1965,3 +2192,109 @@ int SyntheticClient::thrash_links(const char *basedir, int dirs, int files, int } + + +void SyntheticClient::import_find(const char *base, const char *find, bool data) +{ + dout(1) << "import_find " << base << " from " << find << " data=" << data << endl; + + /* use this to gather the static trace: + * + * find . -exec ls -dilsn --time-style=+%s \{\} \; + * or if it's wafl, + * find . -path ./.snapshot -prune -o -exec ls -dilsn --time-style=+%s \{\} \; + * + */ + + client->mkdir(base, 0755); + + ifstream f(find); + assert(f.is_open()); + + while (!f.eof()) { + uint64_t ino; + int dunno, nlink; + string modestring; + int uid, gid; + off_t size; + time_t mtime; + string filename; + f >> ino; + if (f.eof()) break; + f >> dunno; + f >> modestring; + f >> nlink; + f >> uid; + f >> gid; + f >> size; + f >> mtime; + f.seekg(1, ios::cur); + getline(f, filename); + + // remove leading ./ + if (filename[0] == '.' && filename[1] == '/') + filename = filename.substr(2); + + // parse the mode + assert(modestring.length() == 10); + mode_t mode = 0; + switch (modestring[0]) { + case 'd': mode |= INODE_MODE_DIR; break; + case 'l': mode |= INODE_MODE_SYMLINK; break; + default: + case '-': mode |= INODE_MODE_FILE; break; + } + if (modestring[1] == 'r') mode |= 0400; + if (modestring[2] == 'w') mode |= 0200; + if (modestring[3] == 'x') mode |= 0100; + if (modestring[4] == 'r') mode |= 040; + if (modestring[5] == 'w') mode |= 020; + if (modestring[6] == 'x') mode |= 010; + if (modestring[7] == 'r') mode |= 04; + if (modestring[8] == 'w') mode |= 02; + if (modestring[9] == 'x') mode |= 01; + + dout(20) << " mode " << modestring << " to " << oct << mode << dec << endl; + + if (S_ISLNK(mode)) { + // target vs destination + int pos = filename.find(" -> "); + assert(pos > 0); + string link = base; + link += "/"; + link += filename.substr(0, pos); + string target; + if (filename[pos+4] == '/') { + target = base; + target += filename.substr(pos + 4); + } else { + target = filename.substr(pos + 4); + } + dout(10) << "symlink from '" << link << "' -> '" << target << "'" << endl; + client->symlink(target.c_str(), link.c_str()); + } else { + string f = base; + f += "/"; + f += filename; + if (S_ISDIR(mode)) { + client->mkdir(f.c_str(), mode); + } else { + int fd = client->open(f.c_str(), O_WRONLY|O_CREAT); + assert(fd > 0); + client->write(fd, " ", 1, size-1); + client->close(fd); + + client->chmod(f.c_str(), mode & 0777); + client->chown(f.c_str(), uid, gid); + + struct utimbuf ut; + ut.modtime = mtime; + ut.actime = mtime; + client->utime(f.c_str(), &ut); + } + } + } + + +} + diff --git a/trunk/ceph/client/SyntheticClient.h b/trunk/ceph/client/SyntheticClient.h index dc1cf58121d26..b8367315edbb2 100644 --- a/trunk/ceph/client/SyntheticClient.h +++ b/trunk/ceph/client/SyntheticClient.h @@ -64,6 +64,8 @@ #define SYNCLIENT_MODE_FOO 100 #define SYNCLIENT_MODE_THRASHLINKS 101 +#define SYNCLIENT_MODE_IMPORTFIND 300 + void parse_syn_options(vector& args); @@ -80,7 +82,7 @@ class SyntheticClient { filepath cwd; - map contents; + map contents; set subdirs; bool did_readdir; set open_files; @@ -120,7 +122,7 @@ class SyntheticClient { r += cwd.last_dentry().c_str()[0]; // slightly permuted r %= contents.size(); - map::iterator it = contents.begin(); + map::iterator it = contents.begin(); while (r--) it++; n2 = cwd; @@ -183,6 +185,11 @@ class SyntheticClient { int exclude; string get_sarg(int seq); + int get_iarg() { + int i = iargs.front(); + iargs.pop_front(); + return i; + } bool time_to_stop() { utime_t now = g_clock.now(); @@ -219,13 +226,15 @@ class SyntheticClient { int clean_dir(string& basedir); - int play_trace(Trace& t, string& prefix); + int play_trace(Trace& t, string& prefix, bool metadata_only=false); void make_dir_mess(const char *basedir, int n); void foo(); int thrash_links(const char *basedir, int dirs, int files, int depth, int n); + void import_find(const char *basedir, const char *find, bool writedata); + }; #endif diff --git a/trunk/ceph/client/Trace.h b/trunk/ceph/client/Trace.h index bde9f2830cf5f..5243f28d4a56c 100644 --- a/trunk/ceph/client/Trace.h +++ b/trunk/ceph/client/Trace.h @@ -30,11 +30,13 @@ using namespace std; class Trace { class TokenList *tl; - + int _line; + public: Trace(const char* filename); ~Trace(); - + + int get_line() { return _line; } list& get_list(); list::iterator _cur; @@ -43,30 +45,27 @@ class Trace { void start() { _cur = get_list().begin(); _end = get_list().end(); - ns = 0; + _line = 1; } - char strings[10][200]; - int ns; - const char *get_string(const char *prefix = 0) { + const char *get_string(char *buf, const char *prefix) { assert(_cur != _end); const char *s = *_cur; - _cur++; + _cur++; _line++; if (prefix) { if (strstr(s, "/prefix") == s || strstr(s, "/prefix") == s+1) { - strcpy(strings[ns], prefix); - strcpy(strings[ns] + strlen(prefix), + strcpy(buf, prefix); + strcpy(buf + strlen(prefix), s + strlen("/prefix")); - s = (const char*)strings[ns]; - ns++; - if (ns == 10) ns = 0; + s = (const char*)buf; } } return s; } __int64_t get_int() { - return atoll(get_string()); + char buf[20]; + return atoll(get_string(buf, 0)); } bool end() { return _cur == _end; diff --git a/trunk/ceph/client/fuse.cc b/trunk/ceph/client/fuse.cc index 855a3eb4a6766..431025a8d8f13 100644 --- a/trunk/ceph/client/fuse.cc +++ b/trunk/ceph/client/fuse.cc @@ -28,7 +28,7 @@ #define _XOPEN_SOURCE 500 #endif -#define FUSE_USE_VERSION 25 +#define FUSE_USE_VERSION 26 #include #include @@ -47,13 +47,8 @@ #include "config.h" -// stl -#include -using namespace std; - - // globals -Client *client; // the ceph client +static Client *client; // the ceph client @@ -76,28 +71,6 @@ static int ceph_readlink(const char *path, char *buf, size_t size) return 0; } - -static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) -{ - map contents; - - int res = client->getdir(path, contents); - if (res < 0) return res; - - // return contents to fuse via callback - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - // (immutable) inode contents too. - res = filler(h, // fuse's handle - it->first.c_str(), // dentry as char* - it->second.mode & INODE_TYPE_MASK, // mask type bits from mode - it->second.ino); // ino.. 64->32 bit issue here? FIXME - if (res != 0) break; // fuse has had enough - } - return res; -} - static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) { return client->mknod(path, mode); @@ -154,6 +127,9 @@ static int ceph_utime(const char *path, struct utimbuf *buf) } +// ------------------ +// file i/o + static int ceph_open(const char *path, struct fuse_file_info *fi) { int res; @@ -167,51 +143,93 @@ static int ceph_open(const char *path, struct fuse_file_info *fi) static int ceph_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { - fh_t fh = fi->fh; - return client->read(fh, buf, size, offset); + int fd = fi->fh; + return client->read(fd, buf, size, offset); } static int ceph_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { - fh_t fh = fi->fh; - return client->write(fh, buf, size, offset); + int fd = fi->fh; + return client->write(fd, buf, size, offset); } static int ceph_flush(const char *path, struct fuse_file_info *fi) { -//fh_t fh = fi->fh; + //int fh = fi->fh; //return client->flush(fh); return 0; } - static int ceph_statfs(const char *path, struct statvfs *stbuf) { return client->statfs(path, stbuf); } - - static int ceph_release(const char *path, struct fuse_file_info *fi) { - fh_t fh = fi->fh; - int r = client->close(fh); // close the file + int fd = fi->fh; + int r = client->close(fd); // close the file return r; } static int ceph_fsync(const char *path, int isdatasync, struct fuse_file_info *fi) { - fh_t fh = fi->fh; - return client->fsync(fh, isdatasync ? true:false); + int fd = fi->fh; + return client->fsync(fd, isdatasync ? true:false); } +// --------------------- +// directory i/o + +static int ceph_opendir(const char *path, struct fuse_file_info *fi) +{ + DIR *dirp; + int r = client->opendir(path, &dirp); + if (r < 0) return r; + fi->fh = (uint64_t)(void*)dirp; + return 0; +} + +static int ceph_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t off, fuse_file_info *fi) +{ + DIR *dirp = (DIR*)fi->fh; + + client->seekdir(dirp, off); + + int res = 0; + struct dirent de; + struct stat st; + int stmask = 0; + while (res == 0) { + int r = client->readdirplus_r(dirp, &de, &st, &stmask); + if (r != 0) break; + int stneed = STAT_MASK_INO | STAT_MASK_TYPE; + res = filler(buf, + de.d_name, + ((stmask & stneed) == stneed) ? &st:0, + client->telldir(dirp)); + } + return 0; +} + +static int ceph_releasedir(const char *path, struct fuse_file_info *fi) +{ + DIR *dirp = (DIR*)fi->fh; + int r = client->closedir(dirp); // close the file + return r; +} + + + + + static struct fuse_operations ceph_oper = { getattr: ceph_getattr, readlink: ceph_readlink, - getdir: ceph_getdir, + getdir: 0, mknod: ceph_mknod, mkdir: ceph_mkdir, unlink: ceph_unlink, @@ -229,7 +247,14 @@ static struct fuse_operations ceph_oper = { statfs: ceph_statfs, flush: ceph_flush, release: ceph_release, - fsync: ceph_fsync + fsync: ceph_fsync, + setxattr: 0, + getxattr: 0, + listxattr: 0, + removexattr: 0, + opendir: ceph_opendir, + readdir: ceph_readdir, + releasedir: ceph_releasedir }; @@ -276,6 +301,6 @@ int ceph_fuse_main(Client *c, int argc, char *argv[]) // go fuse go cout << "ok, calling fuse_main" << endl; - int r = fuse_main(newargc, newargv, &ceph_oper); + int r = fuse_main(newargc, newargv, &ceph_oper, 0); return r; } diff --git a/trunk/ceph/client/fuse_ll.cc b/trunk/ceph/client/fuse_ll.cc new file mode 100644 index 0000000000000..b44279a42bf03 --- /dev/null +++ b/trunk/ceph/client/fuse_ll.cc @@ -0,0 +1,385 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#define FUSE_USE_VERSION 26 + +#include +#include +#include +#include +#include +#include +#include +#include + +// ceph +#include "include/types.h" +#include "Client.h" +#include "config.h" + +static Client *client; + + +static void ceph_ll_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + struct fuse_entry_param fe; + int stmask; + + memset(&fe, 0, sizeof(fe)); + stmask = client->ll_lookup(parent, name, &fe.attr); + if (stmask >= 0) { + fe.ino = fe.attr.st_ino; + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, ENOENT); + } +} + +static void ceph_ll_forget(fuse_req_t req, fuse_ino_t ino, long unsigned nlookup) +{ + client->ll_forget(ino, nlookup); + fuse_reply_none(req); +} + +static void ceph_ll_getattr(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + struct stat stbuf; + + (void) fi; + + if (client->ll_getattr(ino, &stbuf) == 0) + fuse_reply_attr(req, &stbuf, 0); + else + fuse_reply_err(req, ENOENT); +} + +static void ceph_ll_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int to_set, struct fuse_file_info *fi) +{ + int r = client->ll_setattr(ino, attr, to_set); + if (r == 0) + fuse_reply_attr(req, attr, 0); + else + fuse_reply_err(req, -r); +} + +static void ceph_ll_opendir(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + void *dirp; + int r = client->ll_opendir(ino, &dirp); + if (r >= 0) { + fi->fh = (long)dirp; + fuse_reply_open(req, fi); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_readlink(fuse_req_t req, fuse_ino_t ino) +{ + const char *value; + int r = client->ll_readlink(ino, &value); + if (r == 0) + fuse_reply_readlink(req, value); + else + fuse_reply_err(req, -r); +} + +static void ceph_ll_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev) +{ + struct fuse_entry_param fe; + memset(&fe, 0, sizeof(fe)); + + int r = client->ll_mknod(parent, name, mode, rdev, &fe.attr); + if (r == 0) { + fe.ino = fe.attr.st_ino; + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode) +{ + struct fuse_entry_param fe; + memset(&fe, 0, sizeof(fe)); + + int r = client->ll_mkdir(parent, name, mode, &fe.attr); + if (r == 0) { + fe.ino = fe.attr.st_ino; + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int r = client->ll_unlink(parent, name); + fuse_reply_err(req, -r); +} + +static void ceph_ll_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int r = client->ll_rmdir(parent, name); + fuse_reply_err(req, -r); +} + +static void ceph_ll_symlink(fuse_req_t req, const char *existing, fuse_ino_t parent, const char *name) +{ + struct fuse_entry_param fe; + memset(&fe, 0, sizeof(fe)); + + int r = client->ll_symlink(parent, name, existing, &fe.attr); + if (r == 0) { + fe.ino = fe.attr.st_ino; + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname) +{ + int r = client->ll_rename(parent, name, newparent, newname); + fuse_reply_err(req, -r); +} + +static void ceph_ll_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, + const char *newname) +{ + struct fuse_entry_param fe; + memset(&fe, 0, sizeof(fe)); + + int r = client->ll_link(ino, newparent, newname, &fe.attr); + if (r == 0) { + fe.ino = fe.attr.st_ino; + fuse_reply_entry(req, &fe); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + Fh *fh; + int r = client->ll_open(ino, fi->flags, &fh); + if (r == 0) { + fi->fh = (long)fh; + fuse_reply_open(req, fi); + } else { + fuse_reply_err(req, -r); + } +} + +static void ceph_ll_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi) +{ + Fh *fh = (Fh*)fi->fh; + bufferlist bl; + int r = client->ll_read(fh, off, size, &bl); + if (r >= 0) + fuse_reply_buf(req, bl.c_str(), bl.length()); + else + fuse_reply_err(req, -r); +} + +static void ceph_ll_write(fuse_req_t req, fuse_ino_t ino, const char *buf, + size_t size, off_t off, struct fuse_file_info *fi) +{ + Fh *fh = (Fh*)fi->fh; + int r = client->ll_write(fh, off, size, buf); + if (r >= 0) + fuse_reply_write(req, r); + else + fuse_reply_err(req, -r); +} + +static void ceph_ll_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + // NOOP + fuse_reply_err(req, 0); +} + +static void ceph_ll_release(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + Fh *fh = (Fh*)fi->fh; + int r = client->ll_release(fh); + fuse_reply_err(req, -r); +} + +static void ceph_ll_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + +} + +static void ceph_ll_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t off, struct fuse_file_info *fi) +{ + (void) fi; + + // buffer + char *buf; + size_t pos = 0; + + buf = new char[size]; + if (!buf) { + fuse_reply_err(req, ENOMEM); + return; + } + + DIR *dirp = (DIR*)fi->fh; + client->seekdir(dirp, off); + + struct dirent de; + struct stat st; + memset(&st, 0, sizeof(st)); + + while (1) { + int r = client->readdir_r(dirp, &de); + if (r < 0) break; + st.st_ino = de.d_ino; + st.st_mode = DT_TO_MODE(de.d_type); + + off_t off = client->telldir(dirp); + size_t entrysize = fuse_add_direntry(req, buf + pos, size - pos, + de.d_name, &st, off); + + cout << "ceph_ll_readdir added " << de.d_name << " at " << pos << " len " << entrysize + << " (buffer size is " << size << ")" + << " .. off = " << off + << endl; + + if (entrysize > size - pos) + break; // didn't fit, done for now. + pos += entrysize; + } + + fuse_reply_buf(req, buf, pos); + delete[] buf; +} + +static void ceph_ll_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + DIR *dirp = (DIR*)fi->fh; + client->ll_releasedir(dirp); + fuse_reply_err(req, 0); +} + +static void ceph_ll_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) +{ + struct fuse_entry_param fe; + memset(&fe, 0, sizeof(fe)); + Fh *fh; + int r = client->ll_create(parent, name, mode, fi->flags, &fe.attr, &fh); + if (r == 0) { + fi->fh = (long)fh; + fe.ino = fe.attr.st_ino; + fuse_reply_create(req, &fe, fi); + } else { + fuse_reply_err(req, -r); + } +} + +static struct fuse_lowlevel_ops ceph_ll_oper = { + init: 0, + destroy: 0, + lookup: ceph_ll_lookup, + forget: ceph_ll_forget, + getattr: ceph_ll_getattr, + setattr: ceph_ll_setattr, + readlink: ceph_ll_readlink, + mknod: ceph_ll_mknod, + mkdir: ceph_ll_mkdir, + unlink: ceph_ll_unlink, + rmdir: ceph_ll_rmdir, + symlink: ceph_ll_symlink, + rename: ceph_ll_rename, + link: ceph_ll_link, + open: ceph_ll_open, + read: ceph_ll_read, + write: ceph_ll_write, + flush: ceph_ll_flush, + release: ceph_ll_release, + fsync: ceph_ll_fsync, + opendir: ceph_ll_opendir, + readdir: ceph_ll_readdir, + releasedir: ceph_ll_releasedir, + fsyncdir: 0, + statfs: 0, + setxattr: 0, + getxattr: 0, + listxattr: 0, + removexattr: 0, + access: 0, + create: 0, //ceph_ll_create, + getlk: 0, + setlk: 0, + bmap: 0 +}; + +int ceph_fuse_ll_main(Client *c, int argc, char *argv[]) +{ + cout << "ceph_fuse_ll_main starting fuse" << endl; + + client = c; + + // set up fuse argc/argv + int newargc = 0; + char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); + newargv[newargc++] = argv[0]; + newargv[newargc++] = "-f"; // stay in foreground + + newargv[newargc++] = "-o"; + newargv[newargc++] = "allow_other"; + + for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; + + // go go gadget fuse + struct fuse_args args = FUSE_ARGS_INIT(newargc, newargv); + struct fuse_chan *ch; + char *mountpoint; + int err = -1; + + if (fuse_parse_cmdline(&args, &mountpoint, NULL, NULL) != -1 && + (ch = fuse_mount(mountpoint, &args)) != NULL) { + struct fuse_session *se; + + // init fuse + se = fuse_lowlevel_new(&args, &ceph_ll_oper, sizeof(ceph_ll_oper), + NULL); + if (se != NULL) { + if (fuse_set_signal_handlers(se) != -1) { + fuse_session_add_chan(se, ch); + err = fuse_session_loop(se); + fuse_remove_signal_handlers(se); + fuse_session_remove_chan(ch); + } + fuse_session_destroy(se); + } + fuse_unmount(mountpoint, ch); + } + fuse_opt_free_args(&args); + + cout << "ceph_fuse_ll_main done, err=" << err << endl; + return err ? 1 : 0; +} + diff --git a/trunk/ceph/client/fuse_ll.h b/trunk/ceph/client/fuse_ll.h new file mode 100644 index 0000000000000..068969c4f7487 --- /dev/null +++ b/trunk/ceph/client/fuse_ll.h @@ -0,0 +1,15 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +int ceph_fuse_ll_main(Client *c, int argc, char *argv[]); diff --git a/trunk/ceph/common/DecayCounter.h b/trunk/ceph/common/DecayCounter.h index 37ce7bad07fa5..1eed4a2e697e3 100644 --- a/trunk/ceph/common/DecayCounter.h +++ b/trunk/ceph/common/DecayCounter.h @@ -12,8 +12,6 @@ * */ - - #ifndef __DECAYCOUNTER_H #define __DECAYCOUNTER_H @@ -22,73 +20,112 @@ #include "config.h" +/** + * + * TODO: normalize value based on some fucntion of half_life, + * so that it can be interpreted as an approximation of a + * moving average of N seconds. currently, changing half-life + * skews the scale of the value, even at steady state. + * + */ + class DecayCounter { protected: - double val; // value - - double half_life; // in seconds - double k; // k = ln(.5)/half_life - +public: + double half_life; + double k; // k = ln(.5)/half_life + double val; // value + double delta; // delta since last decay + double vel; // recent velocity utime_t last_decay; // time of last decay public: - DecayCounter() : val(0) { + DecayCounter() : val(0), delta(0), vel(0) { set_halflife( g_conf.mds_decay_halflife ); reset(); } - /* - DecayCounter(double hl) : val(0) { - set_halflife(hl); + DecayCounter(double hl) : val(0), delta(0), vel(0) { + set_halflife( hl ); reset(); } - */ + + /** + * reading + */ + + double get() { + return get(g_clock.now()); + } + + double get(utime_t now) { + decay(now); + return val; + } + + double get_last() { + return val; + } + double get_last_vel() { + return vel; + } + + utime_t get_last_decay() { + return last_decay; + } + + /** + * adjusting + */ + + void hit(utime_t now, double v = 1.0) { + decay(now); + delta += v; + } + void adjust(double a) { - decay(); val += a; } - void adjust_down(const DecayCounter& other) { - // assume other has same time stamp as us... - val -= other.val; + void adjust(utime_t now, double a) { + decay(now); + val += a; } + /** + * decay etc. + */ + void set_halflife(double hl) { half_life = hl; k = log(.5) / hl; } - void take(DecayCounter& other) { - *this = other; - other.reset(); - } - void reset() { - last_decay.sec_ref() = 0; - last_decay.usec_ref() = 0; - val = 0; + reset(g_clock.now()); + } + void reset(utime_t now) { + last_decay = g_clock.now(); + val = delta = 0; } - void decay() { - utime_t el = g_clock.recent_now(); + void decay(utime_t now) { + utime_t el = now; el -= last_decay; + if (el.sec() >= 1) { - val = val * exp((double)el * k); - if (val < .01) val = 0; - last_decay = g_clock.recent_now(); + // calculate new value + double newval = (val+delta) * exp((double)el * k); + if (newval < .01) newval = 0.0; + + // calculate velocity approx + vel += (newval - val) * (double)el; + vel *= exp((double)el * k); + + val = newval; + delta = 0; + last_decay = now; } } - - double get() { - decay(); - return val; - } - - double hit(double v = 1.0) { - decay(); - val += v; - return val; - } - }; diff --git a/trunk/ceph/common/Logger.cc b/trunk/ceph/common/Logger.cc index a5417ab18629d..9ffab65074f1a 100644 --- a/trunk/ceph/common/Logger.cc +++ b/trunk/ceph/common/Logger.cc @@ -39,7 +39,7 @@ Logger::Logger(string fn, LogType *type) if (g_conf.use_abspaths) { char *cwd = get_current_dir_name(); filename = cwd; - delete cwd; + free(cwd); filename += "/"; } @@ -156,7 +156,7 @@ void Logger::flush(bool force) //cout << "opening log file " << filename << endl; } - utime_t fromstart = g_clock.now(); + utime_t fromstart = g_clock.recent_now(); if (fromstart < start) { cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl; assert(0); diff --git a/trunk/ceph/config.cc b/trunk/ceph/config.cc index e2ecbaf8469da..d8a789ea001bc 100644 --- a/trunk/ceph/config.cc +++ b/trunk/ceph/config.cc @@ -85,9 +85,11 @@ md_config_t g_conf = { debug_mds: 1, debug_mds_balancer: 1, debug_mds_log: 1, + debug_mds_migrator: 1, debug_buffer: 0, debug_filer: 0, debug_objecter: 0, + debug_journaler: 0, debug_objectcacher: 0, debug_client: 0, debug_osd: 0, @@ -136,9 +138,10 @@ md_config_t g_conf = { mon_accept_timeout: 10.0, // on leader, if paxos update isn't accepted mon_stop_on_last_unmount: false, mon_stop_with_last_mds: false, + mon_allow_mds_bully: true, // allow a booting mds to (forcibly) claim an mds # // --- client --- - client_cache_size: 300, + client_cache_size: 1000, client_cache_mid: .5, client_cache_stat_ttl: 0, // seconds until cached stat results become invalid client_cache_readdir_ttl: 1, // 1 second only @@ -157,23 +160,25 @@ md_config_t g_conf = { client_trace: 0, fuse_direct_io: 0, + fuse_ll: true, // --- objecter --- - objecter_buffer_uncommitted: true, + objecter_buffer_uncommitted: true, // this must be true for proper failure handling // --- journaler --- journaler_allow_split_entries: true, journaler_safe: false, // wait for COMMIT on journal writes journaler_write_head_interval: 15, + journaler_cache: false, // cache writes for later readback // --- mds --- mds_cache_size: MDS_CACHE_SIZE, mds_cache_mid: .7, - mds_decay_halflife: 30, + mds_decay_halflife: 10, mds_beacon_interval: 5, //30.0, - mds_beacon_grace: 15, //60*60.0, + mds_beacon_grace: 30, //60*60.0, mds_log: true, mds_log_max_len: MDS_CACHE_SIZE / 3, @@ -184,19 +189,23 @@ md_config_t g_conf = { mds_log_subtree_map_interval: 128*1024, // frequency (in bytes) of EImportMap in log mds_log_eopen_size: 100, // # open inodes per log entry + mds_bal_sample_interval: 5.0, // every 5 seconds mds_bal_replicate_threshold: 2000, mds_bal_unreplicate_threshold: 0,//500, - mds_bal_hash_rd: 10000, - mds_bal_unhash_rd: 1000, - mds_bal_hash_wr: 10000, - mds_bal_unhash_wr: 1000, + mds_bal_split_size: 1000, + mds_bal_split_rd: 10000, + mds_bal_split_wr: 10000, + mds_bal_merge_size: 50, + mds_bal_merge_rd: 1000, + mds_bal_merge_wr: 1000, mds_bal_interval: 30, // seconds - mds_bal_hash_interval: 5, // seconds + mds_bal_fragment_interval: 5, // seconds mds_bal_idle_threshold: .1, mds_bal_max: -1, mds_bal_max_until: -1, mds_bal_mode: 0, + mds_bal_min_rebalance: .2, // must be this much above average before we export anything mds_bal_min_start: .2, // if we need less than this, we don't do anything mds_bal_need_min: .8, // take within this range of what we need mds_bal_need_max: 1.2, @@ -212,6 +221,7 @@ md_config_t g_conf = { mds_local_osd: false, mds_thrash_exports: 0, + mds_thrash_fragments: 0, mds_dump_cache_on_map: false, mds_dump_cache_after_rejoin: true, @@ -237,13 +247,13 @@ md_config_t g_conf = { osd_mkfs: false, osd_age: .8, osd_age_time: 0, - osd_heartbeat_interval: 5, // shut up while i'm debugging + osd_heartbeat_interval: 15, // shut up while i'm debugging osd_replay_window: 5, osd_max_pull: 2, osd_pad_pg_log: false, // --- fakestore --- - fakestore_fake_sync: 2, // 2 seconds + fakestore_fake_sync: .5, // seconds fakestore_fsync: false,//true, fakestore_writesync: false, fakestore_syncthreads: 4, @@ -518,6 +528,11 @@ void parse_config_options(std::vector& args) g_conf.debug_mds_log = atoi(args[++i]); else g_debug_after_conf.debug_mds_log = atoi(args[++i]); + else if (strcmp(args[i], "--debug_mds_migrator") == 0) + if (!g_conf.debug_after) + g_conf.debug_mds_migrator = atoi(args[++i]); + else + g_debug_after_conf.debug_mds_migrator = atoi(args[++i]); else if (strcmp(args[i], "--debug_buffer") == 0) if (!g_conf.debug_after) g_conf.debug_buffer = atoi(args[++i]); @@ -533,6 +548,11 @@ void parse_config_options(std::vector& args) g_conf.debug_objecter = atoi(args[++i]); else g_debug_after_conf.debug_objecter = atoi(args[++i]); + else if (strcmp(args[i], "--debug_journaler") == 0) + if (!g_conf.debug_after) + g_conf.debug_journaler = atoi(args[++i]); + else + g_debug_after_conf.debug_journaler = atoi(args[++i]); else if (strcmp(args[i], "--debug_objectcacher") == 0) if (!g_conf.debug_after) g_conf.debug_objectcacher = atoi(args[++i]); @@ -593,6 +613,8 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--journaler_safe") == 0) g_conf.journaler_safe = atoi(args[++i]); + else if (strcmp(args[i], "--journaler_cache") == 0) + g_conf.journaler_cache = atoi(args[++i]); else if (strcmp(args[i], "--mds_cache_size") == 0) g_conf.mds_cache_size = atoi(args[++i]); @@ -632,14 +654,18 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mds_bal_max_until") == 0) g_conf.mds_bal_max_until = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_hash_rd") == 0) - g_conf.mds_bal_hash_rd = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_hash_wr") == 0) - g_conf.mds_bal_hash_wr = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_unhash_rd") == 0) - g_conf.mds_bal_unhash_rd = atoi(args[++i]); - else if (strcmp(args[i], "--mds_bal_unhash_wr") == 0) - g_conf.mds_bal_unhash_wr = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_split_size") == 0) + g_conf.mds_bal_split_size = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_split_rd") == 0) + g_conf.mds_bal_split_rd = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_split_wr") == 0) + g_conf.mds_bal_split_wr = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_merge_size") == 0) + g_conf.mds_bal_merge_size = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_merge_rd") == 0) + g_conf.mds_bal_merge_rd = atoi(args[++i]); + else if (strcmp(args[i], "--mds_bal_merge_wr") == 0) + g_conf.mds_bal_merge_wr = atoi(args[++i]); else if (strcmp(args[i], "--mds_bal_mode") == 0) g_conf.mds_bal_mode = atoi(args[++i]); @@ -658,6 +684,8 @@ void parse_config_options(std::vector& args) g_conf.mds_local_osd = atoi(args[++i]); else if (strcmp(args[i], "--mds_thrash_exports") == 0) g_conf.mds_thrash_exports = atoi(args[++i]); + else if (strcmp(args[i], "--mds_thrash_fragments") == 0) + g_conf.mds_thrash_fragments = atoi(args[++i]); else if (strcmp(args[i], "--mds_dump_cache_on_map") == 0) g_conf.mds_dump_cache_on_map = true; @@ -670,9 +698,12 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--client_cache_readdir_ttl") == 0) g_conf.client_cache_readdir_ttl = atoi(args[++i]); else if (strcmp(args[i], "--client_trace") == 0) - g_conf.client_trace = atoi(args[++i]); + g_conf.client_trace = args[++i]; + else if (strcmp(args[i], "--fuse_direct_io") == 0) g_conf.fuse_direct_io = atoi(args[++i]); + else if (strcmp(args[i], "--fuse_ll") == 0) + g_conf.fuse_ll = atoi(args[++i]); else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0) g_conf.mon_osd_down_out_interval = atoi(args[++i]); diff --git a/trunk/ceph/config.h b/trunk/ceph/config.h index 13f03436763b4..9076092aba739 100644 --- a/trunk/ceph/config.h +++ b/trunk/ceph/config.h @@ -71,9 +71,11 @@ struct md_config_t { int debug_mds; int debug_mds_balancer; int debug_mds_log; + int debug_mds_migrator; int debug_buffer; int debug_filer; int debug_objecter; + int debug_journaler; int debug_objectcacher; int debug_client; int debug_osd; @@ -122,6 +124,7 @@ struct md_config_t { float mon_accept_timeout; bool mon_stop_on_last_unmount; bool mon_stop_with_last_mds; + bool mon_allow_mds_bully; // client int client_cache_size; @@ -154,8 +157,9 @@ struct md_config_t { size_t client_bcache_align; */ - int client_trace; + char *client_trace; int fuse_direct_io; + bool fuse_ll; // objecter bool objecter_buffer_uncommitted; @@ -164,6 +168,7 @@ struct md_config_t { bool journaler_allow_split_entries; bool journaler_safe; int journaler_write_head_interval; + bool journaler_cache; // mds int mds_cache_size; @@ -183,19 +188,23 @@ struct md_config_t { off_t mds_log_subtree_map_interval; int mds_log_eopen_size; + float mds_bal_sample_interval; float mds_bal_replicate_threshold; float mds_bal_unreplicate_threshold; - float mds_bal_hash_rd; - float mds_bal_unhash_rd; - float mds_bal_hash_wr; - float mds_bal_unhash_wr; + int mds_bal_split_size; + float mds_bal_split_rd; + float mds_bal_split_wr; + int mds_bal_merge_size; + float mds_bal_merge_rd; + float mds_bal_merge_wr; int mds_bal_interval; - int mds_bal_hash_interval; + int mds_bal_fragment_interval; float mds_bal_idle_threshold; int mds_bal_max; int mds_bal_max_until; int mds_bal_mode; + float mds_bal_min_rebalance; float mds_bal_min_start; float mds_bal_need_min; float mds_bal_need_max; @@ -211,6 +220,7 @@ struct md_config_t { bool mds_local_osd; int mds_thrash_exports; + int mds_thrash_fragments; bool mds_dump_cache_on_map; bool mds_dump_cache_after_rejoin; @@ -240,7 +250,7 @@ struct md_config_t { int osd_max_pull; bool osd_pad_pg_log; - int fakestore_fake_sync; + double fakestore_fake_sync; bool fakestore_fsync; bool fakestore_writesync; int fakestore_syncthreads; // such crap diff --git a/trunk/ceph/ebofs/Allocator.cc b/trunk/ceph/ebofs/Allocator.cc index 70b641cfee14f..a9c79f01a315a 100644 --- a/trunk/ceph/ebofs/Allocator.cc +++ b/trunk/ceph/ebofs/Allocator.cc @@ -19,7 +19,7 @@ #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator." +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << fs->dev.get_device_name() << ").allocator." void Allocator::dump_freelist() diff --git a/trunk/ceph/ebofs/BlockDevice.cc b/trunk/ceph/ebofs/BlockDevice.cc index 8fcb6bf549a8e..83de7b69efd0e 100644 --- a/trunk/ceph/ebofs/BlockDevice.cc +++ b/trunk/ceph/ebofs/BlockDevice.cc @@ -62,8 +62,8 @@ inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) */ #undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq." -#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq." +#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ").elevatorq." +#define derr(x) if (x <= g_conf.debug_bdev) cerr << g_clock.now() << " bdev(" << dev << ").elevatorq." int BlockDevice::ElevatorQueue::dequeue_io(list& biols, @@ -211,7 +211,7 @@ int BlockDevice::ElevatorQueue::dequeue_io(list& biols, * BarrierQueue */ #undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq." +#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ").barrierq." void BlockDevice::BarrierQueue::barrier() { @@ -259,7 +259,7 @@ int BlockDevice::BarrierQueue::dequeue_io(list& biols, */ #undef dout -#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")." +#define dout(x) if (x <= g_conf.debug_bdev) cout << g_clock.now() << " bdev(" << dev << ")." @@ -275,10 +275,10 @@ block_t BlockDevice::get_num_blocks() r = ioctl(fd, BLKGETSIZE64, &bytes); num_blocks = bytes / (uint64_t)EBOFS_BLOCK_SIZE; if (r == 0) { - dout(1) << "get_num_blocks ioctl BLKGETSIZE64 reports " - << num_blocks << " 4k blocks, " - << bytes << " bytes" - << endl; + dout(10) << "get_num_blocks ioctl BLKGETSIZE64 reports " + << num_blocks << " 4k blocks, " + << bytes << " bytes" + << endl; #else // hrm, try the 32 bit ioctl? unsigned long sectors = 0; @@ -286,8 +286,8 @@ block_t BlockDevice::get_num_blocks() num_blocks = sectors/8ULL; bytes = sectors*512ULL; if (r == 0) { - dout(1) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, " - << num_blocks << " 4k blocks, " << bytes << " bytes" << endl; + dout(10) << "get_num_blocks ioctl BLKGETSIZE reports " << sectors << " sectors, " + << num_blocks << " 4k blocks, " << bytes << " bytes" << endl; #endif } else { // hmm, try stat! @@ -296,7 +296,7 @@ block_t BlockDevice::get_num_blocks() fstat(fd, &st); uint64_t bytes = st.st_size; num_blocks = bytes / EBOFS_BLOCK_SIZE; - dout(1) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << endl; + dout(10) << "get_num_blocks stat reports " << num_blocks << " 4k blocks, " << bytes << " bytes" << endl; } if (g_conf.bdev_fake_mb) { diff --git a/trunk/ceph/ebofs/BufferCache.cc b/trunk/ceph/ebofs/BufferCache.cc index a83ce5cb480fd..d110d066e6993 100644 --- a/trunk/ceph/ebofs/BufferCache.cc +++ b/trunk/ceph/ebofs/BufferCache.cc @@ -22,7 +22,7 @@ #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh." +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bh." @@ -33,7 +33,7 @@ #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.oc." @@ -737,7 +737,7 @@ void ObjectCache::clone_to(Onode *other) /************** BufferCache ***************/ #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc." +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs.bc." diff --git a/trunk/ceph/ebofs/Ebofs.cc b/trunk/ceph/ebofs/Ebofs.cc index e5b8dda2585de..1f9ca12b46ee3 100644 --- a/trunk/ceph/ebofs/Ebofs.cc +++ b/trunk/ceph/ebofs/Ebofs.cc @@ -30,8 +30,8 @@ // ******************* #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")." -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")." +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." +#define derr(x) if (x <= g_conf.debug_ebofs) cerr << g_clock.now() << " ebofs(" << dev.get_device_name() << ")." char *nice_blocks(block_t b) @@ -124,36 +124,39 @@ int Ebofs::mount() if (journalfn) { journal = new FileJournal(this, journalfn); if (journal->open() < 0) { - dout(-3) << "mount journal " << journalfn << " open failed" << endl; + dout(3) << "mount journal " << journalfn << " open failed" << endl; delete journal; journal = 0; } else { - dout(-3) << "mount journal " << journalfn << " opened, replaying" << endl; + dout(3) << "mount journal " << journalfn << " opened, replaying" << endl; while (1) { bufferlist bl; epoch_t e; if (!journal->read_entry(bl, e)) { - dout(-3) << "mount replay: end of journal, done." << endl; + dout(3) << "mount replay: end of journal, done." << endl; break; } if (e < super_epoch) { - dout(-3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << endl; + dout(3) << "mount replay: skipping old entry in epoch " << e << " < " << super_epoch << endl; continue; } if (e == super_epoch+1) { super_epoch++; - dout(-3) << "mount replay: jumped to next epoch " << super_epoch << endl; + dout(3) << "mount replay: jumped to next epoch " << super_epoch << endl; } assert(e == super_epoch); - dout(-3) << "mount replay: applying transaction in epoch " << e << endl; + dout(3) << "mount replay: applying transaction in epoch " << e << endl; Transaction t; int off = 0; t._decode(bl, off); _apply_transaction(t); } + + // done reading, make writeable. + journal->make_writeable(); } } @@ -161,7 +164,9 @@ int Ebofs::mount() commit_thread.create(); finisher_thread.create(); - dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; + dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) + << (journal ? ", with journal":", no journal") + << endl; mounted = true; @@ -184,8 +189,11 @@ int Ebofs::mkfs() block_t num_blocks = dev.get_num_blocks(); // make a super-random fsid + srand48(time(0) ^ getpid()); + super_fsid = ((uint64_t)lrand48() << 32) ^ mrand48(); srand(time(0) ^ getpid()); - super_fsid = (lrand48() << 32) ^ mrand48(); + super_fsid ^= rand(); + super_fsid ^= (uint64_t)rand() << 32; free_blocks = 0; limbo_blocks = 0; @@ -261,13 +269,13 @@ int Ebofs::mkfs() // create journal? if (journalfn) { - journal = new FileJournal(this, journalfn); + Journal *journal = new FileJournal(this, journalfn); if (journal->create() < 0) { dout(3) << "mount journal " << journalfn << " created failed" << endl; - delete journal; } else { dout(3) << "mount journal " << journalfn << " created" << endl; } + delete journal; } dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; @@ -1550,9 +1558,12 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) if (bl.length() == 0) { zleft += len; left = 0; + } else { + assert(bl.length() == len); } if (zleft) - dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl; + dout(10) << "apply_write zeroing " << zleft << " bytes before " << off << "~" << len + << " in " << *on << endl; block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; block_t blen = blast-bstart+1; @@ -1640,7 +1651,7 @@ void Ebofs::apply_write(Onode *on, off_t off, size_t len, const bufferlist& bl) bufferlist zb; zb.push_back(zp); bh->add_partial(off_in_bh, zb); - zleft -= z; + zleft -= z; opos += z; } @@ -2078,7 +2089,7 @@ bool Ebofs::write_will_block() unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) { ebofs_lock.Lock(); - dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl; + dout(7) << "apply_transaction start (" << t.get_num_ops() << " ops)" << endl; unsigned r = _apply_transaction(t); @@ -2106,16 +2117,18 @@ unsigned Ebofs::_apply_transaction(Transaction& t) // do ops unsigned r = 0; // bit fields indicate which ops failed. int bit = 1; - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { + while (t.have_op()) { + int op = t.get_op(); + switch (op) { case Transaction::OP_READ: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); + object_t oid; + t.get_oid(oid); + off_t offset, len; + t.get_length(offset); + t.get_length(len); + bufferlist *pbl; + t.get_pbl(pbl); if (_read(oid, offset, len, *pbl) < 0) { dout(7) << "apply_transaction fail on _read" << endl; r &= bit; @@ -2125,8 +2138,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_STAT: { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); + object_t oid; + t.get_oid(oid); + struct stat *st; + t.get_pstat(st); if (_stat(oid, st) < 0) { dout(7) << "apply_transaction fail on _stat" << endl; r &= bit; @@ -2136,9 +2151,12 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_GETATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); + pair pattrval; + t.get_pattrval(pattrval); if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { dout(7) << "apply_transaction fail on _getattr" << endl; r &= bit; @@ -2148,8 +2166,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_GETATTRS: { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); + object_t oid; + t.get_oid(oid); + map *pset; + t.get_pattrset(pset); if (_getattrs(oid, *pset) < 0) { dout(7) << "apply_transaction fail on _getattrs" << endl; r &= bit; @@ -2160,10 +2180,13 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_WRITE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); + object_t oid; + t.get_oid(oid); + off_t offset, len; + t.get_length(offset); + t.get_length(len); + bufferlist bl; + t.get_bl(bl); if (_write(oid, offset, len, bl) < 0) { dout(7) << "apply_transaction fail on _write" << endl; r &= bit; @@ -2173,17 +2196,21 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_TRIMCACHE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); + object_t oid; + t.get_oid(oid); + off_t offset, len; + t.get_length(offset); + t.get_length(len); _trim_from_cache(oid, offset, len); } break; case Transaction::OP_TRUNCATE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); + object_t oid; + t.get_oid(oid); + off_t len; + t.get_length(len); if (_truncate(oid, len) < 0) { dout(7) << "apply_transaction fail on _truncate" << endl; r &= bit; @@ -2193,7 +2220,8 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_REMOVE: { - object_t oid = t.oids.front(); t.oids.pop_front(); + object_t oid; + t.get_oid(oid); if (_remove(oid) < 0) { dout(7) << "apply_transaction fail on _remove" << endl; r &= bit; @@ -2203,12 +2231,12 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_SETATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); + t.get_bl(bl); if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { dout(7) << "apply_transaction fail on _setattr" << endl; r &= bit; @@ -2218,8 +2246,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_SETATTRS: { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); + object_t oid; + t.get_oid(oid); + map *pattrset; + t.get_pattrset(pattrset); if (_setattrs(oid, *pattrset) < 0) { dout(7) << "apply_transaction fail on _setattrs" << endl; r &= bit; @@ -2229,8 +2259,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_RMATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); if (_rmattr(oid, attrname) < 0) { dout(7) << "apply_transaction fail on _rmattr" << endl; r &= bit; @@ -2240,8 +2272,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_CLONE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); + object_t oid; + t.get_oid(oid); + object_t noid; + t.get_oid(noid); if (_clone(oid, noid) < 0) { dout(7) << "apply_transaction fail on _clone" << endl; r &= bit; @@ -2251,7 +2285,8 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_MKCOLL: { - coll_t cid = t.cids.front(); t.cids.pop_front(); + coll_t cid; + t.get_cid(cid); if (_create_collection(cid) < 0) { dout(7) << "apply_transaction fail on _create_collection" << endl; r &= bit; @@ -2261,7 +2296,8 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_RMCOLL: { - coll_t cid = t.cids.front(); t.cids.pop_front(); + coll_t cid; + t.get_cid(cid); if (_destroy_collection(cid) < 0) { dout(7) << "apply_transaction fail on _destroy_collection" << endl; r &= bit; @@ -2271,8 +2307,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_COLL_ADD: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); + coll_t cid; + t.get_cid(cid); + object_t oid; + t.get_oid(oid); if (_collection_add(cid, oid) < 0) { //dout(7) << "apply_transaction fail on _collection_add" << endl; //r &= bit; @@ -2282,8 +2320,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_COLL_REMOVE: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); + coll_t cid; + t.get_cid(cid); + object_t oid; + t.get_oid(oid); if (_collection_remove(cid, oid) < 0) { dout(7) << "apply_transaction fail on _collection_remove" << endl; r &= bit; @@ -2293,12 +2333,12 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_COLL_SETATTR: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); + t.get_bl(bl); if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { dout(7) << "apply_transaction fail on _collection_setattr" << endl; @@ -2309,8 +2349,10 @@ unsigned Ebofs::_apply_transaction(Transaction& t) case Transaction::OP_COLL_RMATTR: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); if (_collection_rmattr(cid, attrname) < 0) { dout(7) << "apply_transaction fail on _collection_rmattr" << endl; r &= bit; @@ -2319,7 +2361,7 @@ unsigned Ebofs::_apply_transaction(Transaction& t) break; default: - cerr << "bad op " << *p << endl; + cerr << "bad op " << op << endl; assert(0); } diff --git a/trunk/ceph/ebofs/FileJournal.cc b/trunk/ceph/ebofs/FileJournal.cc index 40a73a442182d..b49ee82bb42b4 100644 --- a/trunk/ceph/ebofs/FileJournal.cc +++ b/trunk/ceph/ebofs/FileJournal.cc @@ -23,18 +23,18 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << ebofs->dev.get_device_name() << ").journal " -#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << ebofs->dev.get_device_name() << ").journal " +#define dout(x) if (x <= g_conf.debug_ebofs) cout << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " +#define derr(x) if (x <= g_conf.debug_ebofs) cerr << g_clock.now() << " ebofs(" << ebofs->dev.get_device_name() << ").journal " int FileJournal::create() { - dout(1) << "create " << fn << endl; + dout(2) << "create " << fn << endl; // open/create fd = ::open(fn.c_str(), O_RDWR|O_SYNC); if (fd < 0) { - dout(1) << "create failed " << errno << " " << strerror(errno) << endl; + dout(2) << "create failed " << errno << " " << strerror(errno) << endl; return -errno; } assert(fd > 0); @@ -45,7 +45,7 @@ int FileJournal::create() // get size struct stat st; ::fstat(fd, &st); - dout(1) << "open " << fn << " " << st.st_size << " bytes" << endl; + dout(2) << "create " << fn << " " << st.st_size << " bytes" << endl; // write empty header memset(&header, 0, sizeof(header)); @@ -71,7 +71,7 @@ int FileJournal::open() assert(fd == 0); fd = ::open(fn.c_str(), O_RDWR|O_SYNC); if (fd < 0) { - dout(1) << "open failed " << errno << " " << strerror(errno) << endl; + dout(2) << "open failed " << errno << " " << strerror(errno) << endl; return -errno; } assert(fd > 0); @@ -82,7 +82,10 @@ int FileJournal::open() // read header? read_header(); - if (header.num > 0 && header.fsid == ebofs->get_fsid()) { + if (header.fsid != ebofs->get_fsid()) { + dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << endl; + } + else if (header.num > 0) { // valid header, pick an offset for (int i=0; iget_super_epoch()) { @@ -397,8 +400,7 @@ void FileJournal::make_writeable() bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) { if (!read_pos) { - dout(1) << "read_entry -- not readable" << endl; - make_writeable(); + dout(2) << "read_entry -- not readable" << endl; return false; } @@ -420,8 +422,7 @@ bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) ::lseek(fd, read_pos, SEEK_SET); ::read(fd, &h, sizeof(h)); if (!h.check_magic(read_pos, header.fsid)) { - dout(1) << "read_entry " << read_pos << " : bad header magic, end of journal" << endl; - make_writeable(); + dout(2) << "read_entry " << read_pos << " : bad header magic, end of journal" << endl; return false; } @@ -435,8 +436,7 @@ bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch) if (!f.check_magic(read_pos, header.fsid) || h.epoch != f.epoch || h.len != f.len) { - dout(1) << "read_entry " << read_pos << " : bad footer magic, partially entry, end of journal" << endl; - make_writeable(); + dout(2) << "read_entry " << read_pos << " : bad footer magic, partially entry, end of journal" << endl; return false; } diff --git a/trunk/ceph/ebofs/FileJournal.h b/trunk/ceph/ebofs/FileJournal.h index a26f75ec97ff6..7c9a67ccbd25f 100644 --- a/trunk/ceph/ebofs/FileJournal.h +++ b/trunk/ceph/ebofs/FileJournal.h @@ -106,8 +106,6 @@ private: void stop_writer(); void write_thread_entry(); - void make_writeable(); - class Writer : public Thread { FileJournal *journal; public: @@ -131,6 +129,8 @@ private: int open(); void close(); + void make_writeable(); + // writes bool submit_entry(bufferlist& e, Context *oncommit); // submit an item void commit_epoch_start(); // mark epoch boundary diff --git a/trunk/ceph/ebofs/Journal.h b/trunk/ceph/ebofs/Journal.h index fb1983c22eafc..9bab0b7f3c109 100644 --- a/trunk/ceph/ebofs/Journal.h +++ b/trunk/ceph/ebofs/Journal.h @@ -34,6 +34,7 @@ public: virtual void close() = 0; // writes + virtual void make_writeable() = 0; virtual bool submit_entry(bufferlist& e, Context *oncommit) = 0;// submit an item virtual void commit_epoch_start() = 0; // mark epoch boundary virtual void commit_epoch_finish() = 0; // mark prior epoch as committed (we can expire) diff --git a/trunk/ceph/fakefuse.cc b/trunk/ceph/fakefuse.cc index 66e5d550c1543..663a16b84d5c6 100644 --- a/trunk/ceph/fakefuse.cc +++ b/trunk/ceph/fakefuse.cc @@ -27,6 +27,7 @@ using namespace std; #include "osd/OSD.h" #include "client/Client.h" #include "client/fuse.h" +#include "client/fuse_ll.h" #include "common/Timer.h" @@ -59,6 +60,10 @@ public: int main(int argc, char **argv) { cerr << "fakefuse starting" << endl; + // stop on our own (by default) + g_conf.mon_stop_on_last_unmount = true; + g_conf.mon_stop_with_last_mds = true; + vector args; argv_to_vec(argc, argv, args); parse_config_options(args); @@ -82,7 +87,13 @@ int main(int argc, char **argv) { if (g_conf.clock_tare) g_clock.tare(); MonMap *monmap = new MonMap(g_conf.num_mon); - + entity_addr_t a; + a.nonce = getpid(); + for (int i=0; imon_inst[i] = entity_inst_t(MSG_ADDR_MON(i), a); // hack ; see FakeMessenger.cc + } + Monitor *mon[g_conf.num_mon]; for (int i=0; iinit(); - } - for (int i=0; iinit(); - } - - for (int i=0; iinit(); + for (int i=0; iinit(); - } // create client @@ -122,15 +129,19 @@ int main(int argc, char **argv) { // start up fuse // use my argc, argv (make sure you pass a mount point!) - cout << "starting fuse on pid " << getpid() << endl; client[i]->mount(); char *oldcwd = get_current_dir_name(); // note previous wd - ceph_fuse_main(client[i], argc, argv); + cout << "starting fuse on pid " << getpid() << endl; + if (g_conf.fuse_ll) + ceph_fuse_ll_main(client[i], argc, argv); + else + ceph_fuse_main(client[i], argc, argv); + cout << "fuse finished on pid " << getpid() << endl; ::chdir(oldcwd); // return to previous wd + free(oldcwd); client[i]->unmount(); - cout << "fuse finished on pid " << getpid() << endl; client[i]->shutdown(); } diff --git a/trunk/ceph/fakesyn.cc b/trunk/ceph/fakesyn.cc index e4256db646abb..84f06dde83e23 100644 --- a/trunk/ceph/fakesyn.cc +++ b/trunk/ceph/fakesyn.cc @@ -86,6 +86,7 @@ int main(int argc, char **argv) MonMap *monmap = new MonMap(g_conf.num_mon); entity_addr_t a; + a.nonce = getpid(); for (int i=0; imon_inst[i] = entity_inst_t(MSG_ADDR_MON(i), a); // hack ; see FakeMessenger.cc @@ -105,7 +106,7 @@ int main(int argc, char **argv) OSD *mdsosd[g_conf.num_mds]; for (int i=0; i #include @@ -37,6 +44,9 @@ extern Mutex bufferlock; extern long buffer_total_alloc; // + + + class buffer { private: @@ -59,11 +69,23 @@ private: public: char *data; unsigned len; +#ifdef BUFFER_USE_CCPP + mutable ost::AtomicCounter nref; // mutable for const-ness of operator<< +#else int nref; Mutex lock; // we'll make it non-recursive. +#endif - raw(unsigned l) : len(l), nref(0), lock(false) {} - raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {} + raw(unsigned l) : len(l), nref(0) +#ifndef BUFFER_USE_CCPP + , lock(false) +#endif + { } + raw(char *c, unsigned l) : data(c), len(l), nref(0) +#ifndef BUFFER_USE_CCPP + , lock(false) +#endif + { } virtual ~raw() {}; // no copying. @@ -187,7 +209,8 @@ public: static raw* create_page_aligned(unsigned len) { #ifndef __CYGWIN__ - return new raw_mmap_pages(len); + //return new raw_mmap_pages(len); + return new raw_posix_aligned(len); #else return new raw_hack_aligned(len); #endif @@ -216,24 +239,36 @@ public: } ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { if (_raw) { - _raw->lock.Lock(); +#ifdef BUFFER_USE_CCPP ++_raw->nref; - _raw->lock.Unlock(); +#else + _raw->lock.Lock(); + ++_raw->nref; + _raw->lock.Unlock(); +#endif } } ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { assert(o+l <= p._len); assert(_raw); +#ifdef BUFFER_USE_CCPP + ++_raw->nref; +#else _raw->lock.Lock(); ++_raw->nref; _raw->lock.Unlock(); +#endif } ptr& operator= (const ptr& p) { // be careful -- we need to properly handle self-assignment. if (p._raw) { +#ifdef BUFFER_USE_CCPP + ++p._raw->nref; // inc new +#else p._raw->lock.Lock(); ++p._raw->nref; // inc new p._raw->lock.Unlock(); +#endif } release(); // dec (+ dealloc) old (if any) _raw = p._raw; // change my ref @@ -259,13 +294,20 @@ public: void release() { if (_raw) { +#ifndef BUFFER_USE_CCPP _raw->lock.Lock(); - if (--_raw->nref == 0) { - //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; - _raw->lock.Unlock(); - delete _raw; // dealloc old (if any) - } else - _raw->lock.Unlock(); +#endif + if (--_raw->nref == 0) { + //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; +#ifndef BUFFER_USE_CCPP + _raw->lock.Unlock(); +#endif + delete _raw; // dealloc old (if any) + } else { +#ifndef BUFFER_USE_CCPP + _raw->lock.Unlock(); +#endif + } _raw = 0; } } @@ -380,7 +422,7 @@ public: } unsigned length() const { -#if 1 +#if 0 // DEBUG: verify _len unsigned len = 0; for (std::list::const_iterator it = _buffers.begin(); @@ -538,8 +580,8 @@ public: void append(const char *data, unsigned len) { while (len > 0) { // put what we can into the existing append_buffer. - if (append_buffer.unused_tail_length() > 0) { - unsigned gap = append_buffer.unused_tail_length(); + unsigned gap = append_buffer.unused_tail_length(); + if (gap > 0) { if (gap > len) gap = len; append_buffer.append(data, gap); append(append_buffer, append_buffer.end() - gap, gap); // add segment to the list @@ -796,10 +838,22 @@ inline void _decoderaw(T& t, bufferlist& bl, int& off) template inline void _encode(const std::list& ls, bufferlist& bl) { - uint32_t n = ls.size(); - _encoderaw(n, bl); - for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) - _encode(*p, bl); + // should i pre- or post- count? + if (!ls.empty()) { + unsigned pos = bl.length(); + uint32_t n = 0; + _encoderaw(n, bl); + for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) { + n++; + _encode(*p, bl); + } + bl.copy_in(pos, sizeof(n), (char*)&n); + } else { + uint32_t n = ls.size(); // FIXME: this is slow on a list. + _encoderaw(n, bl); + for (typename std::list::const_iterator p = ls.begin(); p != ls.end(); ++p) + _encode(*p, bl); + } } template inline void _decode(std::list& ls, bufferlist& bl, int& off) @@ -977,6 +1031,12 @@ inline void _encode(const bufferlist& s, bufferlist& bl) _encoderaw(len, bl); bl.append(s); } +inline void _encode_destructively(bufferlist& s, bufferlist& bl) +{ + uint32_t len = s.length(); + _encoderaw(len, bl); + bl.claim_append(s); +} inline void _decode(bufferlist& s, bufferlist& bl, int& off) { uint32_t len; diff --git a/trunk/ceph/include/frag.h b/trunk/ceph/include/frag.h index ede296737ca37..db8fec1ebfa8c 100644 --- a/trunk/ceph/include/frag.h +++ b/trunk/ceph/include/frag.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "buffer.h" /* @@ -60,6 +61,9 @@ * TODO: * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) * iteration efficient (see, e.g., try_assimilate_children() + * - rework frag_t so that we mask the left-most (most significant) bits instead of + * the right-most (least significant) bits. just because it's more intutive, and + * matches the network/netmask concept. */ typedef uint32_t _frag_t; @@ -74,7 +78,7 @@ class frag_t { public: frag_t() : _enc(0) { } frag_t(unsigned v, unsigned b) : _enc((b << 24) + - (v & (0xffffffff >> b))) { } + (v & (0xffffffffULL >> (32-b)))) { } frag_t(_frag_t e) : _enc(e) { } // constructors @@ -83,7 +87,8 @@ class frag_t { // accessors unsigned value() const { return _enc & 0xffffff; } unsigned bits() const { return _enc >> 24; } - unsigned mask() const { return 0xffffffff >> (32-bits()); } + unsigned mask() const { return 0xffffffffULL >> (32-bits()); } + operator _frag_t() const { return _enc; } // tests @@ -103,16 +108,17 @@ class frag_t { } // splitting - void split(int nb, list& fragments) const { + void split(int nb, std::list& fragments) const { assert(nb > 0); - unsigned nway = 1 << (nb-1); + unsigned nway = 1 << nb; for (unsigned i=0; i 0); + assert(!is_root()); return frag_t(_enc ^ (1 << (bits()-1))); } bool is_left() const { @@ -131,11 +137,23 @@ class frag_t { frag_t right_child() const { return frag_t(value() | (1< ls; - get_leaves_under_split(x, ls); + std::list ls; + get_leaves_under(x, ls); + //cout << "is_leaf(" << x << ") -> " << ls << endl; if (!ls.empty() && - ls.front() == x) + ls.front() == x && + ls.size() == 1) return true; return false; } @@ -181,15 +201,15 @@ class fragtree_t { /** * get_leaves -- list all leaves */ - void get_leaves(list& ls) const { + void get_leaves(std::list& ls) const { return get_leaves_under_split(frag_t(), ls); } /** * get_leaves_under_split -- list all leaves under a known split point (or root) */ - void get_leaves_under_split(frag_t under, list& ls) const { - list q; + void get_leaves_under_split(frag_t under, std::list& ls) const { + std::list q; q.push_back(under); while (!q.empty()) { frag_t t = q.front(); @@ -238,8 +258,8 @@ class fragtree_t { frag_t branch = get_branch(x); int nb = get_split(branch); if (nb > 0 && // if branch is a split, and - branch.bits() + nb <= x.bits()) // one of the children is or contains x - return frag_t(branch.bits()+nb, x.value()); // then return that child (it's a leaf) + branch.bits() + nb <= x.bits()) // one of the children is or contains x + return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) else return branch; } @@ -247,8 +267,8 @@ class fragtree_t { /** * get_leaves_under(x, ls) -- search for any leaves fully contained by x */ - void get_leaves_under(frag_t x, list& ls) const { - list q; + void get_leaves_under(frag_t x, std::list& ls) const { + std::list q; q.push_back(get_branch(x)); while (!q.empty()) { frag_t t = q.front(); @@ -268,7 +288,7 @@ class fragtree_t { * contains(fg) -- does fragtree contain the specific frag @x */ bool contains(frag_t x) const { - list q; + std::list q; q.push_back(get_branch(x)); while (!q.empty()) { frag_t t = q.front(); @@ -300,10 +320,11 @@ class fragtree_t { if (nb == 0) return t; // done. // pick appropriate child fragment. - unsigned nway = 1 << (nb-1); + unsigned nway = 1 << nb; unsigned i; for (i=0; i children; + std::list children; x.split(nb, children); int childbits = 0; - for (list::iterator p = children.begin(); + for (std::list::iterator p = children.begin(); p != children.end(); ++p) { int cb = get_split(*p); @@ -351,27 +372,34 @@ class fragtree_t { childbits = cb; } // all children are split with childbits! - for (list::iterator p = children.begin(); + for (std::list::iterator p = children.begin(); p != children.end(); ++p) _splits.erase(*p); _splits[x] += childbits; } - void force_to_leaf(frag_t x) { - assert(!is_leaf(x)); + bool force_to_leaf(frag_t x) { + if (is_leaf(x)) + return false; + + cout << "force_to_leaf " << x << " on " << _splits << endl; frag_t parent = get_branch_or_leaf(x); assert(parent.bits() <= x.bits()); + cout << "parent is " << parent << endl; // do we need to split from parent to x? if (parent.bits() < x.bits()) { int spread = x.bits() - parent.bits(); int nb = get_split(parent); + cout << "spread " << spread << ", parent splits by " << nb << endl; if (nb == 0) { // easy: split parent (a leaf) by the difference + cout << "splitting parent " << parent << " by spread " << spread << endl; split(parent, spread); - return; + assert(is_leaf(x)); + return true; } assert(nb > spread); @@ -379,27 +407,34 @@ class fragtree_t { merge(parent, nb); split(parent, spread); - list subs; + std::list subs; parent.split(spread, subs); - for (list::iterator p = subs.begin(); + for (std::list::iterator p = subs.begin(); p != subs.end(); - ++p) + ++p) { + cout << "splitting intermediate " << *p << " by " << (nb-spread) << endl; split(*p, nb - spread); + } } // x is now a leaf or split. // hoover up any children. - list q; + std::list q; q.push_back(x); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); int nb = get_split(t); if (nb) { + cout << "merging child " << t << " by " << nb << endl; merge(t, nb); // merge this point, and t.split(nb, q); // queue up children } - } + } + + cout << "force_to_leaf done" << endl; + assert(is_leaf(x)); + return true; } // verify that we describe a legal partition of the namespace. @@ -428,9 +463,9 @@ class fragtree_t { ::_decode(_splits, bl, off); } - void print(ostream& out) { + void print(std::ostream& out) { out << "fragtree_t("; - list q; + std::list q; q.push_back(frag_t()); while (!q.empty()) { frag_t t = q.front(); @@ -452,22 +487,29 @@ class fragtree_t { } }; -inline ostream& operator<<(ostream& out, fragtree_t& ft) +inline std::ostream& operator<<(std::ostream& out, fragtree_t& ft) { out << "fragtree_t("; - list q; - q.push_back(frag_t()); - while (!q.empty()) { - frag_t t = q.front(); - q.pop_front(); - int nb = ft.get_split(t); - if (nb) { - if (t.bits()) out << ' '; - out << t << '%' << nb; - t.split(nb, q); // queue up children + if (0) { + std::list q; + q.push_back(frag_t()); + while (!q.empty()) { + frag_t t = q.front(); + q.pop_front(); + int nb = ft.get_split(t); + if (nb) { + if (t.bits()) out << ' '; + out << t << '%' << nb; + t.split(nb, q); // queue up children + } } } + if (1) { + std::list leaves; + ft.get_leaves(leaves); + out << leaves; + } return out << ")"; } @@ -476,12 +518,12 @@ inline ostream& operator<<(ostream& out, fragtree_t& ft) * fragset_t -- a set of fragments */ class fragset_t { - set _set; + std::set _set; public: - set &get() { return _set; } - set::iterator begin() { return _set.begin(); } - set::iterator end() { return _set.end(); } + std::set &get() { return _set; } + std::set::iterator begin() { return _set.begin(); } + std::set::iterator end() { return _set.end(); } bool empty() const { return _set.empty(); } @@ -501,9 +543,10 @@ public: void simplify() { while (1) { bool clean = true; - set::iterator p = _set.begin(); + std::set::iterator p = _set.begin(); while (p != _set.end()) { - if (_set.count(p->get_sibling())) { + if (!p->is_root() && + _set.count(p->get_sibling())) { _set.erase(p->get_sibling()); _set.insert(p->parent()); _set.erase(p++); @@ -518,9 +561,9 @@ public: } }; -inline ostream& operator<<(ostream& out, fragset_t& fs) +inline std::ostream& operator<<(std::ostream& out, fragset_t& fs) { - return out << "fragset_t(" << fs.get() << ")" << endl; + return out << "fragset_t(" << fs.get() << ")"; } #endif diff --git a/trunk/ceph/include/lru.h b/trunk/ceph/include/lru.h index 225204f151a0a..cb8d08c58ae9c 100644 --- a/trunk/ceph/include/lru.h +++ b/trunk/ceph/include/lru.h @@ -71,6 +71,12 @@ class LRUList { return tail; } + void clear() { + while (len > 0) { + remove(get_head()); + } + } + void insert_head(LRUObject *o) { o->lru_next = head; o->lru_prev = NULL; @@ -129,7 +135,7 @@ class LRU { LRU(int max = 0) { lru_num = 0; lru_num_pinned = 0; - lru_midpoint = .9; + lru_midpoint = .6; lru_max = max; } @@ -143,6 +149,11 @@ class LRU { void lru_set_max(uint32_t m) { lru_max = m; } void lru_set_midpoint(float f) { lru_midpoint = f; } + void lru_clear() { + lru_top.clear(); + lru_bot.clear(); + lru_pintail.clear(); + } // insert at top of lru void lru_insert_top(LRUObject *o) { diff --git a/trunk/ceph/include/types.h b/trunk/ceph/include/types.h index d13937b39da3d..42a0416211c97 100644 --- a/trunk/ceph/include/types.h +++ b/trunk/ceph/include/types.h @@ -140,9 +140,9 @@ typedef uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 struct FileLayout { // -- file -> object mapping -- - int stripe_unit; // stripe unit, in bytes - int stripe_count; // over this many objects - int object_size; // until objects are this big, then move to new objects + int32_t stripe_unit; // stripe unit, in bytes + int32_t stripe_count; // over this many objects + int32_t object_size; // until objects are this big, then move to new objects int stripe_width() { return stripe_unit * stripe_count; } @@ -152,10 +152,10 @@ struct FileLayout { // -- object -> pg layout -- char pg_type; // pg type (replicated, raid, etc.) (see pg_t::TYPE_*) char pg_size; // pg size (num replicas, or raid4 stripe width) - int preferred; // preferred primary osd? + int32_t preferred; // preferred primary osd? // -- pg -> disk layout -- - int object_stripe_unit; // for per-object raid + int32_t object_stripe_unit; // for per-object raid FileLayout() { } FileLayout(int su, int sc, int os, int pgt, int pgs, int o=-1) : @@ -210,23 +210,39 @@ namespace __gnu_cxx { #define FILE_MODE_RW (1|2) #define FILE_MODE_LAZY 4 -#define INODE_MASK_BASE 1 // ino, layout, symlink value -#define INODE_MASK_AUTH 2 // uid, gid, mode -#define INODE_MASK_LINK 4 // nlink, anchored -#define INODE_MASK_FILE 8 // mtime, size. -// atime? - -#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_AUTH|INODE_MASK_LINK|INODE_MASK_FILE) - -#define INODE_MASK_SIZE INODE_MASK_FILE // size, blksize, blocks -#define INODE_MASK_MTIME INODE_MASK_FILE // mtime -#define INODE_MASK_ATIME INODE_MASK_FILE // atime -#define INODE_MASK_CTIME (INODE_MASK_FILE|INODE_MASK_AUTH|INODE_MASK_LINK) // ctime +/** stat masks + */ +#define STAT_MASK_INO 1 // inode nmber +#define STAT_MASK_TYPE 2 // file type bits of the mode +#define STAT_MASK_BASE 4 // layout, symlink value +#define STAT_MASK_AUTH 8 // uid, gid, mode +#define STAT_MASK_LINK 16 // nlink, anchored +#define STAT_MASK_FILE 32 // mtime, size. + +#define STAT_MASK_ALL 63 + +#define STAT_MASK_SIZE STAT_MASK_FILE // size, blksize, blocks +#define STAT_MASK_MTIME STAT_MASK_FILE // mtime +#define STAT_MASK_ATIME STAT_MASK_FILE // atime +#define STAT_MASK_CTIME (STAT_MASK_FILE|STAT_MASK_AUTH|STAT_MASK_LINK) // ctime + +inline int DT_TO_MODE(int dt) { + return dt << 12; + /* + switch (dt) { + case DT_REG: return INODE_MODE_FILE; + case DT_DIR: return INODE_MODE_DIR; + case DT_LNK: return INODE_MODE_SYMLINK; + default: assert(0); return 0; + } + */ +} struct inode_t { // base (immutable) inodeno_t ino; FileLayout layout; // ?immutable? + dev_t rdev; // if special file // affected by any inode change... utime_t ctime; // inode change time @@ -237,7 +253,7 @@ struct inode_t { gid_t gid; // nlink - int nlink; + int32_t nlink; bool anchored; // auth only? // file (data access) @@ -246,15 +262,31 @@ struct inode_t { utime_t atime; // file data access time. // special stuff - int mask; // used for client stat. hack. - version_t version; // auth only - version_t file_data_version; // auth only + version_t version; // auth only + version_t file_data_version; // auth only + // file type bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } + + // corresponding d_types + static const unsigned char DT_REG = 8; + static const unsigned char DT_DIR = 4; + static const unsigned char DT_LNK = 10; }; +inline unsigned char MODE_TO_DT(int mode) { + return mode >> 12; + /* + if (S_ISREG(mode)) return inode_t::DT_REG; + if (S_ISLNK(mode)) return inode_t::DT_LNK; + if (S_ISDIR(mode)) return inode_t::DT_DIR; + assert(0); + return 0; + */ +} + diff --git a/trunk/ceph/include/utime.h b/trunk/ceph/include/utime.h index 03fed6744f4e2..7fef5a7f930d2 100644 --- a/trunk/ceph/include/utime.h +++ b/trunk/ceph/include/utime.h @@ -63,6 +63,8 @@ class utime_t { // is just casting it to long& OK? long& usec_ref() { return (long&) tv.tv_usec; } + struct timeval& tv_ref() { return tv; } + // cast to double operator double() { return (double)sec() + ((double)usec() / 1000000.0L); @@ -82,7 +84,7 @@ inline utime_t& operator+=(utime_t& l, const utime_t& r) { } inline utime_t& operator+=(utime_t& l, double f) { double fs = trunc(f); - double us = (f - fs) / (double)1000000.0; + double us = (f - fs) * (double)1000000.0; l.sec_ref() += (long)fs; l.usec_ref() += (long)us; l.normalize(); @@ -131,7 +133,7 @@ inline std::ostream& operator<<(std::ostream& out, const utime_t& t) time_t tt = t.sec(); localtime_r(&tt, &bdt); out << std::setw(2) << (bdt.tm_year-100) // 2007 -> '07' - << std::setw(2) << bdt.tm_mon + << std::setw(2) << (bdt.tm_mon+1) << std::setw(2) << bdt.tm_mday << "." << std::setw(2) << bdt.tm_hour diff --git a/trunk/ceph/mds/Anchor.h b/trunk/ceph/mds/Anchor.h index 9ead7bb599c7f..748091306a44d 100644 --- a/trunk/ceph/mds/Anchor.h +++ b/trunk/ceph/mds/Anchor.h @@ -57,7 +57,7 @@ inline const char* get_anchor_opname(int o) { case ANCHOR_OP_COMMIT: return "commit"; case ANCHOR_OP_ACK: return "ack"; case ANCHOR_OP_ROLLBACK: return "rollback"; - default: assert(0); + default: assert(0); return 0; } } diff --git a/trunk/ceph/mds/CDentry.cc b/trunk/ceph/mds/CDentry.cc index 2db36a7a187d8..4e499c484e06f 100644 --- a/trunk/ceph/mds/CDentry.cc +++ b/trunk/ceph/mds/CDentry.cc @@ -27,7 +27,7 @@ #include #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->dirfrag() << " " << name << ") " ostream& CDentry::print_db_line_prefix(ostream& out) { @@ -43,6 +43,7 @@ ostream& operator<<(ostream& out, CDentry& dn) dn.make_path(path); out << "[dentry " << path; + if (dn.is_auth()) { out << " auth"; if (dn.is_replicated()) @@ -54,7 +55,16 @@ ostream& operator<<(ostream& out, CDentry& dn) } if (dn.is_null()) out << " NULL"; - if (dn.is_remote()) out << " REMOTE"; + if (dn.is_remote()) { + out << " REMOTE("; + switch (dn.get_remote_d_type()) { + case inode_t::DT_REG: out << "reg"; break; + case inode_t::DT_DIR: out << "dir"; break; + case inode_t::DT_LNK: out << "lnk"; break; + default: assert(0); + } + out << ")"; + } out << " " << dn.lock; @@ -139,7 +149,7 @@ void CDentry::mark_dirty(version_t pv) dout(10) << " mark_dirty " << *this << endl; // i now live in this new dir version - assert(pv == projected_version); + assert(pv <= projected_version); version = pv; _mark_dirty(); @@ -248,14 +258,24 @@ bool CDentry::can_auth_pin() void CDentry::auth_pin() { - assert(dir); - dir->auth_pin(); + if (auth_pins == 0) + get(PIN_AUTHPIN); + auth_pins++; + dir->adjust_nested_auth_pins(1); } void CDentry::auth_unpin() { - assert(dir); - dir->auth_unpin(); + auth_pins--; + if (auth_pins == 0) + put(PIN_AUTHPIN); + dir->adjust_nested_auth_pins(-1); +} + +void CDentry::adjust_nested_auth_pins(int by) +{ + nested_auth_pins += by; + dir->adjust_nested_auth_pins(by); } diff --git a/trunk/ceph/mds/CDentry.h b/trunk/ceph/mds/CDentry.h index 29e4a3314411e..d120a1a07ec9f 100644 --- a/trunk/ceph/mds/CDentry.h +++ b/trunk/ceph/mds/CDentry.h @@ -47,12 +47,15 @@ class CDentry : public MDSCacheObject, public LRUObject { public: // -- state -- static const int STATE_NEW = 1; + static const int STATE_FRAGMENTING = 2; // -- pins -- - static const int PIN_INODEPIN = 1; // linked inode is pinned + static const int PIN_INODEPIN = 1; // linked inode is pinned + static const int PIN_FRAGMENTING = -2; // containing dir is refragmenting const char *pin_name(int p) { switch (p) { case PIN_INODEPIN: return "inodepin"; + case PIN_FRAGMENTING: return "fragmenting"; default: return generic_pin_name(p); } }; @@ -69,15 +72,20 @@ class CDentry : public MDSCacheObject, public LRUObject { } protected: - string name; - CInode *inode; - CDir *dir; + string name; - inodeno_t remote_ino; // if remote dentry + inodeno_t remote_ino; // if remote dentry + unsigned char remote_d_type; - version_t version; // dir version when last touched. - version_t projected_version; // what it will be when i unlock/commit. + CInode *inode; // linked inode (if any) + CDir *dir; // containing dirfrag + version_t version; // dir version when last touched. + version_t projected_version; // what it will be when i unlock/commit. + + off_t dir_offset; + + int auth_pins, nested_auth_pins; friend class Migrator; friend class Locker; @@ -98,37 +106,44 @@ public: public: // cons CDentry() : - inode(0), - dir(0), - remote_ino(0), - version(0), - projected_version(0), + remote_ino(0), remote_d_type(0), + inode(0), dir(0), + version(0), projected_version(0), + dir_offset(0), + auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, inodeno_t ino, CInode *in=0) : + CDentry(const string& n, CInode *in) : name(n), - inode(in), - dir(0), - remote_ino(ino), - version(0), - projected_version(0), + remote_ino(0), remote_d_type(0), + inode(in), dir(0), + version(0), projected_version(0), + dir_offset(0), + auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } - CDentry(const string& n, CInode *in) : + CDentry(const string& n, inodeno_t ino, unsigned char dt, CInode *in=0) : name(n), - inode(in), - dir(0), - remote_ino(0), - version(0), - projected_version(0), + remote_ino(ino), remote_d_type(dt), + inode(in), dir(0), + version(0), projected_version(0), + dir_offset(0), + auth_pins(0), nested_auth_pins(0), lock(this, LOCK_OTYPE_DN, WAIT_LOCK_OFFSET) { } CInode *get_inode() const { return inode; } CDir *get_dir() const { return dir; } const string& get_name() const { return name; } inodeno_t get_ino(); - inodeno_t get_remote_ino() { return remote_ino; } - void set_remote_ino(inodeno_t ino) { remote_ino = ino; } + off_t get_dir_offset() { return dir_offset; } + void set_dir_offset(off_t o) { dir_offset = o; } + void clear_dir_offset() { dir_offset = 0; } + inodeno_t get_remote_ino() { return remote_ino; } + unsigned char get_remote_d_type() { return remote_d_type; } + void set_remote(inodeno_t ino, unsigned char d_type) { + remote_ino = ino; + remote_d_type = d_type; + } // ref counts: pin ourselves in the LRU when we're pinned. void first_get() { @@ -142,6 +157,7 @@ public: bool can_auth_pin(); void auth_pin(); void auth_unpin(); + void adjust_nested_auth_pins(int by); // dentry type is primary || remote || null @@ -248,23 +264,27 @@ class CDentryDiscover { string dname; int replica_nonce; int lockstate; - + off_t dir_offset; inodeno_t remote_ino; + unsigned char remote_d_type; public: CDentryDiscover() {} CDentryDiscover(CDentry *dn, int nonce) : dname(dn->get_name()), replica_nonce(nonce), lockstate(dn->lock.get_replica_state()), - remote_ino(dn->get_remote_ino()) { } + dir_offset(dn->get_dir_offset()), + remote_ino(dn->get_remote_ino()), remote_d_type(dn->get_remote_d_type()) { } string& get_dname() { return dname; } int get_nonce() { return replica_nonce; } bool is_remote() { return remote_ino ? true:false; } inodeno_t get_remote_ino() { return remote_ino; } + unsigned char get_remote_d_type() { return remote_d_type; } void update_dentry(CDentry *dn) { - dn->set_replica_nonce( replica_nonce ); + dn->set_dir_offset(dir_offset); + dn->set_replica_nonce(replica_nonce); } void init_dentry_lock(CDentry *dn) { dn->lock.set_state( lockstate ); @@ -272,14 +292,18 @@ public: void _encode(bufferlist& bl) { ::_encode(dname, bl); + ::_encode(dir_offset, bl); ::_encode(remote_ino, bl); + ::_encode(remote_d_type, bl); ::_encode(replica_nonce, bl); ::_encode(lockstate, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(dname, bl, off); + ::_decode(dir_offset, bl, off); ::_decode(remote_ino, bl, off); + ::_decode(remote_d_type, bl, off); ::_decode(replica_nonce, bl, off); ::_decode(lockstate, bl, off); } diff --git a/trunk/ceph/mds/CDir.cc b/trunk/ceph/mds/CDir.cc index 72ab9d7bcf74f..e91e1b48b84db 100644 --- a/trunk/ceph/mds/CDir.cc +++ b/trunk/ceph/mds/CDir.cc @@ -13,6 +13,7 @@ */ +#include "include/types.h" #include "CDir.h" #include "CDentry.h" @@ -39,9 +40,7 @@ ostream& operator<<(ostream& out, CDir& dir) { string path; dir.get_inode()->make_path(path); - out << "[dir " << dir.ino(); - if (!dir.frag.is_root()) out << "%" << dir.frag; - out << " " << path << "/"; + out << "[dir " << dir.dirfrag() << " " << path << "/"; if (dir.is_auth()) { out << " auth"; if (dir.is_replicated()) @@ -101,13 +100,13 @@ void CDir::print(ostream& out) #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") " //#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache." << *this << " " ostream& CDir::print_db_line_prefix(ostream& out) { - return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << get_inode()->inode.ino << ") "; + return out << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << this->dirfrag() << ") "; } @@ -129,7 +128,7 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) projected_version = version = 0; committing_version = 0; - committed_version = 0; + committed_version_equivalent = committed_version = 0; // dir_auth dir_auth = CDIR_AUTH_DEFAULT; @@ -154,13 +153,13 @@ CDir::CDir(CInode *in, frag_t fg, MDCache *mdcache, bool auth) * linking fun */ -CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) +CDentry* CDir::add_null_dentry(const string& dname) { // foreign assert(lookup(dname) == 0); // create dentry - CDentry* dn = new CDentry(dname, ino); + CDentry* dn = new CDentry(dname, 0); if (is_auth()) dn->state_set(CDentry::STATE_AUTH); cache->lru.lru_insert_mid(dn); @@ -173,9 +172,9 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) //assert(null_items.count(dn->name) == 0); items[dn->name] = dn; - nitems++; + nnull++; - dout(12) << "add_dentry " << *dn << endl; + dout(12) << "add_null_dentry " << *dn << endl; // pin? if (nnull + nitems == 1) get(PIN_CHILD); @@ -186,7 +185,7 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) } -CDentry* CDir::add_dentry( const string& dname, CInode *in) +CDentry* CDir::add_primary_dentry(const string& dname, CInode *in) { // primary assert(lookup(dname) == 0); @@ -205,16 +204,40 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in) //assert(null_items.count(dn->name) == 0); items[dn->name] = dn; + link_inode_work( dn, in ); - if (in) { - link_inode_work( dn, in ); - } else { - assert(dn->inode == 0); - //null_items[dn->name] = dn; - nnull++; - } + dout(12) << "add_primary_dentry " << *dn << endl; + + // pin? + if (nnull + nitems == 1) get(PIN_CHILD); + + assert(nnull + nitems == items.size()); + //assert(nnull == null_items.size()); + return dn; +} + +CDentry* CDir::add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type) +{ + // foreign + assert(lookup(dname) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, ino, d_type); + if (is_auth()) + dn->state_set(CDentry::STATE_AUTH); + cache->lru.lru_insert_mid(dn); + + dn->dir = this; + dn->version = projected_version; + + // add to dir + assert(items.count(dn->name) == 0); + //assert(null_items.count(dn->name) == 0); + + items[dn->name] = dn; + nitems++; - dout(12) << "add_dentry " << *dn << endl; + dout(12) << "add_remote_dentry " << *dn << endl; // pin? if (nnull + nitems == 1) get(PIN_CHILD); @@ -258,13 +281,14 @@ void CDir::remove_dentry(CDentry *dn) //assert(nnull == null_items.size()); } -void CDir::link_inode( CDentry *dn, inodeno_t ino) +void CDir::link_remote_inode(CDentry *dn, inodeno_t ino, unsigned char d_type) { dout(12) << "link_inode " << *dn << " remote " << ino << endl; assert(dn->is_null()); - dn->set_remote_ino(ino); + dn->set_remote(ino, d_type); nitems++; + dn->clear_dir_offset(); //assert(null_items.count(dn->name) == 1); //null_items.erase(dn->name); @@ -272,12 +296,13 @@ void CDir::link_inode( CDentry *dn, inodeno_t ino) assert(nnull + nitems == items.size()); } -void CDir::link_inode( CDentry *dn, CInode *in ) +void CDir::link_primary_inode(CDentry *dn, CInode *in) { - dout(12) << "link_inode " << *dn << " " << *in << endl; + dout(12) << "link_primary_inode " << *dn << " " << *in << endl; assert(!dn->is_remote()); link_inode_work(dn,in); + dn->clear_dir_offset(); // remove from null list //assert(null_items.count(dn->name) == 1); @@ -288,7 +313,7 @@ void CDir::link_inode( CDentry *dn, CInode *in ) //assert(nnull == null_items.size()); } -void CDir::link_inode_work( CDentry *dn, CInode *in ) +void CDir::link_inode_work( CDentry *dn, CInode *in) { dn->inode = in; in->set_primary_parent(dn); @@ -304,7 +329,7 @@ void CDir::link_inode_work( CDentry *dn, CInode *in ) // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); + dn->adjust_nested_auth_pins(in->auth_pins + in->nested_auth_pins); } void CDir::unlink_inode( CDentry *dn ) @@ -315,6 +340,7 @@ void CDir::unlink_inode( CDentry *dn ) dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; } + dn->clear_dir_offset(); unlink_inode_work(dn); // add to null list @@ -329,9 +355,15 @@ void CDir::unlink_inode( CDentry *dn ) void CDir::try_remove_unlinked_dn(CDentry *dn) { assert(dn->dir == this); + assert(dn->is_null()); + assert(dn->is_dirty()); - if (dn->is_new() && dn->is_dirty() && - dn->get_num_ref() == 1) { + // no pins (besides dirty)? + if (dn->get_num_ref() != 1) + return; + + // was the dn new? or is the dir complete (i.e. we don't need negatives)? + if (dn->is_new() || is_complete()) { dout(10) << "try_remove_unlinked_dn " << *dn << " in " << *this << endl; dn->mark_clean(); remove_dentry(dn); @@ -352,13 +384,13 @@ void CDir::try_remove_unlinked_dn(CDentry *dn) void CDir::unlink_inode_work( CDentry *dn ) { CInode *in = dn->inode; - + if (dn->is_remote()) { // remote if (in) dn->unlink_remote(); - dn->set_remote_ino(0); + dn->set_remote(0, 0); } else { // primary assert(dn->is_primary()); @@ -369,7 +401,7 @@ void CDir::unlink_inode_work( CDentry *dn ) // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) - adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); + dn->adjust_nested_auth_pins(0 - (in->auth_pins + in->nested_auth_pins)); // detach inode in->remove_primary_parent(dn); @@ -402,19 +434,29 @@ void CDir::remove_null_dentries() { } +/** + * steal_dentry -- semi-violently move a dentry from one CDir to another + * (*) violently, in that nitems, most pins, etc. are not correctly maintained + * on the old CDir corpse; must call purge_stolen() when finished. + */ void CDir::steal_dentry(CDentry *dn) { dout(15) << "steal_dentry " << *dn << endl; items[dn->name] = dn; - if (nitems == 0) + dn->dir->items.erase(dn->name); + if (dn->dir->items.empty()) + dn->dir->put(PIN_CHILD); + + if (nnull + nitems == 0) get(PIN_CHILD); - nitems++; if (dn->is_null()) nnull++; - if (dn->is_primary()) - nested_auth_pins += dn->inode->auth_pins + dn->inode->nested_auth_pins; + else + nitems++; + + nested_auth_pins += dn->auth_pins + dn->nested_auth_pins; if (dn->is_dirty()) num_dirty++; @@ -423,40 +465,44 @@ void CDir::steal_dentry(CDentry *dn) void CDir::purge_stolen(list& waiters) { - if (!items.empty()) { - put(PIN_CHILD); - items.clear(); + // take waiters _before_ unfreeze... + take_waiting(WAIT_ANY, waiters); + + if (is_auth()) { + assert(is_frozen_dir()); + unfreeze_dir(); } - if (is_dirty()) mark_clean(); + nnull = nitems = 0; + if (is_auth()) + clear_replica_map(); + if (is_dirty()) mark_clean(); if (state_test(STATE_EXPORT)) put(PIN_EXPORT); if (state_test(STATE_IMPORTBOUND)) put(PIN_IMPORTBOUND); if (state_test(STATE_EXPORTBOUND)) put(PIN_EXPORTBOUND); - if (state_test(STATE_FROZENDIR)) put(PIN_FROZEN); if (auth_pins > 0) put(PIN_AUTHPIN); - take_waiting(WAIT_ANY, waiters); - - assert(get_num_ref() == 0); + assert(get_num_ref() == (state_test(STATE_STICKY) ? 1:0)); } void CDir::init_fragment_pins() { + if (!replica_map.empty()) get(PIN_REPLICATED); if (state_test(STATE_DIRTY)) get(PIN_DIRTY); - if (state_test(STATE_FROZENDIR)) get(PIN_FROZEN); if (state_test(STATE_EXPORT)) get(PIN_EXPORT); if (state_test(STATE_EXPORTBOUND)) get(PIN_EXPORTBOUND); if (state_test(STATE_IMPORTBOUND)) get(PIN_IMPORTBOUND); - if (state_test(STATE_STICKY)) get(PIN_STICKY); } void CDir::split(int bits, list& subs, list& waiters) { - dout(10) << "split by " << bits << " bits" << endl; - - assert(is_complete()); + dout(10) << "split by " << bits << " bits on " << *this << endl; + + if (cache->mds->logger) cache->mds->logger->inc("dir_sp"); + + assert(is_complete() || !is_auth()); list frags; frag.split(bits, frags); @@ -464,17 +510,20 @@ void CDir::split(int bits, list& subs, list& waiters) vector subfrags(1 << bits); // create subfrag dirs + int n = 0; for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *f = new CDir(inode, *p, cache, true); + CDir *f = new CDir(inode, *p, cache, is_auth()); f->state_set(state & MASK_STATE_FRAGMENT_KEPT); - f->init_fragment_pins(); - f->set_version(get_version()); f->replica_map = replica_map; + f->dir_auth = dir_auth; + f->init_fragment_pins(); + f->version = version; + f->projected_version = projected_version; dout(10) << " subfrag " << *p << " " << *f << endl; - subfrags.push_back(f); + subfrags[n++] = f; + subs.push_back(f); inode->add_dirfrag(f); } - assert(subfrags.size() == frags.size()); // repartition dentries while (!items.empty()) { @@ -516,6 +565,7 @@ void CDir::merge(int bits, list& waiters) // merge state state_set(dir->get_state() & MASK_STATE_FRAGMENT_KEPT); + dir_auth = dir->dir_auth; dir->purge_stolen(waiters); inode->close_dirfrag(dir->get_frag()); @@ -668,12 +718,12 @@ void CDir::mark_clean() void CDir::first_get() { - inode->get(CInode::PIN_DIR); + inode->get(CInode::PIN_DIRFRAG); } void CDir::last_put() { - inode->put(CInode::PIN_DIR); + inode->put(CInode::PIN_DIRFRAG); } @@ -697,13 +747,19 @@ class C_Dir_Fetch : public Context { } }; -void CDir::fetch(Context *c) +void CDir::fetch(Context *c, bool ignore_authpinnability) { dout(10) << "fetch on " << *this << endl; assert(is_auth()); assert(!is_complete()); + if (!can_auth_pin() && !ignore_authpinnability) { + dout(7) << "fetch waiting for authpinnable" << endl; + add_waiter(WAIT_AUTHPINNABLE, c); + return; + } + if (c) add_waiter(WAIT_COMPLETE, c); // already fetching? @@ -712,9 +768,10 @@ void CDir::fetch(Context *c) return; } + auth_pin(); state_set(CDir::STATE_FETCHING); - if (cache->mds->logger) cache->mds->logger->inc("fdir"); + if (cache->mds->logger) cache->mds->logger->inc("dir_f"); // start by reading the first hunk of it C_Dir_Fetch *fin = new C_Dir_Fetch(this); @@ -728,35 +785,30 @@ void CDir::fetch(Context *c) void CDir::_fetched(bufferlist &bl) { - dout(10) << "_fetched " << 0 << "~" << bl.length() - << " on " << *this + dout(10) << "_fetched " << bl.length() + << " bytes for " << *this << endl; - // give up? - if (!is_auth() || is_frozen()) { - dout(10) << "_fetched canceling (!auth or frozen)" << endl; - //ondisk_bl.clear(); - //ondisk_size = 0; - - // kick waiters? - state_clear(CDir::STATE_FETCHING); - finish_waiting(WAIT_COMPLETE, -1); - return; - } + assert(is_auth()); + assert(!is_frozen()); // decode. int len = bl.length(); int off = 0; - version_t got_version; + version_t got_version; - bl.copy(off, sizeof(got_version), (char*)&got_version); - off += sizeof(got_version); + ::_decode(got_version, bl, off); dout(10) << "_fetched version " << got_version << ", " << len << " bytes" << endl; - while (off < len) { + int32_t n; + ::_decode(n, bl, off); + + for (int i=0; iget_inode() == 0) { dout(12) << "_fetched had NEG dentry " << *dn << endl; @@ -782,7 +835,7 @@ void CDir::_fetched(bufferlist &bl) } } else { // (remote) link - CDentry *dn = add_dentry( dname, ino ); + dn = add_remote_dentry(dname, ino, d_type); // link to inode? CInode *in = cache->get_inode(ino); // we may or may not have it. @@ -839,8 +892,8 @@ void CDir::_fetched(bufferlist &bl) cache->add_inode( in ); // link - add_dentry( dname, in ); - dout(12) << "_fetched got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; + dn = add_primary_dentry(dname, in); + dout(12) << "_fetched got " << *dn << " " << *in << endl; } } } else { @@ -848,6 +901,9 @@ void CDir::_fetched(bufferlist &bl) << " at pos " << off << endl; assert(0); } + + // make note of dentry position in the directory + dn->dir_offset = dn_offset; /** clean underwater item? * Underwater item is something that is dirty in our cache from @@ -906,8 +962,8 @@ void CDir::_fetched(bufferlist &bl) /** * commit * - * @param want min version i want committed - * @param c callback for completion + * @param want - min version i want committed + * @param c - callback for completion */ void CDir::commit(version_t want, Context *c) { @@ -916,7 +972,7 @@ void CDir::commit(version_t want, Context *c) // preconditions assert(want <= version || version == 0); // can't commit the future - assert(committed_version < want); // the caller is stupid + assert(want > committed_version); // the caller is stupid assert(is_auth()); assert(can_auth_pin()); @@ -982,6 +1038,7 @@ void CDir::_commit(version_t want) // complete? if (!is_complete()) { dout(7) << "commit not complete, fetching first" << endl; + if (cache->mds->logger) cache->mds->logger->inc("dir_ffc"); fetch(new C_Dir_RetryCommit(this, want)); return; } @@ -995,12 +1052,15 @@ void CDir::_commit(version_t want) state_set(STATE_COMMITTING); } - if (cache->mds->logger) cache->mds->logger->inc("cdir"); + if (cache->mds->logger) cache->mds->logger->inc("dir_c"); - // encode dentries + // encode bufferlist bl; - bl.append((char*)&version, sizeof(version)); - + + ::_encode(version, bl); + int32_t n = nitems; + ::_encode(n, bl); + for (CDir_map_t::iterator it = items.begin(); it != items.end(); it++) { @@ -1009,6 +1069,8 @@ void CDir::_commit(version_t want) if (dn->is_null()) continue; // skip negative entries + n--; + // primary or remote? if (dn->is_remote()) { inodeno_t ino = dn->get_remote_ino(); @@ -1039,6 +1101,7 @@ void CDir::_commit(version_t want) in->dirfragtree._encode(bl); } } + assert(n == 0); // write it. cache->mds->objecter->write( get_ondisk_object(), @@ -1174,14 +1237,8 @@ bool CDir::is_subtree_root() /** set_dir_auth - * - * always list ourselves first. - * - * accept 'iamauth' param so that i can intelligently adjust freeze auth_pins - * even when the auth bit isn't correct. - * as when calling MDCache::import_subtree(...). */ -void CDir::set_dir_auth(pair a, bool iamauth) +void CDir::set_dir_auth(pair a) { dout(10) << "setting dir_auth=" << a << " from " << dir_auth @@ -1252,9 +1309,7 @@ void CDir::auth_pin() if (is_subtree_root()) return; // no. //assert(!is_import()); - inode->nested_auth_pins++; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( 1 ); + inode->adjust_nested_auth_pins(1); } void CDir::auth_unpin() @@ -1267,39 +1322,38 @@ void CDir::auth_unpin() assert(auth_pins >= 0); // pending freeze? - if (auth_pins + nested_auth_pins == 0) - on_freezeable(); + if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) && + auth_pins == 1 && + nested_auth_pins == 0) + finish_waiting(WAIT_FREEZEABLE); // nest? if (is_subtree_root()) return; // no. //assert(!is_import()); - inode->nested_auth_pins--; - if (inode->parent) - inode->parent->dir->adjust_nested_auth_pins( -1 ); + inode->adjust_nested_auth_pins(-1); } void CDir::adjust_nested_auth_pins(int inc) { - CDir *dir = this; - - // dir - dir->nested_auth_pins += inc; + nested_auth_pins += inc; - dout(10) << "adjust_nested_auth_pins " << inc << " on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; - assert(dir->nested_auth_pins >= 0); + dout(10) << "adjust_nested_auth_pins " << inc << " on " << *this + << " count now " << auth_pins << " + " << nested_auth_pins << endl; + assert(nested_auth_pins >= 0); // pending freeze? - if (is_freezeable()) - dir->on_freezeable(); - // on freezeable_dir too? FIXME + if (state_test(STATE_FREEZINGTREE|STATE_FREEZINGDIR) && + auth_pins == 1 && + nested_auth_pins == 0) + finish_waiting(WAIT_FREEZEABLE); // adjust my inode? - if (dir->is_subtree_root()) + if (is_subtree_root()) return; // no, stop. // yes. - dir->inode->adjust_nested_auth_pins(inc); + inode->adjust_nested_auth_pins(inc); } @@ -1308,21 +1362,6 @@ void CDir::adjust_nested_auth_pins(int inc) * FREEZING */ -void CDir::on_freezeable() -{ - // check for anything pending freezeable - - /* NOTE: this will be called on deeper dirs first, walking up toward - the root, meaning that deeper freeze attempts will succeed first. - */ - /* NOTE: the first of these will likely freeze the dir, and unmark - FREEZING. additional ones will re-flag FREEZING. this isn't - particularly graceful, and might cause problems if the first one - needs to know about other waiters.... FIXME? */ - - finish_waiting(WAIT_FREEZEABLE); -} - // FREEZE TREE class C_MDS_FreezeTree : public Context { @@ -1342,25 +1381,29 @@ void CDir::freeze_tree(Context *c) { assert(!is_frozen()); assert(!is_freezing()); + + auth_pin(); if (is_freezeable()) { - dout(10) << "freeze_tree " << *this << endl; - _freeze_tree(c); + _freeze_tree(); + auth_unpin(); + if (c) { + c->finish(0); + delete c; + } } else { state_set(STATE_FREEZINGTREE); - dout(10) << "freeze_tree + wait " << *this << endl; - - // need to wait for auth pins to expire + dout(10) << "freeze_tree waiting " << *this << endl; add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); } } -void CDir::_freeze_tree(Context *c) +void CDir::_freeze_tree() { dout(10) << "_freeze_tree " << *this << endl; - // there shouldn't be any conflicting auth_pins. - assert(is_freezeable_dir()); + // there shouldn't be any conflicting auth_pins (except the 'freezing' one) + assert(is_freezeable(true)); // twiddle state state_clear(STATE_FREEZINGTREE); // actually, this may get set again by next context? @@ -1369,13 +1412,7 @@ void CDir::_freeze_tree(Context *c) // auth_pin inode for duration of freeze, if we are not a subtree root. if (is_auth() && !is_subtree_root()) - inode->auth_pin(); - - // continue to frozen land - if (c) { - c->finish(0); - delete c; - } + inode->auth_pin(); } void CDir::freeze_tree_finish(Context *c) @@ -1389,16 +1426,20 @@ void CDir::freeze_tree_finish(Context *c) } // freezeable now? - if (!is_freezeable()) { - // wait again! + if (!is_freezeable(true)) { dout(10) << "freeze_tree_finish still waiting " << *this << endl; - state_set(STATE_FREEZINGTREE); + assert(state_test(STATE_FREEZINGTREE)); add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); return; } dout(10) << "freeze_tree_finish " << *this << endl; - _freeze_tree(c); + _freeze_tree(); + auth_unpin(); + if (c) { + c->finish(0); + delete c; + } } void CDir::unfreeze_tree() @@ -1420,6 +1461,7 @@ void CDir::unfreeze_tree() // freezing. stop it. assert(state_test(STATE_FREEZINGTREE)); state_clear(STATE_FREEZINGTREE); + auth_unpin(); // cancel freeze waiters finish_waiting(WAIT_UNFREEZE); @@ -1489,23 +1531,26 @@ void CDir::freeze_dir(Context *c) assert(!is_frozen()); assert(!is_freezing()); + auth_pin(); if (is_freezeable_dir()) { - dout(10) << "freeze_dir " << *this << endl; - _freeze_dir(c); + _freeze_dir(); + auth_unpin(); + if (c) { + c->finish(0); + delete c; + } } else { state_set(STATE_FREEZINGDIR); dout(10) << "freeze_dir + wait " << *this << endl; - - // need to wait for auth pins to expire add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); } } -void CDir::_freeze_dir(Context *c) +void CDir::_freeze_dir() { dout(10) << "_freeze_dir " << *this << endl; - assert(is_freezeable_dir()); + assert(is_freezeable_dir(true)); state_clear(STATE_FREEZINGDIR); state_set(STATE_FROZENDIR); @@ -1513,11 +1558,6 @@ void CDir::_freeze_dir(Context *c) if (is_auth() && !is_subtree_root()) inode->auth_pin(); // auth_pin for duration of freeze - - if (c) { - c->finish(0); - delete c; - } } void CDir::freeze_dir_finish(Context *c) @@ -1531,8 +1571,7 @@ void CDir::freeze_dir_finish(Context *c) } // freezeable now? - if (!is_freezeable_dir()) { - // wait again! + if (!is_freezeable_dir(true)) { dout(10) << "freeze_dir_finish still waiting " << *this << endl; state_set(STATE_FREEZINGDIR); add_waiter(WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); @@ -1540,7 +1579,13 @@ void CDir::freeze_dir_finish(Context *c) } // freeze now - _freeze_dir(c); + dout(10) << "freeze_dir_finish " << *this << endl; + _freeze_dir(); + auth_unpin(); + if (c) { + c->finish(0); + delete c; + } } void CDir::unfreeze_dir() @@ -1561,6 +1606,7 @@ void CDir::unfreeze_dir() // still freezing. stop. assert(state_test(STATE_FREEZINGDIR)); state_clear(STATE_FREEZINGDIR); + auth_unpin(); // cancel freeze waiters finish_waiting(WAIT_UNFREEZE); diff --git a/trunk/ceph/mds/CDir.h b/trunk/ceph/mds/CDir.h index df3412040508f..e4ac2f2a5c82f 100644 --- a/trunk/ceph/mds/CDir.h +++ b/trunk/ceph/mds/CDir.h @@ -60,9 +60,7 @@ class CDir : public MDSCacheObject { static const int PIN_DNWAITER = 1; static const int PIN_CHILD = 2; static const int PIN_FROZEN = 3; - static const int PIN_FRAGMENTING = 4; static const int PIN_EXPORT = 5; - static const int PIN_AUTHPIN = 6; static const int PIN_IMPORTING = 7; static const int PIN_EXPORTING = 8; static const int PIN_IMPORTBOUND = 9; @@ -73,34 +71,33 @@ class CDir : public MDSCacheObject { case PIN_DNWAITER: return "dnwaiter"; case PIN_CHILD: return "child"; case PIN_FROZEN: return "frozen"; - case PIN_FRAGMENTING: return "fragmenting"; case PIN_EXPORT: return "export"; case PIN_EXPORTING: return "exporting"; case PIN_IMPORTING: return "importing"; case PIN_IMPORTBOUND: return "importbound"; case PIN_EXPORTBOUND: return "exportbound"; - case PIN_AUTHPIN: return "authpin"; case PIN_STICKY: return "sticky"; default: return generic_pin_name(p); } } // -- state -- - static const unsigned STATE_COMPLETE = (1<< 2); // the complete contents are in cache - static const unsigned STATE_FROZENTREE = (1<< 4); // root of tree (bounded by exports) - static const unsigned STATE_FREEZINGTREE = (1<< 5); // in process of freezing - static const unsigned STATE_FROZENDIR = (1<< 6); - static const unsigned STATE_FREEZINGDIR = (1<< 7); - static const unsigned STATE_COMMITTING = (1<< 8); // mid-commit - static const unsigned STATE_FETCHING = (1<< 9); // currenting fetching - static const unsigned STATE_DELETED = (1<<10); - static const unsigned STATE_EXPORT = (1<<12); - static const unsigned STATE_IMPORTBOUND = (1<<13); - static const unsigned STATE_EXPORTBOUND = (1<<14); - static const unsigned STATE_EXPORTING = (1<<15); - static const unsigned STATE_IMPORTING = (1<<16); - static const unsigned STATE_FRAGMENTING = (1<<17); - static const unsigned STATE_STICKY = (1<<18); // sticky pin due to inode stickydirs + static const unsigned STATE_COMPLETE = (1<< 1); // the complete contents are in cache + static const unsigned STATE_FROZENTREE = (1<< 2); // root of tree (bounded by exports) + static const unsigned STATE_FREEZINGTREE = (1<< 3); // in process of freezing + static const unsigned STATE_FROZENDIR = (1<< 4); + static const unsigned STATE_FREEZINGDIR = (1<< 5); + static const unsigned STATE_COMMITTING = (1<< 6); // mid-commit + static const unsigned STATE_FETCHING = (1<< 7); // currenting fetching + static const unsigned STATE_DELETED = (1<< 8); + static const unsigned STATE_EXPORT = (1<< 9); + static const unsigned STATE_IMPORTBOUND = (1<<10); + static const unsigned STATE_EXPORTBOUND = (1<<11); + static const unsigned STATE_EXPORTING = (1<<12); + static const unsigned STATE_IMPORTING = (1<<13); + static const unsigned STATE_FRAGMENTING = (1<<14); + static const unsigned STATE_STICKY = (1<<15); // sticky pin due to inode stickydirs + static const unsigned STATE_DNPINNEDFRAG = (1<<16); // dir is refragmenting // common states static const unsigned STATE_CLEAN = 0; @@ -126,11 +123,9 @@ class CDir : public MDSCacheObject { static const unsigned MASK_STATE_FRAGMENT_KEPT = (STATE_DIRTY | STATE_COMPLETE | - STATE_FROZENDIR | STATE_EXPORT | STATE_EXPORTBOUND | - STATE_IMPORTBOUND | - STATE_STICKY); + STATE_IMPORTBOUND); // -- rep spec -- static const int REP_NONE = 0; @@ -170,31 +165,36 @@ class CDir : public MDSCacheObject { protected: // contents - CDir_map_t items; // non-null AND null - size_t nitems; // # non-null - size_t nnull; // # null + CDir_map_t items; // non-null AND null + unsigned nitems; // # non-null + unsigned nnull; // # null int num_dirty; // state - version_t version; - version_t committing_version; - version_t committed_version; - version_t committed_version_equivalent; // in case of, e.g., temporary file - version_t projected_version; + version_t version; + version_t committing_version; + version_t committed_version; + version_t committed_version_equivalent; // in case of, e.g., temporary file + version_t projected_version; // lock nesting, freeze - int auth_pins; - int nested_auth_pins; - int request_pins; + int auth_pins; + int nested_auth_pins; + int request_pins; // cache control (defined for authority; hints for replicas) - int dir_rep; - set dir_rep_by; // if dir_rep == REP_LIST + int dir_rep; + set dir_rep_by; // if dir_rep == REP_LIST // popularity - meta_load_t popularity[MDS_NPOP]; + dirfrag_load_vec_t pop_me; + dirfrag_load_vec_t pop_nested; + dirfrag_load_vec_t pop_auth_subtree; + dirfrag_load_vec_t pop_auth_subtree_nested; + + utime_t last_popularity_sample; // friends friend class Migrator; @@ -221,11 +221,11 @@ protected: CDir_map_t::iterator begin() { return items.begin(); } CDir_map_t::iterator end() { return items.end(); } - size_t get_size() { + unsigned get_size() { return nitems; } - size_t get_nitems() { return nitems; } - size_t get_nnull() { return nnull; } + unsigned get_nitems() { return nitems; } + unsigned get_nnull() { return nnull; } void inc_num_dirty() { num_dirty++; } void dec_num_dirty() { @@ -247,11 +247,12 @@ protected: return iter->second; } - CDentry* add_dentry( const string& dname, CInode *in=0 ); - CDentry* add_dentry( const string& dname, inodeno_t ino ); + CDentry* add_null_dentry(const string& dname); + CDentry* add_primary_dentry(const string& dname, CInode *in); + CDentry* add_remote_dentry(const string& dname, inodeno_t ino, unsigned char d_type); void remove_dentry( CDentry *dn ); // delete dentry - void link_inode( CDentry *dn, inodeno_t ino ); - void link_inode( CDentry *dn, CInode *in ); + void link_remote_inode( CDentry *dn, inodeno_t ino, unsigned char d_type); + void link_primary_inode( CDentry *dn, CInode *in ); void unlink_inode( CDentry *dn ); void try_remove_unlinked_dn(CDentry *dn); private: @@ -280,10 +281,8 @@ private: public: pair authority(); pair get_dir_auth() { return dir_auth; } - void set_dir_auth(pair a, bool iamauth=false); - void set_dir_auth(int a) { - set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN), false); - } + void set_dir_auth(pair a); + void set_dir_auth(int a) { set_dir_auth(pair(a, CDIR_AUTH_UNKNOWN)); } bool is_ambiguous_dir_auth() { return dir_auth.second != CDIR_AUTH_UNKNOWN; } @@ -300,7 +299,7 @@ private: // for giving to clients void get_dist_spec(set& ls, int auth) { - if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > + if (( pop_auth_subtree.get(META_POP_IRD).get() > g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; for (map::iterator p = replicas_begin(); @@ -327,7 +326,7 @@ private: // -- fetch -- object_t get_ondisk_object() { return object_t(ino(), frag); } - void fetch(Context *c); + void fetch(Context *c, bool ignore_authpinnability=false); void _fetched(bufferlist &bl); // -- commit -- @@ -394,17 +393,16 @@ public: void auth_pin(); void auth_unpin(); void adjust_nested_auth_pins(int inc); - void on_freezeable(); // -- freezing -- void freeze_tree(Context *c); void freeze_tree_finish(Context *c); void unfreeze_tree(); - void _freeze_tree(Context *c=0); + void _freeze_tree(); void freeze_dir(Context *c); void freeze_dir_finish(Context *c); - void _freeze_dir(Context *c=0); + void _freeze_dir(); void unfreeze_dir(); bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } @@ -417,9 +415,9 @@ public: bool is_frozen_tree_root() { return state & STATE_FROZENTREE; } bool is_frozen_dir() { return state & STATE_FROZENDIR; } - bool is_freezeable() { + bool is_freezeable(bool freezing=false) { // no nested auth pins. - if (auth_pins > 0 || nested_auth_pins > 0) + if ((auth_pins-freezing) > 0 || nested_auth_pins > 0) return false; // inode must not be frozen. @@ -428,12 +426,12 @@ public: return true; } - bool is_freezeable_dir() { - if (auth_pins > 0) + bool is_freezeable_dir(bool freezing=false) { + if ((auth_pins-freezing) > 0) return false; - // if not subtree root, inode must not be frozen. - if (!is_subtree_root() && inode->is_frozen()) + // if not subtree root, inode must not be frozen (tree--frozen_dir is okay). + if (!is_subtree_root() && inode->is_frozen() && !inode->is_frozen_dir()) return false; return true; @@ -508,9 +506,10 @@ class CDirExport { uint32_t nden; // num dentries (including null ones) version_t version; version_t committed_version; + version_t committed_version_equivalent; uint32_t state; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; + dirfrag_load_vec_t pop_me; + dirfrag_load_vec_t pop_auth_subtree; int32_t dir_rep; } st; map replicas; @@ -518,7 +517,7 @@ class CDirExport { public: CDirExport() {} - CDirExport(CDir *dir) { + CDirExport(CDir *dir, utime_t now) { memset(&st, 0, sizeof(st)); assert(dir->get_version() == dir->get_projected_version()); @@ -527,13 +526,15 @@ class CDirExport { st.nden = dir->items.size(); st.version = dir->version; st.committed_version = dir->committed_version; + st.committed_version_equivalent = dir->committed_version_equivalent; st.state = dir->state; st.dir_rep = dir->dir_rep; - - st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] ); - dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; + + st.pop_me = dir->pop_me; + st.pop_auth_subtree = dir->pop_auth_subtree; + dir->pop_auth_subtree_nested -= dir->pop_auth_subtree; + dir->pop_me.zero(now); + dir->pop_auth_subtree.zero(now); rep_by = dir->dir_rep_by; replicas = dir->replica_map; @@ -546,18 +547,20 @@ class CDirExport { assert(dir->dirfrag() == st.dirfrag); // set committed_version at old version - dir->committing_version = dir->committed_version = st.committed_version; - dir->projected_version = dir->version = st.version; + dir->committing_version = + dir->committed_version = st.committed_version; + dir->committed_version_equivalent = st.committed_version_equivalent; + dir->projected_version = + dir->version = st.version; // twiddle state dir->state = (dir->state & CDir::MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. (st.state & CDir::MASK_STATE_EXPORTED); dir->dir_rep = st.dir_rep; - dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; - dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - dir->popularity[MDS_POP_NESTED] += st.popularity_curdom; + dir->pop_me = st.pop_me; + dir->pop_auth_subtree = st.pop_auth_subtree; + dir->pop_auth_subtree_nested += dir->pop_auth_subtree; dir->replica_nonce = 0; // no longer defined diff --git a/trunk/ceph/mds/CInode.cc b/trunk/ceph/mds/CInode.cc index e72c8770ee7ac..45d3f6ecbf4ca 100644 --- a/trunk/ceph/mds/CInode.cc +++ b/trunk/ceph/mds/CInode.cc @@ -58,7 +58,8 @@ ostream& operator<<(ostream& out, CInode& in) } if (in.is_symlink()) out << " symlink"; - + if (in.is_dir() && !in.dirfragtree.empty()) out << " " << in.dirfragtree; + out << " v" << in.get_version(); // locks @@ -144,6 +145,25 @@ void CInode::get_dirfrags_under(frag_t fg, list& ls) ls.push_back(dirfrags[*p]); } +CDir *CInode::get_approx_dirfrag(frag_t fg) +{ + CDir *dir = get_dirfrag(fg); + if (dir) return dir; + + // find a child? + list ls; + get_dirfrags_under(fg, ls); + if (!ls.empty()) + return ls.front(); + + // try parents? + while (1) { + fg = fg.parent(); + dir = get_dirfrag(fg); + if (dir) return dir; + } +} + void CInode::get_dirfrags(list& ls) { // all dirfrags @@ -276,20 +296,6 @@ void CInode::put_stickydirs() -void CInode::fragment_dir(frag_t basefrag, int bits, list& subs, list& waiters) -{ - dout(10) << "fragment_dir " << bits << endl; - - CDir *base = get_or_open_dirfrag(mdcache, basefrag); - - dirfragtree.split(basefrag, bits); - if (bits > 0) { - base->split(bits, subs, waiters); - } else { - base->merge(bits, waiters); - } -} - // pins @@ -372,7 +378,11 @@ void CInode::make_anchor_trace(vector& trace) void CInode::name_stray_dentry(string& dname) { char s[20]; +#ifdef __LP64__ sprintf(s, "%ld", inode.ino.val); +#else + sprintf(s, "%lld", inode.ino.val); +#endif dname = s; } @@ -453,7 +463,19 @@ void CInode::encode_lock_state(int type, bufferlist& bl) break; case LOCK_OTYPE_IDIRFRAGTREE: - dirfragtree._encode(bl); + { + // encode the raw tree + dirfragtree._encode(bl); + + // also specify which frags are mine + set myfrags; + list dfls; + get_dirfrags(dfls); + for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) + if ((*p)->is_auth()) + myfrags.insert((*p)->get_frag()); + _encode(myfrags, bl); + } break; case LOCK_OTYPE_IFILE: @@ -502,7 +524,20 @@ void CInode::decode_lock_state(int type, bufferlist& bl) break; case LOCK_OTYPE_IDIRFRAGTREE: - dirfragtree._decode(bl, off); + { + fragtree_t temp; + temp._decode(bl, off); + set authfrags; + _decode(authfrags, bl, off); + if (is_auth()) { + // auth. believe replica's auth frags only. + for (set::iterator p = authfrags.begin(); p != authfrags.end(); ++p) + dirfragtree.force_to_leaf(*p); + } else { + // replica. just take the tree. + dirfragtree.swap(temp); + } + } break; case LOCK_OTYPE_IFILE: @@ -570,7 +605,7 @@ void CInode::add_waiter(int tag, Context *c) // auth_pins bool CInode::can_auth_pin() { if (parent) - return parent->dir->can_auth_pin(); + return parent->can_auth_pin(); return true; } @@ -583,7 +618,7 @@ void CInode::auth_pin() dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; if (parent) - parent->dir->adjust_nested_auth_pins( 1 ); + parent->adjust_nested_auth_pins( 1 ); } void CInode::auth_unpin() @@ -597,14 +632,14 @@ void CInode::auth_unpin() assert(auth_pins >= 0); if (parent) - parent->dir->adjust_nested_auth_pins( -1 ); + parent->adjust_nested_auth_pins( -1 ); } void CInode::adjust_nested_auth_pins(int a) { if (!parent) return; nested_auth_pins += a; - parent->get_dir()->adjust_nested_auth_pins(a); + parent->adjust_nested_auth_pins(a); } @@ -613,8 +648,6 @@ void CInode::adjust_nested_auth_pins(int a) pair CInode::authority() { - //if (is_root()) - //return CDIR_AUTH_ROOTINODE; // root _inode_ is locked to mds0. if (force_auth.first >= 0) return force_auth; diff --git a/trunk/ceph/mds/CInode.h b/trunk/ceph/mds/CInode.h index d32236e326753..6cd1e69c51241 100644 --- a/trunk/ceph/mds/CInode.h +++ b/trunk/ceph/mds/CInode.h @@ -55,24 +55,21 @@ ostream& operator<<(ostream& out, CInode& in); class CInode : public MDSCacheObject { public: // -- pins -- - //static const int PIN_REPLICATED = 1; - static const int PIN_DIR = 2; - static const int PIN_CAPS = 7; // client caps - static const int PIN_AUTHPIN = 8; - static const int PIN_IMPORTING = -9; // importing - static const int PIN_ANCHORING = 12; - static const int PIN_UNANCHORING = 13; - static const int PIN_OPENINGDIR = 14; - static const int PIN_REMOTEPARENT = 15; - static const int PIN_BATCHOPENJOURNAL = 16; - static const int PIN_SCATTERED = 17; - static const int PIN_STICKYDIRS = 18; + static const int PIN_DIRFRAG = -1; + static const int PIN_CAPS = 2; // client caps + static const int PIN_IMPORTING = -4; // importing + static const int PIN_ANCHORING = 5; + static const int PIN_UNANCHORING = 6; + static const int PIN_OPENINGDIR = 7; + static const int PIN_REMOTEPARENT = 8; + static const int PIN_BATCHOPENJOURNAL = 9; + static const int PIN_SCATTERED = 10; + static const int PIN_STICKYDIRS = 11; const char *pin_name(int p) { switch (p) { - case PIN_DIR: return "dir"; + case PIN_DIRFRAG: return "dirfrag"; case PIN_CAPS: return "caps"; - case PIN_AUTHPIN: return "authpin"; case PIN_IMPORTING: return "importing"; case PIN_ANCHORING: return "anchoring"; case PIN_UNANCHORING: return "unanchoring"; @@ -123,7 +120,8 @@ class CInode : public MDSCacheObject { fragtree_t dirfragtree; // dir frag tree, if any. always consistent with our dirfrag map. //map dirfrag_size; // size of each dirfrag - off_t last_open_journaled; // log offset for the last journaled EOpen + off_t last_journaled; // log offset for the last time i was journaled + off_t last_open_journaled; // log offset for the last journaled EOpen // projected values (only defined while dirty) list projected_inode; @@ -148,12 +146,14 @@ public: frag_t pick_dirfrag(const string &dn); bool has_dirfrags() { return !dirfrags.empty(); } CDir* get_dirfrag(frag_t fg) { - if (dirfrags.count(fg)) + if (dirfrags.count(fg)) { + assert(dirfragtree.is_leaf(fg)); return dirfrags[fg]; - else + } else return 0; } void get_dirfrags_under(frag_t fg, list& ls); + CDir* get_approx_dirfrag(frag_t fg); void get_dirfrags(list& ls); void get_nested_dirfrags(list& ls); void get_subtree_dirfrags(list& ls); @@ -166,8 +166,6 @@ public: void get_stickydirs(); void put_stickydirs(); - void fragment_dir(frag_t basefrag, int bits, list& subs, list& waiters); - protected: // parent dentries in cache CDentry *parent; // primary link @@ -190,7 +188,7 @@ protected: int nested_auth_pins; public: - meta_load_t popularity[MDS_NPOP]; + inode_load_vec_t pop; // friends friend class Server; @@ -205,7 +203,7 @@ protected: // --------------------------- CInode(MDCache *c, bool auth=true) : mdcache(c), - last_open_journaled(0), + last_journaled(0), last_open_journaled(0), stickydir_ref(0), parent(0), force_auth(CDIR_AUTH_DEFAULT), replica_caps_wanted(0), @@ -248,8 +246,6 @@ protected: return ino() < ((CInode*)r)->ino(); } - - // -- misc -- void make_path(string& s); void make_anchor_trace(vector& trace); @@ -278,7 +274,7 @@ public: LocalLock versionlock; SimpleLock authlock; SimpleLock linklock; - SimpleLock dirfragtreelock; + ScatterLock dirfragtreelock; FileLock filelock; ScatterLock dirlock; @@ -289,7 +285,7 @@ public: case LOCK_OTYPE_ILINK: return &linklock; case LOCK_OTYPE_IDIRFRAGTREE: return &dirfragtreelock; case LOCK_OTYPE_IDIR: return &dirlock; - default: assert(0); + default: assert(0); return 0; } } void set_object_info(MDSCacheObjectInfo &info); @@ -565,8 +561,8 @@ class CInodeExport { struct st_ { inode_t inode; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; + inode_load_vec_t pop; + bool is_dirty; // dirty inode? int num_caps; @@ -582,7 +578,7 @@ class CInodeExport { public: CInodeExport() {} - CInodeExport(CInode *in) { + CInodeExport(CInode *in, utime_t now) { st.inode = in->inode; symlink = in->symlink; dirfragtree = in->dirfragtree; @@ -596,10 +592,8 @@ public: in->filelock._encode(locks); in->dirlock._encode(locks); - st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); - st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); - in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; - in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; + st.pop = in->pop; + in->pop.zero(now); // steal WRITER caps from inode in->take_client_caps(cap_map); @@ -619,10 +613,7 @@ public: in->symlink = symlink; in->dirfragtree = dirfragtree; - in->popularity[MDS_POP_JUSTME] += st.popularity_justme; - in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; - in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; - in->popularity[MDS_POP_NESTED] += st.popularity_curdom; + in->pop = st.pop; if (st.is_dirty) in->_mark_dirty(); diff --git a/trunk/ceph/mds/Capability.h b/trunk/ceph/mds/Capability.h index eab6aa84b08bc..2c2241870650a 100644 --- a/trunk/ceph/mds/Capability.h +++ b/trunk/ceph/mds/Capability.h @@ -76,7 +76,6 @@ public: last_sent(s), last_recv(s), suppress(false) { - //cap_history[last_sent] = 0; } Capability(Export& other) : wanted_caps(other.wanted), diff --git a/trunk/ceph/mds/FileLock.h b/trunk/ceph/mds/FileLock.h index adb2130e86541..09868f7563fb6 100644 --- a/trunk/ceph/mds/FileLock.h +++ b/trunk/ceph/mds/FileLock.h @@ -67,7 +67,7 @@ inline const char *get_filelock_state_name(int n) { case LOCK_LONER: return "loner"; case LOCK_GLONERR: return "glonerr"; case LOCK_GLONERM: return "glonerm"; - default: assert(0); + default: assert(0); return 0; } } diff --git a/trunk/ceph/mds/Hasher.cc b/trunk/ceph/mds/Hasher.cc deleted file mode 100644 index 308aaa0dc976c..0000000000000 --- a/trunk/ceph/mds/Hasher.cc +++ /dev/null @@ -1,1582 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab - - -// ======================================================================= -// HASHING - - -void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) -{ - int off = 0; - - for (; nden>0; nden--) { - // dentry - string dname; - _decode(dname, bl, off); - dout(15) << "dname is " << dname << endl; - - char icode; - bl.copy(off, 1, &icode); - off++; - - CDentry *dn = dir->lookup(dname); - if (!dn) - dn = dir->add_dentry(dname); // null - - // mark dn dirty _after_ we link the inode (scroll down) - - if (icode == 'N') { - - // null dentry - assert(dn->is_null()); - - // fall thru - } - else if (icode == 'L') { - // remote link - inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - dir->link_inode(dn, ino); - } - else if (icode == 'I') { - // inode - decode_import_inode(dn, bl, off, oldauth); - - // fix up subdir export? - if (dn->inode->dir) { - assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTBOUND)); - dn->inode->dir->put(CDir::PIN_IMPORTBOUND); - dn->inode->dir->state_clear(CDIR_STATE_IMPORTBOUND); - - if (dn->inode->dir->is_auth()) { - // mine. must have been an import. - assert(dn->inode->dir->is_import()); - dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDir::PIN_IMPORT); - dn->inode->dir->state_clear(CDIR_STATE_IMPORT); - - // move nested under hashdir - for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); - it != cache->nested_exports[dn->inode->dir].end(); - it++) - cache->nested_exports[dir].insert(*it); - cache->nested_exports.erase(dn->inode->dir); - - // now it matches the inode - dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - else { - // not mine. make it an export. - dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDir::PIN_EXPORT); - dn->inode->dir->state_set(CDIR_STATE_EXPORT); - cache->exports.insert(dn->inode->dir); - cache->nested_exports[dir].insert(dn->inode->dir); - - if (dn->inode->dir->get_dir_auth().first == CDIR_AUTH_PARENT) - dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode - assert(dn->inode->dir->get_dir_auth().first >= 0); - } - } - } - - // mark dentry dirty? (only _after_ we link the inode!) - dn->_mark_dirty(); // fixme - } -} - -/* - - notes on interaction of hashing and export/import: - - - dir->is_auth() is completely independent of hashing. for a hashed dir, - - all nodes are partially authoritative - - all nodes dir->is_hashed() == true - - all nodes dir->inode->dir_is_hashed() == true - - one node dir->is_auth() == true, the rest == false - - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. - - - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - - on export, -1 is changed to mds->get_nodeid() - - on import, nothing special, actually. - - - hashed dir files aren't included in export; subdirs are converted to imports - or exports as necessary. - - hashed dir subdirs are discovered on export. this is important - because dirs are needed to tie together auth hierarchy, for auth to know about - imports/exports, etc. - - - dir state is maintained on auth. - - COMPLETE and HASHED are transfered to importers. - - DIRTY is set everywhere. - - - hashed dir is like an import: hashed dir used for nested_exports map. - - nested_exports is updated appropriately on auth and replicas. - - a subtree terminates as a hashed dir, since the hashing explicitly - redelegates all inodes. thus export_dir_walk includes hashed dirs, but - not their inodes. -*/ - -// HASH on auth - -class C_MDC_HashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->hash_dir_frozen(dir); - } -}; - -class C_MDC_HashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_HashComplete(Migrator *mig, CDir *dir) { - this->mig = mig; - this->dir = dir; - } - virtual void finish(int r) { - mig->hash_dir_complete(dir); - } -}; - - -/** hash_dir(dir) - * start hashing a directory. - */ -void Migrator::hash_dir(CDir *dir) -{ - dout(-7) << "hash_dir " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't hash, freezing|frozen." << endl; - return; - } - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "hash_dir couldn't pin path, failing." << endl; - return; - } - - // ok, go - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // discover on all mds - assert(hash_gather.count(dir) == 0); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; // except me - hash_gather[dir].insert(i); - mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); - } - dir->auth_pin(); // pin until discovers are all acked. - - // start freeze - dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_HashComplete(this, dir)); - } else - hash_dir_complete(dir); -} - - -/* - * wait for everybody to discover and open the hashing dir - * then auth_unpin, to let the freeze happen - */ -void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete - } else { - dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; - } - - delete m; // done -} - - - -/* - * once the dir is completely in memory, - * mark all migrating inodes dirty (to pin in cache) - */ -void Migrator::hash_dir_complete(CDir *dir) -{ - dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - in->_mark_dirty(); // fixme - } - - if (dir->is_frozen_dir()) - hash_dir_go(dir); -} - - -/* - * once the dir is frozen, - * make sure it's complete - * send the prep messages! - */ -void Migrator::hash_dir_frozen(CDir *dir) -{ - dout(7) << "hash_dir_frozen " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; - return; - } - - // send prep messages w/ export directories to open - vector msgs(mds->get_mds_map()->get_num_mds()); - - // check for subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) continue; - - // msg? - if (msgs[dentryhashcode] == 0) { - msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); - } - msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); - } - - // send them! - assert(hash_gather[dir].empty()); - for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - hash_gather[dir].insert(i); - } - } - - if (hash_gather[dir].empty()) { - // no subdirs! continue! - hash_gather.erase(dir); - hash_dir_go(dir); - } else { - // wait! - } -} - -/* - * wait for peers to open all subdirs - */ -void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; - hash_dir_go(dir); - } else { - dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * once the dir is frozen, - * make sure it's complete - * do the hashing! - */ -void Migrator::hash_dir_go(CDir *dir) -{ - dout(7) << "hash_dir_go " << *dir << endl; - - assert(!dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - // get messages to other nodes ready - vector msgs(mds->get_mds_map()->get_num_mds()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - msgs[i] = new MHashDir(dir->ino()); - } - - // pick a hash seed. - dir->inode->inode.hash_seed = 1;//dir->ino(); - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // get containing import. might be me. - CDir *containing_import = cache->get_auth_container(dir); - assert(containing_import != dir || dir->is_import()); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode == mds->get_nodeid()) { - continue; // still mine! - } - - bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); - assert(bl); - - // -- dentry - dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, *bl); - - // null dentry? - if (dn->is_null()) { - bl->append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl->append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl->append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl->append("I", 1); // inode dentry - - encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export - msgs[dentryhashcode]->inc_nden(); - - if (dn->is_dirty()) - dn->mark_clean(); - - // add to proxy - hash_proxy_inos[dir].push_back(in); - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - - // fix up subdirs - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - - // fix nested bits - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == in->dir) { - dout(10) << "moving nested export " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[in->dir].insert(ex); - } - } - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[containing_import].erase(in->dir); - if (in->dir->authority() == dentryhashcode) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( in->dir->authority() ); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // inode state - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // fix up nested_exports? - if (containing_import != dir) { - dout(7) << "moving nested exports under hashed dir" << endl; - for (set::iterator it = cache->nested_exports[containing_import].begin(); - it != cache->nested_exports[containing_import].end(); ) { - CDir *ex = *it; - it++; - if (cache->get_auth_container(ex) == dir) { - dout(7) << " moving nested export under hashed dir: " << *ex << endl; - cache->nested_exports[containing_import].erase(ex); - cache->nested_exports[dir].insert(ex); - } else { - dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; - } - } - } - - // send hash messages - assert(hash_gather[dir].empty()); - assert(hash_notify_gather[dir].empty()); - assert(dir->hashed_subset.empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - // all nodes hashed locally.. - dir->hashed_subset.insert(i); - - if (i == mds->get_nodeid()) continue; - - // init hash_gather and hash_notify_gather sets - hash_gather[dir].insert(i); - - assert(hash_notify_gather[dir][i].empty()); - for (int j=0; jget_mds_map()->get_num_mds(); j++) { - if (j == mds->get_nodeid()) continue; - if (j == i) continue; - hash_notify_gather[dir][i].insert(j); - } - - mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); - } - - // wait for all the acks. -} - - -void Migrator::handle_hash_dir_ack(MHashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; - - if (hash_notify_gather[dir].empty()) { - dout(7) << "got notifies too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "waiting on notifies " << endl; - } - - } else { - dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -void Migrator::hash_dir_finish(CDir *dir) -{ - dout(7) << "hash_dir_finish finishing " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_hashing()); - - // dir state - hash_gather.erase(dir); - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - - // unproxy inodes - // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - hash_proxy_inos.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // unfreeze - dir->unfreeze_dir(); - - show_imports(); - assert(hash_gather.count(dir) == 0); - - // stats - //if (mds->logger) mds->logger->inc("nh", 1); - -} - - - - -// HASH on auth and non-auth - -void Migrator::handle_hash_dir_notify(MHashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir_notify " << *dir << endl; - int from = m->get_from(); - - int source = m->get_source().num(); - if (dir->is_auth()) { - // gather notifies - assert(dir->is_hashed()); - - assert( hash_notify_gather[dir][from].count(source) ); - hash_notify_gather[dir][from].erase(source); - - if (hash_notify_gather[dir][from].empty()) { - dout(7) << "last notify from " << from << endl; - hash_notify_gather[dir].erase(from); - - if (hash_notify_gather[dir].empty()) { - dout(7) << "last notify!" << endl; - hash_notify_gather.erase(dir); - - if (hash_gather[dir].empty()) { - dout(7) << "got acks too, all done" << endl; - hash_dir_finish(dir); - } else { - dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; - } - } else { - dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; - } - } else { - dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; - } - - // delete msg - delete m; - } else { - // update dir hashed_subset - assert(dir->hashed_subset.count(from) == 0); - dir->hashed_subset.insert(from); - - // update open subdirs - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->get_inode(); - if (!in) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != from) continue; // we'll import these in a minute - - if (in->dir->authority() != dentryhashcode) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - - // remove from notify gather set - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - // last notify? - if (hash_gather[dir].empty()) { - dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; - hash_gather.erase(dir); - - dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDir::PIN_HASHING); - dir->hashed_subset.clear(); - } else { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - } - - // fw notify to auth - mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); - } -} - - - - -// HASH on non-auth - -/* - * discover step: - * each peer needs to open up the directory and pin it before we start - */ -class C_MDC_HashDirDiscover : public Context { - Migrator *mig; - MHashDirDiscover *m; -public: - vector trace; - C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { - this->mig = mig; - this->m = m; - } - void finish(int r) { - CInode *in = 0; - if (r >= 0) { - if (trace.size()) - in = trace[trace.size()-1]->get_inode(); - else - in = mig->cache->get_root(); - } - mig->handle_hash_dir_discover_2(m, in, r); - } -}; - -void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) -{ - assert(m->get_source().num() != mds->get_nodeid()); - - dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; - - // must discover it! - C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); - filepath fpath(m->get_path()); - cache->path_traverse(fpath, onfinish->trace, true, - m, new C_MDS_RetryMessage(mds,m), // on delay/retry - MDS_TRAVERSE_DISCOVER, - onfinish); // on completion|error -} - -void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) -{ - // yay! - if (in) { - dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; - } - - if (r < 0 || !in->is_dir()) { - dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; - assert(0); // this shouldn't happen if the auth pins his path properly!!!! - } - assert(in->is_dir()); - - // is dir open? - if (!in->dir) { - dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - return; - } - CDir *dir = in->dir; - - // pin dir, set hashing flag - dir->state_set(CDIR_STATE_HASHING); - dir->get(CDir::PIN_HASHING); - assert(dir->hashed_subset.empty()); - - // inode state - dir->inode->inode.hash_seed = 1;// dir->ino(); - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash dirty fixme")); - } - - // get gather set ready for notifies - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == dir->authority()) continue; - hash_gather[dir].insert(i); - } - - // reply - dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; - mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - delete m; -} - -/* - * prep step: - * peers need to open up all subdirs of the hashed dir - */ - -void Migrator::handle_hash_dir_prep(MHashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_hash_dir_prep " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTBOUND); - in->dir->state_set(CDIR_STATE_IMPORTBOUND); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ack! - mds->send_message_mds(new MHashDirPrepAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; -} - - -/* - * hash step: - */ - -void Migrator::handle_hash_dir(MHashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - assert(!dir->is_auth()); - assert(!dir->is_hashed()); - assert(dir->is_hashing()); - - dout(5) << "handle_hash_dir " << *dir << endl; - int oldauth = m->get_source().num(); - - // content - import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); - - // dir state - dir->state_set(CDIR_STATE_HASHED); - dir->get(CDir::PIN_HASHED); - cache->hashdirs.insert(dir); - dir->hashed_subset.insert(mds->get_nodeid()); - - // dir is complete - dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdlog->submit_entry(new EString("dirty dir fixme")); - - // commit - mds->mdstore->commit_dir(dir, 0); - - // send notifies - dout(7) << "sending notifies" << endl; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - if (i == m->get_source().num()) continue; - mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), - i, MDS_PORT_MIGRATOR); - } - - // ack - dout(7) << "acking" << endl; - mds->send_message_mds(new MHashDirAck(dir->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); - - // done. - delete m; - - show_imports(); -} - - - - - -// UNHASH on auth - -class C_MDC_UnhashFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_frozen(dir); - } -}; - -class C_MDC_UnhashComplete : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_complete(dir); - } -}; - - -void Migrator::unhash_dir(CDir *dir) -{ - dout(-7) << "unhash_dir " << *dir << endl; - - assert(dir->is_hashed()); - assert(!dir->is_unhashing()); - assert(dir->is_auth()); - assert(hash_gather.count(dir)==0); - - // pin path? - vector trace; - cache->make_trace(trace, dir->inode); - if (!cache->path_pin(trace, 0, 0)) { - dout(7) << "unhash_dir couldn't pin path, failing." << endl; - return; - } - - // twiddle state - dir->state_set(CDIR_STATE_UNHASHING); - - // first, freeze the dir. - dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else - unhash_dir_complete(dir); - -} - -void Migrator::unhash_dir_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep(dir); -} - - -/* - * ask peers to freeze and complete hashed dir - */ -void Migrator::unhash_dir_prep(CDir *dir) -{ - dout(7) << "unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDirPrep(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * wait for peers to freeze and complete hashed dirs - */ -void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - int from = m->get_source().num(); - dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; - - if (!m->did_assim()) { - m->mark_assim(); // only do this the first time! - - // assimilate dentry+inodes for exports - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - if (in) { - it->second->update_inode(in); - dout(5) << " updated " << *in << endl; - } else { - in = new CInode(mds->mdcache, false); - it->second->update_inode(in); - cache->add_inode(in); - - // link - dir->add_dentry( it->first, in ); - dout(5) << " added " << *in << endl; - } - - // open! - if (!in->dir) { - dout(5) << " opening nested export on " << *in << endl; - cache->open_remote_dir(in, - new C_MDS_RetryMessage(mds, m)); - } - } - } - - // verify! - int waiting_for = 0; - for (map::iterator it = m->get_inodes().begin(); - it != m->get_inodes().end(); - it++) { - CInode *in = cache->get_inode( it->second->get_ino() ); - assert(in); - - if (in->dir) { - if (!in->dir->state_test(CDIR_STATE_IMPORTBOUND)) { - dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDir::PIN_IMPORTBOUND); - in->dir->state_set(CDIR_STATE_IMPORTBOUND); - } else { - dout(5) << " already pinned nested export " << *in << endl; - } - } else { - dout(5) << " waiting for nested export dir on " << *in << endl; - waiting_for++; - } - } - - if (waiting_for) { - dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; - return; - } - - // ok, done with this PrepAck - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - - if (hash_gather[dir].empty()) { - hash_gather.erase(dir); - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; - unhash_dir_go(dir); - } else { - dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; - } - - delete m; -} - - -/* - * auth: - * send out MHashDir's to peers - */ -void Migrator::unhash_dir_go(CDir *dir) -{ - dout(7) << "unhash_dir_go " << *dir << endl; - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(dir->is_frozen_dir()); - assert(dir->is_complete()); - - // send unhash prep to all peers - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - hash_gather[dir].insert(i); - mds->send_message_mds(new MUnhashDir(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - -/* - * auth: - * assimilate unhashing content - */ -void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(dir->is_hashed()); - - // assimilate content - int from = m->get_source().num(); - import_hashed_content(dir, m->get_state(), m->get_nden(), from); - delete m; - - // done? - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; - return; - } - - // done! - - // fix up nested_exports - CDir *containing_import = cache->get_auth_container(dir); - if (containing_import != dir) { - for (set::iterator it = cache->nested_exports[dir].begin(); - it != cache->nested_exports[dir].end(); - it++) { - dout(7) << "moving nested export out from under hashed dir : " << **it << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(dir); - } - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); //later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - - // commit! - assert(dir->is_complete()); - //dir->mark_complete(); - dir->mark_dirty(dir->pre_dirty()); // fixme - mds->mdstore->commit_dir(dir, 0); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); - } - - // notify - assert(hash_gather[dir].empty()); - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == mds->get_nodeid()) continue; - - hash_gather[dir].insert(i); - - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), - i, MDS_PORT_MIGRATOR); - } -} - - -/* - * sent by peer to flush mds links. unfreeze when all gathered. - */ -void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_ack " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(dir->is_frozen_dir()); - - // done? - int from = m->get_source().num(); - assert(hash_gather[dir].count(from)); - hash_gather[dir].erase(from); - delete m; - - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; - } else { - unhash_dir_finish(dir); - } -} - - -/* - * all mds links are flushed. unfreeze dir! - */ -void Migrator::unhash_dir_finish(CDir *dir) -{ - dout(7) << "unhash_dir_finish " << *dir << endl; - hash_gather.erase(dir); - - // unpin path - vector trace; - cache->make_trace(trace, dir->inode); - cache->path_unpin(trace, 0); - - // state - dir->state_clear(CDIR_STATE_UNHASHING); - - // unfreeze - dir->unfreeze_dir(); - -} - - - -// UNHASH on all - -/* - * hashed dir is complete. - * mark all migrating inodes dirty (to pin in cache) - * if frozen too, then go to next step (depending on auth) - */ -void Migrator::unhash_dir_complete(CDir *dir) -{ - dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; - - assert(dir->is_hashed()); - assert(dir->is_complete()); - - // mark dirty to pin in cache - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CInode *in = it->second->inode; - if (in->is_auth()) { - in->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash dirty fixme")); - } - } - - if (!dir->is_frozen_dir()) { - dout(7) << "dir complete but !frozen, waiting " << *dir << endl; - } else { - if (dir->is_auth()) - unhash_dir_prep(dir); // auth - else - unhash_dir_prep_finish(dir); // nonauth - } -} - - -// UNHASH on non-auth - -class C_MDC_UnhashPrepFreeze : public Context { -public: - Migrator *mig; - CDir *dir; - C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} - virtual void finish(int r) { - mig->unhash_dir_prep_frozen(dir); - } -}; - - -/* - * peers need to freeze their dir and make them complete - */ -void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_prep " << *dir << endl; - assert(dir->is_hashed()); - - // freeze - dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); - - // make complete - if (!dir->is_complete()) { - dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; - mds->mdstore->fetch_dir(dir, - new C_MDC_UnhashComplete(this, dir)); - } else { - unhash_dir_complete(dir); - } - - delete m; -} - -/* - * peer has hashed dir frozen. - * complete too? - */ -void Migrator::unhash_dir_prep_frozen(CDir *dir) -{ - dout(7) << "unhash_dir_prep_frozen " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_frozen_dir()); - assert(!dir->is_auth()); - - if (!dir->is_complete()) { - dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; - } else - unhash_dir_prep_finish(dir); -} - -/* - * peer has hashed dir complete and frozen. ack. - */ -void Migrator::unhash_dir_prep_finish(CDir *dir) -{ - dout(7) << "unhash_dir_prep_finish " << *dir << endl; - assert(dir->is_hashed()); - assert(!dir->is_auth()); - assert(dir->is_frozen()); - assert(dir->is_complete()); - - // twiddle state - if (dir->is_unhashing()) - return; // already replied. - dir->state_set(CDIR_STATE_UNHASHING); - - // send subdirs back to auth - MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); - int auth = dir->authority(); - - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - if (!in->is_dir()) continue; - if (!in->dir) continue; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) continue; - - // msg? - ack->add_inode(it->first, in->replicate_to(auth)); - } - - // ack - mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); -} - - - -/* - * peer needs to send hashed dir content back to auth. - * unhash dir. - */ -void Migrator::handle_unhash_dir(MUnhashDir *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; - assert(dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - // get message ready - bufferlist bl; - int nden = 0; - - // suck up all waiters - C_Contexts *fin = new C_Contexts; - list waiting; - dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters - fin->take(waiting); - - // divy up contents - for (CDir_map_t::iterator it = dir->begin(); - it != dir->end(); - it++) { - CDentry *dn = it->second; - CInode *in = dn->inode; - - int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); - if (dentryhashcode != mds->get_nodeid()) { - // not mine! - // twiddle dir_auth? - if (in->dir) { - if (in->dir->authority() != dir->authority()) - in->dir->set_dir_auth( in->dir->authority() ); - else - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - } - continue; - } - - // -- dentry - dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; - _encode(it->first, bl); - - // null dentry? - if (dn->is_null()) { - bl.append("N", 1); // null dentry - assert(dn->is_sync()); - continue; - } - - if (dn->is_remote()) { - // remote link - bl.append("L", 1); // remote link - - inodeno_t ino = dn->get_remote_ino(); - bl.append((char*)&ino, sizeof(ino)); - continue; - } - - // primary link - // -- inode - bl.append("I", 1); // inode dentry - - encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export - nden++; - - if (dn->is_dirty()) - dn->mark_clean(); - - // proxy - in->state_set(CInode::STATE_PROXY); - in->get(CInode::PIN_PROXY); - hash_proxy_inos[dir].push_back(in); - - if (in->dir) { - if (in->dir->is_auth()) { - // mine. make it into an import. - dout(7) << "making subdir into import " << *in->dir << endl; - in->dir->set_dir_auth( mds->get_nodeid() ); - cache->imports.insert(in->dir); - in->dir->get(CDir::PIN_IMPORT); - in->dir->state_set(CDIR_STATE_IMPORT); - } - else { - // not mine. - dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; - assert(in->dir->is_export()); - in->dir->put(CDir::PIN_EXPORT); - in->dir->state_clear(CDIR_STATE_EXPORT); - cache->exports.erase(in->dir); - cache->nested_exports[dir].erase(in->dir); - } - } - - // waiters - list waiters; - in->take_waiting(CINODE_WAIT_ANY, waiters); - fin->take(waiters); - } - - // we should have no nested exports; we're not auth for the dir! - assert(cache->nested_exports[dir].empty()); - cache->nested_exports.erase(dir); - - // dir state - //dir->state_clear(CDIR_STATE_UNHASHING); // later - dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDir::PIN_HASHED); - cache->hashdirs.erase(dir); - dir->mark_clean(); - - // inode state - dir->inode->inode.hash_seed = 0; - if (dir->inode->is_auth()) { - dir->inode->_mark_dirty(); // fixme - mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); - } - - // init gather set - mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); - hash_gather[dir].erase(mds->get_nodeid()); - - // send unhash message - mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), - dir->authority(), MDS_PORT_MIGRATOR); -} - - -/* - * first notify comes from auth. - * send notifies to all other peers, with peer = self - * if we get notify from peer=other, remove from our gather list. - * when we've gotten notifies from everyone, - * unpin proxies, - * send notify_ack to auth. - * this ensures that all mds links are flushed of cache_expire type messages. - */ -void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(7) << "handle_unhash_dir_finish " << *dir << endl; - assert(!dir->is_hashed()); - assert(dir->is_unhashing()); - assert(!dir->is_auth()); - - int from = m->get_source().num(); - assert(hash_gather[dir].count(from) == 1); - hash_gather[dir].erase(from); - delete m; - - // did we send our shout out? - if (from == dir->authority()) { - // send notify to everyone else in weird chatter storm - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i == from) continue; - if (i == mds->get_nodeid()) continue; - mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); - } - } - - // are we done? - if (!hash_gather[dir].empty()) { - dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; - return; - } - hash_gather.erase(dir); - - // all done! - dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; - - // unpin proxies - for (list::iterator it = hash_proxy_inos[dir].begin(); - it != hash_proxy_inos[dir].end(); - it++) { - CInode *in = *it; - assert(in->state_test(CInode::STATE_PROXY)); - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - } - - // unfreeze - dir->unfreeze_dir(); - - // ack - dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; - mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - -} diff --git a/trunk/ceph/mds/IdAllocator.cc b/trunk/ceph/mds/IdAllocator.cc index da9019f423f9e..5fa09339b0acd 100644 --- a/trunk/ceph/mds/IdAllocator.cc +++ b/trunk/ceph/mds/IdAllocator.cc @@ -135,8 +135,14 @@ void IdAllocator::reset() // use generic range. FIXME THIS IS CRAP free.clear(); +#ifdef __LP64__ uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 40; uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 40) - 1; +#else +# warning this looks like a 32-bit system, using small inode numbers. + uint64_t start = (uint64_t)(mds->get_nodeid()+1) << 25; + uint64_t end = ((uint64_t)(mds->get_nodeid()+2) << 25) - 1; +#endif free.insert(start, end); state = STATE_ACTIVE; diff --git a/trunk/ceph/mds/Locker.cc b/trunk/ceph/mds/Locker.cc index 190bd5618faf8..ea87fe8569b0b 100644 --- a/trunk/ceph/mds/Locker.cc +++ b/trunk/ceph/mds/Locker.cc @@ -337,6 +337,7 @@ bool Locker::rdlock_start(SimpleLock *lock, MDRequest *mdr) switch (lock->get_type()) { case LOCK_OTYPE_IFILE: return file_rdlock_start((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: return scatter_rdlock_start((ScatterLock*)lock, mdr); default: @@ -349,6 +350,7 @@ void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) switch (lock->get_type()) { case LOCK_OTYPE_IFILE: return file_rdlock_finish((FileLock*)lock, mdr); + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: return scatter_rdlock_finish((ScatterLock*)lock, mdr); default: @@ -359,18 +361,21 @@ void Locker::rdlock_finish(SimpleLock *lock, MDRequest *mdr) bool Locker::wrlock_start(SimpleLock *lock, MDRequest *mdr) { switch (lock->get_type()) { + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: return scatter_wrlock_start((ScatterLock*)lock, mdr); case LOCK_OTYPE_IVERSION: return local_wrlock_start((LocalLock*)lock, mdr); default: - assert(0); + assert(0); + return false; } } void Locker::wrlock_finish(SimpleLock *lock, MDRequest *mdr) { switch (lock->get_type()) { + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: return scatter_wrlock_finish((ScatterLock*)lock, mdr); case LOCK_OTYPE_IVERSION: @@ -387,6 +392,7 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mdr) return file_xlock_start((FileLock*)lock, mdr); case LOCK_OTYPE_IVERSION: return local_xlock_start((LocalLock*)lock, mdr); + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: assert(0); default: @@ -401,6 +407,7 @@ void Locker::xlock_finish(SimpleLock *lock, MDRequest *mdr) return file_xlock_finish((FileLock*)lock, mdr); case LOCK_OTYPE_IVERSION: return local_xlock_finish((LocalLock*)lock, mdr); + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: assert(0); default: @@ -462,7 +469,7 @@ Capability* Locker::issue_new_caps(CInode *in, cap->set_suppress(true); } else { // make sure it has sufficient caps - if (cap->wanted() & ~my_want) { + if (my_want & ~cap->wanted()) { // augment wanted caps for this client cap->set_wanted( cap->wanted() | my_want ); } @@ -672,8 +679,21 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) << endl; // update wanted - if (cap->wanted() != wanted) - cap->set_wanted(wanted); + if (cap->wanted() != wanted) { + if (m->get_seq() < cap->get_last_seq()) { + /* this is awkward. + client may be trying to release caps (i.e. inode closed, etc.) by setting reducing wanted + set. + but it may also be opening the same filename, not sure that it'll map to the same inode. + so, we don't want wanted reductions to clobber mds's notion of wanted unless we're + sure the client has seen all the latest caps. + */ + dout(-10) << "handle_client_file_caps ignoring wanted " << cap_string(m->get_wanted()) + << " bc seq " << m->get_seq() << " < " << cap->get_last_seq() << endl; + } else { + cap->set_wanted(wanted); + } + } // confirm caps int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); @@ -811,7 +831,6 @@ void Locker::handle_lock(MLock *m) case LOCK_OTYPE_DN: case LOCK_OTYPE_IAUTH: case LOCK_OTYPE_ILINK: - case LOCK_OTYPE_IDIRFRAGTREE: handle_simple_lock(lock, m); break; @@ -819,6 +838,7 @@ void Locker::handle_lock(MLock *m) handle_file_lock((FileLock*)lock, m); break; + case LOCK_OTYPE_IDIRFRAGTREE: case LOCK_OTYPE_IDIR: handle_scatter_lock((ScatterLock*)lock, m); break; @@ -1319,7 +1339,8 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) !lock->is_rdlocked() && !lock->is_xlocked() && lock->get_state() == LOCK_SYNC) - scatter_lock(lock); + lock->set_state(LOCK_SCATTER); + //scatter_scatter(lock); // can wrlock? if (lock->can_wrlock()) { @@ -1330,7 +1351,8 @@ bool Locker::scatter_wrlock_start(ScatterLock *lock, MDRequest *mdr) } // wait for write. - lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mdr)); + lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, + new C_MDS_RetryRequest(mdcache, mdr)); // initiate scatter or lock? if (lock->is_stable()) { @@ -1526,7 +1548,7 @@ void Locker::scatter_writebehind(ScatterLock *lock) inode_t *pi = in->project_inode(); pi->version = in->pre_dirty(); - EUpdate *le = new EUpdate("dir.mtime writebehind"); + EUpdate *le = new EUpdate(mds->mdlog, "scatter writebehind"); le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); diff --git a/trunk/ceph/mds/LogEvent.h b/trunk/ceph/mds/LogEvent.h index dca883e9f386a..fb2ccf2664fb2 100644 --- a/trunk/ceph/mds/LogEvent.h +++ b/trunk/ceph/mds/LogEvent.h @@ -49,10 +49,12 @@ class LogEvent { private: int _type; off_t _start_off,_end_off; + friend class MDLog; public: - LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } + LogEvent(int t) : + _type(t), _start_off(0), _end_off(0) { } virtual ~LogEvent() { } int get_type() { return _type; } @@ -69,7 +71,6 @@ class LogEvent { out << "event(" << _type << ")"; } - /*** live journal ***/ /* obsolete() - is this entry committed to primary store, such that diff --git a/trunk/ceph/mds/MDBalancer.cc b/trunk/ceph/mds/MDBalancer.cc index 58de2647753e7..a227886f23ba1 100644 --- a/trunk/ceph/mds/MDBalancer.cc +++ b/trunk/ceph/mds/MDBalancer.cc @@ -68,6 +68,12 @@ void MDBalancer::tick() utime_t elapsed = now; elapsed -= first; + // sample? + if ((double)now - (double)last_sample > g_conf.mds_bal_sample_interval) { + dout(15) << "tick last_sample now " << now << endl; + last_sample = now; + } + // balance? if (true && mds->get_nodeid() == 0 && @@ -84,10 +90,9 @@ void MDBalancer::tick() // hash? if (true && - g_conf.num_mds > 1 && - now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) { - last_hash = now; - do_hashing(); + now.sec() - last_fragment.sec() > g_conf.mds_bal_fragment_interval) { + last_fragment = now; + do_fragmenting(); } } @@ -108,11 +113,17 @@ public: mds_load_t MDBalancer::get_load() { mds_load_t load; - if (mds->mdcache->get_root()) - load.root = - mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM]; - // + - // mds->mdcache->get_root()->popularity[MDS_POP_NESTED]; + + if (mds->mdcache->get_root()) { + list ls; + mds->mdcache->get_root()->get_dirfrags(ls); + for (list::iterator p = ls.begin(); + p != ls.end(); + p++) { + load.auth += (*p)->pop_auth_subtree_nested; + load.all += (*p)->pop_nested; + } + } load.req_rate = mds->get_req_rate(); load.queue_len = mds->messenger->get_dispatch_queue_len(); @@ -121,6 +132,7 @@ mds_load_t MDBalancer::get_load() void MDBalancer::send_heartbeat() { + utime_t now = g_clock.now(); if (!mds->mdcache->get_root()) { dout(5) << "no root on send_heartbeat" << endl; mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); @@ -146,7 +158,7 @@ void MDBalancer::send_heartbeat() int from = im->inode->authority().first; if (from == mds->get_nodeid()) continue; if (im->get_inode()->is_stray()) continue; - import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); + import_map[from] += im->pop_auth_subtree.meta_load(now); } mds_import_map[ mds->get_nodeid() ] = import_map; @@ -176,7 +188,7 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) dout(25) << "=== got heartbeat " << m->get_beat() << " from " << m->get_source().num() << " " << m->get_load() << endl; if (!mds->mdcache->get_root()) { - dout(10) << "no root on handle" << endl; + dout(10) << "opening root on handle_heartbeat" << endl; mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); return; } @@ -211,17 +223,20 @@ void MDBalancer::handle_heartbeat(MHeartbeat *m) void MDBalancer::export_empties() { dout(5) << "export_empties checking for empty imports" << endl; - dout(0) << "IMPLEMENT ME" << endl; - /* - for (set::iterator it = mds->mdcache->subtrees.begin(); + + for (map >::iterator it = mds->mdcache->subtrees.begin(); it != mds->mdcache->subtrees.end(); it++) { - CDir *dir = *it; + CDir *dir = it->first; + if (!dir->is_auth() || + dir->is_ambiguous_auth() || + dir->is_freezing() || + dir->is_frozen()) + continue; if (!dir->inode->is_root() && dir->get_size() == 0) mds->mdcache->migrator->export_empty_import(dir); } - */ } @@ -250,31 +265,26 @@ double MDBalancer::try_match(int ex, double& maxex, -void MDBalancer::do_hashing() +void MDBalancer::do_fragmenting() { - if (hash_queue.empty()) { - dout(20) << "do_hashing has nothing to do" << endl; + if (split_queue.empty()) { + dout(20) << "do_fragmenting has nothing to do" << endl; return; } - dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; + dout(0) << "do_fragmenting " << split_queue.size() << " dirs marked for possible splitting" << endl; - for (set::iterator i = hash_queue.begin(); - i != hash_queue.end(); + for (set::iterator i = split_queue.begin(); + i != split_queue.end(); i++) { - inodeno_t dirino = *i; - CInode *in = mds->mdcache->get_inode(dirino); - if (!in) continue; - /* - CDir *dir = in->dir; + CDir *dir = mds->mdcache->get_dirfrag(*i); if (!dir) continue; if (!dir->is_auth()) continue; - dout(0) << "do_hashing hashing " << *dir << endl; - mds->mdcache->migrator->hash_dir(dir); - */ + dout(0) << "do_fragmenting splitting " << *dir << endl; + mds->mdcache->split_dir(dir, 3); } - hash_queue.clear(); + split_queue.clear(); } @@ -283,6 +293,9 @@ void MDBalancer::do_rebalance(int beat) { int cluster_size = mds->get_mds_map()->get_num_mds(); int whoami = mds->get_nodeid(); + utime_t now = g_clock.now(); + + dump_pop_map(); // reset my_targets.clear(); @@ -293,25 +306,25 @@ void MDBalancer::do_rebalance(int beat) // rescale! turn my mds_load back into meta_load units double load_fac = 1.0; - if (mds_load[whoami].mds_load() > 0) { - load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); + if (mds_load[whoami].mds_load(now) > 0) { + load_fac = mds_load[whoami].auth.meta_load(now) / mds_load[whoami].mds_load(now); dout(7) << " load_fac is " << load_fac - << " <- " << mds_load[whoami].root.meta_load() - << " / " << mds_load[whoami].mds_load() - << endl; + << " <- " << mds_load[whoami].auth.meta_load(now) + << " / " << mds_load[whoami].mds_load(now) + << endl; } double total_load = 0; multimap load_map; for (int i=0; i " << l << endl; + << " " << mds_load[i] + << " = " << mds_load[i].mds_load(now) + << " ~ " << l << endl; if (whoami == i) my_load = l; total_load += l; @@ -327,13 +340,13 @@ void MDBalancer::do_rebalance(int beat) << endl; // under or over? - if (my_load < target_load) { - dout(5) << " i am underloaded, doing nothing." << endl; + if (my_load < target_load * (1.0 + g_conf.mds_bal_min_rebalance)) { + dout(5) << " i am underloaded or barely overloaded, doing nothing." << endl; show_imports(); return; } - dout(5) << " i am overloaded" << endl; + dout(5) << " i am sufficiently overloaded" << endl; // first separate exporters and importers @@ -362,7 +375,7 @@ void MDBalancer::do_rebalance(int beat) if (true) { // analyze import_map; do any matches i can - dout(5) << " matching exporters to import sources" << endl; + dout(15) << " matching exporters to import sources" << endl; // big -> small exporters for (multimap::reverse_iterator ex = exporters.rbegin(); @@ -388,7 +401,7 @@ void MDBalancer::do_rebalance(int beat) if (1) { if (beat % 2 == 1) { // old way - dout(5) << " matching big exporters to big importers" << endl; + dout(15) << " matching big exporters to big importers" << endl; // big exporters to big importers multimap::reverse_iterator ex = exporters.rbegin(); multimap::iterator im = importers.begin(); @@ -404,7 +417,7 @@ void MDBalancer::do_rebalance(int beat) } } else { // new way - dout(5) << " matching small exporters to big importers" << endl; + dout(15) << " matching small exporters to big importers" << endl; // small exporters to big importers multimap::iterator ex = exporters.begin(); multimap::iterator im = importers.begin(); @@ -435,7 +448,7 @@ void MDBalancer::do_rebalance(int beat) CDir *im = *it; if (im->get_inode()->is_stray()) continue; - double pop = im->popularity[MDS_POP_CURDOM].meta_load(); + double pop = im->pop_auth_subtree.meta_load(now); if (pop < g_conf.mds_bal_idle_threshold && im->inode != mds->mdcache->get_root() && im->inode->authority().first != mds->get_nodeid()) { @@ -445,6 +458,7 @@ void MDBalancer::do_rebalance(int beat) mds->mdcache->migrator->export_dir(im, im->inode->authority().first); continue; } + import_pop_map[ pop ] = im; int from = im->inode->authority().first; dout(15) << " map: i imported " << *im << " from " << from << endl; @@ -478,7 +492,7 @@ void MDBalancer::do_rebalance(int beat) if (amount < MIN_OFFLOAD) continue; - dout(-5) << " sending " << amount << " to mds" << target + dout(5) << "want to send " << amount << " to mds" << target //<< " .. " << (*it).second << " * " << load_fac << " -> " << amount << endl;//" .. fudge is " << fudge << endl; @@ -498,7 +512,7 @@ void MDBalancer::do_rebalance(int beat) if (dir->inode->is_root()) continue; if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); + double pop = dir->pop_auth_subtree.meta_load(now); assert(dir->inode->authority().first == target); // cuz that's how i put it in the map, dummy if (pop <= amount-have) { @@ -555,23 +569,20 @@ void MDBalancer::do_rebalance(int beat) pot != candidates.end(); pot++) { if ((*pot)->get_inode()->is_stray()) continue; - find_exports(*pot, amount, exports, have, already_exporting); - if (have > amount-MIN_OFFLOAD) { + find_exports(*pot, amount, exports, have, already_exporting, now); + if (have > amount-MIN_OFFLOAD) break; - } } //fudge = amount - have; total_sent += have; for (list::iterator it = exports.begin(); it != exports.end(); it++) { - dout(-5) << " exporting to mds" << target - << " fragment " << **it - << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() + dout(-5) << " - exporting " + << (*it)->pop_auth_subtree.meta_load(now) + << " to mds" << target + << " " << **it << endl; mds->mdcache->migrator->export_dir(*it, target); - - // hack! only do one dir. - break; } } @@ -586,7 +597,8 @@ void MDBalancer::find_exports(CDir *dir, double amount, list& exports, double& have, - set& already_exporting) + set& already_exporting, + utime_t now) { double need = amount - have; if (need < amount * g_conf.mds_bal_min_start) @@ -596,13 +608,13 @@ void MDBalancer::find_exports(CDir *dir, double midchunk = need * g_conf.mds_bal_midchunk; double minchunk = need * g_conf.mds_bal_minchunk; - list bigger; + list bigger_rep, bigger_unrep; multimap smaller; - double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - double dir_sum = 0; - dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; + double dir_pop = dir->pop_auth_subtree.meta_load(now); + dout(7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; + double subdir_sum = 0; for (CDir_map_t::iterator it = dir->begin(); it != dir->end(); it++) { @@ -615,34 +627,37 @@ void MDBalancer::find_exports(CDir *dir, for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - if (!dir->is_auth()) continue; - if (already_exporting.count(dir)) continue; + CDir *subdir = *p; + if (!subdir->is_auth()) continue; + if (already_exporting.count(subdir)) continue; - if (dir->is_frozen()) continue; // can't export this right now! - //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! + if (subdir->is_frozen()) continue; // can't export this right now! // how popular? - double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); - dir_sum += pop; - dout(20) << " pop " << pop << " " << *dir << endl; + double pop = subdir->pop_auth_subtree.meta_load(now); + subdir_sum += pop; + dout(15) << " subdir pop " << pop << " " << *subdir << endl; if (pop < minchunk) continue; // lucky find? if (pop > needmin && pop < needmax) { - exports.push_back(dir); + exports.push_back(subdir); + already_exporting.insert(subdir); have += pop; return; } - if (pop > need) - bigger.push_back(dir); - else - smaller.insert(pair(pop, dir)); + if (pop > need) { + if (subdir->is_rep()) + bigger_rep.push_back(subdir); + else + bigger_unrep.push_back(subdir); + } else + smaller.insert(pair(pop, subdir)); } } - dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; + dout(15) << " sum " << subdir_sum << " / " << dir_pop << endl; // grab some sufficiently big small items multimap::reverse_iterator it; @@ -653,7 +668,7 @@ void MDBalancer::find_exports(CDir *dir, if ((*it).first < midchunk) break; // try later - dout(7) << " taking smaller " << *(*it).second << endl; + dout(7) << " taking smaller " << *(*it).second << endl; exports.push_back((*it).second); already_exporting.insert((*it).second); @@ -663,12 +678,11 @@ void MDBalancer::find_exports(CDir *dir, } // apprently not enough; drill deeper into the hierarchy (if non-replicated) - for (list::iterator it = bigger.begin(); - it != bigger.end(); + for (list::iterator it = bigger_unrep.begin(); + it != bigger_unrep.end(); it++) { - if ((*it)->is_rep()) continue; - dout(7) << " descending into " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); + dout(15) << " descending into " << **it << endl; + find_exports(*it, amount, exports, have, already_exporting, now); if (have > needmin) return; } @@ -677,8 +691,7 @@ void MDBalancer::find_exports(CDir *dir, for (; it != smaller.rend(); it++) { - - dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; + dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; exports.push_back((*it).second); already_exporting.insert((*it).second); @@ -687,13 +700,12 @@ void MDBalancer::find_exports(CDir *dir, return; } - // ok fine, drill inot replicated dirs - for (list::iterator it = bigger.begin(); - it != bigger.end(); + // ok fine, drill into replicated dirs + for (list::iterator it = bigger_rep.begin(); + it != bigger_rep.end(); it++) { - if (!(*it)->is_rep()) continue; - dout(7) << " descending into replicated " << **it << endl; - find_exports(*it, amount, exports, have, already_exporting); + dout(7) << " descending into replicated " << **it << endl; + find_exports(*it, amount, exports, have, already_exporting, now); if (have > needmin) return; } @@ -703,172 +715,173 @@ void MDBalancer::find_exports(CDir *dir, -void MDBalancer::hit_inode(CInode *in, int type) +void MDBalancer::hit_inode(utime_t now, CInode *in, int type) { + // hit inode + in->pop.get(type).hit(now); + + if (in->get_parent_dir()) + hit_dir(now, in->get_parent_dir(), type); +} +/* // hit me - float me = in->popularity[MDS_POP_JUSTME].pop[type].hit(); - float nested = in->popularity[MDS_POP_NESTED].pop[type].hit(); - float curdom = 0; - float anydom = 0; + in->popularity[MDS_POP_JUSTME].pop[type].hit(now); + in->popularity[MDS_POP_NESTED].pop[type].hit(now); if (in->is_auth()) { - curdom = in->popularity[MDS_POP_CURDOM].pop[type].hit(); - anydom = in->popularity[MDS_POP_ANYDOM].pop[type].hit(); + in->popularity[MDS_POP_CURDOM].pop[type].hit(now); + in->popularity[MDS_POP_ANYDOM].pop[type].hit(now); + + dout(20) << "hit_inode " << type << " pop " + << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " + << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " + << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " curdom, " + << in->popularity[MDS_POP_CURDOM].pop[type].get(now) << " anydom" + << " on " << *in + << endl; + } else { + dout(20) << "hit_inode " << type << " pop " + << in->popularity[MDS_POP_JUSTME].pop[type].get(now) << " me, " + << in->popularity[MDS_POP_NESTED].pop[type].get(now) << " nested, " + << " on " << *in + << endl; } - dout(20) << "hit_inode " << type << " pop " << me << " me, " - << nested << " nested, " - << curdom << " curdom, " - << anydom << " anydom" - << " on " << *in - << endl; - // hit auth up to import CDir *dir = in->get_parent_dir(); - if (dir) hit_dir(dir, type); -} + if (dir) hit_dir(now, dir, type); +*/ -void MDBalancer::hit_dir(CDir *dir, int type) +void MDBalancer::hit_dir(utime_t now, CDir *dir, int type, double amount) { // hit me - float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit(); + dir->pop_me.get(type).hit(now, amount); + + /* + dir->popularity[MDS_POP_JUSTME].pop[type].hit(now, amount); // hit modify counter, if this was a modify if (g_conf.num_mds > 2 && // FIXME >2 thing !dir->inode->is_root() && // not root (for now at least) dir->is_auth()) { + float v = dir->popularity[MDS_POP_JUSTME].pop[type].get(); + dout(20) << "hit_dir " << type << " pop " << v << " me " << *dir << endl; - // hash this dir? (later?) - if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || - //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || - (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && - hash_queue.count(dir->ino()) == 0) { - dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; - hash_queue.insert(dir->ino()); + // fragment this dir? (later?) + if (((g_conf.mds_bal_split_size > 0 && + dir->get_size() > (unsigned)g_conf.mds_bal_split_size) || + (v > g_conf.mds_bal_split_rd && type == META_POP_IRD) || + //(v > g_conf.mds_bal_split_wr && type == META_POP_IWR) || + (v > g_conf.mds_bal_split_wr && type == META_POP_DWR)) && + split_queue.count(dir->dirfrag()) == 0) { + dout(0) << "hit_dir " << type << " pop is " << v << ", putting in split_queue: " << *dir << endl; + split_queue.insert(dir->dirfrag()); } } + */ - hit_recursive(dir, type); -} - - - -void MDBalancer::hit_recursive(CDir *dir, int type) -{ - bool anydom = dir->is_auth(); - bool curdom = dir->is_auth(); - - float rd_adj = 0.0; - // replicate? - float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? + double rd_adj = 0; + if (type == META_POP_IRD && + dir->last_popularity_sample < last_sample) { + float dir_pop = dir->pop_auth_subtree.get(type).get(now); // hmm?? + dir->last_popularity_sample = last_sample; - dout(20) << "hit_recursive " << type << " pop " << dir_pop << " curdom " << *dir << endl; - - if (dir->is_auth()) { - if (!dir->is_rep() && - dir_pop >= g_conf.mds_bal_replicate_threshold) { - // replicate - float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get(); - rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; - rd_adj /= 2.0; // temper somewhat - - dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; - - dir->dir_rep = CDir::REP_ALL; - mds->mdcache->send_dir_updates(dir, true); - - dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); - dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj); - } - - if (!dir->ino() != 1 && - dir->is_rep() && - dir_pop < g_conf.mds_bal_unreplicate_threshold) { - // unreplicate - dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; + dout(20) << "hit_dir " << type << " pop " << dir_pop << " in " << *dir << endl; + + if (dir->is_auth()) { + if (!dir->is_rep() && + dir_pop >= g_conf.mds_bal_replicate_threshold) { + // replicate + float rdp = dir->pop_me.get(META_POP_IRD).get(now); + rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; + rd_adj /= 2.0; // temper somewhat + + dout(2) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; + + dir->dir_rep = CDir::REP_ALL; + mds->mdcache->send_dir_updates(dir, true); + + dir->pop_me.get(META_POP_IRD).adjust(rd_adj); + dir->pop_auth_subtree.get(META_POP_IRD).adjust(rd_adj); + } - dir->dir_rep = CDir::REP_NONE; - mds->mdcache->send_dir_updates(dir); + if (!dir->ino() != 1 && + dir->is_rep() && + dir_pop < g_conf.mds_bal_unreplicate_threshold) { + // unreplicate + dout(2) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; + + dir->dir_rep = CDir::REP_NONE; + mds->mdcache->send_dir_updates(dir); + } } } + // adjust ancestors + bool hit_subtree = dir->is_auth(); // current auth subtree (if any) + bool hit_subtree_nested = dir->is_auth(); // all nested auth subtrees while (dir) { - CInode *in = dir->inode; - - dir->popularity[MDS_POP_NESTED].pop[type].hit(); - in->popularity[MDS_POP_NESTED].pop[type].hit(); + dir->pop_nested.get(type).hit(now, amount); + if (rd_adj != 0.0) + dir->pop_nested.get(META_POP_IRD).adjust(now, rd_adj); - if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj); - - if (anydom) { - dir->popularity[MDS_POP_ANYDOM].pop[type].hit(); - in->popularity[MDS_POP_ANYDOM].pop[type].hit(); - } - - if (curdom) { - dir->popularity[MDS_POP_CURDOM].pop[type].hit(); - in->popularity[MDS_POP_CURDOM].pop[type].hit(); + if (hit_subtree) { + dir->pop_auth_subtree.get(type).hit(now, amount); + if (rd_adj != 0.0) + dir->pop_auth_subtree.get(META_POP_IRD).adjust(now, rd_adj); } + + if (hit_subtree_nested) { + dir->pop_auth_subtree_nested.get(type).hit(now, amount); + if (rd_adj != 0.0) + dir->pop_auth_subtree_nested.get(META_POP_IRD).adjust(now, rd_adj); + } if (dir->is_subtree_root()) - curdom = false; // end of auth domain, stop hitting auth counters. + hit_subtree = false; // end of auth domain, stop hitting auth counters. dir = dir->inode->get_parent_dir(); } } /* - * subtract off an exported chunk + * subtract off an exported chunk. + * this excludes *dir itself (encode_export_dir should have take care of that) + * we _just_ do the parents' nested counters. + * + * NOTE: call me _after_ forcing *dir into a subtree root, + * but _before_ doing the encode_export_dirs. */ void MDBalancer::subtract_export(CDir *dir) { - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_subtree_root(); - + dirfrag_load_vec_t subload = dir->pop_auth_subtree; + while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom; - - dir = in->get_parent_dir(); + dir = dir->inode->get_parent_dir(); if (!dir) break; - if (dir->is_subtree_root()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] -= curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; + dir->pop_nested -= subload; + dir->pop_auth_subtree_nested -= subload; } } void MDBalancer::add_import(CDir *dir) { - meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; - - bool in_domain = !dir->is_subtree_root(); + dirfrag_load_vec_t subload = dir->pop_auth_subtree; while (true) { - CInode *in = dir->inode; - - in->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom; - - dir = in->get_parent_dir(); + dir = dir->inode->get_parent_dir(); if (!dir) break; - if (dir->is_subtree_root()) in_domain = false; - - dir->popularity[MDS_POP_ANYDOM] += curdom; - if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; - } - + dir->pop_nested += subload; + dir->pop_auth_subtree_nested += subload; + } } @@ -882,6 +895,69 @@ void MDBalancer::show_imports(bool external) } +void MDBalancer::dump_pop_map() +{ + char fn[20]; + sprintf(fn, "popdump.%d.mds%d", beat_epoch, mds->get_nodeid()); + + dout(1) << "dump_pop_map to " << fn << endl; + + ofstream myfile; + myfile.open(fn); + + list iq; + if (mds->mdcache->root) + iq.push_back(mds->mdcache->root); + + utime_t now = g_clock.now(); + while (!iq.empty()) { + CInode *in = iq.front(); + iq.pop_front(); + + // pop stats + /*for (int a=0; apopularity[a].pop[b].get(now) << "\t"; + */ + + // recurse, depth-first. + if (in->is_dir()) { + + list dirs; + in->get_dirfrags(dirs); + for (list::iterator p = dirs.begin(); + p != dirs.end(); + ++p) { + CDir *dir = *p; + + myfile << (int)dir->pop_me.meta_load(now) << "\t"; + myfile << (int)dir->pop_nested.meta_load(now) << "\t"; + myfile << (int)dir->pop_auth_subtree.meta_load(now) << "\t"; + myfile << (int)dir->pop_auth_subtree_nested.meta_load(now) << "\t"; + + // filename last + string p; + in->make_path(p); + myfile << "." << p; + if (dir->get_frag() != frag_t()) + myfile << "___" << (unsigned)dir->get_frag(); + myfile << endl; //"/" << dir->get_frag() << endl; + + // add contents + for (map::iterator q = dir->items.begin(); + q != dir->items.end(); + q++) + if (q->second->is_primary()) + iq.push_front(q->second->get_inode()); + } + } + + } + + myfile.close(); +} + + /* replicate? diff --git a/trunk/ceph/mds/MDBalancer.h b/trunk/ceph/mds/MDBalancer.h index e6a9488e04f7b..bdc61c2abed37 100644 --- a/trunk/ceph/mds/MDBalancer.h +++ b/trunk/ceph/mds/MDBalancer.h @@ -43,10 +43,12 @@ class MDBalancer { int beat_epoch; utime_t last_heartbeat; - utime_t last_hash; + utime_t last_fragment; + utime_t last_sample; + // todo - set hash_queue; + set split_queue; // per-epoch scatter/gathered info hash_map mds_load; @@ -82,7 +84,7 @@ class MDBalancer { void tick(); - void do_hashing(); + void do_fragmenting(); void export_empties(); void do_rebalance(int beat); @@ -90,18 +92,20 @@ class MDBalancer { double amount, list& exports, double& have, - set& already_exporting); + set& already_exporting, + utime_t now); void subtract_export(class CDir *ex); void add_import(class CDir *im); - void hit_inode(class CInode *in, int type=0); - void hit_dir(class CDir *dir, int type=0); - void hit_recursive(class CDir *dir, int type=0); + void hit_inode(utime_t now, class CInode *in, int type); + void hit_dir(utime_t now, class CDir *dir, int type, double amount=1.0); + void hit_recursive(utime_t now, class CDir *dir, int type, double amount, double rd_adj); void show_imports(bool external=false); + void dump_pop_map(); }; diff --git a/trunk/ceph/mds/MDCache.cc b/trunk/ceph/mds/MDCache.cc index 8451c0836bc30..0ee95f853b372 100644 --- a/trunk/ceph/mds/MDCache.cc +++ b/trunk/ceph/mds/MDCache.cc @@ -43,6 +43,7 @@ #include "events/EString.h" #include "events/EPurgeFinish.h" #include "events/EImportFinish.h" +#include "events/EFragment.h" #include "messages/MGenericMessage.h" @@ -67,6 +68,9 @@ #include "messages/MMDSSlaveRequest.h" +#include "messages/MMDSFragmentNotify.h" + + #include "IdAllocator.h" #include "common/Timer.h" @@ -110,8 +114,9 @@ MDCache::~MDCache() void MDCache::log_stat(Logger *logger) { if (get_root()) { - logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load()); - logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load()); + utime_t now = g_clock.now(); + //logger->set("pop", (int)get_root()->pop_nested.meta_load(now)); + //logger->set("popauth", (int)get_root()->pop_auth_subtree_nested.meta_load(now)); } logger->set("c", lru.lru_get_size()); logger->set("cpin", lru.lru_get_num_pinned()); @@ -309,7 +314,7 @@ CDentry *MDCache::get_or_create_stray_dentry(CInode *in) CDentry *straydn = straydir->lookup(straydname); if (!straydn) - straydn = straydir->add_dentry(straydname, 0); + straydn = straydir->add_null_dentry(straydname); return straydn; } @@ -402,6 +407,16 @@ void MDCache::adjust_subtree_auth(CDir *dir, pair auth) // i am now the subtree root. root = dir; + // adjust recursive pop counters + if (dir->is_auth()) { + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree -= dir->pop_auth_subtree; + if (p->is_subtree_root()) break; + p = p->inode->get_parent_dir(); + } + } + eval_subtree_root(dir); } @@ -464,6 +479,16 @@ void MDCache::try_subtree_merge(CDir *dir) try_subtree_merge_at(*p); } +class C_MDC_SubtreeMergeWB : public Context { + MDCache *mdcache; + CInode *in; +public: + C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i) : mdcache(mdc), in(i) {} + void finish(int r) { + mdcache->subtree_merge_writebehind_finish(in); + } +}; + void MDCache::try_subtree_merge_at(CDir *dir) { dout(10) << "try_subtree_merge_at " << *dir << endl; @@ -492,12 +517,51 @@ void MDCache::try_subtree_merge_at(CDir *dir) subtrees.erase(dir); subtrees[parent].erase(dir); + // adjust popularity? + if (dir->is_auth()) { + CDir *p = dir->get_parent_dir(); + while (p) { + p->pop_auth_subtree += dir->pop_auth_subtree; + if (p->is_subtree_root()) break; + p = p->inode->get_parent_dir(); + } + } + eval_subtree_root(dir); + + // journal inode? + // (this is a large hammer to ensure that dirfragtree updates will + // hit the disk before the relevant dirfrags ever close) + if (dir->inode->is_auth() && + dir->inode->can_auth_pin()) { + CInode *in = dir->inode; + dout(10) << "try_subtree_merge_at journaling merged bound " << *in << endl; + + in->auth_pin(); + + // journal write-behind. + inode_t *pi = in->project_inode(); + pi->version = in->pre_dirty(); + + EUpdate *le = new EUpdate(mds->mdlog, "subtree merge writebehind"); + le->metablob.add_dir_context(in->get_parent_dn()->get_dir()); + le->metablob.add_primary_dentry(in->get_parent_dn(), true, 0, pi); + + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_sync(new C_MDC_SubtreeMergeWB(this, in)); + } } show_subtrees(15); } +void MDCache::subtree_merge_writebehind_finish(CInode *in) +{ + dout(10) << "subtree_merge_writebehind_finish on " << in << endl; + in->pop_and_dirty_projected_inode(); + in->auth_unpin(); +} + void MDCache::eval_subtree_root(CDir *dir) { // evaluate subtree inode dirlock? @@ -509,7 +573,8 @@ void MDCache::eval_subtree_root(CDir *dir) mds->locker->scatter_eval(&dir->inode->dirlock); else mds->locker->try_scatter_eval(&dir->inode->dirlock); // ** may or may not be auth_pinned ** - } + } + } @@ -629,6 +694,7 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, pair& bound_dfs, pair auth) { dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth @@ -649,6 +715,34 @@ void MDCache::adjust_bounded_subtree_auth(CDir *dir, list& bound_dfs, adjust_bounded_subtree_auth(dir, bounds, auth); } +void MDCache::map_dirfrag_set(list& dfs, set& result) +{ + // group by inode + map ino_fragset; + for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) + ino_fragset[p->ino].insert(p->frag); + + // get frags + for (map::iterator p = ino_fragset.begin(); + p != ino_fragset.end(); + ++p) { + CInode *in = get_inode(p->first); + if (!in) continue; + + list fglist; + for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) + in->dirfragtree.get_leaves_under(*q, fglist); + + dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist + << " on " << *in << endl; + + for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { + CDir *dir = in->get_dirfrag(*q); + if (dir) result.insert(dir); + } + } +} + CDir *MDCache::get_subtree_root(CDir *dir) @@ -904,7 +998,7 @@ void MDCache::log_subtree_map(Context *onsync) << num_subtrees_fullauth() << " fullauth" << endl; - ESubtreeMap *le = new ESubtreeMap; + ESubtreeMap *le = new ESubtreeMap(); // include all auth subtrees, and their bounds. // and a spanning tree to tie it to the root. @@ -1282,11 +1376,20 @@ void MDCache::handle_resolve(MMDSResolve *m) for (map >::iterator pi = m->subtrees.begin(); pi != m->subtrees.end(); ++pi) { - CDir *im = get_dirfrag(pi->first); - if (im) { - adjust_bounded_subtree_auth(im, pi->second, from); - try_subtree_merge(im); + CInode *diri = get_inode(pi->first.ino); + if (!diri) continue; + bool forced = diri->dirfragtree.force_to_leaf(pi->first.frag); + if (forced) { + dout(10) << " forced frag " << pi->first.frag << " to leaf in " + << diri->dirfragtree + << " on " << pi->first << endl; } + + CDir *dir = diri->get_dirfrag(pi->first.frag); + if (!dir) continue; + + adjust_bounded_subtree_auth(dir, pi->second, from); + try_subtree_merge(dir); } // am i a surviving ambiguous importer? @@ -1380,7 +1483,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) uncommitted_slave_updates[from][*p].replay(mds); uncommitted_slave_updates[from].erase(*p); // log commit - mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_COMMIT)); + mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_COMMIT)); } else { MDRequest *mdr = request_get(*p); assert(mdr->slave_request == 0); // shouldn't be doing anything! @@ -1396,7 +1499,7 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack) if (mds->is_resolve()) { assert(uncommitted_slave_updates[from].count(*p)); uncommitted_slave_updates[from].erase(*p); - mds->mdlog->submit_entry(new ESlaveUpdate("unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); + mds->mdlog->submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from, ESlaveUpdate::OP_ROLLBACK)); } else { MDRequest *mdr = request_get(*p); if (mdr->slave_commit) { @@ -1750,7 +1853,8 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) rejoin->add_weak_primary_dentry(dir->dirfrag(), p->first, dn->get_inode()->ino()); dn->get_inode()->get_nested_dirfrags(nested); } else if (dn->is_remote()) - rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, dn->get_remote_ino()); + rejoin->add_weak_remote_dentry(dir->dirfrag(), p->first, + dn->get_remote_ino(), dn->get_remote_d_type()); else assert(0); // i shouldn't have a non-auth null dentry after replay + trim_non_auth() } @@ -1762,9 +1866,10 @@ void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) p != dir->items.end(); ++p) { CDentry *dn = p->second; - rejoin->add_strong_dentry(dir->dirfrag(), p->first, + rejoin->add_strong_dentry(dir->dirfrag(), p->first, dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), + dn->is_remote() ? dn->get_remote_d_type():0, dn->get_replica_nonce(), dn->lock.get_state()); if (dn->is_primary()) { @@ -1928,9 +2033,10 @@ void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) int nonce = dn->add_replica(from); dout(10) << " have " << *dn << endl; if (ack) - ack->add_strong_dentry(p->first, q->first, + ack->add_strong_dentry(p->first, q->first, dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), + dn->is_remote() ? dn->get_remote_d_type():0, nonce, dn->lock.get_replica_state()); // inode? @@ -2147,13 +2253,13 @@ void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) CDentry *dn = dir->lookup(q->first); if (!dn) { if (q->second.is_remote()) { - dn = dir->add_dentry(q->first, q->second.remote_ino); + dn = dir->add_remote_dentry(q->first, q->second.remote_ino, q->second.remote_d_type); } else if (q->second.is_null()) { - dn = dir->add_dentry(q->first); + dn = dir->add_null_dentry(q->first); } else { CInode *in = get_inode(q->second.ino); if (!in) in = rejoin_invent_inode(q->second.ino); - dn = dir->add_dentry(q->first, in); + dn = dir->add_primary_dentry(q->first, in); dout(10) << " missing " << q->second.ino << endl; if (!missing) missing = new MMDSCacheRejoin(MMDSCacheRejoin::OP_MISSING); @@ -2642,6 +2748,7 @@ void MDCache::rejoin_send_acks() ack[r->first]->add_strong_dentry(dir->dirfrag(), dn->name, dn->is_primary() ? dn->get_inode()->ino():inodeno_t(0), dn->is_remote() ? dn->get_remote_ino():inodeno_t(0), + dn->is_remote() ? dn->get_remote_d_type():0, r->second, dn->lock.get_replica_state()); @@ -3151,8 +3258,10 @@ void MDCache::handle_cache_expire(MCacheExpire *m) ++p) { // check container? if (p->first.ino > 0) { - CDir *con = get_dirfrag(p->first); - assert(con); // we had better have this. + CInode *coni = get_inode(p->first.ino); + assert(coni); // we had better have this. + CDir *con = coni->get_approx_dirfrag(p->first.frag); + assert(con); if (!con->is_auth() || (con->is_auth() && con->is_exporting() && @@ -3236,21 +3345,32 @@ void MDCache::handle_cache_expire(MCacheExpire *m) for (map >::iterator pd = p->second.dentries.begin(); pd != p->second.dentries.end(); ++pd) { - dout(0) << " dn expires in dir " << pd->first << endl; - CDir *dir = get_dirfrag(pd->first); + dout(10) << " dn expires in dir " << pd->first << endl; + CInode *diri = get_inode(pd->first.ino); + assert(diri); + CDir *dir = diri->get_dirfrag(pd->first.frag); if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from << ", don't have it" << endl; - assert(dir); - } - assert(dir->is_auth()); + dout(0) << " dn expires on " << pd->first << " from " << from << ", must have refragmented" << endl; + } else { + assert(dir->is_auth()); + } for (map::iterator p = pd->second.begin(); p != pd->second.end(); ++p) { int nonce = p->second; + CDentry *dn; - CDentry *dn = dir->lookup(p->first); + if (dir) { + dn = dir->lookup(p->first); + } else { + // which dirfrag for this dentry? + CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first)); + assert(dir->is_auth()); + dn = dir->lookup(p->first); + } + if (!dn) dout(0) << " missing dentry for " << p->first << " in " << *dir << endl; assert(dn); @@ -3534,7 +3654,7 @@ bool MDCache::shutdown_pass() } // done! - dout(1) << "shutdown done." << endl; + dout(2) << "shutdown done." << endl; return true; } @@ -3594,6 +3714,9 @@ void MDCache::dispatch(Message *m) break; + case MSG_MDS_FRAGMENTNOTIFY: + handle_fragment_notify((MMDSFragmentNotify*)m); + break; @@ -3686,15 +3809,15 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, // who // discover? assert(!cur->is_auth()); if (cur->is_ambiguous_auth()) { - dout(10) << "traverse: need dir, waiting for single auth on " << *cur << endl; + dout(10) << "traverse: need dirfrag " << fg << ", waiting for single auth on " << *cur << endl; cur->add_waiter(CInode::WAIT_SINGLEAUTH, _get_waiter(mdr, req)); return 1; } else if (dir_discovers.count(cur->ino())) { - dout(10) << "traverse: need dir, already doing discover for " << *cur << endl; + dout(10) << "traverse: need dirfrag " << fg << ", already doing discover for " << *cur << endl; assert(cur->is_waiter_for(CInode::WAIT_DIR)); } else { filepath want = path.postfixpath(depth); - dout(10) << "traverse: need dir, doing discover, want " << want.get_path() + dout(10) << "traverse: need dirfrag " << fg << ", doing discover, want " << want.get_path() << " from " << *cur << endl; mds->send_message_mds(new MDiscover(mds->get_nodeid(), cur->ino(), @@ -4440,7 +4563,7 @@ void MDCache::_anchor_create_prepared(CInode *in, version_t atid) // predirty, prepare log entry version_t pdv = in->pre_dirty(); - EUpdate *le = new EUpdate("anchor_create"); + EUpdate *le = new EUpdate(mds->mdlog, "anchor_create"); le->metablob.add_dir_context(in->get_parent_dir()); // update the logged inode copy @@ -4547,7 +4670,7 @@ void MDCache::_anchor_destroy_prepared(CInode *in, version_t atid) // predirty, prepare log entry version_t pdv = in->pre_dirty(); - EUpdate *le = new EUpdate("anchor_destroy"); + EUpdate *le = new EUpdate(mds->mdlog, "anchor_destroy"); le->metablob.add_dir_context(in->get_parent_dir()); // update the logged inode copy @@ -4638,7 +4761,7 @@ void MDCache::_purge_stray(CDentry *dn) // log removal version_t pdv = dn->pre_dirty(); - EUpdate *le = new EUpdate("purge_stray"); + EUpdate *le = new EUpdate(mds->mdlog, "purge_stray"); le->metablob.add_dir_context(dn->dir); le->metablob.add_null_dentry(dn, true); le->metablob.add_inode_truncate(dn->inode->inode, 0); @@ -4770,14 +4893,13 @@ void MDCache::handle_discover(MDiscover *dis) } CDir *curdir = cur->get_dirfrag(fg); - // am i dir auth (or if no dir, at least the inode auth) if ((!curdir && !cur->is_auth()) || (curdir && !curdir->is_auth())) { if (curdir) { - dout(7) << *curdir << " not dirfrag auth, setting dir_auth_hint" << endl; + dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << endl; reply->set_dir_auth_hint(curdir->authority().first); } else { - dout(7) << *cur << " dirfrag not open, not inode auth, setting dir_auth_hint" << endl; + dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " << *cur << endl; reply->set_dir_auth_hint(cur->authority().first); } reply->set_wanted_xlocks_hint(dis->wants_xlocked()); @@ -4856,7 +4978,7 @@ void MDCache::handle_discover(MDiscover *dis) // send null dentry dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " << *curdir << endl; - dn = curdir->add_dentry(dis->get_dentry(i), 0); + dn = curdir->add_null_dentry(dis->get_dentry(i)); } assert(dn); @@ -5068,6 +5190,13 @@ CDir *MDCache::add_replica_dir(CInode *diri, dis.update_dir(dir); dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << endl; } else { + // force frag to leaf in the diri tree + if (!diri->dirfragtree.is_leaf(fg)) { + dout(7) << "add_replica_dir forcing frag " << fg << " to leaf in the fragtree " + << diri->dirfragtree << endl; + diri->dirfragtree.force_to_leaf(fg); + } + // add replica. dir = diri->add_dirfrag( new CDir(diri, fg, this, false) ); dis.update_dir(dir); @@ -5112,7 +5241,7 @@ CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, listadd_dentry( dis.get_dname(), 0 ); + dn = dir->add_null_dentry(dis.get_dname()); dis.update_dentry(dn); dis.init_dentry_lock(dn); dout(7) << "add_replica_dentry added " << *dn << endl; @@ -5121,7 +5250,7 @@ CDentry *MDCache::add_replica_dentry(CDir *dir, CDentryDiscover &dis, listis_null()) - dir->link_inode(dn, dis.get_remote_ino()); + dir->link_remote_inode(dn, dis.get_remote_ino(), dis.get_remote_d_type()); // hrm. yeah. assert(dn->is_remote() && dn->get_remote_ino() == dis.get_remote_ino()); @@ -5142,7 +5271,7 @@ CInode *MDCache::add_replica_inode(CInodeDiscover& dis, CDentry *dn) add_inode(in); dout(10) << "add_replica_inode had " << *in << endl; if (dn && dn->is_null()) - dn->dir->link_inode(dn, in); + dn->dir->link_primary_inode(dn, in); } else { dis.update_inode(in); dout(10) << "add_replica_inode added " << *in << endl; @@ -5360,7 +5489,7 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) CInode *in = dn->inode; dn->dir->unlink_inode(dn); assert(straydn); - straydn->dir->link_inode(straydn, in); + straydn->dir->link_primary_inode(straydn, in); } else { assert(dn->is_remote()); dn->dir->unlink_inode(dn); @@ -5381,6 +5510,374 @@ void MDCache::handle_dentry_unlink(MDentryUnlink *m) +// =================================================================== +// FRAGMENT + + +/** + * adjust_dir_fragments -- adjust fragmentation for a directory + * + * @diri - directory inode + * @basefrag - base fragment + * @bits - bit adjustment. positive for split, negative for merge. + */ +void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + list& resultfrags, + list& waiters) +{ + dout(10) << "adjust_dir_fragments " << basefrag << " " << bits + << " on " << *diri << endl; + + // yuck. we may have discovered the inode while it was being fragmented. + if (!diri->dirfragtree.is_leaf(basefrag)) + diri->dirfragtree.force_to_leaf(basefrag); + + CDir *base = diri->get_or_open_dirfrag(this, basefrag); + + // adjust fragtree + diri->dirfragtree.split(basefrag, bits); + dout(10) << " new fragtree is " << diri->dirfragtree << endl; + + if (bits > 0) { + if (base) { + CDir *baseparent = base->get_parent_dir(); + + base->split(bits, resultfrags, waiters); + + // did i change the subtree map? + if (base->is_subtree_root()) { + // am i a bound? + if (baseparent) { + CDir *parent = get_subtree_root(baseparent); + assert(subtrees[parent].count(base)); + subtrees[parent].erase(base); + for (list::iterator p = resultfrags.begin(); + p != resultfrags.end(); + ++p) { + subtrees[parent].insert(*p); + subtrees[*p].clear(); // new frag is now its own subtree + } + } + + // adjust my bounds. + set bounds; + bounds.swap(subtrees[base]); + subtrees.erase(base); + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *frag = get_subtree_root((*p)->get_parent_dir()); + subtrees[frag].insert(*p); + } + + show_subtrees(10); + } + } + } else { + assert(base); + base->merge(bits, waiters); + resultfrags.push_back(base); + assert(0); // FIXME adjust subtree map! and clean up this code, probably. + } +} + +class C_MDC_FragmentGo : public Context { + MDCache *mdcache; + CInode *diri; + list dirs; + frag_t basefrag; + int bits; +public: + C_MDC_FragmentGo(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : + mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } + virtual void finish(int r) { + mdcache->fragment_go(diri, dirs, basefrag, bits); + } +}; + +void MDCache::split_dir(CDir *dir, int bits) +{ + dout(7) << "split_dir " << *dir << " bits " << bits << endl; + assert(dir->is_auth()); + + if (mds->mdsmap->is_degraded()) { + dout(7) << "cluster degraded, no fragmenting for now" << endl; + return; + } + if (dir->inode->is_root()) { + dout(7) << "i won't fragment root" << endl; + //assert(0); + return; + } + if (dir->state_test(CDir::STATE_FRAGMENTING)) { + dout(7) << "already fragmenting" << endl; + return; + } + if (!dir->can_auth_pin()) { + dout(7) << "not authpinnable on " << *dir << endl; + return; + } + + list startfrags; + startfrags.push_back(dir); + + dir->state_set(CDir::STATE_FRAGMENTING); + + fragment_freeze(dir->get_inode(), startfrags, dir->get_frag(), bits); + fragment_mark_and_complete(dir->get_inode(), startfrags, dir->get_frag(), bits); +} + +/* + * initial the freeze, blocking with an auth_pin. + * + * some reason(s) we have to freeze: + * - on merge, version/projected version are unified from all fragments; + * concurrent pipelined updates in the directory will have divergent + * versioning... and that's no good. + */ +void MDCache::fragment_freeze(CInode *diri, list& frags, frag_t basefrag, int bits) +{ + C_Gather *gather = new C_Gather(new C_MDC_FragmentGo(this, diri, frags, basefrag, bits)); + + // freeze the dirs + for (list::iterator p = frags.begin(); + p != frags.end(); + ++p) { + CDir *dir = *p; + dir->auth_pin(); // this will block the freeze + dir->freeze_dir(gather->new_sub()); + } +} + +class C_MDC_FragmentMarking : public Context { + MDCache *mdcache; + CInode *diri; + list dirs; + frag_t basefrag; + int bits; +public: + C_MDC_FragmentMarking(MDCache *m, CInode *di, list& dls, frag_t bf, int b) : + mdcache(m), diri(di), dirs(dls), basefrag(bf), bits(b) { } + virtual void finish(int r) { + mdcache->fragment_mark_and_complete(diri, dirs, basefrag, bits); + } +}; + +void MDCache::fragment_mark_and_complete(CInode *diri, + list& startfrags, + frag_t basefrag, int bits) +{ + dout(10) << "fragment_mark_and_complete " << basefrag << " by " << bits + << " on " << *diri << endl; + + C_Gather *gather = 0; + + for (list::iterator p = startfrags.begin(); + p != startfrags.end(); + ++p) { + CDir *dir = *p; + + if (!dir->is_complete()) { + dout(15) << " fetching incomplete " << *dir << endl; + if (!gather) gather = new C_Gather(new C_MDC_FragmentMarking(this, diri, startfrags, basefrag, bits)); + dir->fetch(gather->new_sub(), + true); // ignore authpinnability + } + else if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { + dout(15) << " marking " << *dir << endl; + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + p->second->get(CDentry::PIN_FRAGMENTING); + p->second->state_set(CDentry::STATE_FRAGMENTING); + } + dir->state_set(CDir::STATE_DNPINNEDFRAG); + dir->auth_unpin(); // allow our freeze to complete + } + else { + dout(15) << " marked " << *dir << endl; + } + } +} + + +class C_MDC_FragmentStored : public Context { + MDCache *mdcache; + CInode *diri; + frag_t basefrag; + int bits; + list resultfrags; +public: + C_MDC_FragmentStored(MDCache *m, CInode *di, frag_t bf, int b, + list& rf) : + mdcache(m), diri(di), basefrag(bf), bits(b), resultfrags(rf) { } + virtual void finish(int r) { + mdcache->fragment_stored(diri, basefrag, bits, resultfrags); + } +}; + +void MDCache::fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits) +{ + dout(10) << "fragment_go " << basefrag << " by " << bits + << " on " << *diri << endl; + + // refragment + list resultfrags; + list waiters; + adjust_dir_fragments(diri, basefrag, bits, resultfrags, waiters); + mds->queue_waiters(waiters); + + C_Gather *gather = new C_Gather(new C_MDC_FragmentStored(this, diri, basefrag, bits, resultfrags)); + + // freeze, store resulting frags + for (list::iterator p = resultfrags.begin(); + p != resultfrags.end(); + p++) { + CDir *dir = *p; + dout(10) << " result frag " << *dir << endl; + dir->state_set(CDir::STATE_FRAGMENTING); + dir->commit(0, gather->new_sub()); + dir->_freeze_dir(); + } +} + +class C_MDC_FragmentLogged : public Context { + MDCache *mdcache; + CInode *diri; + frag_t basefrag; + int bits; + list resultfrags; + vector pvs; +public: + C_MDC_FragmentLogged(MDCache *m, CInode *di, frag_t bf, int b, + list& rf, vector& p) : + mdcache(m), diri(di), basefrag(bf), bits(b) { + resultfrags.swap(rf); + pvs.swap(p); + } + virtual void finish(int r) { + mdcache->fragment_logged(diri, basefrag, bits, + resultfrags, pvs); + } +}; + +void MDCache::fragment_stored(CInode *diri, frag_t basefrag, int bits, + list& resultfrags) +{ + dout(10) << "fragment_stored " << basefrag << " by " << bits + << " on " << *diri << endl; + + EFragment *le = new EFragment(mds->mdlog, diri->ino(), basefrag, bits); + + set peers; + vector pvs; + for (list::iterator p = resultfrags.begin(); + p != resultfrags.end(); + p++) { + CDir *dir = *p; + dout(10) << " result frag " << *dir << endl; + + if (p == resultfrags.begin()) { + le->metablob.add_dir_context(dir); + // note peers + // only do this once: all frags have identical replica_maps. + if (peers.empty()) + for (map::iterator p = dir->replica_map.begin(); + p != dir->replica_map.end(); + ++p) + peers.insert(p->first); + } + + pvs.push_back(dir->pre_dirty()); + le->metablob.add_dir(dir, true); + } + + mds->mdlog->submit_entry(le, + new C_MDC_FragmentLogged(this, diri, basefrag, bits, + resultfrags, pvs)); + + // announcelist& resultfrags, + for (set::iterator p = peers.begin(); + p != peers.end(); + ++p) { + MMDSFragmentNotify *notify = new MMDSFragmentNotify(diri->ino(), basefrag, bits); + if (bits < 0) { + // freshly replicate basedir to peer on merge + CDir *base = resultfrags.front(); + CDirDiscover *basedis = base->replicate_to(*p); + basedis->_encode(notify->basebl); + delete basedis; + } + mds->send_message_mds(notify, *p, MDS_PORT_CACHE); + } + +} + +void MDCache::fragment_logged(CInode *diri, frag_t basefrag, int bits, + list& resultfrags, + vector& pvs) +{ + dout(10) << "fragment_logged " << basefrag << " bits " << bits + << " on " << *diri << endl; + + + // dirty resulting frags + set peers; + vector::iterator pv = pvs.begin(); + for (list::iterator p = resultfrags.begin(); + p != resultfrags.end(); + p++) { + CDir *dir = *p; + dout(10) << " result frag " << *dir << endl; + + // dirty, unpin, unfreeze + dir->state_clear(CDir::STATE_FRAGMENTING); + dir->mark_dirty(*pv); + pv++; + + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + CDentry *dn = p->second; + if (dn->state_test(CDentry::STATE_FRAGMENTING)) + dn->put(CDentry::PIN_FRAGMENTING); + } + + dir->unfreeze_dir(); + } +} + + + +void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) +{ + dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << endl; + + CInode *diri = get_inode(notify->get_ino()); + if (diri) { + list waiters; + + // add replica dir (for merge)? + // (adjust_dir_fragments expects base to already exist, if non-auth) + if (notify->get_bits() < 0) { + CDirDiscover basedis; + int off = 0; + basedis._decode(notify->basebl, off); + add_replica_dir(diri, notify->get_basefrag(), basedis, + notify->get_source().num(), waiters); + } + + // refragment + list resultfrags; + adjust_dir_fragments(diri, notify->get_basefrag(), notify->get_bits(), + resultfrags, waiters); + mds->queue_waiters(waiters); + } + + delete notify; +} + + diff --git a/trunk/ceph/mds/MDCache.h b/trunk/ceph/mds/MDCache.h index 88513bd8c34c9..10073e3e19a95 100644 --- a/trunk/ceph/mds/MDCache.h +++ b/trunk/ceph/mds/MDCache.h @@ -55,6 +55,8 @@ class Message; class MClientRequest; class MMDSSlaveRequest; +class MMDSFragmentNotify; + // MDCache //typedef const char* pchar; @@ -244,9 +246,11 @@ public: void adjust_bounded_subtree_auth(CDir *dir, list& bounds, int a) { adjust_bounded_subtree_auth(dir, bounds, pair(a, CDIR_AUTH_UNKNOWN)); } + void map_dirfrag_set(list& dfs, set& result); void adjust_export_state(CDir *dir); void try_subtree_merge(CDir *root); void try_subtree_merge_at(CDir *root); + void subtree_merge_writebehind_finish(CInode *in); void eval_subtree_root(CDir *dir); CDir *get_subtree_root(CDir *dir); void remove_subtree(CDir *dir); @@ -593,6 +597,29 @@ protected: void handle_dentry_unlink(MDentryUnlink *m); + // -- fragmenting -- +private: + void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, + list& frags, list& waiters); + friend class EFragment; + +public: + void split_dir(CDir *dir, int byn); + +private: + void fragment_freeze(CInode *diri, list& startfrags, frag_t basefrag, int bits); + void fragment_mark_and_complete(CInode *diri, list& startfrags, frag_t basefrag, int bits); + void fragment_go(CInode *diri, list& startfrags, frag_t basefrag, int bits); + void fragment_stored(CInode *diri, frag_t basefrag, int bits, list& resultfrags); + void fragment_logged(CInode *diri, frag_t basefrag, int bits, list& resultfrags, vector& pvs); + friend class C_MDC_FragmentGo; + friend class C_MDC_FragmentMarking; + friend class C_MDC_FragmentStored; + friend class C_MDC_FragmentLogged; + + void handle_fragment_notify(MMDSFragmentNotify *m); + + // -- updates -- //int send_inode_updates(CInode *in); //void handle_inode_update(MInodeUpdate *m); diff --git a/trunk/ceph/mds/MDLog.cc b/trunk/ceph/mds/MDLog.cc index f16047612fc7b..129aa1e9fe2f9 100644 --- a/trunk/ceph/mds/MDLog.cc +++ b/trunk/ceph/mds/MDLog.cc @@ -31,28 +31,6 @@ LogType mdlog_logtype; -/* -MDLog::MDLog(MDS *m) : replay_thread(this) -{ - mds = m; - num_events = 0; - waiting_for_read = false; - - last_import_map = 0; - writing_import_map = false; - seen_import_map = false; - - max_events = g_conf.mds_log_max_len; - - capped = false; - - unflushed = 0; - - journaler = 0; - logger = 0; -} -*/ - MDLog::~MDLog() { @@ -70,6 +48,7 @@ void MDLog::init_journaler() static bool didit = false; if (!didit) { + didit = true; mdlog_logtype.add_inc("add"); mdlog_logtype.add_inc("expire"); mdlog_logtype.add_inc("obs"); @@ -141,12 +120,14 @@ void MDLog::submit_entry( LogEvent *le, Context *c ) dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; // encode it, with event type - bufferlist bl; - bl.append((char*)&le->_type, sizeof(le->_type)); - le->encode_payload(bl); - - // journal it. - journaler->append_entry(bl); + { + bufferlist bl; + bl.append((char*)&le->_type, sizeof(le->_type)); + le->encode_payload(bl); + + // journal it. + journaler->append_entry(bl); // bl is destroyed. + } assert(!capped); @@ -283,8 +264,11 @@ void MDLog::trim(Context *c) utime_t stop = g_clock.now(); stop += 2.0; - while (num_events > max_events && - stop > g_clock.now()) { + while (num_events > max_events) { + // don't check the clock on _every_ event, here! + if (num_events % 100 == 0 && + stop < g_clock.now()) + break; off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); dout(5) << "trim num_events " << num_events << " > max " << max_events @@ -346,17 +330,6 @@ void MDLog::trim(Context *c) std::list finished; finished.swap(trim_waiters); finish_contexts(finished, 0); - - // hmm, are we at the end? - /* - if (journaler->get_read_pos() == journaler->get_write_pos() && - trimming.size() == import_map_expire_waiters.size()) { - dout(5) << "trim log is empty, allowing import_map to expire" << endl; - list ls; - ls.swap(import_map_expire_waiters); - finish_contexts(ls); - } - */ } diff --git a/trunk/ceph/mds/MDS.cc b/trunk/ceph/mds/MDS.cc index 8c267271edfbf..97bb9e2353ca2 100644 --- a/trunk/ceph/mds/MDS.cc +++ b/trunk/ceph/mds/MDS.cc @@ -59,8 +59,6 @@ #include "messages/MClientRequestForward.h" -LogType mds_logtype, mds_cache_logtype; - #include "config.h" #undef dout #define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " @@ -97,7 +95,6 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) : server = new Server(this); locker = new Locker(this, mdcache); - // clients last_client_mdsmap_bcast = 0; @@ -144,6 +141,48 @@ MDS::~MDS() { void MDS::reopen_logger() { + static LogType mds_logtype, mds_cache_logtype; + static bool didit = false; + if (!didit) { + didit = true; + + mds_logtype.add_inc("req"); + mds_logtype.add_inc("reply"); + mds_logtype.add_inc("fw"); + mds_logtype.add_inc("cfw"); + + mds_logtype.add_set("l"); + mds_logtype.add_set("q"); + mds_logtype.add_set("popanyd"); + mds_logtype.add_set("popnest"); + + mds_logtype.add_inc("lih"); + mds_logtype.add_inc("lif"); + + mds_logtype.add_set("c"); + mds_logtype.add_set("ctop"); + mds_logtype.add_set("cbot"); + mds_logtype.add_set("cptail"); + mds_logtype.add_set("cpin"); + mds_logtype.add_inc("cex"); + mds_logtype.add_inc("dis"); + mds_logtype.add_inc("cmiss"); + + mds_logtype.add_set("buf"); + mds_logtype.add_inc("cdir"); + mds_logtype.add_inc("fdir"); + + mds_logtype.add_inc("iex"); + mds_logtype.add_inc("iim"); + mds_logtype.add_inc("ex"); + mds_logtype.add_inc("im"); + mds_logtype.add_inc("imex"); + mds_logtype.add_set("nex"); + mds_logtype.add_set("nim"); + } + + if (whoami < 0) return; + // flush+close old log if (logger) { logger->flush(true); @@ -154,7 +193,6 @@ void MDS::reopen_logger() delete logger2; } - // log string name; name = "mds"; @@ -166,44 +204,11 @@ void MDS::reopen_logger() logger = new Logger(name, (LogType*)&mds_logtype); - mds_logtype.add_inc("req"); - mds_logtype.add_inc("reply"); - mds_logtype.add_inc("fw"); - mds_logtype.add_inc("cfw"); - - mds_logtype.add_set("l"); - mds_logtype.add_set("q"); - mds_logtype.add_set("popanyd"); - mds_logtype.add_set("popnest"); - - mds_logtype.add_inc("lih"); - mds_logtype.add_inc("lif"); - - mds_logtype.add_set("c"); - mds_logtype.add_set("ctop"); - mds_logtype.add_set("cbot"); - mds_logtype.add_set("cptail"); - mds_logtype.add_set("cpin"); - mds_logtype.add_inc("cex"); - mds_logtype.add_inc("dis"); - mds_logtype.add_inc("cmiss"); - - mds_logtype.add_set("buf"); - mds_logtype.add_inc("cdir"); - mds_logtype.add_inc("fdir"); - - mds_logtype.add_inc("iex"); - mds_logtype.add_inc("iim"); - mds_logtype.add_inc("ex"); - mds_logtype.add_inc("im"); - mds_logtype.add_inc("imex"); - mds_logtype.add_set("nex"); - mds_logtype.add_set("nim"); - - char n[80]; sprintf(n, "mds%d.cache", whoami); logger2 = new Logger(n, (LogType*)&mds_cache_logtype); + + server->reopen_logger(); } void MDS::send_message_mds(Message *m, int mds, int port, int fromport) @@ -331,7 +336,7 @@ void MDS::tick() if (logger) { req_rate = logger->get("req"); - logger->set("l", (int)load.mds_load()); + logger->set("l", (int)load.mds_load(g_clock.now())); logger->set("q", messenger->get_dispatch_queue_len()); logger->set("buf", buffer_total_alloc); @@ -395,7 +400,8 @@ void MDS::beacon_send() beacon_seq_stamp[beacon_last_seq] = g_clock.now(); int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBeacon(messenger->get_myinst(), want_state, beacon_last_seq), + messenger->send_message(new MMDSBeacon(messenger->get_myinst(), mdsmap->get_epoch(), + want_state, beacon_last_seq), monmap->get_inst(mon)); // schedule next sender @@ -446,8 +452,7 @@ void MDS::beacon_kill(utime_t lab) dout(0) << "beacon_kill last_acked_stamp " << lab << ", killing myself." << endl; - messenger->suicide(); - //exit(0); + suicide(); } else { dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp << " != my " << lab @@ -503,7 +508,7 @@ void MDS::handle_mds_map(MMDSMap *m) whoami = mdsmap->get_addr_rank(messenger->get_myaddr()); if (whoami < 0) { dout(1) << "handle_mds_map i'm not in the mdsmap, killing myself" << endl; - shutdown_final(); + suicide(); return; } if (oldwhoami != whoami) { @@ -552,7 +557,7 @@ void MDS::handle_mds_map(MMDSMap *m) // contemplate suicide if (mdsmap->get_inst(whoami) != messenger->get_myinst()) { dout(1) << "apparently i've been replaced by " << mdsmap->get_inst(whoami) << ", committing suicide." << endl; - shutdown_final(); + suicide(); return; } @@ -562,8 +567,6 @@ void MDS::handle_mds_map(MMDSMap *m) if (oldstate == MDSMap::STATE_REJOIN || oldstate == MDSMap::STATE_RECONNECT) recovery_done(); - - dout(1) << "now active" << endl; finish_contexts(waiting_for_active); // kick waiters } else if (is_replay()) { replay_start(); @@ -577,7 +580,7 @@ void MDS::handle_mds_map(MMDSMap *m) } else if (is_stopped()) { assert(oldstate == MDSMap::STATE_STOPPING); dout(1) << "now stopped, sending down:out and exiting" << endl; - shutdown_final(); + suicide(); return; } } @@ -1043,9 +1046,9 @@ void MDS::stopping_done() -int MDS::shutdown_final() +void MDS::suicide() { - dout(1) << "shutdown_final" << endl; + dout(1) << "suicide" << endl; // flush loggers if (logger) logger->flush(true); @@ -1073,8 +1076,6 @@ int MDS::shutdown_final() // shut down messenger messenger->shutdown(); - - return 0; } @@ -1202,7 +1203,22 @@ void MDS::my_dispatch(Message *m) } while (dest == whoami); mdcache->migrator->export_dir(dir,dest); } + // hack: thrash exports + for (int i=0; ihack_pick_random_inode(); + list ls; + in->get_dirfrags(ls); + if (ls.empty()) continue; // must be an open dir. + CDir *dir = ls.front(); + if (!dir->get_parent_dir()) continue; // must be linked. + if (!dir->is_auth()) continue; // must be auth. + mdcache->split_dir(dir, 1);// + (rand() % 3)); + } // hack: force hash root? /* @@ -1276,7 +1292,7 @@ void MDS::ms_handle_failure(Message *m, const entity_inst_t& inst) mds_lock.Lock(); dout(10) << "handle_ms_failure to " << inst << " on " << *m << endl; - if (m->get_type() == MSG_CLIENT_RECONNECT) + if (m->get_type() == MSG_MDS_MAP && m->get_dest().is_client()) server->client_reconnect_failure(m->get_dest().num()); delete m; diff --git a/trunk/ceph/mds/MDS.h b/trunk/ceph/mds/MDS.h index ae8fb4d618d4e..6c243a5b53f90 100644 --- a/trunk/ceph/mds/MDS.h +++ b/trunk/ceph/mds/MDS.h @@ -247,7 +247,7 @@ class MDS : public Dispatcher { void shutdown_start(); void stopping_start(); void stopping_done(); - int shutdown_final(); + void suicide(); void tick(); diff --git a/trunk/ceph/mds/Migrator.cc b/trunk/ceph/mds/Migrator.cc index 796ba73e31e9f..bb049b59bebe5 100644 --- a/trunk/ceph/mds/Migrator.cc +++ b/trunk/ceph/mds/Migrator.cc @@ -31,7 +31,6 @@ #include "events/EExport.h" #include "events/EImportStart.h" #include "events/EImportFinish.h" -#include "events/EFragment.h" #include "msg/Messenger.h" @@ -50,7 +49,7 @@ #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds || l <= g_conf.debug_mds_migrator) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " @@ -308,7 +307,7 @@ void Migrator::handle_mds_failure_or_stop(int who) CDir *dir = mds->mdcache->get_dirfrag(df); if (import_peer[df] == who) { - switch (import_state[df]) { + switch (q->second) { case IMPORT_DISCOVERING: dout(10) << "import state=discovering : clearing state" << endl; import_state.erase(df); @@ -325,7 +324,7 @@ void Migrator::handle_mds_failure_or_stop(int who) break; case IMPORT_PREPPING: - if (import_state[df] == IMPORT_PREPPING) { + if (q->second == IMPORT_PREPPING) { dout(10) << "import state=prepping : unpinning base+bounds " << *dir << endl; } assert(dir); @@ -632,7 +631,7 @@ void Migrator::export_frozen(CDir *dir) // include the dirfrag? only if it's not the bounding subtree root. if (cur != bound) { assert(cur->is_auth()); - prep->add_dirfrag( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! + prep->add_dirfrag( cur->replicate_to(dest) ); // yay! dout(7) << " added " << *cur << endl; } @@ -643,9 +642,11 @@ void Migrator::export_frozen(CDir *dir) it != inode_trace.end(); it++) { CInode *in = *it; + dout(7) << " added " << *in->parent << endl; dout(7) << " added " << *in << endl; prep->add_inode( in->parent->get_dir()->dirfrag(), - in->parent->get_name(), + in->parent->get_name(), + in->parent->replicate_to(dest), in->replicate_to(dest) ); } @@ -724,8 +725,12 @@ void Migrator::export_go(CDir *dir) // set ambiguous auth cache->adjust_subtree_auth(dir, dest, mds->get_nodeid()); + + // take away the popularity we're sending. + mds->balancer->subtract_export(dir); // fill export message with cache data + utime_t now = g_clock.now(); C_Contexts *fin = new C_Contexts; // collect all the waiters map exported_client_map; int num_exported_inodes = encode_export_dir( export_data[dir], @@ -733,7 +738,8 @@ void Migrator::export_go(CDir *dir) dir, // base dir, // recur start point dest, - exported_client_map ); + exported_client_map, + now ); bufferlist bl; ::_encode(exported_client_map, bl); export_data[dir].push_front(bl); @@ -758,9 +764,6 @@ void Migrator::export_go(CDir *dir) // queue up the finisher dir->add_waiter( CDir::WAIT_UNFREEZE, fin ); - // take away the popularity we're sending. FIXME: do this later? - mds->balancer->subtract_export(dir); - // stats if (mds->logger) mds->logger->inc("ex"); if (mds->logger) mds->logger->inc("iex", num_exported_inodes); @@ -775,7 +778,8 @@ void Migrator::export_go(CDir *dir) * used by: encode_export_dir, file_rename (if foreign) */ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth, - map& exported_client_map) + map& exported_client_map, + utime_t now) { // tell (all) clients about migrating caps.. mark STALE for (map::iterator it = in->client_caps.begin(); @@ -798,7 +802,7 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au // add inode assert(!in->is_replica(mds->get_nodeid())); - CInodeExport istate( in ); + CInodeExport istate(in, now); istate._encode( enc_state ); // we're export this inode; fix inode state @@ -834,7 +838,8 @@ int Migrator::encode_export_dir(list& dirstatelist, CDir *basedir, CDir *dir, int newauth, - map& exported_client_map) + map& exported_client_map, + utime_t now) { int num_exported = 0; @@ -845,7 +850,7 @@ int Migrator::encode_export_dir(list& dirstatelist, // dir bufferlist enc_dir; - CDirExport dstate(dir); + CDirExport dstate(dir, now); dstate._encode( enc_dir ); // release open_by @@ -881,7 +886,7 @@ int Migrator::encode_export_dir(list& dirstatelist, dout(7) << "encode_export_dir exporting " << *dn << endl; // name - _encode(it->first, enc_dir); + ::_encode(it->first, enc_dir); // state it->second->encode_export_state(enc_dir); @@ -899,7 +904,9 @@ int Migrator::encode_export_dir(list& dirstatelist, enc_dir.append("L", 1); // remote link inodeno_t ino = dn->get_remote_ino(); - enc_dir.append((char*)&ino, sizeof(ino)); + unsigned char d_type = dn->get_remote_d_type(); + ::_encode(ino, enc_dir); + ::_encode(d_type, enc_dir); continue; } @@ -907,7 +914,7 @@ int Migrator::encode_export_dir(list& dirstatelist, // -- inode enc_dir.append("I", 1); // inode dentry - encode_export_inode(in, enc_dir, newauth, exported_client_map); // encode, and (update state for) export + encode_export_inode(in, enc_dir, newauth, exported_client_map, now); // encode, and (update state for) export // directory? list dfs; @@ -935,7 +942,7 @@ int Migrator::encode_export_dir(list& dirstatelist, // subdirs for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth, - exported_client_map); + exported_client_map, now); return num_exported; } @@ -973,7 +980,8 @@ void Migrator::handle_export_ack(MExportDirAck *m) cache->get_subtree_bounds(dir, bounds); // log completion - EExport *le = new EExport(dir); + EExport *le = new EExport(mds->mdlog, dir); + le->metablob.add_dir_context(dir); le->metablob.add_dir( dir, false ); for (set::iterator p = bounds.begin(); p != bounds.end(); @@ -1148,7 +1156,7 @@ void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m) void Migrator::export_finish(CDir *dir) { - dout(7) << "export_finish " << *dir << endl; + dout(5) << "export_finish " << *dir << endl; if (export_state.count(dir) == 0) { dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; @@ -1261,27 +1269,19 @@ void Migrator::handle_export_discover(MExportDirDiscover *m) dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; assert(0); // this shouldn't happen if the auth pins his path properly!!!! } - - CInode *in; - if (trace.empty()) { - in = cache->get_root(); - if (!in) { - cache->open_root(new C_MDS_RetryMessage(mds, m)); - return; - } - } else { - in = trace[trace.size()-1]->inode; - } + + assert(0); // this shouldn't happen; the get_inode above would have succeeded. } // yay - dout(7) << "handle_export_discover have " << df << " inode " << *in << endl; + import_state[m->get_dirfrag()] = IMPORT_DISCOVERED; + // pin inode in the cache (for now) assert(in->is_dir()); in->get(CInode::PIN_IMPORTING); - + // reply dout(7) << " sending export_discover_ack on " << *in << endl; mds->send_message_mds(new MExportDirDiscoverAck(df), @@ -1314,7 +1314,8 @@ void Migrator::handle_export_prep(MExportDirPrep *m) // make sure we didn't abort if (import_state.count(m->get_dirfrag()) == 0 || - import_state[m->get_dirfrag()] != IMPORT_DISCOVERED || + (import_state[m->get_dirfrag()] != IMPORT_DISCOVERED && + import_state[m->get_dirfrag()] != IMPORT_PREPPING) || import_peer[m->get_dirfrag()] != oldauth) { dout(10) << "handle_export_prep import has aborted, dropping" << endl; delete m; @@ -1347,8 +1348,10 @@ void Migrator::handle_export_prep(MExportDirPrep *m) map bound_dirfragset; for (list::iterator p = m->get_bounds().begin(); p != m->get_bounds().end(); - ++p) + ++p) { + dout(10) << " bound " << *p << endl; bound_dirfragset[p->ino].insert(p->frag); + } // assimilate contents? if (!m->did_assim()) { @@ -1384,7 +1387,7 @@ void Migrator::handle_export_prep(MExportDirPrep *m) CDir *condir = cache->get_dirfrag( m->get_containing_dirfrag(in->ino()) ); assert(condir); cache->add_inode( in ); - condir->add_dentry( m->get_dentry(in->ino()), in ); + condir->add_primary_dentry( m->get_dentry(in->ino()), in ); dout(7) << " added " << *in << endl; } @@ -1448,10 +1451,10 @@ void Migrator::handle_export_prep(MExportDirPrep *m) dout(7) << " pinning import bound " << *bound << endl; bound->get(CDir::PIN_IMPORTBOUND); bound->state_set(CDir::STATE_IMPORTBOUND); - import_bounds.insert(bound); } else { dout(7) << " already pinned import bound " << *bound << endl; } + import_bounds.insert(bound); } } @@ -1769,7 +1772,7 @@ void Migrator::import_finish(CDir *dir, bool now) cache->process_delayed_expire(dir); // ok now finish contexts - dout(5) << "finishing any waiters on imported data" << endl; + dout(10) << "finishing any waiters on imported data" << endl; dir->finish_waiting(CDir::WAIT_IMPORTED); cache->show_subtrees(); @@ -1809,7 +1812,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol // link before state -- or not! -sage if (dn->inode != in) { assert(!dn->inode); - dn->dir->link_inode(dn, in); + dn->dir->link_primary_inode(dn, in); } // add inode? @@ -1827,18 +1830,6 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol if (in->is_replica(mds->get_nodeid())) in->remove_replica(mds->get_nodeid()); - // twiddle locks - /* - if (in->authlock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->authlock); - if (in->linklock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->linklock); - if (in->dirfragtreelock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->dirfragtreelock); - if (in->dirlock.do_import(oldauth, mds->get_nodeid())) - mds->locker->scatter_eval(&in->dirlock); - */ - // caps for (set::iterator it = merged_client_caps.begin(); it != merged_client_caps.end(); @@ -1851,12 +1842,6 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol caps->set_mds( oldauth ); // reap from whom? mds->send_message_client_maybe_open(caps, imported_client_map[*it]); } - - // filelock - /* - if (in->filelock.do_import(oldauth, mds->get_nodeid())) - mds->locker->simple_eval(&in->filelock); - */ } @@ -1917,16 +1902,15 @@ int Migrator::decode_import_dir(bufferlist& bl, long nden = dstate.get_nden(); for (; nden>0; nden--) { - num_imported++; // dentry string dname; - _decode(dname, bl, off); + ::_decode(dname, bl, off); CDentry *dn = dir->lookup(dname); if (!dn) - dn = dir->add_dentry(dname); // null + dn = dir->add_null_dentry(dname); // decode state dn->decode_import_state(bl, off, oldauth, mds->get_nodeid()); @@ -1946,12 +1930,13 @@ int Migrator::decode_import_dir(bufferlist& bl, else if (icode == 'L') { // remote link inodeno_t ino; - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); + unsigned char d_type; + ::_decode(ino, bl, off); + ::_decode(d_type, bl, off); if (dn->is_remote()) { assert(dn->get_remote_ino() == ino); } else { - dir->link_inode(dn, ino); + dir->link_remote_inode(dn, ino, d_type); } } else if (icode == 'I') { @@ -1993,7 +1978,9 @@ void Migrator::handle_export_notify(MExportDirNotify *m) dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth << " on " << *dir << endl; // adjust auth - cache->adjust_bounded_subtree_auth(dir, m->get_bounds(), new_auth); + set have; + cache->map_dirfrag_set(m->get_bounds(), have); + cache->adjust_bounded_subtree_auth(dir, have, new_auth); // induce a merge? cache->try_subtree_merge(dir); @@ -2023,117 +2010,3 @@ void Migrator::handle_export_notify(MExportDirNotify *m) - -// =================================================================== -// FRAGMENT - -class C_MDC_FragmentFreeze : public Context { - Migrator *mig; - CDir *dir; - int bits; -public: - C_MDC_FragmentFreeze(Migrator *m, CDir *d, int b) : mig(m), dir(d), bits(b) {} - virtual void finish(int r) { - if (r >= 0) - mig->fragment_frozen(dir, bits); - } -}; - -void Migrator::fragment_dir(CDir *dir, int bits) -{ - dout(7) << "fragment_dir " << *dir << " bits " << bits << endl; - assert(dir->is_auth()); - - if (mds->mdsmap->is_degraded()) { - dout(7) << "cluster degraded, no fragmenting for now" << endl; - return; - } - if (dir->inode->is_root()) { - dout(7) << "i won't fragment root" << endl; - //assert(0); - return; - } - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; - return; - } - if (dir->state_test(CDir::STATE_FRAGMENTING)) { - dout(7) << "already fragmenting" << endl; - return; - } - - dir->state_set(CDir::STATE_FRAGMENTING); - dir->get(CDir::PIN_FRAGMENTING); - - // first, freeze. - dir->freeze_dir(new C_MDC_FragmentFreeze(this, dir, bits)); -} - -class C_MDC_FragmentLogged : public Context { - Migrator *mig; - list dirs; - int bits; - version_t pv; -public: - C_MDC_FragmentLogged(Migrator *m, list& dls, int b, version_t v) : mig(m), dirs(dls), bits(b), pv(v) {} - virtual void finish(int r) { - if (r >= 0) - mig->fragment_logged(dirs, bits, pv); - } -}; - -void Migrator::fragment_frozen(CDir *dir, int bits) -{ - dout(7) << "fragment_frozen " << *dir << " bits " << bits << endl; - - CInode *diri = dir->get_inode(); - - // journal it. - EFragment *le = new EFragment(dir->ino(), dir->get_frag(), bits); - - list subfrags; - list waiters; - version_t pv = dir->pre_dirty(); - diri->fragment_dir(dir->get_frag(), bits, subfrags, waiters); - - // predirty and journal content - for (list::iterator p = subfrags.begin(); - p != subfrags.end(); - ++p) { - CDir *subfrag = *p; - le->metablob.add_dir_context(subfrag); - for (map::iterator q = subfrag->items.begin(); - q != subfrag->items.end(); - ++q) { - CDentry *dn = q->second; - dn->set_projected_version(pv); - le->metablob.add_dentry(dn, true); - } - } - - // go - mds->mdlog->submit_entry(le); - mds->mdlog->wait_for_sync(new C_MDC_FragmentLogged(this, subfrags, bits, pv)); -} - -void Migrator::fragment_logged(list& dirs, int bits, version_t pv) -{ - CInode *diri = dirs.front()->get_inode(); - dout(10) << "fragment_logged " << diri->ino() << " bits " << bits << " pv " << pv << endl; - - for (list::iterator p = dirs.begin(); - p != dirs.end(); - p++) { - CDir *dir = *p; - dout(10) << " subfrag " << *dir << endl; - - // dirty everything - for (map::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) - p->second->mark_dirty(pv); - - dir->unfreeze_dir(); - } -} diff --git a/trunk/ceph/mds/Migrator.h b/trunk/ceph/mds/Migrator.h index 25851635bced4..e224faf0d04fb 100644 --- a/trunk/ceph/mds/Migrator.h +++ b/trunk/ceph/mds/Migrator.h @@ -41,8 +41,6 @@ class MExportDirNotify; class MExportDirNotifyAck; class MExportDirFinish; -class MFragmentDirNotify; - class EImportStart; @@ -184,13 +182,15 @@ public: void export_empty_import(CDir *dir); void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth, - map& exported_client_map); + map& exported_client_map, + utime_t now); int encode_export_dir(list& dirstatelist, class C_Contexts *fin, CDir *basedir, CDir *dir, int newauth, - map& exported_client_map); + map& exported_client_map, + utime_t now); void add_export_finish_waiter(CDir *dir, Context *c) { export_finish_waiters[dir].push_back(c); @@ -247,14 +247,6 @@ protected: void handle_export_notify(MExportDirNotify *m); - // -- fragmenting -- - void fragment_dir(CDir *dir, int byn); - void fragment_frozen(CDir *dir, int byn); - friend class C_MDC_FragmentFreeze; - void fragment_logged(list& dirs, int bits, version_t pv); - friend class C_MDC_FragmentLogged; - - void handle_fragment_notify(MFragmentDirNotify *m); }; diff --git a/trunk/ceph/mds/Renamer.cc b/trunk/ceph/mds/Renamer.cc deleted file mode 100644 index 534a608b8e6bd..0000000000000 --- a/trunk/ceph/mds/Renamer.cc +++ /dev/null @@ -1,905 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include "MDCache.h" -#include "CInode.h" -#include "CDir.h" -#include "MDS.h" -#include "MDSMap.h" -#include "MDLog.h" -#include "AnchorClient.h" -#include "Migrator.h" -#include "Renamer.h" - -#include "include/filepath.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "events/EString.h" -#include "events/EUnlink.h" - -#include "messages/MRenameWarning.h" -#include "messages/MRenameNotify.h" -#include "messages/MRenameNotifyAck.h" -#include "messages/MRename.h" -#include "messages/MRenameAck.h" -#include "messages/MRenameReq.h" -#include "messages/MRenamePrep.h" - - - -void Renamer::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_MDS_RENAMEWARNING: - handle_rename_warning((MRenameWarning*)m); - break; - case MSG_MDS_RENAMENOTIFY: - handle_rename_notify((MRenameNotify*)m); - break; - case MSG_MDS_RENAMENOTIFYACK: - handle_rename_notify_ack((MRenameNotifyAck*)m); - break; - case MSG_MDS_RENAME: - handle_rename((MRename*)m); - break; - case MSG_MDS_RENAMEREQ: - handle_rename_req((MRenameReq*)m); - break; - case MSG_MDS_RENAMEPREP: - handle_rename_prep((MRenamePrep*)m); - break; - case MSG_MDS_RENAMEACK: - handle_rename_ack((MRenameAck*)m); - break; - - default: - assert(0); - } -} - - -// renaming! - - -/* - fix_renamed_dir(): - - caller has already: - - relinked inode in new location - - fixed in->is_auth() - - set dir_auth, if appropriate - - caller has not: - - touched in->dir - - updated import/export tables -*/ -void Renamer::fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth - int dir_auth) // dir auth (for certain cases) -{ - dout(7) << "fix_renamed_dir on " << *in << endl; - dout(7) << "fix_renamed_dir on " << *in->dir << endl; - - - assert(0); // rewrite . - - // 1- fix subtree tree. - // 2- adjust subtree auth. - - /* - if (in->dir->is_auth()) { - // dir ours - dout(7) << "dir is auth" << endl; - assert(!in->dir->is_export()); - - if (in->is_auth()) { - // inode now ours - if (authchanged) { - // inode _was_ replica, now ours - dout(7) << "inode was replica, now ours." << endl; - cache->adjust_subtree_auth(dir, mds->get_nodeid()); - } else { - // inode was ours, still ours. - dout(7) << "inode was ours, still ours." << endl; - - assert(!in->dir->is_import()); - assert(in->dir->get_dir_auth().first == CDIR_AUTH_PARENT); - - // move any exports nested beneath me? - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - if (newcon != oldcon) { - dout(7) << "moving nested exports under new container" << endl; - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under new container" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[newcon].insert(*it); - } - } - } - - } else { - // inode now replica - - if (authchanged) { - // inode was ours, but now replica - dout(7) << "inode was ours, now replica. adding to import list." << endl; - - // i am now an import - cache->imports.insert(in->dir); - in->dir->state_set(CDir::STATE_IMPORT); - in->dir->get(CDir::PIN_IMPORT); - - in->dir->set_dir_auth( mds->get_nodeid() ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - // find old import - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - dout(7) << " oldcon is " << *oldcon << endl; - - // move nested exports under me - set nested; - cache->find_nested_exports_under(oldcon, in->dir, nested); - for (set::iterator it = nested.begin(); - it != nested.end(); - it++) { - dout(7) << "moving nested export " << *it << " under me" << endl; - cache->nested_exports[oldcon].erase(*it); - cache->nested_exports[in->dir].insert(*it); - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. doing nothing." << endl; - assert(in->dir->is_import()); - - // verify dir_auth - assert(in->dir->get_dir_auth().first == mds->get_nodeid()); // me, because i'm auth for dir. - assert(in->authority() != in->dir->get_dir_auth()); // inode not me. - } - - assert(in->dir->is_import()); - } - - } else { - // dir is not ours - dout(7) << "dir is not auth" << endl; - - if (in->is_auth()) { - // inode now ours - - if (authchanged) { - // inode was replica, now ours - dout(7) << "inode was replica, now ours. now an export." << endl; - assert(!in->dir->is_export()); - - // now export - cache->exports.insert(in->dir); - in->dir->state_set(CDir::STATE_EXPORT); - in->dir->get(CDir::PIN_EXPORT); - - assert(dir_auth >= 0); // better be defined - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; - - CDir *newcon = cache->get_auth_container(in->dir); - assert(newcon); - cache->nested_exports[newcon].insert(in->dir); - - } else { - // inode was ours, still ours - dout(7) << "inode was ours, still ours. did my import change?" << endl; - - // sanity - assert(in->dir->is_export()); - assert(in->dir->get_dir_auth().first >= 0); - assert(in->dir->get_dir_auth() != in->authority()); - - // moved under new import? - CDir *oldcon = cache->get_auth_container(srcdir); - CDir *newcon = cache->get_auth_container(in->dir); - if (oldcon != newcon) { - dout(7) << "moving myself under new import " << *newcon << endl; - cache->nested_exports[oldcon].erase(in->dir); - cache->nested_exports[newcon].insert(in->dir); - } - } - - assert(in->dir->is_export()); - } else { - // inode now replica - - if (authchanged) { - // inode was ours, now replica - dout(7) << "inode was ours, now replica. removing from export list." << endl; - assert(in->dir->is_export()); - - // remove from export list - cache->exports.erase(in->dir); - in->dir->state_clear(CDir::STATE_EXPORT); - in->dir->put(CDir::PIN_EXPORT); - - CDir *oldcon = cache->get_auth_container(srcdir); - assert(oldcon); - assert(cache->nested_exports[oldcon].count(in->dir) == 1); - cache->nested_exports[oldcon].erase(in->dir); - - // simplify dir_auth - if (in->authority() == in->dir->authority()) { - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; - } else { - assert(in->dir->get_dir_auth().first >= 0); // someone else's export, - } - - } else { - // inode was replica, still replica - dout(7) << "inode was replica, still replica. do nothing." << endl; - - // fix dir_auth? - if (in->authority().first == dir_auth) - in->dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - in->dir->set_dir_auth( dir_auth ); - dout(7) << " fixing dir_auth to be " << dir_auth << endl; - - // do nothing. - } - - assert(!in->dir->is_export()); - } - } - */ - cache->show_subtrees(); -} - -/* - * when initiator gets an ack back for a foreign rename - */ - -class C_MDC_RenameNotifyAck : public Context { - Renamer *rn; - CInode *in; - int initiator; - -public: - C_MDC_RenameNotifyAck(Renamer *r, - CInode *i, int init) : rn(r), in(i), initiator(init) {} - void finish(int r) { - rn->file_rename_ack(in, initiator); - } -}; - - - -/************** initiator ****************/ - -/* - * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.) - */ -class C_MDC_RenameAck : public Context { - Renamer *mdc; - CDir *srcdir; - CInode *in; - Context *c; -public: - C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) { - this->mdc = mdc; - this->srcdir = srcdir; - this->in = in; - this->c = c; - } - void finish(int r) { - mdc->file_rename_finish(srcdir, in, c); - } -}; - - -void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) -{ - assert(srcdn->is_xlocked()); // by me - assert(destdn->is_xlocked()); // by me - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - CDir *destdir = destdn->dir; - string destname = destdn->name; - - CInode *in = srcdn->inode; - //Message *req = srcdn->xlockedby; - - - // determine the players - int srcauth = srcdir->dentry_authority(srcdn->name).first; - int destauth = destdir->dentry_authority(destname).first; - - - // FOREIGN rename? - if (srcauth != mds->get_nodeid() || - destauth != mds->get_nodeid()) { - dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl; - - string destpath; - destdn->make_path(destpath); - - if (destauth != mds->get_nodeid()) { - // make sure dest has dir open. - dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl; - - // prep dest first, they must have the dir open! rest will follow. - string srcpath; - srcdn->make_path(srcpath); - - MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, srcpath, - destdir->ino(), destname, destpath, - srcauth); // tell dest who src is (maybe even me) - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - cache->show_subtrees(); - - } - - else if (srcauth != mds->get_nodeid()) { - if (destauth == mds->get_nodeid()) { - dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; - } else { - dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; - } - - // srcdn not important on destauth, just request - MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator - srcdir->ino(), srcname, - destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know) - mds->send_message_mds(m, srcauth, MDS_PORT_CACHE); - } - - else - assert(0); - - // set waiter on the inode (is this the best place?) - in->add_waiter(CInode::WAIT_RENAMEACK, - new C_MDC_RenameAck(this, - srcdir, in, onfinish)); - return; - } - - // LOCAL rename! - assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid()); - dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; - - // update our cache - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change - - // mark dentries dirty - srcdn->_mark_dirty(); // fixme - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - - // local, restrict notify to ppl with open dirs - set notify; - for (map::iterator it = srcdir->replicas_begin(); - it != srcdir->replicas_end(); - ++it) - notify.insert(it->first); - for (map::iterator it = destdir->replicas_begin(); - it != destdir->replicas_end(); - it++) - if (notify.count(it->first) == 0) notify.insert(it->first); - - if (notify.size()) { - // warn + notify - file_rename_warn(in, notify); - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); - - // wait for MRenameNotifyAck's - in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator - - // wait for finish - in->add_waiter(CInode::WAIT_RENAMEACK, - new C_MDC_RenameAck(this, srcdir, in, onfinish)); - } else { - // sweet, no notify necessary, we're done! - file_rename_finish(srcdir, in, onfinish); - } -} - -void Renamer::handle_rename_ack(MRenameAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_rename_ack on " << *in << endl; - - // all done! - in->finish_waiting(CInode::WAIT_RENAMEACK); - - delete m; -} - -void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) -{ - dout(10) << "file_rename_finish on " << *in << endl; - - // did i empty out an imported dir? FIXME this check should go somewhere else??? - if (srcdir->is_auth() && !srcdir->inode->is_auth() && srcdir->get_size() == 0) - cache->migrator->export_empty_import(srcdir); - - // finish our caller - if (c) { - c->finish(0); - delete c; - } -} - - -/************* src **************/ - - -/** handle_rename_req - * received by auth of src dentry (from init, or destauth if dir). - * src may not have dest dir open. - * src will export inode, unlink|rename, and send MRename to dest. - */ -void Renamer::handle_rename_req(MRenameReq *m) -{ - // i am auth, i will have it. - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - assert(srcdn); - - // do it - file_rename_foreign_src(srcdn, - m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), - m->get_initiator()); - delete m; -} - - -void Renamer::file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator) -{ - dout(7) << "file_rename_foreign_src " << *srcdn << endl; - - CDir *srcdir = srcdn->dir; - string srcname = srcdn->name; - - // (we're basically exporting this inode) - CInode *in = srcdn->inode; - assert(in); - assert(in->is_auth()); - - if (in->is_dir()) cache->show_subtrees(); - - // encode and export inode state - bufferlist inode_state; - cache->migrator->encode_export_inode(in, inode_state, destauth); - - // send - MRename *m = new MRename(initiator, - srcdir->ino(), srcdn->name, destdirino, destname, - inode_state); - mds->send_message_mds(m, destauth, MDS_PORT_CACHE); - - // have dest? - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // discover src - if (!destdn) { - dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl; - - filepath destfilepath = destpath; - vector trace; - int r = cache->path_traverse(destfilepath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - return; - } - - assert(destdn); - - // update our cache - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in->is_dir() && in->dir) - fix_renamed_dir(srcdir, in, destdir, true); // auth changed - - srcdn->_mark_dirty(); // fixme - - // proxy! - //in->state_set(CInode::STATE_PROXY); - //in->get(CInode::PIN_PROXY); - - // generate notify list (everybody but src|dst) and send warnings - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != mds->get_nodeid() && // except the source - i != destauth) // and the dest - notify.insert(i); - } - file_rename_warn(in, notify); - - - // wait for MRenameNotifyAck's - in->add_waiter(CInode::WAIT_RENAMENOTIFYACK, - new C_MDC_RenameNotifyAck(this, in, initiator)); -} - -void Renamer::file_rename_warn(CInode *in, - set& notify) -{ - // note gather list - rename_waiting_for_ack[in->ino()] = notify; - - // send - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_warn to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE); - } -} - - -void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) -{ - CInode *in = cache->get_inode(m->get_ino()); - assert(in); - dout(7) << "handle_rename_notify_ack on " << *in << endl; - - int source = m->get_source().num(); - rename_waiting_for_ack[in->ino()].erase(source); - if (rename_waiting_for_ack[in->ino()].empty()) { - // last one! - rename_waiting_for_ack.erase(in->ino()); - in->finish_waiting(CInode::WAIT_RENAMENOTIFYACK, 0); - } else { - dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; - } -} - - -void Renamer::file_rename_ack(CInode *in, int initiator) -{ - // we got all our MNotifyAck's. - - // was i proxy (if not, it's cuz this was a local rename) - /*if (in->state_test(CInode::STATE_PROXY)) { - dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; - in->state_clear(CInode::STATE_PROXY); - in->put(CInode::PIN_PROXY); - }*/ - - // done! - if (initiator == mds->get_nodeid()) { - // it's me, finish - dout(7) << "file_rename_ack i am initiator, finishing" << endl; - in->finish_waiting(CInode::WAIT_RENAMEACK); - } else { - // send ack - dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; - mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE); - } -} - - - - -/************ dest *************/ - -/** handle_rename_prep - * received by auth of dest dentry to make sure they have src + dir open. - * this is so that when they get the inode and dir, they can update exports etc properly. - * will send MRenameReq to src. - */ -void Renamer::handle_rename_prep(MRenamePrep *m) -{ - // open src - filepath srcpath = m->get_srcpath(); - vector trace; - int r = cache->path_traverse(srcpath, trace, false, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - - if (r>0) return; - - // ok! - CInode *srcin = trace[trace.size()-1]->inode; - assert(srcin); - - dout(7) << "handle_rename_prep have srcin " << *srcin << endl; - - if (srcin->is_dir()) { - if (!srcin->dir) { - dout(7) << "handle_rename_prep need to open dir" << endl; - cache->open_remote_dir(srcin, frag_t(), // FIXME dirfrag - new C_MDS_RetryMessage(mds,m)); - return; - } - - dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl; - } - - // pin - srcin->get(CInode::PIN_RENAMESRC); - - // send rename request - MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator - m->get_srcdirino(), m->get_srcname(), - m->get_destdirino(), m->get_destname(), m->get_destpath(), - mds->get_nodeid()); // i am dest - mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE); - delete m; - return; -} - - - -/** handle_rename - * received by auth of dest dentry. includes exported inode info. - * dest may not have srcdir open. - */ -void Renamer::handle_rename(MRename *m) -{ - // srcdn (required) - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = srcdiri->dir; - CDentry *srcdn = srcdir->lookup(m->get_srcname()); - string srcname = srcdn->name; - assert(srcdn && srcdn->inode); - - dout(7) << "handle_rename srcdn " << *srcdn << endl; - - // destdn (required). i am auth, so i will have it. - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = destdiri->dir; - CDentry *destdn = destdir->lookup(m->get_destname()); - string destname = destdn->name; - assert(destdn); - - dout(7) << "handle_rename destdn " << *destdn << endl; - - // note old dir auth - int old_dir_auth = -1; - if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority().first; - - // rename replica into position - if (destdn->inode && destdn->inode->is_dirty()) - destdn->inode->mark_clean(); - - cache->rename_file(srcdn, destdn); - - // decode + import inode (into new location start) - int off = 0; - // HACK - bufferlist bufstate; - bufstate.claim_append(m->get_inode_state()); - cache->migrator->decode_import_inode(destdn, bufstate, off, m->get_source().num()); - - CInode *in = destdn->inode; - assert(in); - - // update imports/exports? - if (in->is_dir()) { - assert(in->dir); // i had better already ahve it open.. see MRenamePrep - fix_renamed_dir(srcdir, in, destdir, true, // auth changed - old_dir_auth); // src is possibly new dir auth. - } - - // mark dirty - destdn->_mark_dirty(); // fixme - in->_mark_dirty(); // fixme - - // unpin - in->put(CInode::PIN_RENAMESRC); - - // ok, send notifies. - set notify; - for (int i=0; iget_mds_map()->get_num_mds(); i++) { - if (i != m->get_source().num() && // except the source - i != mds->get_nodeid()) // and the dest - notify.insert(i); - } - file_rename_notify(in, srcdir, srcname, destdir, destname, notify, m->get_source().num()); - - delete m; -} - - -void Renamer::file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, - int srcauth) -{ - /* NOTE: notify list might include myself */ - - // tell - string destdirpath; - destdir->inode->make_path(destdirpath); - - for (set::iterator it = notify.begin(); - it != notify.end(); - it++) { - dout(10) << "file_rename_notify to " << *it << " for " << *in << endl; - mds->send_message_mds(new MRenameNotify(in->ino(), - srcdir->ino(), - srcname, - destdir->ino(), - destdirpath, - destname, - srcauth), - *it, MDS_PORT_CACHE); - } -} - - - -/************** bystanders ****************/ - -void Renamer::handle_rename_warning(MRenameWarning *m) -{ - // add to warning list - stray_rename_warnings.insert( m->get_ino() ); - - // did i already see the notify? - if (stray_rename_notifies.count(m->get_ino())) { - // i did, we're good. - dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl; - - handle_rename_notify(stray_rename_notifies[m->get_ino()]); - stray_rename_notifies.erase(m->get_ino()); - } else { - dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl; - } - - // done - delete m; -} - - -void Renamer::handle_rename_notify(MRenameNotify *m) -{ - // FIXME: when we do hard links, i think we need to - // have srcdn and destdn both, or neither, always! - - // did i see the warning yet? - if (!stray_rename_warnings.count(m->get_ino())) { - // wait for it. - dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl; - stray_rename_notifies[m->get_ino()] = m; - return; - } - - dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; - - // src - CInode *srcdiri = cache->get_inode(m->get_srcdirino()); - CDir *srcdir = 0; - if (srcdiri) srcdir = srcdiri->dir; - CDentry *srcdn = 0; - if (srcdir) srcdn = srcdir->lookup(m->get_srcname()); - - // dest - CInode *destdiri = cache->get_inode(m->get_destdirino()); - CDir *destdir = 0; - if (destdiri) destdir = destdiri->dir; - CDentry *destdn = 0; - if (destdir) destdn = destdir->lookup(m->get_destname()); - - // have both? - list finished; - if (srcdn && destdir) { - CInode *in = srcdn->inode; - - int old_dir_auth = -1; - if (in && in->dir) old_dir_auth = in->dir->authority().first; - - if (!destdn) { - destdn = destdir->add_dentry(m->get_destname()); // create null dentry - destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked! - } - - dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl; - - if (in) { - cache->rename_file(srcdn, destdn); - - // update imports/exports? - if (in && in->is_dir() && in->dir) { - fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change - } - } else { - dout(7) << " i don't have the inode (just null dentries)" << endl; - } - - } - - else if (srcdn) { - dout(7) << "handle_rename_notify no dest, but have src" << endl; - dout(7) << "srcdn is " << *srcdn << endl; - - if (destdiri) { - dout(7) << "have destdiri, opening dir " << *destdiri << endl; - cache->open_remote_dir(destdiri, frag_t(), // FIXME dirfrag - new C_MDS_RetryMessage(mds,m)); - } else { - filepath destdirpath = m->get_destdirpath(); - dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; - - vector trace; - int r = cache->path_traverse(destdirpath, trace, true, - m, new C_MDS_RetryMessage(mds, m), - MDS_TRAVERSE_DISCOVER); - assert(r>0); - } - return; - } - - else if (destdn) { - dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl; - if (destdn->inode) { - destdir->unlink_inode(destdn); - } - } - - else { - dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl; - assert(srcdn == 0 && destdn == 0); - } - - mds->queue_finished(finished); - - - // ack - dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl; - MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino()); - mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE); - - - stray_rename_warnings.erase( m->get_ino() ); - delete m; -} - - - - diff --git a/trunk/ceph/mds/Renamer.h b/trunk/ceph/mds/Renamer.h deleted file mode 100644 index f6f82c31ba9fc..0000000000000 --- a/trunk/ceph/mds/Renamer.h +++ /dev/null @@ -1,99 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_RENAMER_H -#define __MDS_RENAMER_H - -#include "include/types.h" - -#include -#include -using std::map; -using std::set; - -class MDS; -class MDCache; -class CDentry; -class CInode; -class CDir; - -class Message; -class MRenameWarning; -class MRenameNotify; -class MRenameNotifyAck; -class MRename; -class MRenamePrep; -class MRenameReq; -class MRenameAck; - -class Renamer { - MDS *mds; - MDCache *cache; - - // rename fun - set stray_rename_warnings; // notifies i haven't seen - map stray_rename_notifies; - - map > rename_waiting_for_ack; - - - - void fix_renamed_dir(CDir *srcdir, - CInode *in, - CDir *destdir, - bool authchanged, // _inode_ auth changed - int dirauth=-1); // dirauth (for certain cases) - - -public: - Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {} - - void dispatch(Message *m); - - // RENAME - // initiator - public: - void file_rename(CDentry *srcdn, CDentry *destdn, Context *c); - protected: - void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) - void file_rename_finish(CDir *srcdir, CInode *in, Context *c); - friend class C_MDC_RenameAck; - - // src - void handle_rename_req(MRenameReq *m); // dest -> src - void file_rename_foreign_src(CDentry *srcdn, - inodeno_t destdirino, string& destname, string& destpath, int destauth, - int initiator); - void file_rename_warn(CInode *in, set& notify); - void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src - void file_rename_ack(CInode *in, int initiator); - friend class C_MDC_RenameNotifyAck; - - // dest - void handle_rename_prep(MRenamePrep *m); // init -> dest - void handle_rename(MRename *m); // src -> dest - void file_rename_notify(CInode *in, - CDir *srcdir, string& srcname, CDir *destdir, string& destname, - set& notify, int srcauth); - - // bystander - void handle_rename_warning(MRenameWarning *m); // src -> bystanders - void handle_rename_notify(MRenameNotify *m); // dest -> bystanders - - -}; - -#endif - - diff --git a/trunk/ceph/mds/ScatterLock.h b/trunk/ceph/mds/ScatterLock.h index 56153ebef8409..5f5085e59b82a 100644 --- a/trunk/ceph/mds/ScatterLock.h +++ b/trunk/ceph/mds/ScatterLock.h @@ -58,7 +58,7 @@ inline const char *get_scatterlock_state_name(int s) { case LOCK_GLOCKT: return "gLockT"; case LOCK_TEMPSYNC: return "Tempsync"; - default: assert(0); + default: assert(0); return 0; } } @@ -95,6 +95,7 @@ public: return LOCK_LOCK; default: assert(0); + return 0; } } diff --git a/trunk/ceph/mds/Server.cc b/trunk/ceph/mds/Server.cc index 6ffae50603490..f37ace7f30488 100644 --- a/trunk/ceph/mds/Server.cc +++ b/trunk/ceph/mds/Server.cc @@ -60,6 +60,31 @@ using namespace std; #define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".server " +void Server::reopen_logger() +{ + static LogType mdserver_logtype; + static bool didit = false; + if (!didit) { + didit = true; + mdserver_logtype.add_inc("hcreq"); // handle client req + mdserver_logtype.add_inc("hsreq"); // slave + mdserver_logtype.add_inc("hcsess"); // client session + mdserver_logtype.add_inc("dcreq"); // dispatch client req + mdserver_logtype.add_inc("dsreq"); // slave + } + + if (logger) { + logger->flush(); + delete logger; + } + + // logger + char name[80]; + sprintf(name, "mds%d.server", mds->get_nodeid()); + logger = new Logger(name, &mdserver_logtype); +} + + void Server::dispatch(Message *m) { switch (m->get_type()) { @@ -152,6 +177,8 @@ void Server::handle_client_session(MClientSession *m) mdlog->submit_entry(new ESession(m->get_source_inst(), open, cmapv), new C_MDS_session_finish(mds, m->get_source_inst(), open, cmapv)); delete m; + + if (logger) logger->inc("hcsess"); } void Server::_session_logged(entity_inst_t client_inst, bool open, version_t cmapv) @@ -391,6 +418,8 @@ void Server::handle_client_request(MClientRequest *req) dout(4) << "handle_client_request " << *req << endl; int client = req->get_client(); + if (logger) logger->inc("hcreq"); + if (!mds->is_active()) { dout(5) << " not active, discarding client request." << endl; delete req; @@ -471,6 +500,8 @@ void Server::dispatch_client_request(MDRequest *mdr) { MClientRequest *req = mdr->client_request; + if (logger) logger->inc("dcreq"); + if (mdr->ref) { dout(7) << "dispatch_client_request " << *req << " ref " << *mdr->ref << endl; } else { @@ -508,8 +539,7 @@ void Server::dispatch_client_request(MDRequest *mdr) // funky. case MDS_OP_OPEN: - if ((req->args.open.flags & O_CREAT) && - !mdr->ref) + if (req->args.open.flags & O_CREAT) handle_client_openc(mdr); else handle_client_open(mdr); @@ -553,6 +583,8 @@ void Server::handle_slave_request(MMDSSlaveRequest *m) dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << endl; int from = m->get_source().num(); + if (logger) logger->inc("hsreq"); + // reply? if (m->is_reply()) { @@ -647,6 +679,8 @@ void Server::dispatch_slave_request(MDRequest *mdr) return; } + if (logger) logger->inc("dsreq"); + switch (mdr->slave_request->get_op()) { case MMDSSlaveRequest::OP_XLOCK: { @@ -921,7 +955,7 @@ CDentry* Server::prepare_null_dentry(MDRequest *mdr, CDir *dir, const string& dn } // create - dn = dir->add_dentry(dname, 0); + dn = dir->add_null_dentry(dname); dn->mark_new(); dout(10) << "prepare_null_dentry added " << *dn << endl; @@ -941,9 +975,6 @@ CInode* Server::prepare_new_inode(MDRequest *mdr, CDir *dir) in->inode.ctime = in->inode.mtime = in->inode.atime = mdr->now; // now dout(10) << "prepare_new_inode " << *in << endl; - // bump modify pop - mds->balancer->hit_dir(dir, META_POP_DWR); - return in; } @@ -1286,12 +1317,12 @@ void Server::handle_client_stat(MDRequest *mdr) set xlocks = mdr->xlocks; int mask = req->args.stat.mask; - if (mask & INODE_MASK_LINK) rdlocks.insert(&ref->linklock); - if (mask & INODE_MASK_AUTH) rdlocks.insert(&ref->authlock); + if (mask & STAT_MASK_LINK) rdlocks.insert(&ref->linklock); + if (mask & STAT_MASK_AUTH) rdlocks.insert(&ref->authlock); if (ref->is_file() && - mask & INODE_MASK_FILE) rdlocks.insert(&ref->filelock); + mask & STAT_MASK_FILE) rdlocks.insert(&ref->filelock); if (ref->is_dir() && - mask & INODE_MASK_MTIME) rdlocks.insert(&ref->dirlock); + mask & STAT_MASK_MTIME) rdlocks.insert(&ref->dirlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -1325,6 +1356,8 @@ public: // apply in->pop_and_dirty_projected_inode(); + mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); + // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply->set_result(0); @@ -1349,8 +1382,6 @@ void Server::handle_client_utime(MDRequest *mdr) if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - mds->balancer->hit_inode(cur, META_POP_IWR); - // project update inode_t *pi = cur->project_inode(); pi->mtime = req->args.utime.mtime; @@ -1359,7 +1390,7 @@ void Server::handle_client_utime(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait - EUpdate *le = new EUpdate("utime"); + EUpdate *le = new EUpdate(mdlog, "utime"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); @@ -1385,8 +1416,6 @@ void Server::handle_client_chmod(MDRequest *mdr) if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - mds->balancer->hit_inode(cur, META_POP_IWR); - // project update inode_t *pi = cur->project_inode(); pi->mode = @@ -1396,7 +1425,7 @@ void Server::handle_client_chmod(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait - EUpdate *le = new EUpdate("chmod"); + EUpdate *le = new EUpdate(mdlog, "chmod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); @@ -1422,8 +1451,6 @@ void Server::handle_client_chown(MDRequest *mdr) if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; - mds->balancer->hit_inode(cur, META_POP_IWR); - // project update inode_t *pi = cur->project_inode(); pi->uid = MAX(req->args.chown.uid, 0); @@ -1432,7 +1459,7 @@ void Server::handle_client_chown(MDRequest *mdr) pi->ctime = g_clock.real_now(); // log + wait - EUpdate *le = new EUpdate("chown"); + EUpdate *le = new EUpdate(mdlog, "chown"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_primary_dentry(cur->parent, true, 0, pi); @@ -1450,8 +1477,8 @@ void Server::handle_client_chown(MDRequest *mdr) // READDIR int Server::encode_dir_contents(CDir *dir, - list& inls, - list& dnls) + list& dnls, + list& inls) { int numfiles = 0; @@ -1463,15 +1490,25 @@ int Server::encode_dir_contents(CDir *dir, if (dn->is_null()) continue; CInode *in = dn->inode; - if (!in) - continue; // hmm, fixme!, what about REMOTE links? - - dout(12) << "including inode " << *in << endl; + InodeStat *st; + if (in) { + dout(12) << "including inode " << *in << endl; + + // add this item + // note: InodeStat makes note of whether inode data is readable. + st = new InodeStat(in, mds->get_nodeid()); + } else { + assert(dn->is_remote()); + dout(12) << "including inode-less (remote) dentry " << *dn << endl; + st = new InodeStat; + st->mask = STAT_MASK_INO | STAT_MASK_TYPE; + memset(&st->inode, 0, sizeof(st->inode)); + st->inode.ino = dn->get_remote_ino(); + st->inode.mode = DT_TO_MODE(dn->get_remote_d_type()); + } - // add this item - // note: InodeStat makes note of whether inode data is readable. dnls.push_back( it->first ); - inls.push_back( new InodeStat(in, mds->get_nodeid()) ); + inls.push_back(st); numfiles++; } return numfiles; @@ -1488,7 +1525,7 @@ void Server::handle_client_readdir(MDRequest *mdr) if (!diri->is_dir()) { // not a dir dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; - reply_request(mdr, -ENOTDIR); + reply_request(mdr, -ENOTDIR, diri); return; } @@ -1498,7 +1535,7 @@ void Server::handle_client_readdir(MDRequest *mdr) // does the frag exist? if (diri->dirfragtree[fg] != fg) { dout(10) << "frag " << fg << " doesn't appear in fragtree " << diri->dirfragtree << endl; - reply_request(mdr, -EAGAIN); + reply_request(mdr, -EAGAIN, diri); return; } @@ -1525,7 +1562,7 @@ void Server::handle_client_readdir(MDRequest *mdr) // build dir contents list inls; list dnls; - int numfiles = encode_dir_contents(dir, inls, dnls); + int numfiles = encode_dir_contents(dir, dnls, inls); // . too //dnls.push_back("."); @@ -1534,12 +1571,13 @@ void Server::handle_client_readdir(MDRequest *mdr) // yay, reply MClientReply *reply = new MClientReply(req); - reply->take_dir_items(inls, dnls, numfiles); + reply->take_dir_items(dnls, inls, numfiles); dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; - reply->set_result(fg); - - //balancer->hit_dir(diri->dir); + reply->set_result(0); + + // bump popularity. NOTE: this doesn't quite capture it. + mds->balancer->hit_dir(g_clock.now(), dir, META_POP_IRD, numfiles); // reply reply_request(mdr, reply, diri); @@ -1565,7 +1603,7 @@ public: assert(r == 0); // link the inode - dn->get_dir()->link_inode(dn, newi); + dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir newi->mark_dirty(newi->inode.version + 1); @@ -1574,7 +1612,8 @@ public: mds->server->dirty_dn_diri(dn, dirpv, newi->inode.ctime); // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); + mds->balancer->hit_inode(mdr->now, newi, META_POP_IWR); + //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -1596,13 +1635,14 @@ void Server::handle_client_mknod(MDRequest *mdr) assert(newi); // it's a file. + newi->inode.rdev = req->args.mknod.rdev; newi->inode.mode = req->args.mknod.mode; newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_FILE; newi->inode.version = dn->pre_dirty() - 1; // prepare finisher - EUpdate *le = new EUpdate("mknod"); + EUpdate *le = new EUpdate(mdlog, "mknod"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too @@ -1643,7 +1683,7 @@ void Server::handle_client_mkdir(MDRequest *mdr) newdir->mark_dirty(newdir->pre_dirty()); // prepare finisher - EUpdate *le = new EUpdate("mkdir"); + EUpdate *le = new EUpdate(mdlog, "mkdir"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too @@ -1655,7 +1695,6 @@ void Server::handle_client_mkdir(MDRequest *mdr) mdlog->submit_entry(le); mdlog->wait_for_sync(new C_MDS_mknod_finish(mds, mdr, dn, newi, dirpv)); - /* old export heuristic. pbly need to reimplement this at some point. if ( diri->dir->is_auth() && @@ -1692,7 +1731,7 @@ void Server::handle_client_symlink(MDRequest *mdr) newi->inode.version = dn->pre_dirty() - 1; // prepare finisher - EUpdate *le = new EUpdate("symlink"); + EUpdate *le = new EUpdate(mdlog, "symlink"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(newi->ino(), mds->idalloc->get_version()); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir mtime too @@ -1840,7 +1879,7 @@ void Server::_link_local(MDRequest *mdr, CDentry *dn, CInode *targeti) pi->version = tipv; // log + wait - EUpdate *le = new EUpdate("link_local"); + EUpdate *le = new EUpdate(mdlog, "link_local"); le->metablob.add_client_req(mdr->reqid); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); @@ -1858,7 +1897,7 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, dout(10) << "_link_local_finish " << *dn << " to " << *targeti << endl; // link and unlock the NEW dentry - dn->dir->link_inode(dn, targeti->ino()); + dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); dn->mark_dirty(dnpv); // target inode @@ -1868,7 +1907,8 @@ void Server::_link_local_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, dirty_dn_diri(dn, dirpv, mdr->now); // bump target popularity - mds->balancer->hit_inode(targeti, META_POP_IWR); + mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); + //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -1921,7 +1961,7 @@ void Server::_link_remote(MDRequest *mdr, CDentry *dn, CInode *targeti) dn->pre_dirty(); // add to event - EUpdate *le = new EUpdate("link_remote"); + EUpdate *le = new EUpdate(mdlog, "link_remote"); le->metablob.add_client_req(mdr->reqid); version_t dirpv = predirty_dn_diri(mdr, dn, &le->metablob); // dir inode's mtime le->metablob.add_dir_context(dn->get_dir()); @@ -1941,14 +1981,15 @@ void Server::_link_remote_finish(MDRequest *mdr, CDentry *dn, CInode *targeti, dout(10) << "_link_remote_finish " << *dn << " to " << *targeti << endl; // link the new dentry - dn->dir->link_inode(dn, targeti->ino()); + dn->dir->link_remote_inode(dn, targeti->ino(), MODE_TO_DT(targeti->inode.mode)); dn->mark_dirty(dpv); // dir inode's mtime dirty_dn_diri(dn, dirpv, mdr->now); // bump target popularity - mds->balancer->hit_inode(targeti, META_POP_IWR); + mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); + //mds->balancer->hit_dir(mdr->now, dn->get_dir(), META_POP_DWR); // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); @@ -2019,7 +2060,7 @@ void Server::handle_slave_link_prep(MDRequest *mdr) dout(10) << " projected inode " << pi << " v " << pi->version << endl; // journal it - ESlaveUpdate *le = new ESlaveUpdate("slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); le->metablob.add_dir_context(targeti->get_parent_dir()); le->metablob.add_primary_dentry(dn, true, targeti, pi); // update old primary mds->mdlog->submit_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, old_ctime, inc)); @@ -2052,6 +2093,9 @@ void Server::_logged_slave_link(MDRequest *mdr, CInode *targeti, utime_t old_cti // update the target targeti->pop_and_dirty_projected_inode(); + // hit pop + mds->balancer->hit_inode(mdr->now, targeti, META_POP_IWR); + // ack MMDSSlaveRequest *reply = new MMDSSlaveRequest(mdr->reqid, MMDSSlaveRequest::OP_LINKPREPACK); mds->send_message_mds(reply, mdr->slave_to_mds, MDS_PORT_SERVER); @@ -2076,9 +2120,9 @@ void Server::_commit_slave_link(MDRequest *mdr, int r, CInode *targeti, ESlaveUpdate *le; if (r == 0) { // write a commit to the journal - le = new ESlaveUpdate("slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); + le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); } else { - le = new ESlaveUpdate("slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); + le = new ESlaveUpdate(mdlog, "slave_link_rollback", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); // -- rollback in memory -- assert(targeti->inode.ctime == mdr->now); @@ -2264,7 +2308,7 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) // ok, let's do it. // prepare log entry - EUpdate *le = new EUpdate("unlink_local"); + EUpdate *le = new EUpdate(mdlog, "unlink_local"); le->metablob.add_client_req(mdr->reqid); version_t ipv = 0; // dirty inode version @@ -2298,17 +2342,11 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) if (mdr->dst_reanchor_atid) le->metablob.add_anchor_transaction(mdr->dst_reanchor_atid); - // finisher - C_MDS_unlink_local_finish *fin = new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, - dirpv); - - journal_opens(); // journal pending opens, just in case - // log + wait + journal_opens(); // journal pending opens, just in case mdlog->submit_entry(le); - mdlog->wait_for_sync(fin); - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); + mdlog->wait_for_sync(new C_MDS_unlink_local_finish(mds, mdr, dn, straydn, + dirpv)); } void Server::_unlink_local_finish(MDRequest *mdr, @@ -2322,7 +2360,7 @@ void Server::_unlink_local_finish(MDRequest *mdr, dn->dir->unlink_inode(dn); // relink as stray? (i.e. was primary link?) - if (straydn) straydn->dir->link_inode(straydn, in); + if (straydn) straydn->dir->link_primary_inode(straydn, in); // nlink--, dirty old dentry in->pop_and_dirty_projected_inode(); @@ -2331,9 +2369,6 @@ void Server::_unlink_local_finish(MDRequest *mdr, // dir inode's mtime dirty_dn_diri(dn, dirpv, mdr->now); - // bump target popularity - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); it != dn->replicas_end(); @@ -2352,6 +2387,9 @@ void Server::_unlink_local_finish(MDRequest *mdr, if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid); + // bump pop + //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); + // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref @@ -2404,7 +2442,7 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) // ok, let's do it. // prepare log entry - EUpdate *le = new EUpdate("unlink_remote"); + EUpdate *le = new EUpdate(mdlog, "unlink_remote"); le->metablob.add_client_req(mdr->reqid); // the unlinked dentry @@ -2427,8 +2465,6 @@ void Server::_unlink_remote(MDRequest *mdr, CDentry *dn) // log + wait mdlog->submit_entry(le); mdlog->wait_for_sync(fin); - - mds->balancer->hit_dir(dn->dir, META_POP_DWR); } void Server::_unlink_remote_finish(MDRequest *mdr, @@ -2444,9 +2480,6 @@ void Server::_unlink_remote_finish(MDRequest *mdr, // dir inode's mtime dirty_dn_diri(dn, dirpv, mdr->now); - // bump target popularity - mds->balancer->hit_dir(dn->dir, META_POP_DWR); - // share unlink news with replicas for (map::iterator it = dn->replicas_begin(); it != dn->replicas_end(); @@ -2460,6 +2493,8 @@ void Server::_unlink_remote_finish(MDRequest *mdr, if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid); + //mds->balancer->hit_dir(mdr->now, dn->dir, META_POP_DWR); + // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); reply_request(mdr, reply, dn->dir->get_inode()); // FIXME: imprecise ref @@ -2810,7 +2845,7 @@ void Server::handle_client_rename(MDRequest *mdr) } // -- prepare journal entry -- - EUpdate *le = new EUpdate("rename"); + EUpdate *le = new EUpdate(mdlog, "rename"); le->metablob.add_client_req(mdr->reqid); _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); @@ -2840,9 +2875,17 @@ void Server::_rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDe if (mdr->src_reanchor_atid) mds->anchorclient->commit(mdr->src_reanchor_atid); if (mdr->dst_reanchor_atid) mds->anchorclient->commit(mdr->dst_reanchor_atid); + // bump popularity + //if (srcdn->is_auth()) + //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); + // mds->balancer->hit_dir(mdr->now, destdn->get_dir(), META_POP_DWR); + if (destdn->is_remote() && + destdn->inode->is_auth()) + mds->balancer->hit_inode(mdr->now, destdn->get_inode(), META_POP_IWR); + // reply MClientReply *reply = new MClientReply(mdr->client_request, 0); - reply_request(mdr, reply, destdn->dir->get_inode()); // FIXME: imprecise ref + reply_request(mdr, reply, destdn->get_inode()); // FIXME: imprecise ref // clean up? if (straydn) @@ -2869,8 +2912,8 @@ void Server::_rename_prepare(MDRequest *mdr, mdr->pvmap[srcdn->dir->inode] = predirty_dn_diri(mdr, srcdn, metablob); } - inode_t *ji = 0; // journaled inode getting nlink-- - version_t ipv; // it's version + inode_t *ji; // journaled inode getting nlink-- + version_t ipv = 0; // it's version if (linkmerge) { dout(10) << "will merge remote+primary links" << endl; @@ -2951,7 +2994,7 @@ void Server::_rename_prepare(MDRequest *mdr, } } - if (ji) { + if (ipv) { // update journaled target inode inode_t *pi = destdn->inode->project_inode(); pi->nlink--; @@ -3007,7 +3050,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen // move inode to dest srcdn->dir->unlink_inode(srcdn); destdn->dir->unlink_inode(destdn); - destdn->dir->link_inode(destdn, oldin); + destdn->dir->link_primary_inode(destdn, oldin); // nlink-- destdn->inode->inode.nlink--; @@ -3030,7 +3073,7 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen // relink oldin to stray dir. destdn was primary. assert(oldin); - straydn->dir->link_inode(straydn, oldin); + straydn->dir->link_primary_inode(straydn, oldin); //assert(straypv == ipv); // nlink-- in stray dir. @@ -3052,13 +3095,13 @@ void Server::_rename_apply(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDen if (srcdn->is_remote()) { // srcdn was remote. srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_inode(destdn, in->ino()); + destdn->dir->link_remote_inode(destdn, in->ino(), MODE_TO_DT(in->inode.mode)); if (destdn->is_auth()) destdn->mark_dirty(mdr->pvmap[destdn]); } else { // srcdn was primary. srcdn->dir->unlink_inode(srcdn); - destdn->dir->link_inode(destdn, in); + destdn->dir->link_primary_inode(destdn, in); // srcdn inode import? if (!srcdn->is_auth() && destdn->is_auth()) { @@ -3171,7 +3214,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) destdn->inode->is_auth() || srcdn->inode->is_any_caps()) { // journal. - ESlaveUpdate *le = new ESlaveUpdate("slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); + ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_PREPARE); _rename_prepare(mdr, &le->metablob, srcdn, destdn, straydn); mds->mdlog->submit_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn)); } else { @@ -3201,6 +3244,12 @@ void Server::_logged_slave_rename(MDRequest *mdr, // set up commit waiter mdr->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn); + // bump popularity + //if (srcdn->is_auth()) + //mds->balancer->hit_dir(mdr->now, srcdn->get_dir(), META_POP_DWR); + if (destdn->inode->is_auth()) + mds->balancer->hit_inode(mdr->now, destdn->inode, META_POP_IWR); + // done. delete mdr->slave_request; mdr->slave_request = 0; @@ -3217,10 +3266,10 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, _rename_apply(mdr, srcdn, destdn, straydn); // write a commit to the journal - le = new ESlaveUpdate("slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); + le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT); } else { // abort - le = new ESlaveUpdate("slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); + le = new ESlaveUpdate(mdlog, "slave_rename_abort", mdr->reqid, mdr->slave_to_mds, ESlaveUpdate::OP_ROLLBACK); } mds->mdlog->submit_entry(le); } @@ -3274,7 +3323,8 @@ void Server::handle_slave_rename_get_inode(MDRequest *mdr) map exported_client_map; bufferlist inodebl; mdcache->migrator->encode_export_inode(mdr->srcdn->inode, inodebl, mdr->slave_to_mds, - exported_client_map); + exported_client_map, + mdr->now); ::_encode(exported_client_map, reply->inode_export); reply->inode_export.claim_append(inodebl); @@ -3338,9 +3388,6 @@ public: in->inode.mtime = ctime; in->mark_dirty(pv); - // hit pop - mds->balancer->hit_inode(in, META_POP_IWR); - // reply mds->server->reply_request(mdr, 0); } @@ -3397,7 +3444,7 @@ void Server::handle_client_truncate(MDRequest *mdr) pdv, req->args.truncate.length, ctime); // log + wait - EUpdate *le = new EUpdate("truncate"); + EUpdate *le = new EUpdate(mdlog, "truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_inode_truncate(cur->inode, req->args.truncate.length); @@ -3478,11 +3525,12 @@ void Server::_do_open(MDRequest *mdr, CInode *cur) << " on " << *cur << endl; // hit pop + mdr->now = g_clock.now(); if (cmode == FILE_MODE_RW || cmode == FILE_MODE_W) - mds->balancer->hit_inode(cur, META_POP_IWR); + mds->balancer->hit_inode(mdr->now, cur, META_POP_IWR); else - mds->balancer->hit_inode(cur, META_POP_IRD); + mds->balancer->hit_inode(mdr->now, cur, META_POP_IRD); // reply MClientReply *reply = new MClientReply(req, 0); @@ -3526,7 +3574,7 @@ void Server::journal_opens() ++p) { (*p)->put(CInode::PIN_BATCHOPENJOURNAL); if ((*p)->is_any_caps()) { - if (!le) le = new EOpen; + if (!le) le = new EOpen(mdlog); le->add_inode(*p); (*p)->last_open_journaled = mds->mdlog->get_write_pos(); } @@ -3572,9 +3620,6 @@ public: in->inode.mtime = ctime; in->mark_dirty(pv); - // hit pop - mds->balancer->hit_inode(in, META_POP_IWR); - // do the open mds->server->_do_open(mdr, in); } @@ -3594,6 +3639,9 @@ public: void finish(int r) { assert(r == 0); + // hit pop + mds->balancer->hit_inode(mdr->now, in, META_POP_IWR); + // purge also... mds->mdcache->purge_inode(&in->inode, 0); mds->mdcache->wait_for_purge(in->inode.ino, 0, @@ -3614,7 +3662,7 @@ void Server::handle_client_opent(MDRequest *mdr) pdv, ctime); // log + wait - EUpdate *le = new EUpdate("open_truncate"); + EUpdate *le = new EUpdate(mdlog, "open_truncate"); le->metablob.add_client_req(mdr->reqid); le->metablob.add_dir_context(cur->get_parent_dir()); le->metablob.add_inode_truncate(cur->inode, 0); @@ -3644,7 +3692,7 @@ public: assert(r == 0); // link the inode - dn->get_dir()->link_inode(dn, newi); + dn->get_dir()->link_primary_inode(dn, newi); // dirty inode, dn, dir newi->mark_dirty(pv); @@ -3656,9 +3704,6 @@ public: mdr->ref = newi; mdr->pin(newi); - // hit pop - mds->balancer->hit_inode(newi, META_POP_IWR); - // ok, do the open. mds->server->handle_client_open(mdr); } @@ -3702,7 +3747,7 @@ void Server::handle_client_openc(MDRequest *mdr) // prepare finisher C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, mdr, dn, in); - EUpdate *le = new EUpdate("openc"); + EUpdate *le = new EUpdate(mdlog, "openc"); le->metablob.add_client_req(req->get_reqid()); le->metablob.add_allocated_ino(in->ino(), mds->idalloc->get_version()); le->metablob.add_dir_context(dn->dir); diff --git a/trunk/ceph/mds/Server.h b/trunk/ceph/mds/Server.h index 2a32c1b41b968..305e9f88a6872 100644 --- a/trunk/ceph/mds/Server.h +++ b/trunk/ceph/mds/Server.h @@ -17,6 +17,7 @@ #include "MDS.h" +class Logger; class LogEvent; class C_MDS_rename_finish; class MDRequest; @@ -29,13 +30,20 @@ class Server { MDCache *mdcache; MDLog *mdlog; Messenger *messenger; + Logger *logger; public: Server(MDS *m) : mds(m), mdcache(mds->mdcache), mdlog(mds->mdlog), - messenger(mds->messenger) { + messenger(mds->messenger), + logger(0) { } + ~Server() { + delete logger; + } + + void reopen_logger(); // message handler void dispatch(Message *m); @@ -93,7 +101,9 @@ public: void handle_client_chmod(MDRequest *mdr); void handle_client_chown(MDRequest *mdr); void handle_client_readdir(MDRequest *mdr); - int encode_dir_contents(CDir *dir, list& inls, list& dnls); + int encode_dir_contents(CDir *dir, + list& dnls, + list& inls); void handle_client_truncate(MDRequest *mdr); void handle_client_fsync(MDRequest *mdr); diff --git a/trunk/ceph/mds/SimpleLock.h b/trunk/ceph/mds/SimpleLock.h index 42ab3a596d61f..8e26bec25061f 100644 --- a/trunk/ceph/mds/SimpleLock.h +++ b/trunk/ceph/mds/SimpleLock.h @@ -38,7 +38,7 @@ inline const char *get_lock_type_name(int t) { case LOCK_OTYPE_ILINK: return "ilink"; case LOCK_OTYPE_IDIRFRAGTREE: return "idft"; case LOCK_OTYPE_IDIR: return "idir"; - default: assert(0); + default: assert(0); return 0; } } @@ -58,7 +58,7 @@ inline const char *get_simplelock_state_name(int n) { case LOCK_LOCK: return "lock"; case LOCK_GLOCKR: return "glockr"; case LOCK_REMOTEXLOCK: return "remote_xlock"; - default: assert(0); + default: assert(0); return 0; } } @@ -82,7 +82,7 @@ protected: // lock state int state; - set gather_set; // auth + set gather_set; // auth // local state int num_rdlock; diff --git a/trunk/ceph/mds/events/EExport.h b/trunk/ceph/mds/events/EExport.h index 29d8e0df08f49..89534f12b51bf 100644 --- a/trunk/ceph/mds/events/EExport.h +++ b/trunk/ceph/mds/events/EExport.h @@ -31,11 +31,10 @@ protected: set bounds; public: - EExport(CDir *dir) : LogEvent(EVENT_EXPORT), - base(dir->dirfrag()) { - metablob.add_dir_context(dir); - } EExport() : LogEvent(EVENT_EXPORT) { } + EExport(MDLog *mdlog, CDir *dir) : + LogEvent(EVENT_EXPORT), metablob(mdlog), + base(dir->dirfrag()) { } set &get_bounds() { return bounds; } diff --git a/trunk/ceph/mds/events/EFragment.h b/trunk/ceph/mds/events/EFragment.h index bb68e32891929..64969111193c0 100644 --- a/trunk/ceph/mds/events/EFragment.h +++ b/trunk/ceph/mds/events/EFragment.h @@ -20,30 +20,30 @@ class EFragment : public LogEvent { public: + EMetaBlob metablob; inodeno_t ino; frag_t basefrag; int bits; // positive for split (from basefrag), negative for merge (to basefrag) - EMetaBlob metablob; EFragment() : LogEvent(EVENT_FRAGMENT) { } - EFragment(inodeno_t i, frag_t bf, int b) : - LogEvent(EVENT_FRAGMENT), - ino(i), basefrag(bf), bits(b) { } + EFragment(MDLog *mdlog, inodeno_t i, frag_t bf, int b) : + LogEvent(EVENT_FRAGMENT), metablob(mdlog), + ino(i), basefrag(bf), bits(b) { } void print(ostream& out) { out << "EFragment " << ino << " " << basefrag << " by " << bits << " " << metablob; } void encode_payload(bufferlist& bl) { - ::_encode(ino, bl); - ::_encode(basefrag, bl); + ::_encode(ino, bl); + ::_encode(basefrag, bl); ::_encode(bits, bl); - metablob._encode(bl); + metablob._encode(bl); } void decode_payload(bufferlist& bl, int& off) { - ::_decode(ino, bl, off); - ::_decode(basefrag, bl, off); + ::_decode(ino, bl, off); + ::_decode(basefrag, bl, off); ::_decode(bits, bl, off); - metablob._decode(bl, off); + metablob._decode(bl, off); } bool has_expired(MDS *mds); diff --git a/trunk/ceph/mds/events/EMetaBlob.h b/trunk/ceph/mds/events/EMetaBlob.h index e20b2b794b59d..9572c3e315a1c 100644 --- a/trunk/ceph/mds/events/EMetaBlob.h +++ b/trunk/ceph/mds/events/EMetaBlob.h @@ -24,6 +24,7 @@ using namespace std; #include "../CDentry.h" class MDS; +class MDLog; /* * a bunch of metadata in the journal @@ -51,27 +52,26 @@ class EMetaBlob { string symlink; bool dirty; - fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { } - fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } + fullbit(const string& d, version_t v, inode_t& i, bool dr) : + dn(d), dnv(v), inode(i), dirty(dr) { } + fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : + dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } fullbit(bufferlist& bl, int& off) { _decode(bl, off); } void _encode(bufferlist& bl) { ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&inode, sizeof(inode)); + ::_encode(dnv, bl); + ::_encode(inode, bl); if (inode.is_symlink()) ::_encode(symlink, bl); - bl.append((char*)&dirty, sizeof(dirty)); + ::_encode(dirty, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(inode), (char*)&inode); - off += sizeof(inode); + ::_decode(dnv, bl, off); + ::_decode(inode, bl, off); if (inode.is_symlink()) ::_decode(symlink, bl, off); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); + ::_decode(dirty, bl, off); } void print(ostream& out) { out << " fullbit dn " << dn << " dnv " << dnv @@ -86,24 +86,25 @@ class EMetaBlob { string dn; version_t dnv; inodeno_t ino; + unsigned char d_type; bool dirty; - remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { } + remotebit(const string& d, version_t v, inodeno_t i, unsigned char dt, bool dr) : + dn(d), dnv(v), ino(i), d_type(dt), dirty(dr) { } remotebit(bufferlist& bl, int& off) { _decode(bl, off); } void _encode(bufferlist& bl) { ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&ino, sizeof(ino)); - bl.append((char*)&dirty, sizeof(dirty)); + ::_encode(dnv, bl); + ::_encode(ino, bl); + ::_encode(d_type, bl); + ::_encode(dirty, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(ino), (char*)&ino); - off += sizeof(ino); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); + ::_decode(dnv, bl, off); + ::_decode(ino, bl, off); + ::_decode(d_type, bl, off); + ::_decode(dirty, bl, off); } void print(ostream& out) { out << " remotebit dn " << dn << " dnv " << dnv @@ -123,15 +124,13 @@ class EMetaBlob { nullbit(bufferlist& bl, int& off) { _decode(bl, off); } void _encode(bufferlist& bl) { ::_encode(dn, bl); - bl.append((char*)&dnv, sizeof(dnv)); - bl.append((char*)&dirty, sizeof(dirty)); + ::_encode(dnv, bl); + ::_encode(dirty, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(dn, bl, off); - bl.copy(off, sizeof(dnv), (char*)&dnv); - off += sizeof(dnv); - bl.copy(off, sizeof(dirty), (char*)&dirty); - off += sizeof(dirty); + ::_decode(dnv, bl, off); + ::_decode(dirty, bl, off); } void print(ostream& out) { out << " nullbit dn " << dn << " dnv " << dnv @@ -142,6 +141,7 @@ class EMetaBlob { /* dirlump - contains metadata for any dir we have contents for. */ +public: struct dirlump { static const int STATE_COMPLETE = (1<<1); static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! @@ -210,7 +210,7 @@ class EMetaBlob { ::_encode(nremote, bl); ::_encode(nnull, bl); _encode_bits(); - ::_encode(dnbl, bl); + ::_encode_destructively(dnbl, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(dirv, bl, off); @@ -222,7 +222,8 @@ class EMetaBlob { dn_decoded = false; // don't decode bits unless we need them. } }; - + +private: // my lumps. preserve the order we added them in a list. list lump_order; map lump_map; @@ -244,6 +245,13 @@ class EMetaBlob { list client_reqs; public: + // soft state + off_t last_subtree_map; + off_t my_offset; + + EMetaBlob() : last_subtree_map(0), my_offset(0) { } + EMetaBlob(MDLog *mdl); // defined in journal.cc + void print(ostream& out) { for (list::iterator p = lump_order.begin(); p != lump_order.end(); @@ -274,9 +282,10 @@ class EMetaBlob { } void add_null_dentry(CDentry *dn, bool dirty) { + add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty); + } + void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) { // add the dir - dirlump& lump = add_dir(dn->get_dir(), false); - lump.nnull++; if (dirty) lump.get_dnull().push_front(nullbit(dn->get_name(), @@ -289,29 +298,41 @@ class EMetaBlob { } void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino=0) { - if (!rino) + add_remote_dentry(add_dir(dn->get_dir(), false), + dn, dirty, rino); + } + void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty, + inodeno_t rino=0, unsigned char rdt=0) { + if (!rino) { rino = dn->get_remote_ino(); - - dirlump& lump = add_dir(dn->get_dir(), false); - + rdt = dn->get_remote_d_type(); + } lump.nremote++; if (dirty) lump.get_dremote().push_front(remotebit(dn->get_name(), dn->get_projected_version(), - rino, + rino, rdt, dirty)); else lump.get_dremote().push_back(remotebit(dn->get_name(), dn->get_projected_version(), - rino, + rino, rdt, dirty)); } // return remote pointer to to-be-journaled inode - inode_t *add_primary_dentry(CDentry *dn, bool dirty, CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { - if (!in) in = dn->get_inode(); + inode_t *add_primary_dentry(CDentry *dn, bool dirty, + CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { + return add_primary_dentry(add_dir(dn->get_dir(), false), + dn, dirty, in, pi, pdft); + } + inode_t *add_primary_dentry(dirlump& lump, CDentry *dn, bool dirty, + CInode *in=0, inode_t *pi=0, fragtree_t *pdft=0) { + if (!in) + in = dn->get_inode(); - dirlump& lump = add_dir(dn->get_dir(), false); + // make note of where this inode was last journaled + in->last_journaled = my_offset; lump.nfull++; if (dirty) { @@ -333,6 +354,10 @@ class EMetaBlob { // convenience: primary or remote? figure it out. inode_t *add_dentry(CDentry *dn, bool dirty) { + dirlump& lump = add_dir(dn->get_dir(), false); + return add_dentry(lump, dn, dirty); + } + inode_t *add_dentry(dirlump& lump, CDentry *dn, bool dirty) { // primary or remote if (dn->is_remote()) { add_remote_dentry(dn, dirty); @@ -347,10 +372,12 @@ class EMetaBlob { dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) { - dirfrag_t df = dir->dirfrag(); + return add_dir(dir->dirfrag(), dir->get_projected_version(), dirty, complete); + } + dirlump& add_dir(dirfrag_t df, version_t pv, bool dirty, bool complete=false) { if (lump_map.count(df) == 0) { lump_order.push_back(df); - lump_map[df].dirv = dir->get_projected_version(); + lump_map[df].dirv = pv; } dirlump& l = lump_map[df]; if (complete) l.mark_complete(); @@ -366,16 +393,22 @@ class EMetaBlob { if (lump_map.count(dir->dirfrag())) return; - // stop at subtree root? - if (mode == TO_AUTH_SUBTREE_ROOT && - dir->is_subtree_root() && dir->is_auth()) - return; + if (mode == TO_AUTH_SUBTREE_ROOT) { + // subtree root? + if (dir->is_subtree_root() && dir->is_auth()) + return; + // was the inode journaled since the last subtree_map? + if (dir->inode->last_journaled >= last_subtree_map) + return; + } // stop at root/stray CInode *diri = dir->get_inode(); if (!diri->get_parent_dn()) return; + // journaled? + // add parent dn CDentry *parent = diri->get_parent_dn(); add_dir_context(parent->get_dir(), mode); diff --git a/trunk/ceph/mds/events/EOpen.h b/trunk/ceph/mds/events/EOpen.h index c0ce5218e7146..b0b8911e414dd 100644 --- a/trunk/ceph/mds/events/EOpen.h +++ b/trunk/ceph/mds/events/EOpen.h @@ -24,9 +24,9 @@ public: list inos; EOpen() : LogEvent(EVENT_OPEN) { } - EOpen(CInode *in) : LogEvent(EVENT_OPEN) { - add_inode(in); - } + EOpen(MDLog *mdlog) : + LogEvent(EVENT_OPEN), metablob(mdlog) { } + void print(ostream& out) { out << "EOpen " << metablob; } diff --git a/trunk/ceph/mds/events/ESlaveUpdate.h b/trunk/ceph/mds/events/ESlaveUpdate.h index 51539234d4617..23d280a7b831c 100644 --- a/trunk/ceph/mds/events/ESlaveUpdate.h +++ b/trunk/ceph/mds/events/ESlaveUpdate.h @@ -24,15 +24,15 @@ public: const static int OP_COMMIT = 2; const static int OP_ROLLBACK = 3; + EMetaBlob metablob; string type; metareqid_t reqid; int master; int op; // prepare, commit, abort - EMetaBlob metablob; ESlaveUpdate() : LogEvent(EVENT_SLAVEUPDATE) { } - ESlaveUpdate(const char *s, metareqid_t ri, int mastermds, int o) : - LogEvent(EVENT_SLAVEUPDATE), + ESlaveUpdate(MDLog *mdlog, const char *s, metareqid_t ri, int mastermds, int o) : + LogEvent(EVENT_SLAVEUPDATE), metablob(mdlog), type(s), reqid(ri), master(mastermds), diff --git a/trunk/ceph/mds/events/EUpdate.h b/trunk/ceph/mds/events/EUpdate.h index 02c5d3ece2569..afc0b708bd916 100644 --- a/trunk/ceph/mds/events/EUpdate.h +++ b/trunk/ceph/mds/events/EUpdate.h @@ -24,8 +24,9 @@ public: string type; EUpdate() : LogEvent(EVENT_UPDATE) { } - EUpdate(const char *s) : LogEvent(EVENT_UPDATE), - type(s) { } + EUpdate(MDLog *mdlog, const char *s) : + LogEvent(EVENT_UPDATE), metablob(mdlog), + type(s) { } void print(ostream& out) { if (type.length()) diff --git a/trunk/ceph/mds/journal.cc b/trunk/ceph/mds/journal.cc index 7770afed866fa..e09a4f1eaf1e7 100644 --- a/trunk/ceph/mds/journal.cc +++ b/trunk/ceph/mds/journal.cc @@ -68,6 +68,13 @@ void EString::replay(MDS *mds) // ----------------------- // EMetaBlob +EMetaBlob::EMetaBlob(MDLog *mdlog) : + last_subtree_map(mdlog->get_last_subtree_map_offset()), + my_offset(mdlog->get_write_pos()) +{ +} + + /* * we need to ensure that a journaled item has either * @@ -99,7 +106,8 @@ bool EMetaBlob::has_expired(MDS *mds) << " for " << *dir << endl; continue; // not our problem } - if (dir->get_committed_version() >= lp->second.dirv) { + if (dir->get_committed_version() >= lp->second.dirv || + dir->get_committed_version_equivalent() >= lp->second.dirv) { dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv << " for " << *dir << endl; continue; // yay @@ -217,7 +225,8 @@ void EMetaBlob::expire(MDS *mds, Context *c) << " for " << *dir << endl; continue; // not our problem } - if (dir->get_committed_version() >= lp->second.dirv) { + if (dir->get_committed_version() >= lp->second.dirv || + dir->get_committed_version_equivalent() >= lp->second.dirv) { dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv << " on " << *dir << endl; continue; // yay @@ -238,15 +247,12 @@ void EMetaBlob::expire(MDS *mds, Context *c) continue; } } - if (dir->get_committed_version() < lp->second.dirv) { - dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv - << ", committing " << *dir << endl; - commit[dir] = MAX(commit[dir], lp->second.dirv); - ncommit++; - continue; - } - - assert(0); // hrm + + assert(dir->get_committed_version() < lp->second.dirv); + dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv + << ", committing " << *dir << endl; + commit[dir] = MAX(commit[dir], lp->second.dirv); + ncommit++; } // set up gather context @@ -384,7 +390,7 @@ void EMetaBlob::replay(MDS *mds) p++) { CDentry *dn = dir->lookup(p->dn); if (!dn) { - dn = dir->add_dentry( p->dn ); + dn = dir->add_null_dentry(p->dn); dn->set_version(p->dnv); if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *dn << endl; @@ -400,7 +406,7 @@ void EMetaBlob::replay(MDS *mds) in->inode = p->inode; if (in->inode.is_symlink()) in->symlink = p->symlink; mds->mdcache->add_inode(in); - dir->link_inode(dn, in); + dir->link_primary_inode(dn, in); if (p->dirty) in->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *in << endl; } else { @@ -410,7 +416,7 @@ void EMetaBlob::replay(MDS *mds) } in->inode = p->inode; if (in->inode.is_symlink()) in->symlink = p->symlink; - dir->link_inode(dn, in); + dir->link_primary_inode(dn, in); if (p->dirty) in->_mark_dirty(); dout(10) << "EMetaBlob.replay linked " << *in << endl; } @@ -422,8 +428,7 @@ void EMetaBlob::replay(MDS *mds) p++) { CDentry *dn = dir->lookup(p->dn); if (!dn) { - dn = dir->add_dentry(p->dn, p->ino); - dn->set_remote_ino(p->ino); + dn = dir->add_remote_dentry(p->dn, p->ino, p->d_type); dn->set_version(p->dnv); if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *dn << endl; @@ -432,7 +437,7 @@ void EMetaBlob::replay(MDS *mds) dout(10) << "EMetaBlob.replay unlinking " << *dn << endl; dir->unlink_inode(dn); } - dn->set_remote_ino(p->ino); + dn->set_remote(p->ino, p->d_type); dn->set_version(p->dnv); if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay had " << *dn << endl; @@ -445,7 +450,7 @@ void EMetaBlob::replay(MDS *mds) p++) { CDentry *dn = dir->lookup(p->dn); if (!dn) { - dn = dir->add_dentry(p->dn); + dn = dir->add_null_dentry(p->dn); dn->set_version(p->dnv); if (p->dirty) dn->_mark_dirty(); dout(10) << "EMetaBlob.replay added " << *dn << endl; @@ -767,6 +772,7 @@ bool ESlaveUpdate::has_expired(MDS *mds) default: assert(0); + return false; } } @@ -898,7 +904,10 @@ void EFragment::replay(MDS *mds) CInode *in = mds->mdcache->get_inode(ino); assert(in); - //in->fragment_dir(basefrag, bits); + list resultfrags; + list waiters; + mds->mdcache->adjust_dir_fragments(in, basefrag, bits, resultfrags, waiters); + metablob.replay(mds); } diff --git a/trunk/ceph/mds/mdstypes.h b/trunk/ceph/mds/mdstypes.h index c790fde44aac2..aaf0ea33e47b0 100644 --- a/trunk/ceph/mds/mdstypes.h +++ b/trunk/ceph/mds/mdstypes.h @@ -35,6 +35,7 @@ using namespace std; #define MDS_INO_ROOT 1 #define MDS_INO_PGTABLE 2 #define MDS_INO_ANCHORTABLE 3 +#define MDS_INO_PG 4 // this should match osd/osd_types.h PG_INO #define MDS_INO_LOG_OFFSET 0x100 #define MDS_INO_IDS_OFFSET 0x200 #define MDS_INO_CLIENTMAP_OFFSET 0x300 @@ -51,7 +52,7 @@ using namespace std; struct metareqid_t { - int client; + int32_t client; tid_t tid; metareqid_t() : client(-1), tid(0) {} metareqid_t(int c, tid_t t) : client(c), tid(t) {} @@ -114,8 +115,10 @@ struct dirfrag_t { dirfrag_t(inodeno_t i, frag_t f) : ino(i), frag(f) { } }; -inline ostream& operator<<(ostream& out, const dirfrag_t& df) { - return out << df.ino << "#" << df.frag; +inline ostream& operator<<(ostream& out, const dirfrag_t df) { + out << df.ino; + if (!df.frag.is_root()) out << "." << df.frag; + return out; } inline bool operator<(dirfrag_t l, dirfrag_t r) { if (l.ino < r.ino) return true; @@ -129,23 +132,85 @@ inline bool operator==(dirfrag_t l, dirfrag_t r) { // ================================================================ +#define META_POP_IRD 0 +#define META_POP_IWR 1 +#define META_POP_READDIR 2 +#define META_POP_FETCH 3 +#define META_POP_STORE 4 +#define META_NPOP 5 + +class inode_load_vec_t { + static const int NUM = 2; + DecayCounter vec[NUM]; +public: + DecayCounter &get(int t) { + assert(t < NUM); + return vec[t]; + } + void zero(utime_t now) { + for (int i=0; i"; + return out << ""; } @@ -177,7 +246,7 @@ inline meta_load_t& operator+=(meta_load_t& l, meta_load_t& r) l.pop[i].adjust(r.pop[i].get()); return l; } - +*/ /* mds_load_t @@ -187,14 +256,14 @@ inline meta_load_t& operator+=(meta_load_t& l, meta_load_t& r) // popularity classes #define MDS_POP_JUSTME 0 // just me (this dir or inode) #define MDS_POP_NESTED 1 // me + children, auth or not -#define MDS_POP_CURDOM 2 // me + children in current auth domain -#define MDS_POP_ANYDOM 3 // me + children in any (nested) auth domain -//#define MDS_POP_DIRMOD 4 // just this dir, modifications only +#define MDS_POP_CURDOM 2 // (if auth) me + children in current auth domain +#define MDS_POP_ANYDOM 3 // (if auth) me + children in any (nested) auth domain #define MDS_NPOP 4 class mds_load_t { public: - meta_load_t root; + dirfrag_load_vec_t auth; + dirfrag_load_vec_t all; double req_rate; double cache_hit_rate; @@ -203,12 +272,14 @@ class mds_load_t { mds_load_t() : req_rate(0), cache_hit_rate(0), queue_len(0) { } - double mds_load() { + double mds_load(utime_t now) { switch(g_conf.mds_bal_mode) { case 0: - return root.meta_load() - + req_rate - + 10.0*queue_len; + return + .8 * auth.meta_load(now) + + .2 * all.meta_load(now) + + req_rate + + 10.0 * queue_len; case 1: return req_rate + 10.0*queue_len; @@ -222,7 +293,7 @@ class mds_load_t { inline ostream& operator<<( ostream& out, mds_load_t& load ) { - return out << "mdsload<" << load.root + return out << "mdsload<" << load.auth << "/" << load.all << ", req " << load.req_rate << ", hr " << load.cache_hit_rate << ", qlen " << load.queue_len @@ -310,7 +381,8 @@ class MDSCacheObject { const static int PIN_REQUEST = -1003; const static int PIN_WAITER = 1004; const static int PIN_DIRTYSCATTERED = 1005; - + static const int PIN_AUTHPIN = 1006; + const char *generic_pin_name(int p) { switch (p) { case PIN_REPLICATED: return "replicated"; @@ -319,7 +391,8 @@ class MDSCacheObject { case PIN_REQUEST: return "request"; case PIN_WAITER: return "waiter"; case PIN_DIRTYSCATTERED: return "dirtyscattered"; - default: assert(0); + case PIN_AUTHPIN: return "authpin"; + default: assert(0); return 0; } } diff --git a/trunk/ceph/messages/MClientReply.h b/trunk/ceph/messages/MClientReply.h index e88c31ca47400..24b7eae8976b7 100644 --- a/trunk/ceph/messages/MClientReply.h +++ b/trunk/ceph/messages/MClientReply.h @@ -51,11 +51,11 @@ class CInode; */ class InodeStat { - public: inode_t inode; string symlink; // symlink content (if symlink) fragtree_t dirfragtree; + uint32_t mask; // mds distribution hints map dirfrag_auth; @@ -65,13 +65,13 @@ class InodeStat { public: InodeStat() {} InodeStat(CInode *in, int whoami) : - inode(in->inode) + inode(in->inode), + mask(STAT_MASK_INO|STAT_MASK_TYPE|STAT_MASK_BASE) { - // inode.mask - inode.mask = INODE_MASK_BASE; - if (in->authlock.can_rdlock(0)) inode.mask |= INODE_MASK_AUTH; - if (in->linklock.can_rdlock(0)) inode.mask |= INODE_MASK_LINK; - if (in->filelock.can_rdlock(0)) inode.mask |= INODE_MASK_FILE; + // mask + if (in->authlock.can_rdlock(0)) mask |= STAT_MASK_AUTH; + if (in->linklock.can_rdlock(0)) mask |= STAT_MASK_LINK; + if (in->filelock.can_rdlock(0)) mask |= STAT_MASK_FILE; // symlink content? if (in->is_symlink()) @@ -96,6 +96,7 @@ class InodeStat { } void _encode(bufferlist &bl) { + ::_encode(mask, bl); ::_encode(inode, bl); ::_encode(dirfrag_auth, bl); ::_encode(dirfrag_dist, bl); @@ -105,6 +106,7 @@ class InodeStat { } void _decode(bufferlist &bl, int& off) { + ::_decode(mask, bl, off); ::_decode(inode, bl, off); ::_decode(dirfrag_auth, bl, off); ::_decode(dirfrag_dist, bl, off); @@ -133,8 +135,8 @@ class MClientReply : public Message { list trace_in; list trace_dn; + list dir_dn; list dir_in; - list dir_dn; public: long get_tid() { return st.tid; } @@ -210,13 +212,12 @@ class MClientReply : public Message { trace_in.push_back(ci); } + // dir contents + ::_decode(dir_dn, payload, off); for (int i=0; i_decode(payload, off); dir_in.push_back(ci); - string dn; - ::_decode(dn, payload, off); - dir_dn.push_back(dn); } } virtual void encode_payload() { @@ -237,13 +238,11 @@ class MClientReply : public Message { } // dir contents - pdn = dir_dn.begin(); + ::_encode(dir_dn, payload); for (pin = dir_in.begin(); pin != dir_in.end(); - ++pin, ++pdn) { + ++pin) (*pin)->_encode(payload); - ::_encode(*pdn, payload); - } } // builders @@ -253,13 +252,14 @@ class MClientReply : public Message { dir_in.push_back(in); ++st._dir_size; }*/ - void take_dir_items(list& inls, - list& dnls, - int num) { - dir_in.swap(inls); + void take_dir_items(list& dnls, + list& inls, + int num) { dir_dn.swap(dnls); + dir_in.swap(inls); st._dir_size = num; } + /* void copy_dir_items(const list& inls, const list& dnls) { list::const_iterator pdn = dnls.begin(); @@ -275,6 +275,7 @@ class MClientReply : public Message { ++st._dir_size; } } + */ void set_trace_dist(CInode *in, int whoami) { st._num_trace_in = 0; diff --git a/trunk/ceph/messages/MClientRequest.h b/trunk/ceph/messages/MClientRequest.h index d97695a2477ab..8f03044cf5a4f 100644 --- a/trunk/ceph/messages/MClientRequest.h +++ b/trunk/ceph/messages/MClientRequest.h @@ -119,6 +119,7 @@ class MClientRequest : public Message { } chown; struct { mode_t mode; + dev_t rdev; } mknod; struct { mode_t mode; @@ -200,6 +201,7 @@ class MClientRequest : public Message { default: assert(0); + return false; } } diff --git a/trunk/ceph/messages/MClientSession.h b/trunk/ceph/messages/MClientSession.h index c84eadbccb117..dc4252ac73d8e 100644 --- a/trunk/ceph/messages/MClientSession.h +++ b/trunk/ceph/messages/MClientSession.h @@ -29,7 +29,7 @@ public: case OP_OPEN: return "open"; case OP_REQUEST_CLOSE: return "request_close"; case OP_CLOSE: return "close"; - default: assert(0); + default: assert(0); return 0; } } diff --git a/trunk/ceph/messages/MExportDirPrep.h b/trunk/ceph/messages/MExportDirPrep.h index 8d54276f0bd83..5789e301e8b11 100644 --- a/trunk/ceph/messages/MExportDirPrep.h +++ b/trunk/ceph/messages/MExportDirPrep.h @@ -31,6 +31,7 @@ class MExportDirPrep : public Message { list bounds; list inodes; + list dentries; map inode_dirfrag; map inode_dentry; @@ -45,6 +46,7 @@ class MExportDirPrep : public Message { dirfrag_t get_dirfrag() { return dirfrag; } list& get_bounds() { return bounds; } list& get_inodes() { return inodes; } + list& get_dentries() { return dentries; } list& get_inode_dirfrags(inodeno_t ino) { return frags_by_ino[ino]; } @@ -77,6 +79,10 @@ class MExportDirPrep : public Message { iit != inodes.end(); iit++) delete *iit; + for (list::iterator p = dentries.begin(); + p != dentries.end(); + p++) + delete *p; for (map::iterator dit = dirfrags.begin(); dit != dirfrags.end(); dit++) @@ -92,10 +98,11 @@ class MExportDirPrep : public Message { void add_export(dirfrag_t df) { bounds.push_back( df ); } - void add_inode(dirfrag_t df, const string& dentry, CInodeDiscover *in) { + void add_inode(dirfrag_t df, const string& name, CDentryDiscover *dn, CInodeDiscover *in) { inodes.push_back(in); + dentries.push_back(dn); inode_dirfrag[in->get_ino()] = df; - inode_dentry[in->get_ino()] = dentry; + inode_dentry[in->get_ino()] = name; } void add_dirfrag(CDirDiscover *dir) { dirfrags[dir->get_dirfrag()] = dir; @@ -121,6 +128,11 @@ class MExportDirPrep : public Message { CInodeDiscover *in = new CInodeDiscover; in->_decode(payload, off); inodes.push_back(in); + + // dentry + CDentryDiscover *dn = new CDentryDiscover; + dn->_decode(payload, off); + dentries.push_back(dn); // dentry string d; @@ -158,12 +170,13 @@ class MExportDirPrep : public Message { // inodes int ni = inodes.size(); payload.append((char*)&ni, sizeof(int)); - for (list::iterator iit = inodes.begin(); - iit != inodes.end(); - iit++) { + list::iterator dit = dentries.begin(); + list::iterator iit = inodes.begin(); + while (iit != inodes.end()) { (*iit)->_encode(payload); - - // dentry + (*dit)->_encode(payload); + + // dentry name _encode(inode_dentry[(*iit)->get_ino()], payload); // dir ino @@ -172,6 +185,9 @@ class MExportDirPrep : public Message { // child frags ::_encode(frags_by_ino[(*iit)->get_ino()], payload); + + iit++; + dit++; } // dirs diff --git a/trunk/ceph/messages/MLock.h b/trunk/ceph/messages/MLock.h index 62f5b174de702..208b5b6e8a003 100644 --- a/trunk/ceph/messages/MLock.h +++ b/trunk/ceph/messages/MLock.h @@ -47,7 +47,7 @@ static const char *get_lock_action_name(int a) { case LOCK_AC_MIXEDACK: return "mixedack"; case LOCK_AC_LOCKACK: return "lockack"; case LOCK_AC_REQSCATTER: return "reqscatter"; - default: assert(0); + default: assert(0); return 0; } } diff --git a/trunk/ceph/messages/MMDSBeacon.h b/trunk/ceph/messages/MMDSBeacon.h index d8b73a45a3122..c18a05e77f1a8 100644 --- a/trunk/ceph/messages/MMDSBeacon.h +++ b/trunk/ceph/messages/MMDSBeacon.h @@ -23,16 +23,18 @@ class MMDSBeacon : public Message { entity_inst_t inst; + epoch_t last_epoch_seen; // include last mdsmap epoch mds has seen to avoid race with monitor decree int state; version_t seq; public: MMDSBeacon() : Message(MSG_MDS_BEACON) {} - MMDSBeacon(entity_inst_t i, int st, version_t se) : + MMDSBeacon(entity_inst_t i, epoch_t les, int st, version_t se) : Message(MSG_MDS_BEACON), - inst(i), state(st), seq(se) { } + inst(i), last_epoch_seen(les), state(st), seq(se) { } entity_inst_t& get_mds_inst() { return inst; } + epoch_t get_last_epoch_seen() { return last_epoch_seen; } int get_state() { return state; } version_t get_seq() { return seq; } char *get_type_name() { return "mdsbeacon"; } @@ -45,12 +47,14 @@ class MMDSBeacon : public Message { void encode_payload() { ::_encode(inst, payload); + ::_encode(last_epoch_seen, payload); ::_encode(state, payload); ::_encode(seq, payload); } void decode_payload() { int off = 0; ::_decode(inst, payload, off); + ::_decode(last_epoch_seen, payload, off); ::_decode(state, payload, off); ::_decode(seq, payload, off); } diff --git a/trunk/ceph/messages/MMDSCacheRejoin.h b/trunk/ceph/messages/MMDSCacheRejoin.h index c0303fd1af455..78d9072cc767c 100644 --- a/trunk/ceph/messages/MMDSCacheRejoin.h +++ b/trunk/ceph/messages/MMDSCacheRejoin.h @@ -37,7 +37,7 @@ class MMDSCacheRejoin : public Message { case OP_ACK: return "ack"; case OP_MISSING: return "missing"; case OP_FULL: return "full"; - default: assert(0); + default: assert(0); return 0; } } @@ -84,11 +84,13 @@ class MMDSCacheRejoin : public Message { struct dn_strong { inodeno_t ino; inodeno_t remote_ino; + unsigned char remote_d_type; int32_t nonce; int32_t lock; - dn_strong() : ino(0), remote_ino(0), nonce(0), lock(0) {} - dn_strong(inodeno_t pi, inodeno_t ri, int n, int l) : - ino(pi), remote_ino(ri), nonce(n), lock(l) {} + dn_strong() : + ino(0), remote_ino(0), remote_d_type(0), nonce(0), lock(0) {} + dn_strong(inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int l) : + ino(pi), remote_ino(ri), remote_d_type(rdt), nonce(n), lock(l) {} bool is_primary() { return ino > 0; } bool is_remote() { return remote_ino > 0; } bool is_null() { return ino == 0 && remote_ino == 0; } @@ -97,8 +99,11 @@ class MMDSCacheRejoin : public Message { struct dn_weak { inodeno_t ino; inodeno_t remote_ino; - dn_weak() : ino(0), remote_ino(0) {} - dn_weak(inodeno_t pi, inodeno_t ri) : ino(pi), remote_ino(ri) {} + unsigned char remote_d_type; + dn_weak() : + ino(0), remote_ino(0), remote_d_type(0) {} + dn_weak(inodeno_t pi, inodeno_t ri, unsigned char rdt) : + ino(pi), remote_ino(ri), remote_d_type(rdt) {} bool is_primary() { return ino > 0; } bool is_remote() { return remote_ino > 0; } bool is_null() { return ino == 0 && remote_ino == 0; } @@ -178,16 +183,16 @@ class MMDSCacheRejoin : public Message { weak[df][dname] = dnw; } void add_weak_null_dentry(dirfrag_t df, const string& dname) { - weak[df][dname] = dn_weak(0, 0); + weak[df][dname] = dn_weak(0, 0, 0); } void add_weak_primary_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(ino, 0); + weak[df][dname] = dn_weak(ino, 0, 0); } - void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino) { - weak[df][dname] = dn_weak(0, ino); + void add_weak_remote_dentry(dirfrag_t df, const string& dname, inodeno_t ino, unsigned char rdt) { + weak[df][dname] = dn_weak(0, ino, rdt); } - void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, int n, int ls) { - strong_dentries[df][dname] = dn_strong(pi, ri, n, ls); + void add_strong_dentry(dirfrag_t df, const string& dname, inodeno_t pi, inodeno_t ri, unsigned char rdt, int n, int ls) { + strong_dentries[df][dname] = dn_strong(pi, ri, rdt, n, ls); } void add_dentry_authpin(dirfrag_t df, const string& dname, const metareqid_t& ri) { authpinned_dentries[df][dname] = ri; diff --git a/trunk/ceph/messages/MMDSFragmentNotify.h b/trunk/ceph/messages/MMDSFragmentNotify.h new file mode 100644 index 0000000000000..232cce92427bb --- /dev/null +++ b/trunk/ceph/messages/MMDSFragmentNotify.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSFRAGMENTNOTIFY_H +#define __MMDSFRAGMENTNOTIFY_H + +#include "msg/Message.h" +#include +using namespace std; + +class MMDSFragmentNotify : public Message { + inodeno_t ino; + frag_t basefrag; + int8_t bits; + + public: + inodeno_t get_ino() { return ino; } + frag_t get_basefrag() { return basefrag; } + int get_bits() { return bits; } + + bufferlist basebl; + + MMDSFragmentNotify() {} + MMDSFragmentNotify(inodeno_t i, frag_t bf, int b) : + Message(MSG_MDS_FRAGMENTNOTIFY), + ino(i), basefrag(bf), bits(b) { } + + virtual char *get_type_name() { return "fragment_notify"; } + void print(ostream& o) { + o << "fragment_notify(" << ino << "#" << basefrag + << " " << (int)bits << ")"; + } + + virtual void decode_payload() { + int off = 0; + ::_decode(ino, payload, off); + ::_decode(basefrag, payload, off); + ::_decode(bits, payload, off); + ::_decode(basebl, payload, off); + } + virtual void encode_payload() { + ::_encode(ino, payload); + ::_encode(basefrag, payload); + ::_encode(bits, payload); + ::_encode(basebl, payload); + } +}; + +#endif diff --git a/trunk/ceph/messages/MOSDOp.h b/trunk/ceph/messages/MOSDOp.h index 96b389b119a7d..9dce4bdd00b27 100644 --- a/trunk/ceph/messages/MOSDOp.h +++ b/trunk/ceph/messages/MOSDOp.h @@ -104,9 +104,8 @@ private: eversion_t pg_trim_to; // primary->replica: trim to here - int op; - size_t length; - off_t offset; + int32_t op; + off_t offset, length; eversion_t version; eversion_t old_version; @@ -164,7 +163,7 @@ private: return st.op < 10; } - const size_t get_length() { return st.length; } + const off_t get_length() { return st.length; } const off_t get_offset() { return st.offset; } map& get_attrset() { return attrset; } @@ -187,7 +186,7 @@ private: bufferlist& get_data() { return data; } - size_t get_data_len() { return data.length(); } + off_t get_data_len() { return data.length(); } MOSDOp(entity_inst_t asker, int inc, long tid, @@ -216,7 +215,7 @@ private: void set_layout(const ObjectLayout& l) { st.layout = l; } - void set_length(size_t l) { st.length = l; } + void set_length(off_t l) { st.length = l; } void set_offset(off_t o) { st.offset = o; } void set_version(eversion_t v) { st.version = v; } void set_old_version(eversion_t ov) { st.old_version = ov; } @@ -243,6 +242,7 @@ private: out << "osd_op(" << st.reqid << " " << get_opname(st.op) << " " << st.oid; + if (st.length) out << " " << st.offset << "~" << st.length; if (st.retry_attempt) out << " RETRY"; out << ")"; } diff --git a/trunk/ceph/messages/MOSDOpReply.h b/trunk/ceph/messages/MOSDOpReply.h index e81f14d4558b1..bfe1674c07d3c 100644 --- a/trunk/ceph/messages/MOSDOpReply.h +++ b/trunk/ceph/messages/MOSDOpReply.h @@ -39,13 +39,13 @@ class MOSDOpReply : public Message { object_t oid; ObjectLayout layout; // pgid, etc. - int op; + int32_t op; // reply - int result; + int32_t result; bool commit; - size_t length, offset; - size_t object_size; + off_t length, offset; + off_t object_size; eversion_t version; eversion_t pg_complete_thru; @@ -66,9 +66,9 @@ class MOSDOpReply : public Message { bool get_commit() { return st.commit; } int get_result() { return st.result; } - size_t get_length() { return st.length; } - size_t get_offset() { return st.offset; } - size_t get_object_size() { return st.object_size; } + off_t get_length() { return st.length; } + off_t get_offset() { return st.offset; } + off_t get_object_size() { return st.object_size; } eversion_t get_version() { return st.version; } map& get_attrset() { return attrset; } @@ -76,9 +76,9 @@ class MOSDOpReply : public Message { void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } void set_result(int r) { st.result = r; } - void set_length(size_t s) { st.length = s; } - void set_offset(size_t o) { st.offset = o; } - void set_object_size(size_t s) { st.object_size = s; } + void set_length(off_t s) { st.length = s; } + void set_offset(off_t o) { st.offset = o; } + void set_object_size(off_t s) { st.object_size = s; } void set_version(eversion_t v) { st.version = v; } void set_attrset(map &as) { attrset = as; } @@ -139,6 +139,7 @@ public: out << "osd_op_reply(" << st.reqid << " " << MOSDOp::get_opname(st.op) << " " << st.oid; + if (st.length) out << " " << st.offset << "~" << st.length; if (st.commit) out << " commit"; else diff --git a/trunk/ceph/messages/MOSDPGLog.h b/trunk/ceph/messages/MOSDPGLog.h index b7ed19dd64d4b..950c7df7eb1b2 100644 --- a/trunk/ceph/messages/MOSDPGLog.h +++ b/trunk/ceph/messages/MOSDPGLog.h @@ -38,6 +38,9 @@ public: } char *get_type_name() { return "PGlog"; } + void print(ostream& out) { + out << "pg_log(" << pgid << " e" << epoch << ")"; + } void encode_payload() { payload.append((char*)&epoch, sizeof(epoch)); diff --git a/trunk/ceph/messages/MOSDPGSummary.h b/trunk/ceph/messages/MOSDPGSummary.h index f41c6954b4c27..0dcebffaf74da 100644 --- a/trunk/ceph/messages/MOSDPGSummary.h +++ b/trunk/ceph/messages/MOSDPGSummary.h @@ -42,6 +42,9 @@ public: } char *get_type_name() { return "PGsum"; } + void print(ostream& out) { + out << "pg_summary(" << pgid << " e" << epoch << ")"; + } void encode_payload() { payload.append((char*)&epoch, sizeof(epoch)); diff --git a/trunk/ceph/messages/MOSDPGUpdate.h b/trunk/ceph/messages/MOSDPGUpdate.h index 20453b3e73e2f..869c02e18c156 100644 --- a/trunk/ceph/messages/MOSDPGUpdate.h +++ b/trunk/ceph/messages/MOSDPGUpdate.h @@ -42,6 +42,12 @@ class MOSDPGUpdate : public Message { } char *get_type_name() { return "PGUp"; } + void print(ostream& out) { + out << "pg_update(" << pgid << " e" << map_version; + if (complete) out << " complete"; + out << " lac=" << last_any_complete; + out << ")"; + } void encode_payload() { payload.append((char*)&map_version, sizeof(map_version)); diff --git a/trunk/ceph/messages/MPGStats.h b/trunk/ceph/messages/MPGStats.h new file mode 100644 index 0000000000000..838ab54219ccb --- /dev/null +++ b/trunk/ceph/messages/MPGStats.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MPGSTATS_H +#define __MPGSTATS_H + +#include "osd/osd_types.h" + +class MPGStats : public Message { +public: + map pg_stat; + + MPGStats() : Message(MSG_PGSTATS) {} + + char *get_type_name() { return "pg_stats"; } + void print(ostream& out) { + out << "pg_stats" << endl; + } + + void encode_payload() { + ::_encode(pg_stat, payload); + } + void decode_payload() { + int off = 0; + ::_decode(pg_stat, payload, off); + } +}; + +#endif diff --git a/trunk/ceph/messages/MStatfs.h b/trunk/ceph/messages/MStatfs.h new file mode 100644 index 0000000000000..2274707a0e128 --- /dev/null +++ b/trunk/ceph/messages/MStatfs.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MSTATFS_H +#define __MSTATFS_H + +#include /* or */ + +class MStatfs : public Message { +public: + struct statvfs stfs; + + MStatfs() : Message(MSG_STATFS) {} + + char *get_type_name() { return "statfs"; } + void print(ostream& out) { + out << "statfs" << endl; + } + + void encode_payload() { + ::_encode(stfs, payload); + } + void decode_payload() { + int off = 0; + ::_decode(stfs, payload, off); + } +}; + +#endif diff --git a/trunk/ceph/mon/ClientMonitor.cc b/trunk/ceph/mon/ClientMonitor.cc index 7b3a8917c1e16..018cbcadc6bf9 100644 --- a/trunk/ceph/mon/ClientMonitor.cc +++ b/trunk/ceph/mon/ClientMonitor.cc @@ -102,7 +102,6 @@ void ClientMonitor::encode_pending(bufferlist &bl) dout(10) << "encode_pending v " << pending_inc.version << ", next is " << pending_inc.next_client << endl; - assert(paxos->get_version() + 1 == pending_inc.version); pending_inc._encode(bl); } diff --git a/trunk/ceph/mon/MDSMonitor.cc b/trunk/ceph/mon/MDSMonitor.cc index 763ee31c43ec1..f4ded2330d20e 100644 --- a/trunk/ceph/mon/MDSMonitor.cc +++ b/trunk/ceph/mon/MDSMonitor.cc @@ -190,14 +190,20 @@ bool MDSMonitor::preprocess_beacon(MMDSBeacon *m) // reply to beacon? if (state != MDSMap::STATE_STOPPED) { last_beacon[from] = g_clock.now(); // note time - mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), state, seq), + mon->messenger->send_message(new MMDSBeacon(m->get_mds_inst(), mdsmap.get_epoch(), state, seq), m->get_mds_inst()); } // is there a state change here? - if (mdsmap.mds_state.count(from) == 0 || - mdsmap.mds_state[from] != state) - return false; // yep, need to update map. + if (mdsmap.mds_state.count(from) == 0) { + if (state == MDSMap::STATE_BOOT) + return false; // need to add to map + dout(1) << "mds_beacon " << *m << " announcing non-boot state, ignoring" << endl; + } else if (mdsmap.mds_state[from] != state) { + if (mdsmap.get_epoch() == m->get_last_epoch_seen()) + return false; // need to update map + dout(10) << "mds_beacon " << *m << " ignoring requested state, because mds hasn't seen latest map" << endl; + } // we're done. delete m; @@ -248,15 +254,30 @@ bool MDSMonitor::handle_beacon(MMDSBeacon *m) // assign a name. if (from >= 0) { // wants to be (or already is) a specific MDS. - if (mdsmap.is_failed(from)) { - dout(10) << "mds_beacon boot: mds" << from << " was failed, replaying" << endl; - state = MDSMap::STATE_REPLAY; - } else if (mdsmap.is_stopped(from)) { - dout(10) << "mds_beacon boot: mds" << from << " was stopped, starting" << endl; - state = MDSMap::STATE_STARTING; - } else if (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst()) { + if (!g_conf.mon_allow_mds_bully && + (!mdsmap.have_inst(from) || mdsmap.get_inst(from) != m->get_mds_inst())) { dout(10) << "mds_beacon boot: mds" << from << " is someone else" << endl; from = -1; + } else { + switch (mdsmap.get_state(from)) { + case MDSMap::STATE_STOPPED: + case MDSMap::STATE_STARTING: + case MDSMap::STATE_STANDBY: + state = MDSMap::STATE_STARTING; + break; + case MDSMap::STATE_DNE: + case MDSMap::STATE_CREATING: + state = MDSMap::STATE_CREATING; + break; + case MDSMap::STATE_FAILED: + default: + state = MDSMap::STATE_REPLAY; + break; + } + dout(10) << "mds_beacon boot: mds" << from + << " was " << MDSMap::get_state_name(mdsmap.get_state(from)) + << ", " << MDSMap::get_state_name(state) + << endl; } } if (from < 0) { @@ -535,9 +556,28 @@ void MDSMonitor::do_stop() print_map(mdsmap); for (map::iterator p = mdsmap.mds_state.begin(); p != mdsmap.mds_state.end(); - ++p) - if (mdsmap.is_active(p->first)) + ++p) { + switch (p->second) { + case MDSMap::STATE_ACTIVE: + case MDSMap::STATE_STOPPING: pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPING; + break; + case MDSMap::STATE_CREATING: + case MDSMap::STATE_STANDBY: + pending_mdsmap.mds_state[p->first] = MDSMap::STATE_DNE; + break; + case MDSMap::STATE_STARTING: + pending_mdsmap.mds_state[p->first] = MDSMap::STATE_STOPPED; + break; + case MDSMap::STATE_REPLAY: + case MDSMap::STATE_RESOLVE: + case MDSMap::STATE_RECONNECT: + case MDSMap::STATE_REJOIN: + // BUG: hrm, if this is the case, the STOPPING gusy won't be able to stop, will they? + pending_mdsmap.mds_state[p->first] = MDSMap::STATE_FAILED; + break; + } + } propose_pending(); } diff --git a/trunk/ceph/mon/MonitorStore.cc b/trunk/ceph/mon/MonitorStore.cc index d260dfd7604e4..7f00bf83921df 100644 --- a/trunk/ceph/mon/MonitorStore.cc +++ b/trunk/ceph/mon/MonitorStore.cc @@ -42,7 +42,7 @@ void MonitorStore::mount() string old = dir; char *cwd = get_current_dir_name(); dir = cwd; - delete cwd; + free(cwd); dir += "/"; dir += old; } diff --git a/trunk/ceph/mon/PGMap.h b/trunk/ceph/mon/PGMap.h index dc6b500111df0..38b6db494255d 100644 --- a/trunk/ceph/mon/PGMap.h +++ b/trunk/ceph/mon/PGMap.h @@ -18,13 +18,80 @@ #include "osd/osd_types.h" class PGMap { - public: + // the map + version_t version; + hash_map pg_stat; + class Incremental { + public: + version_t version; + map pg_stat_updates; + void _encode(bufferlist &bl) { + ::_encode(version, bl); + ::_encode(pg_stat_updates, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(version, bl, off); + ::_decode(pg_stat_updates, bl, off); + } }; + void apply_incremental(Incremental& inc) { + assert(inc.version == version+1); + version++; + for (map::iterator p = inc.pg_stat_updates.begin(); + p != inc.pg_stat_updates.end(); + ++p) { + if (pg_stat.count(p->first)) + stat_sub(pg_stat[p->first]); + pg_stat[p->first] = p->second; + stat_add(p->second); + } + } + + // aggregate stats (soft state) + hash_map num_pg_by_state; + int64_t num_pg; + int64_t total_size; + int64_t total_num_blocks; + + void stat_zero() { + num_pg = 0; + num_pg_by_state.clear(); + total_size = 0; + total_num_blocks = 0; + } + void stat_add(pg_stat_t &s) { + num_pg++; + num_pg_by_state[s.state]++; + total_size += s.size; + total_num_blocks += s.num_blocks; + } + void stat_sub(pg_stat_t &s) { + num_pg--; + num_pg_by_state[s.state]--; + total_size -= s.size; + total_num_blocks -= s.num_blocks; + } + + PGMap() : version(0), + num_pg(0), total_size(0), total_num_blocks(0) {} + void _encode(bufferlist &bl) { + ::_encode(version, bl); + ::_encode(pg_stat, bl); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(version, bl, off); + ::_decode(pg_stat, bl, off); + stat_zero(); + for (hash_map::iterator p = pg_stat.begin(); + p != pg_stat.end(); + ++p) + stat_add(p->second); + } }; #endif diff --git a/trunk/ceph/mon/PGMonitor.cc b/trunk/ceph/mon/PGMonitor.cc index 8280b87df3e9d..68a75f5f5ee0f 100644 --- a/trunk/ceph/mon/PGMonitor.cc +++ b/trunk/ceph/mon/PGMonitor.cc @@ -19,6 +19,9 @@ #include "OSDMonitor.h" #include "MonitorStore.h" +#include "messages/MPGStats.h" +#include "messages/MStatfs.h" + #include "common/Timer.h" #include "config.h" @@ -30,29 +33,154 @@ void PGMonitor::create_initial() { + dout(1) << "create_initial -- creating initial map" << endl; } bool PGMonitor::update_from_paxos() { + version_t paxosv = paxos->get_version(); + if (paxosv == pg_map.version) return true; + assert(paxosv >= pg_map.version); + + if (pg_map.version == 0 && paxosv > 1 && + mon->store->exists_bl_ss("pgmap","latest")) { + // starting up: load latest + dout(7) << "update_from_paxos startup: loading latest full pgmap" << endl; + bufferlist bl; + mon->store->get_bl_ss(bl, "pgmap", "latest"); + int off = 0; + pg_map._decode(bl, off); + } + + // walk through incrementals + while (paxosv > pg_map.version) { + bufferlist bl; + bool success = paxos->read(pg_map.version+1, bl); + if (success) { + dout(7) << "update_from_paxos applying incremental " << pg_map.version+1 << endl; + PGMap::Incremental inc; + int off = 0; + inc._decode(bl, off); + pg_map.apply_incremental(inc); + + } else { + dout(7) << "update_from_paxos couldn't read incremental " << pg_map.version+1 << endl; + return false; + } + } + + // save latest + bufferlist bl; + pg_map._encode(bl); + mon->store->put_bl_ss(bl, "pgmap", "latest"); + return true; } void PGMonitor::create_pending() { - + pending_inc = PGMap::Incremental(); + pending_inc.version = pg_map.version + 1; + dout(10) << "create_pending v " << pending_inc.version << endl; } void PGMonitor::encode_pending(bufferlist &bl) { - + assert(mon->is_leader()); + dout(10) << "encode_pending v " << pending_inc.version << endl; + assert(paxos->get_version() + 1 == pending_inc.version); + pending_inc._encode(bl); } bool PGMonitor::preprocess_query(Message *m) { - return true; + dout(10) << "preprocess_query " << *m << " from " << m->get_source_inst() << endl; + + switch (m->get_type()) { + case MSG_STATFS: + handle_statfs((MStatfs*)m); + return true; + + case MSG_PGSTATS: + { + MPGStats *stats = (MPGStats*)m; + for (map::iterator p = stats->pg_stat.begin(); + p != stats->pg_stat.end(); + p++) { + if (pg_map.pg_stat.count(p->first) == 0 || + pg_map.pg_stat[p->first].reported < p->second.reported) + return false; + } + dout(10) << " message contains no new pg stats" << endl; + return true; + } + + default: + assert(0); + delete m; + return true; + } } bool PGMonitor::prepare_update(Message *m) { + dout(10) << "prepare_update " << *m << " from " << m->get_source_inst() << endl; + switch (m->get_type()) { + case MSG_PGSTATS: + return handle_pg_stats((MPGStats*)m); + + default: + assert(0); + delete m; + return false; + } +} + + +void PGMonitor::handle_statfs(MStatfs *statfs) +{ + dout(10) << "handle_statfs " << *statfs << " from " << statfs->get_source() << endl; + + // fill out stfs + memset(&statfs->stfs, 0, sizeof(statfs->stfs)); + statfs->stfs.f_blocks = pg_map.total_num_blocks; + statfs->stfs.f_fsid = 0; // hmm. + statfs->stfs.f_flag = ST_NOATIME|ST_NODIRATIME; // for now. + + // reply + mon->messenger->send_message(statfs, statfs->get_source_inst()); +} + +bool PGMonitor::handle_pg_stats(MPGStats *stats) +{ + dout(10) << "handle_pg_stats " << *stats << " from " << stats->get_source() << endl; + + for (map::iterator p = stats->pg_stat.begin(); + p != stats->pg_stat.end(); + p++) { + pg_t pgid; + if ((pg_map.pg_stat.count(pgid) && + pg_map.pg_stat[pgid].reported >= p->second.reported)) { + dout(15) << " had " << pgid << " from " << pg_map.pg_stat[pgid].reported << endl; + continue; + } + if (pending_inc.pg_stat_updates.count(pgid) && + pending_inc.pg_stat_updates[pgid].reported >= p->second.reported) { + dout(15) << " had " << pgid << " from " << pending_inc.pg_stat_updates[pgid].reported + << " (pending)" << endl; + continue; + } + + dout(15) << " got " << pgid << " reported at " << p->second.reported << endl; + pending_inc.pg_stat_updates[pgid] = p->second; + + // we don't care about consistency; apply to live map. + if (pg_map.pg_stat.count(pgid)) + pg_map.stat_sub(pg_map.pg_stat[pgid]); + pg_map.pg_stat[pgid] = p->second; + pg_map.stat_add(pg_map.pg_stat[pgid]); + } + + delete stats; return true; } diff --git a/trunk/ceph/mon/PGMonitor.h b/trunk/ceph/mon/PGMonitor.h index 917d6e272a756..e243d0851430d 100644 --- a/trunk/ceph/mon/PGMonitor.h +++ b/trunk/ceph/mon/PGMonitor.h @@ -25,10 +25,12 @@ using namespace std; #include "PGMap.h" +class MPGStats; +class MStatfs; + class PGMonitor : public PaxosService { public: - private: PGMap pg_map; PGMap::Incremental pending_inc; @@ -41,7 +43,9 @@ private: bool preprocess_query(Message *m); // true if processed. bool prepare_update(Message *m); - + void handle_statfs(MStatfs *statfs); + bool handle_pg_stats(MPGStats *stats); + public: PGMonitor(Monitor *mn, Paxos *p) : PaxosService(mn, p) { } diff --git a/trunk/ceph/msg/FakeMessenger.cc b/trunk/ceph/msg/FakeMessenger.cc index 62e347b02c89d..f99c6aa7e30e4 100644 --- a/trunk/ceph/msg/FakeMessenger.cc +++ b/trunk/ceph/msg/FakeMessenger.cc @@ -78,7 +78,7 @@ void *fakemessenger_thread(void *ptr) if (fm_shutdown) break; fakemessenger_do_loop_2(); - if (directory.empty()) break; + if (directory.empty() && nranks > 0) break; dout(20) << "thread waiting" << endl; if (fm_shutdown) break; @@ -137,7 +137,7 @@ int fakemessenger_do_loop_2() { //lock.Lock(); dout(18) << "do_loop begin." << endl; - + while (1) { bool didone = false; @@ -270,7 +270,7 @@ FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) _myinst.name = me; _myinst.addr.port = nranks++; //if (!me.is_mon()) - //_myinst.addr.nonce = getpid(); + _myinst.addr.nonce = getpid(); // add to directory directory[ _myinst.addr ] = this; @@ -360,7 +360,8 @@ int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fr m->set_source(get_myname(), fromport); m->set_source_addr(get_myaddr()); - m->set_dest(inst.name, port); + m->set_dest_inst(inst); + m->set_dest_port(port); lock.Lock(); @@ -379,11 +380,12 @@ int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fr // queue if (directory.count(inst.addr) && shutdown_set.count(inst.addr) == 0) { - dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << endl; + dout(1) << "--> " << get_myname() << " -> " << inst.name << " --- " << *m << " -- " << m + << endl; directory[inst.addr]->queue_incoming(m); } else { dout(0) << "--> " << get_myname() << " -> " << inst.name << " " << *m << " -- " << m - << " *** destination DNE ***" + << " *** destination " << inst.addr << " DNE ***" << endl; for (map::iterator p = directory.begin(); p != directory.end(); diff --git a/trunk/ceph/msg/Message.cc b/trunk/ceph/msg/Message.cc index d6363a4c2ad11..fa63838a2d51f 100644 --- a/trunk/ceph/msg/Message.cc +++ b/trunk/ceph/msg/Message.cc @@ -11,6 +11,9 @@ using namespace std; #include "messages/MGenericMessage.h" +#include "messages/MPGStats.h" +#include "messages/MStatfs.h" + #include "messages/MMonCommand.h" #include "messages/MMonCommandAck.h" #include "messages/MMonPaxos.h" @@ -59,6 +62,8 @@ using namespace std; #include "messages/MDiscover.h" #include "messages/MDiscoverReply.h" +#include "messages/MMDSFragmentNotify.h" + #include "messages/MExportDirDiscover.h" #include "messages/MExportDirDiscoverAck.h" #include "messages/MExportDirCancel.h" @@ -104,6 +109,13 @@ decode_message(msg_envelope_t& env, bufferlist& payload) // -- with payload -- + case MSG_PGSTATS: + m = new MPGStats; + break; + case MSG_STATFS: + m = new MStatfs; + break; + case MSG_MON_COMMAND: m = new MMonCommand; break; @@ -241,6 +253,10 @@ decode_message(msg_envelope_t& env, bufferlist& payload) m = new MDiscoverReply(); break; + case MSG_MDS_FRAGMENTNOTIFY: + m = new MMDSFragmentNotify; + break; + case MSG_MDS_EXPORTDIRDISCOVER: m = new MExportDirDiscover(); break; diff --git a/trunk/ceph/msg/Message.h b/trunk/ceph/msg/Message.h index 58f5da03f3943..f593f1aac2cf3 100644 --- a/trunk/ceph/msg/Message.h +++ b/trunk/ceph/msg/Message.h @@ -17,6 +17,9 @@ #define MSG_CLOSE 0 +#define MSG_STATFS 1 +#define MSG_PGSTATS 2 + #define MSG_PING 10 #define MSG_PING_ACK 11 @@ -107,6 +110,8 @@ #define MSG_MDS_ANCHOR 130 +#define MSG_MDS_FRAGMENTNOTIFY 140 + #define MSG_MDS_EXPORTDIRDISCOVER 149 #define MSG_MDS_EXPORTDIRDISCOVERACK 150 #define MSG_MDS_EXPORTDIRCANCEL 151 @@ -154,10 +159,10 @@ using std::list; typedef struct { - int type; + int32_t type; entity_inst_t src, dst; - int source_port, dest_port; - int nchunks; + int32_t source_port, dest_port; + int32_t nchunks; } msg_envelope_t; #define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) diff --git a/trunk/ceph/msg/SimpleMessenger.cc b/trunk/ceph/msg/SimpleMessenger.cc index f54b6e206c0de..a38e3eaf7bd88 100644 --- a/trunk/ceph/msg/SimpleMessenger.cc +++ b/trunk/ceph/msg/SimpleMessenger.cc @@ -551,7 +551,7 @@ Message *Rank::Pipe::read_message() // payload bufferlist blist; for (int i=0; iget_dest() << endl; @@ -635,7 +635,7 @@ int Rank::Pipe::write_message(Message *m) } #else // one big chunk - int size = blist.length(); + int32_t size = blist.length(); r = tcp_write( sd, (char*)&size, sizeof(size) ); if (r < 0) { derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; diff --git a/trunk/ceph/msg/msg_types.h b/trunk/ceph/msg/msg_types.h index f20ffe8ed3720..48d8af6955f70 100644 --- a/trunk/ceph/msg/msg_types.h +++ b/trunk/ceph/msg/msg_types.h @@ -21,8 +21,8 @@ // new typed msg_addr_t way! class entity_name_t { - int _type; - int _num; + int32_t _type; + int32_t _num; public: static const int TYPE_MON = 1; diff --git a/trunk/ceph/msg/tcp.cc b/trunk/ceph/msg/tcp.cc index 232ee03fa5d09..5f449ab83605b 100644 --- a/trunk/ceph/msg/tcp.cc +++ b/trunk/ceph/msg/tcp.cc @@ -7,6 +7,10 @@ * tcp crap */ +/* +inlined, see tcp.h + + bool tcp_read(int sd, char *buf, int len) { while (len > 0) { @@ -45,7 +49,7 @@ int tcp_write(int sd, char *buf, int len) } return 0; } - +*/ int tcp_hostlookup(char *str, tcpaddr_t& ta) { diff --git a/trunk/ceph/msg/tcp.h b/trunk/ceph/msg/tcp.h index 7a866af7f9d86..e09238c2e77e5 100644 --- a/trunk/ceph/msg/tcp.h +++ b/trunk/ceph/msg/tcp.h @@ -24,8 +24,38 @@ inline ostream& operator<<(ostream& out, const tcpaddr_t &a) return out; } -extern bool tcp_read(int sd, char *buf, int len); -extern int tcp_write(int sd, char *buf, int len); +inline bool tcp_read(int sd, char *buf, int len) { + while (len > 0) { + int got = ::recv( sd, buf, len, 0 ); + if (got <= 0) { + //dout(18) << "tcp_read socket " << sd << " closed" << endl; + return false; + } + len -= got; + buf += got; + //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl; + } + return true; +} + +inline int tcp_write(int sd, char *buf, int len) { + //dout(DBL) << "tcp_write writing " << len << endl; + assert(len > 0); + while (len > 0) { + int did = ::send( sd, buf, len, 0 ); + if (did < 0) { + //dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; + //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; + return did; + } + len -= did; + buf += did; + //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl; + } + return 0; +} + + extern int tcp_hostlookup(char *str, tcpaddr_t& ta); inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { diff --git a/trunk/ceph/newsyn.cc b/trunk/ceph/newsyn.cc index 5eadd85bfdb6f..fcca655c46747 100644 --- a/trunk/ceph/newsyn.cc +++ b/trunk/ceph/newsyn.cc @@ -396,8 +396,15 @@ int main(int argc, char **argv) if (started) cerr << "newsyn finishing" << endl; - return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). + // cd on exit, so that gmon.out (if any) goes into a separate directory for each node. + char s[20]; + sprintf(s, "gmon/%d", myrank); + mkdir(s, 0755); + chdir(s); + + + return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). // cleanup for (map::iterator i = mds.begin(); i != mds.end(); i++) @@ -427,6 +434,8 @@ int main(int argc, char **argv) } */ + + return 0; } diff --git a/trunk/ceph/osd/FakeStore.cc b/trunk/ceph/osd/FakeStore.cc index f6c7cc116c3ed..683c24634defd 100644 --- a/trunk/ceph/osd/FakeStore.cc +++ b/trunk/ceph/osd/FakeStore.cc @@ -307,10 +307,11 @@ int FakeStore::write(object_t oid, const bufferlist& bl, Context *onsafe) { - dout(20) << "write " << oid << " len " << len << " off " << offset << endl; - char fn[200]; get_oname(oid,fn); + + dout(20) << "write " << fn << " len " << len << " off " << offset << endl; + ::mknod(fn, 0644, 0); // in case it doesn't exist yet. @@ -391,7 +392,7 @@ void FakeStore::sync() void FakeStore::sync(Context *onsafe) { - if (g_conf.fakestore_fake_sync) { + if (g_conf.fakestore_fake_sync > 0.0) { g_timer.add_event_after((float)g_conf.fakestore_fake_sync, new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); @@ -547,10 +548,9 @@ int FakeStore::list_collections(list& ls) struct dirent *de; while ((de = ::readdir(dir)) != 0) { // parse + errno = 0; coll_t c = strtoll(de->d_name, 0, 16); - dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl; - if (errno) continue; - ls.push_back(c); + if (c) ls.push_back(c); } ::closedir(dir); diff --git a/trunk/ceph/osd/OSD.cc b/trunk/ceph/osd/OSD.cc index cab4d2e34c2a9..6285c0b340203 100644 --- a/trunk/ceph/osd/OSD.cc +++ b/trunk/ceph/osd/OSD.cc @@ -74,7 +74,7 @@ char *osd_base_path = "./osddata"; char *ebofs_base_path = "./dev"; -object_t SUPERBLOCK_OBJECT(0,0); +static const object_t SUPERBLOCK_OBJECT(0,0); // force remount hack for performance testing FakeStore @@ -870,10 +870,6 @@ void OSD::handle_osd_ping(MOSDPing *m) peer_qlen[from] = m->avg_qlen; peer_read_time[from] = m->read_mean_time; - //if (!m->ack) - //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), - //m->get_source()); - delete m; } diff --git a/trunk/ceph/osd/OSDMap.h b/trunk/ceph/osd/OSDMap.h index f0e0ff301f813..6bf3f10670ed4 100644 --- a/trunk/ceph/osd/OSDMap.h +++ b/trunk/ceph/osd/OSDMap.h @@ -92,12 +92,12 @@ public: bufferlist fullmap; // in leiu of below. // incremental - map new_up; - map new_down; - list new_in; - list new_out; - map new_overload; // updated overload value - list old_overload; // no longer overload + map new_up; + map new_down; + list new_in; + list new_out; + map new_overload; // updated overload value + list old_overload; // no longer overload void encode(bufferlist& bl) { ::_encode(epoch, bl); @@ -129,16 +129,16 @@ private: epoch_t epoch; // what epoch of the osd cluster descriptor is this epoch_t mon_epoch; // monitor epoch (election iteration) utime_t ctime; // epoch start time - int pg_num; // placement group count - int pg_num_mask; // bitmask for above - int localized_pg_num; // localized place group count - int localized_pg_num_mask; // ditto + int32_t pg_num; // placement group count + int32_t pg_num_mask; // bitmask for above + int32_t localized_pg_num; // localized place group count + int32_t localized_pg_num_mask; // ditto - set osds; // all osds - set down_osds; // list of down disks - set out_osds; // list of unmapped disks - map overload_osds; - map osd_inst; + set osds; // all osds + set down_osds; // list of down disks + set out_osds; // list of unmapped disks + map overload_osds; + map osd_inst; public: Crush crush; // hierarchical map @@ -220,7 +220,7 @@ private: } // nope, incremental. - for (map::iterator i = inc.new_down.begin(); + for (map::iterator i = inc.new_down.begin(); i != inc.new_down.end(); i++) { assert(down_osds.count(i->first) == 0); @@ -230,21 +230,21 @@ private: osd_inst.erase(i->first); //cout << "epoch " << epoch << " down osd" << i->first << endl; } - for (list::iterator i = inc.new_out.begin(); + for (list::iterator i = inc.new_out.begin(); i != inc.new_out.end(); i++) { assert(out_osds.count(*i) == 0); out_osds.insert(*i); //cout << "epoch " << epoch << " out osd" << *i << endl; } - for (list::iterator i = inc.old_overload.begin(); + for (list::iterator i = inc.old_overload.begin(); i != inc.old_overload.end(); i++) { assert(overload_osds.count(*i)); overload_osds.erase(*i); } - for (map::iterator i = inc.new_up.begin(); + for (map::iterator i = inc.new_up.begin(); i != inc.new_up.end(); i++) { assert(down_osds.count(i->first)); @@ -253,14 +253,14 @@ private: osd_inst[i->first] = i->second; //cout << "epoch " << epoch << " up osd" << i->first << endl; } - for (list::iterator i = inc.new_in.begin(); + for (list::iterator i = inc.new_in.begin(); i != inc.new_in.end(); i++) { assert(out_osds.count(*i)); out_osds.erase(*i); //cout << "epoch " << epoch << " in osd" << *i << endl; } - for (map::iterator i = inc.new_overload.begin(); + for (map::iterator i = inc.new_overload.begin(); i != inc.new_overload.end(); i++) { overload_osds[i->first] = i->second; diff --git a/trunk/ceph/osd/ObjectStore.h b/trunk/ceph/osd/ObjectStore.h index c373ba32899b9..1bdb9955b4237 100644 --- a/trunk/ceph/osd/ObjectStore.h +++ b/trunk/ceph/osd/ObjectStore.h @@ -95,16 +95,14 @@ public: static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval static const int OP_COLL_RMATTR = 25; // cid, attrname - list ops; + private: + list ops; list bls; list oids; - list cids; - list offsets; - list lengths; + list cids; + list lengths; list attrnames; list attrnames2; - //list< pair > attrvals; - list attrbls; // for reads only (not encoded) list pbls; @@ -112,24 +110,59 @@ public: list< pair > pattrvals; list< map* > pattrsets; - const char *get_attrname() { - if (attrnames.empty()) - return attrnames2.front().c_str(); - else - return attrnames.front(); + public: + bool have_op() { + return !ops.empty(); + } + int get_num_ops() { return ops.size(); } + int get_op() { + int op = ops.front(); + ops.pop_front(); + return op; + } + void get_bl(bufferlist& bl) { + bl.claim(bls.front()); + bls.pop_front(); + } + void get_oid(object_t& oid) { + oid = oids.front(); + oids.pop_front(); + } + void get_cid(coll_t& cid) { + cid = cids.front(); + cids.pop_front(); + } + void get_length(off_t& len) { + len = lengths.front(); + lengths.pop_front(); + } + void get_attrname(const char * &p) { + p = attrnames.front(); + attrnames.pop_front(); } - void pop_attrname() { - if (attrnames.empty()) - attrnames2.pop_front(); - else - attrnames.pop_front(); + void get_pbl(bufferlist* &pbl) { + pbl = pbls.front(); + pbls.pop_front(); } + void get_pstat(struct stat* &pst) { + pst = psts.front(); + psts.pop_front(); + } + void get_pattrval(pair& p) { + p = pattrvals.front(); + pattrvals.pop_front(); + } + void get_pattrset(map* &ps) { + ps = pattrsets.front(); + pattrsets.pop_front(); + } + void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { int op = OP_READ; ops.push_back(op); oids.push_back(oid); - offsets.push_back(off); + lengths.push_back(off); lengths.push_back(len); pbls.push_back(pbl); } @@ -157,7 +190,7 @@ public: int op = OP_WRITE; ops.push_back(op); oids.push_back(oid); - offsets.push_back(off); + lengths.push_back(off); lengths.push_back(len); bls.push_back(bl); } @@ -165,14 +198,14 @@ public: int op = OP_TRIMCACHE; ops.push_back(op); oids.push_back(oid); - offsets.push_back(off); + lengths.push_back(off); lengths.push_back(len); } void truncate(object_t oid, off_t off) { int op = OP_TRUNCATE; ops.push_back(op); oids.push_back(oid); - offsets.push_back(off); + lengths.push_back(off); } void remove(object_t oid) { int op = OP_REMOVE; @@ -187,7 +220,7 @@ public: //attrvals.push_back(pair(val,len)); bufferlist bl; bl.append((char*)val,len); - attrbls.push_back(bl); + bls.push_back(bl); } void setattrs(object_t oid, map& attrset) { int op = OP_SETATTRS; @@ -234,10 +267,9 @@ public: ops.push_back(op); cids.push_back(cid); attrnames.push_back(name); - //attrvals.push_back(pair(val,len)); bufferlist bl; bl.append((char*)val, len); - attrbls.push_back(bl); + bls.push_back(bl); } void collection_rmattr(coll_t cid, const char* name) { int op = OP_COLL_RMATTR; @@ -253,20 +285,20 @@ public: ::_encode(bls, bl); ::_encode(oids, bl); ::_encode(cids, bl); - ::_encode(offsets, bl); ::_encode(lengths, bl); ::_encode(attrnames, bl); - ::_encode(attrbls, bl); } void _decode(bufferlist& bl, int& off) { ::_decode(ops, bl, off); ::_decode(bls, bl, off); ::_decode(oids, bl, off); ::_decode(cids, bl, off); - ::_decode(offsets, bl, off); ::_decode(lengths, bl, off); ::_decode(attrnames2, bl, off); - ::_decode(attrbls, bl, off); + for (list::iterator p = attrnames2.begin(); + p != attrnames2.end(); + ++p) + attrnames.push_back((*p).c_str()); } }; @@ -277,164 +309,195 @@ public: */ virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { // non-atomic implementation - for (list::iterator p = t.ops.begin(); - p != t.ops.end(); - p++) { - switch (*p) { + while (t.have_op()) { + int op = t.get_op(); + switch (op) { case Transaction::OP_READ: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); + object_t oid; + off_t offset, len; + t.get_oid(oid); + t.get_length(offset); + t.get_length(len); + bufferlist *pbl; + t.get_pbl(pbl); read(oid, offset, len, *pbl); } break; case Transaction::OP_STAT: { - object_t oid = t.oids.front(); t.oids.pop_front(); - struct stat *st = t.psts.front(); t.psts.pop_front(); + object_t oid; + t.get_oid(oid); + struct stat *st; + t.get_pstat(st); stat(oid, st); } break; case Transaction::OP_GETATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); + pair pattrval; + t.get_pattrval(pattrval); *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); } break; case Transaction::OP_GETATTRS: { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); + object_t oid; + t.get_oid(oid); + map *pset; + t.get_pattrset(pset); getattrs(oid, *pset); } break; case Transaction::OP_WRITE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); - bufferlist bl = t.bls.front(); t.bls.pop_front(); + object_t oid; + t.get_oid(oid); + off_t offset, len; + t.get_length(offset); + t.get_length(len); + bufferlist bl; + t.get_bl(bl); write(oid, offset, len, bl, 0); } break; case Transaction::OP_TRIMCACHE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t offset = t.offsets.front(); t.offsets.pop_front(); - size_t len = t.lengths.front(); t.lengths.pop_front(); + object_t oid; + t.get_oid(oid); + off_t offset, len; + t.get_length(offset); + t.get_length(len); trim_from_cache(oid, offset, len); } break; case Transaction::OP_TRUNCATE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - off_t len = t.offsets.front(); t.offsets.pop_front(); + object_t oid; + t.get_oid(oid); + off_t len; + t.get_length(len); truncate(oid, len, 0); } break; case Transaction::OP_REMOVE: { - object_t oid = t.oids.front(); t.oids.pop_front(); + object_t oid; + t.get_oid(oid); remove(oid, 0); } break; case Transaction::OP_SETATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); + t.get_bl(bl); setattr(oid, attrname, bl.c_str(), bl.length(), 0); } break; case Transaction::OP_SETATTRS: { - object_t oid = t.oids.front(); t.oids.pop_front(); - map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); + object_t oid; + t.get_oid(oid); + map *pattrset; + t.get_pattrset(pattrset); setattrs(oid, *pattrset, 0); } break; case Transaction::OP_RMATTR: { - object_t oid = t.oids.front(); t.oids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); + object_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); rmattr(oid, attrname, 0); } break; case Transaction::OP_CLONE: { - object_t oid = t.oids.front(); t.oids.pop_front(); - object_t noid = t.oids.front(); t.oids.pop_front(); + object_t oid; + t.get_oid(oid); + object_t noid; + t.get_oid(noid); clone(oid, noid); } break; case Transaction::OP_MKCOLL: { - coll_t cid = t.cids.front(); t.cids.pop_front(); + coll_t cid; + t.get_cid(cid); create_collection(cid, 0); } break; case Transaction::OP_RMCOLL: { - coll_t cid = t.cids.front(); t.cids.pop_front(); + coll_t cid; + t.get_cid(cid); destroy_collection(cid, 0); } break; case Transaction::OP_COLL_ADD: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); + coll_t cid; + t.get_cid(cid); + object_t oid; + t.get_oid(oid); collection_add(cid, oid, 0); } break; case Transaction::OP_COLL_REMOVE: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - object_t oid = t.oids.front(); t.oids.pop_front(); + coll_t cid; + t.get_cid(cid); + object_t oid; + t.get_oid(oid); collection_remove(cid, oid, 0); } break; case Transaction::OP_COLL_SETATTR: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); - //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); bufferlist bl; - bl.claim( t.attrbls.front() ); - t.attrbls.pop_front(); + t.get_bl(bl); collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); } break; case Transaction::OP_COLL_RMATTR: { - coll_t cid = t.cids.front(); t.cids.pop_front(); - const char *attrname = t.get_attrname(); t.pop_attrname(); + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); collection_rmattr(cid, attrname, 0); } break; default: - cerr << "bad op " << *p << endl; + cerr << "bad op " << op << endl; assert(0); } } diff --git a/trunk/ceph/osd/PG.cc b/trunk/ceph/osd/PG.cc index c2d1290102e8b..93cad0d06654c 100644 --- a/trunk/ceph/osd/PG.cc +++ b/trunk/ceph/osd/PG.cc @@ -1033,6 +1033,7 @@ void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) off_t trim = p->first; dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; + assert(trim >= ondisklog.bottom); ondisklog.bottom = trim; // adjust block_map @@ -1082,9 +1083,9 @@ void PG::read_log(ObjectStore *store) // load bounds ondisklog.bottom = ondisklog.top = 0; r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - assert(r == sizeof(ondisklog.bottom)); + //assert(r == sizeof(ondisklog.bottom)); r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - assert(r == sizeof(ondisklog.top)); + //assert(r == sizeof(ondisklog.top)); dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << dendl; @@ -1095,6 +1096,10 @@ void PG::read_log(ObjectStore *store) // read bufferlist bl; store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); + if (bl.length() < ondisklog.top-ondisklog.bottom) { + dout(0) << "read_log data doesn't match attrs" << dendl; + assert(0); + } PG::Log::Entry e; off_t pos = ondisklog.bottom; diff --git a/trunk/ceph/osd/osd_types.h b/trunk/ceph/osd/osd_types.h index 5bd13902ab721..b761ae36d3d0e 100644 --- a/trunk/ceph/osd/osd_types.h +++ b/trunk/ceph/osd/osd_types.h @@ -24,7 +24,7 @@ class osdreqid_t { public: entity_name_t name; // who - int inc; // incarnation + int32_t inc; // incarnation tid_t tid; osdreqid_t() : inc(0), tid(0) {} osdreqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} @@ -67,7 +67,7 @@ typedef uint64_t coll_t; // collection id // pg stuff -#define PG_INO 1 +#define PG_INO 4 // this should match mds/mdstypes.h MDS_INO_PG typedef uint16_t ps_t; typedef uint8_t pruleset_t; @@ -162,6 +162,9 @@ namespace __gnu_cxx { } + + + /** ObjectLayout * * describes an object's placement and layout in the storage cluster. @@ -170,7 +173,7 @@ namespace __gnu_cxx { */ struct ObjectLayout { pg_t pgid; // what pg do i belong to - int stripe_unit; // for object raid in raid pgs + int32_t stripe_unit; // for object raid in raid pgs ObjectLayout() : pgid(0), stripe_unit(0) { } ObjectLayout(pg_t p, int su=0) : pgid(p), stripe_unit(su) { } @@ -218,6 +221,24 @@ inline ostream& operator<<(ostream& out, const eversion_t e) { +/** pg_stat + * aggregate stats for a single PG. + */ +struct pg_stat_t { + const static int STATE_UNKNOWN = 0; + const static int STATE_OK = 1; + const static int STATE_RECOVERING = 2; + const static int STATE_OFFLINE = 3; + + eversion_t reported; + + int32_t state; + int64_t size; // in bytes + int64_t num_blocks; // in 4k blocks + + pg_stat_t() : state(0), size(0), num_blocks(0) {} +}; + // ----------------------------------------- @@ -255,7 +276,7 @@ public: const static uint64_t MAGIC = 0xeb0f505dULL; uint64_t magic; uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. + int32_t whoami; // my role in this fs. epoch_t current_epoch; // most recent epoch epoch_t oldest_map, newest_map; // oldest/newest maps we have. OSDSuperblock(uint64_t f=0, int w=0) : diff --git a/trunk/ceph/osdc/Journaler.cc b/trunk/ceph/osdc/Journaler.cc index 788188c84aea4..2b0a62b5d9df9 100644 --- a/trunk/ceph/osdc/Journaler.cc +++ b/trunk/ceph/osdc/Journaler.cc @@ -20,8 +20,8 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " -#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) cout << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " +#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_journaler) cerr << g_clock.now() << " " << objecter->messenger->get_myname() << ".journaler " @@ -236,7 +236,7 @@ void Journaler::_finish_flush(int r, off_t start) off_t Journaler::append_entry(bufferlist& bl, Context *onsync) { - size_t s = bl.length(); + uint32_t s = bl.length(); if (!g_conf.journaler_allow_split_entries) { // will we span a stripe boundary? @@ -261,11 +261,24 @@ off_t Journaler::append_entry(bufferlist& bl, Context *onsync) } } - dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl; + dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(uint32_t)) << endl; + // cache? + // NOTE: this is a dumb thing to do; this is used for a benchmarking + // purposes only. + if (g_conf.journaler_cache && + write_pos == read_pos + read_buf.length()) { + dout(10) << "append_entry caching in read_buf too" << endl; + assert(requested_pos == received_pos); + assert(requested_pos == read_pos + read_buf.length()); + read_buf.append((char*)&s, sizeof(s)); + read_buf.append(bl); + requested_pos = received_pos = write_pos + sizeof(s) + s; + } + // append write_buf.append((char*)&s, sizeof(s)); - write_buf.append(bl); + write_buf.claim_append(bl); write_pos += sizeof(s) + s; // flush now? @@ -482,7 +495,7 @@ bool Journaler::is_readable() if (read_pos == write_pos) return false; // have enough for entry size? - size_t s = 0; + uint32_t s = 0; if (read_buf.length() >= sizeof(s)) read_buf.copy(0, sizeof(s), (char*)&s); @@ -527,7 +540,7 @@ bool Journaler::try_read_entry(bufferlist& bl) return false; } - size_t s; + uint32_t s; assert(read_buf.length() >= sizeof(s)); read_buf.copy(0, sizeof(s), (char*)&s); assert(read_buf.length() >= sizeof(s) + s); diff --git a/trunk/ceph/osdc/ObjectCacher.cc b/trunk/ceph/osdc/ObjectCacher.cc index 592c8116b5b32..7f5bc439c6bca 100644 --- a/trunk/ceph/osdc/ObjectCacher.cc +++ b/trunk/ceph/osdc/ObjectCacher.cc @@ -255,11 +255,6 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr BufferHead *bh = p->second; dout(10) << "map_write bh " << *bh << " intersected" << endl; - /*if (bh->is_dirty()) { - // already dirty, let's use it. - final = bh; - } else { - */ if (p->first < cur) { assert(final == 0); if (cur + max >= p->first + p->second->length()) { @@ -275,10 +270,6 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr split(final, cur+max); } } else if (p->first == cur) { - /*if (bh->is_dirty()) { - // already dirty, use it. - } - else*/ if (p->second->length() <= max) { // whole bufferhead, piece of cake. } else { @@ -286,7 +277,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr split(bh, cur + max); // just split } if (final) - merge_left(final,bh); + merge_left(final, bh); else final = bh; } @@ -394,7 +385,16 @@ void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, buff dout(7) << "bh_read_finish " << oid << " " << start << "~" << length + << " (bl is " << bl.length() << ")" << endl; + + if (bl.length() < length) { + bufferptr bp(length - bl.length()); + bp.zero(); + dout(7) << "bh_read_finish " << oid << " padding " << start << "~" << length + << " with " << bp.length() << " bytes of zeroes" << endl; + bl.push_back(bp); + } if (objects.count(oid) == 0) { dout(7) << "bh_read_finish no object cache" << endl; @@ -743,9 +743,9 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) for (map::iterator bh_it = hits.begin(); bh_it != hits.end(); bh_it++) { - dout(10) << "readx hit bh " << *bh_it->second << endl; + dout(10) << "readx hit bh " << *bh_it->second << endl; hit_ls.push_back(bh_it->second); - } + } // create reverse map of buffer offset -> object for the eventual result. // this is over a single ObjectExtent, so we know that @@ -753,7 +753,7 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) // - the buffer frags need not be (and almost certainly aren't) off_t opos = ex_it->start; map::iterator bh_it = hits.begin(); - assert(bh_it->second->start() <= opos); + assert(bh_it->second->start() <= opos); size_t bhoff = opos - bh_it->second->start(); map::iterator f_it = ex_it->buffer_extents.begin(); size_t foff = 0; @@ -768,9 +768,13 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) size_t len = MIN(f_it->second - foff, bh->length() - bhoff); - stripe_map[f_it->first].substr_of(bh->bl, - opos - bh->start(), - len); + bufferlist bit; // put substr here first, since substr_of clobbers, and + // we may get multiple bh's at this stripe_map position + bit.substr_of(bh->bl, + opos - bh->start(), + len); + stripe_map[f_it->first].claim_append(bit); + opos += len; bhoff += len; foff += len; @@ -812,6 +816,7 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl; pos += i->second.length(); rd->bl->claim_append(i->second); + assert(rd->bl->length() == pos); } dout(10) << "readx result is " << rd->bl->length() << endl; @@ -870,24 +875,25 @@ int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) touch_bh(bh); bh->last_write = now; - // recombine with left? + // combine with left? map::iterator p = o->data.find(bh->start()); + assert(p->second == bh); if (p != o->data.begin()) { - assert(p->second == bh); p--; - if (p->second->is_dirty()) { + if (p->second->is_dirty() && + p->second->end() == bh->start()) { o->merge_left(p->second, bh); bh = p->second; - } + } else + p++; } - // right? - while (1) { - p = o->data.find(bh->start()); - assert(p->second == bh); - p++; - if (p == o->data.end() || !p->second->is_dirty()) break; + // combine to the right? + assert(p->second == bh); + p++; + if (p != o->data.end() && + !p->second->is_dirty() && + p->second->start() > bh->end()) o->merge_left(bh, p->second); - } } delete wr; diff --git a/trunk/ceph/osdc/ObjectCacher.h b/trunk/ceph/osdc/ObjectCacher.h index 15109ab782167..054953e4670cc 100644 --- a/trunk/ceph/osdc/ObjectCacher.h +++ b/trunk/ceph/osdc/ObjectCacher.h @@ -100,8 +100,8 @@ class ObjectCacher { ObjectCacher *oc; object_t oid; // this _always_ is oid.rev=0 inodeno_t ino; - objectrev_t rev; // last rev we're written - ObjectLayout layout; + objectrev_t rev; // last rev we're written + ObjectLayout layout; public: map data; @@ -538,6 +538,7 @@ inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) if (bh.is_dirty()) out << " dirty"; if (bh.is_clean()) out << " clean"; if (bh.is_missing()) out << " missing"; + if (bh.bl.length() > 0) out << " firstbyte=" << (int)bh.bl[0]; out << "]"; return out; } diff --git a/trunk/ceph/osdc/Objecter.cc b/trunk/ceph/osdc/Objecter.cc index fc7c530824e6d..8d1d58929c4ad 100644 --- a/trunk/ceph/osdc/Objecter.cc +++ b/trunk/ceph/osdc/Objecter.cc @@ -179,13 +179,13 @@ void Objecter::kick_requests(set& changed_pgs) !g_conf.objecter_buffer_uncommitted) { dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl; } else { - dout(0) << "kick_requests missing commit, replay write " << tid + dout(3) << "kick_requests missing commit, replay write " << tid << " v " << wr->tid_version[tid] << endl; modifyx_submit(wr, wr->waitfor_commit[tid], tid); } } else if (wr->waitfor_ack.count(tid)) { - dout(0) << "kick_requests missing ack, resub write " << tid << endl; + dout(3) << "kick_requests missing ack, resub write " << tid << endl; modifyx_submit(wr, wr->waitfor_ack[tid], tid); } } @@ -194,22 +194,22 @@ void Objecter::kick_requests(set& changed_pgs) // READ OSDRead *rd = op_read[tid]; op_read.erase(tid); - dout(0) << "kick_requests resub read " << tid << endl; + dout(3) << "kick_requests resub read " << tid << endl; // resubmit readx_submit(rd, rd->ops[tid], true); rd->ops.erase(tid); } - else if (op_stat.count(tid)) { - OSDStat *st = op_stat[tid]; - op_stat.erase(tid); - - dout(0) << "kick_requests resub stat " << tid << endl; + else if (op_stat.count(tid)) { + OSDStat *st = op_stat[tid]; + op_stat.erase(tid); + + dout(3) << "kick_requests resub stat " << tid << endl; // resubmit stat_submit(st); - } + } else assert(0); diff --git a/trunk/ceph/test/fg.cc b/trunk/ceph/test/fg.cc new file mode 100644 index 0000000000000..02807a0e50467 --- /dev/null +++ b/trunk/ceph/test/fg.cc @@ -0,0 +1,19 @@ + +#include "include/types.h" +#include "include/frag.h" + +int main(int argc, char **argv) +{ + fragtree_t tree; + tree.split(frag_t(),2); + tree.split(frag_t(0,2),1); + tree.split(frag_t(1,2),1); + tree.split(frag_t(2,2),1); + tree.split(frag_t(1,3),1); + + cout << "tree is " << tree << endl; + frag_t fg(2,4); + cout << "fg is " << fg << endl; + tree.force_to_leaf(fg); + +} diff --git a/trunk/ceph/test/testcounter.cc b/trunk/ceph/test/testcounter.cc new file mode 100644 index 0000000000000..a3194489e4886 --- /dev/null +++ b/trunk/ceph/test/testcounter.cc @@ -0,0 +1,70 @@ + +#include "common/DecayCounter.h" + +#include +using namespace std; + +struct RealCounter { +public: + list hits; + + void hit(int ms) { + hits.push_back(ms); + } + + int get(double hl, int now) { + trim(now-hl); + return hits.size(); + } + + void trim(int to) { + while (!hits.empty() && + hits.front() < to) + hits.pop_front(); + } + + +}; + +int main(int argc, char **argv) +{ + int target; + double hl = atof(argv[1]); + cerr << "halflife " << hl << endl; + + DecayCounter dc(hl); + RealCounter rc; + + utime_t now = g_clock.now(); + + for (int ms=0; ms < 300*1000; ms++) { + if (ms % 30000 == 0) { + target = 1 + (rand() % 10) * 10; + if (ms > 200000) target = 0; + } + + if (target && + (rand() % (1000/target) == 0)) { + dc.hit(); + rc.hit(ms); + } + + if (ms % 500 == 0) dc.get(now); + if (ms % 100 == 0) { + //dc.get(now); + DecayCounter o = dc; + cout << ms << "\t" + << target*hl << "\t" + << rc.get(hl*1000, ms) << "\t" + << o.get(now) << "\t" + << dc.val << "\t" + // << dc.delta << "\t" + << o.get_last_vel() << "\t" + << o.get_last() + o.get_last_vel() << "\t" + << endl; + } + + now += .001; + } + +} -- 2.39.5