From dbac3102a4f56151865de263ea1d67b1be299db6 Mon Sep 17 00:00:00 2001 From: sageweil Date: Thu, 1 Mar 2007 19:56:54 +0000 Subject: [PATCH] merge trunk changes r1058:1150 into branches/aleung/security1/ceph - new monitor key boostrapping git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1151 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/aleung/security1/ceph/Makefile | 115 +- branches/aleung/security1/ceph/TODO | 233 +-- .../aleung/security1/ceph/client/Client.cc | 67 +- .../aleung/security1/ceph/client/Client.h | 11 +- .../aleung/security1/ceph/client/FileCache.cc | 2 +- .../security1/ceph/client/SyntheticClient.cc | 103 +- .../security1/ceph/client/SyntheticClient.h | 5 +- branches/aleung/security1/ceph/client/fuse.cc | 11 - .../ceph/client/hadoop/CephClientInterface.cc | 217 +++ .../ceph/client/hadoop/CephClientInterface.h | 115 ++ branches/aleung/security1/ceph/cmds.cc | 20 +- branches/aleung/security1/ceph/cmon.cc | 18 +- branches/aleung/security1/ceph/common/Clock.h | 4 + .../aleung/security1/ceph/common/Timer.cc | 165 +- branches/aleung/security1/ceph/common/Timer.h | 54 +- branches/aleung/security1/ceph/config.cc | 116 +- branches/aleung/security1/ceph/config.h | 27 +- branches/aleung/security1/ceph/cosd.cc | 2 +- .../aleung/security1/ceph/crypto/ExtCap.h | 1 + .../aleung/security1/ceph/doc/journal.txt | 16 + .../security1/ceph/ebofs/BufferCache.cc | 17 + .../aleung/security1/ceph/ebofs/BufferCache.h | 18 +- branches/aleung/security1/ceph/ebofs/Ebofs.cc | 129 +- branches/aleung/security1/ceph/ebofs/Ebofs.h | 9 +- branches/aleung/security1/ceph/fakesyn.cc | 24 +- .../aleung/security1/ceph/include/buffer.h | 128 +- .../aleung/security1/ceph/include/object.h | 1 + .../aleung/security1/ceph/include/reqid.h | 64 + .../aleung/security1/ceph/include/types.h | 246 +-- .../aleung/security1/ceph/mds/AnchorClient.cc | 8 +- .../aleung/security1/ceph/mds/AnchorTable.cc | 2 +- branches/aleung/security1/ceph/mds/CDentry.cc | 116 +- branches/aleung/security1/ceph/mds/CDentry.h | 140 +- branches/aleung/security1/ceph/mds/CDir.cc | 250 ++- branches/aleung/security1/ceph/mds/CDir.h | 347 ++-- branches/aleung/security1/ceph/mds/CInode.cc | 197 +-- branches/aleung/security1/ceph/mds/CInode.h | 351 ++--- .../aleung/security1/ceph/mds/ClientMap.h | 11 + branches/aleung/security1/ceph/mds/Lock.h | 58 +- branches/aleung/security1/ceph/mds/Locker.cc | 337 ++-- branches/aleung/security1/ceph/mds/Locker.h | 4 + .../aleung/security1/ceph/mds/LogEvent.cc | 55 +- branches/aleung/security1/ceph/mds/LogEvent.h | 31 +- .../aleung/security1/ceph/mds/MDBalancer.cc | 119 +- .../aleung/security1/ceph/mds/MDBalancer.h | 13 +- branches/aleung/security1/ceph/mds/MDCache.cc | 1394 +++++++++++++--- branches/aleung/security1/ceph/mds/MDCache.h | 104 +- branches/aleung/security1/ceph/mds/MDLog.cc | 110 +- branches/aleung/security1/ceph/mds/MDLog.h | 44 +- branches/aleung/security1/ceph/mds/MDS.cc | 727 ++++++--- branches/aleung/security1/ceph/mds/MDS.h | 78 +- branches/aleung/security1/ceph/mds/MDSMap.h | 213 ++- branches/aleung/security1/ceph/mds/MDStore.cc | 104 +- .../aleung/security1/ceph/mds/Migrator.cc | 1134 +++++++++----- branches/aleung/security1/ceph/mds/Migrator.h | 102 +- .../aleung/security1/ceph/mds/OSDMonitor.cc | 523 ------ .../aleung/security1/ceph/mds/OSDMonitor.h | 85 - branches/aleung/security1/ceph/mds/Renamer.cc | 49 +- branches/aleung/security1/ceph/mds/Server.cc | 832 ++++++---- branches/aleung/security1/ceph/mds/Server.h | 13 + .../aleung/security1/ceph/mds/events/EAlloc.h | 44 +- .../security1/ceph/mds/events/EDirUpdate.h | 97 -- .../security1/ceph/mds/events/EExportFinish.h | 59 + .../security1/ceph/mds/events/EExportStart.h | 68 + .../security1/ceph/mds/events/EImportFinish.h | 59 + .../security1/ceph/mds/events/EImportMap.h | 66 + .../security1/ceph/mds/events/EImportStart.h | 60 + .../security1/ceph/mds/events/EInodeUpdate.h | 55 - .../security1/ceph/mds/events/EMetaBlob.h | 339 ++++ .../aleung/security1/ceph/mds/events/EMkdir.h | 62 - .../aleung/security1/ceph/mds/events/EMknod.h | 60 - .../security1/ceph/mds/events/EPurgeFinish.h | 5 +- .../security1/ceph/mds/events/EString.h | 5 +- .../aleung/security1/ceph/mds/events/ETrace.h | 119 -- .../security1/ceph/mds/events/EUnlink.h | 19 +- .../security1/ceph/mds/events/EUpdate.h | 49 + branches/aleung/security1/ceph/mds/journal.cc | 658 +++++--- branches/aleung/security1/ceph/mds/mdstypes.h | 157 +- .../security1/ceph/mds/oldcachestuff.cc | 944 ----------- .../security1/ceph/messages/MCacheExpire.h | 85 +- .../security1/ceph/messages/MClientBoot.h | 11 +- .../security1/ceph/messages/MClientRequest.h | 1 + .../security1/ceph/messages/MDiscoverReply.h | 70 +- .../security1/ceph/messages/MExportDir.h | 74 +- .../security1/ceph/messages/MExportDirPrep.h | 2 +- .../aleung/security1/ceph/messages/MFailure.h | 6 +- .../security1/ceph/messages/MFailureAck.h | 4 +- .../security1/ceph/messages/MMDSBeacon.h | 54 + .../security1/ceph/messages/MMDSCacheRejoin.h | 62 + .../ceph/messages/MMDSCacheRejoinAck.h | 82 + .../security1/ceph/messages/MMDSImportMap.h | 59 + .../aleung/security1/ceph/messages/MMDSMap.h | 19 +- .../security1/ceph/messages/MMonElectionAck.h | 25 +- .../ceph/messages/MMonElectionPropose.h | 32 + .../ceph/messages/MMonElectionVictory.h | 40 + .../security1/ceph/messages/MMonPaxos.h | 80 + .../security1/ceph/messages/MNSLookup.h | 6 +- .../security1/ceph/messages/MNSLookupReply.h | 2 +- .../security1/ceph/messages/MNSRegister.h | 6 +- .../security1/ceph/messages/MNSRegisterAck.h | 6 +- .../aleung/security1/ceph/messages/MOSDBoot.h | 1 + .../security1/ceph/messages/MOSDFailure.h | 13 +- .../aleung/security1/ceph/messages/MOSDOp.h | 99 +- .../security1/ceph/messages/MOSDOpReply.h | 70 +- branches/aleung/security1/ceph/mkmonmap.cc | 50 +- .../security1/ceph/mon/ClientMonitor.cc | 12 +- .../aleung/security1/ceph/mon/ClientMonitor.h | 4 +- branches/aleung/security1/ceph/mon/Elector.cc | 327 ++-- branches/aleung/security1/ceph/mon/Elector.h | 141 +- .../aleung/security1/ceph/mon/MDSMonitor.cc | 355 ++++- .../aleung/security1/ceph/mon/MDSMonitor.h | 31 +- branches/aleung/security1/ceph/mon/MonMap.h | 13 +- branches/aleung/security1/ceph/mon/Monitor.cc | 190 ++- branches/aleung/security1/ceph/mon/Monitor.h | 78 +- .../aleung/security1/ceph/mon/MonitorStore.cc | 198 +++ .../aleung/security1/ceph/mon/MonitorStore.h | 69 + .../aleung/security1/ceph/mon/OSDMonitor.cc | 170 +- .../aleung/security1/ceph/mon/OSDMonitor.h | 12 +- branches/aleung/security1/ceph/mon/Paxos.cc | 182 +++ branches/aleung/security1/ceph/mon/Paxos.h | 73 + .../aleung/security1/ceph/msg/Dispatcher.h | 9 +- .../security1/ceph/msg/FakeMessenger.cc | 127 +- .../aleung/security1/ceph/msg/FakeMessenger.h | 17 +- .../aleung/security1/ceph/msg/HostMonitor.cc | 10 +- .../aleung/security1/ceph/msg/HostMonitor.h | 20 +- .../aleung/security1/ceph/msg/MPIMessenger.cc | 4 +- .../aleung/security1/ceph/msg/MPIMessenger.h | 6 +- .../aleung/security1/ceph/msg/MTMessenger.cc | 2 +- .../aleung/security1/ceph/msg/MTMessenger.h | 2 +- branches/aleung/security1/ceph/msg/Message.cc | 52 +- branches/aleung/security1/ceph/msg/Message.h | 179 +-- .../aleung/security1/ceph/msg/Messenger.cc | 46 - .../aleung/security1/ceph/msg/Messenger.h | 50 +- .../aleung/security1/ceph/msg/NewMessenger.cc | 50 +- .../aleung/security1/ceph/msg/NewMessenger.h | 44 +- .../security1/ceph/msg/NewerMessenger.cc | 50 +- .../security1/ceph/msg/NewerMessenger.h | 36 +- .../security1/ceph/msg/SerialMessenger.h | 4 +- .../security1/ceph/msg/SimpleMessenger.cc | 497 +++--- .../security1/ceph/msg/SimpleMessenger.h | 85 +- .../aleung/security1/ceph/msg/TCPDirectory.cc | 8 +- .../aleung/security1/ceph/msg/TCPDirectory.h | 8 +- .../aleung/security1/ceph/msg/TCPMessenger.cc | 32 +- .../aleung/security1/ceph/msg/TCPMessenger.h | 6 +- .../aleung/security1/ceph/msg/msg_types.h | 186 +++ branches/aleung/security1/ceph/msg/tcp.h | 2 +- branches/aleung/security1/ceph/newsyn.cc | 24 +- branches/aleung/security1/ceph/osbdb/OSBDB.cc | 1395 +++++++++++++++++ branches/aleung/security1/ceph/osbdb/OSBDB.h | 507 ++++++ .../aleung/security1/ceph/osd/FakeStore.cc | 521 ++++-- .../aleung/security1/ceph/osd/FakeStore.h | 73 +- branches/aleung/security1/ceph/osd/OSD.cc | 187 ++- branches/aleung/security1/ceph/osd/OSD.h | 15 +- branches/aleung/security1/ceph/osd/OSDMap.h | 1 + .../aleung/security1/ceph/osd/ObjectStore.h | 29 +- branches/aleung/security1/ceph/osd/PG.cc | 51 +- branches/aleung/security1/ceph/osd/PG.h | 44 +- .../aleung/security1/ceph/osd/osd_types.h | 174 ++ .../aleung/security1/ceph/osdc/Journaler.cc | 23 +- .../security1/ceph/osdc/ObjectCacher.cc | 5 + .../aleung/security1/ceph/osdc/Objecter.cc | 34 +- .../aleung/security1/ceph/osdc/Objecter.h | 10 +- branches/aleung/security1/ceph/test/testos.cc | 308 ++++ 163 files changed, 13058 insertions(+), 7103 deletions(-) create mode 100644 branches/aleung/security1/ceph/client/hadoop/CephClientInterface.cc create mode 100644 branches/aleung/security1/ceph/client/hadoop/CephClientInterface.h create mode 100644 branches/aleung/security1/ceph/include/reqid.h delete mode 100644 branches/aleung/security1/ceph/mds/OSDMonitor.cc delete mode 100644 branches/aleung/security1/ceph/mds/OSDMonitor.h delete mode 100644 branches/aleung/security1/ceph/mds/events/EDirUpdate.h create mode 100644 branches/aleung/security1/ceph/mds/events/EExportFinish.h create mode 100644 branches/aleung/security1/ceph/mds/events/EExportStart.h create mode 100644 branches/aleung/security1/ceph/mds/events/EImportFinish.h create mode 100644 branches/aleung/security1/ceph/mds/events/EImportMap.h create mode 100644 branches/aleung/security1/ceph/mds/events/EImportStart.h delete mode 100644 branches/aleung/security1/ceph/mds/events/EInodeUpdate.h create mode 100644 branches/aleung/security1/ceph/mds/events/EMetaBlob.h delete mode 100644 branches/aleung/security1/ceph/mds/events/EMkdir.h delete mode 100644 branches/aleung/security1/ceph/mds/events/EMknod.h delete mode 100644 branches/aleung/security1/ceph/mds/events/ETrace.h create mode 100644 branches/aleung/security1/ceph/mds/events/EUpdate.h delete mode 100644 branches/aleung/security1/ceph/mds/oldcachestuff.cc create mode 100644 branches/aleung/security1/ceph/messages/MMDSBeacon.h create mode 100644 branches/aleung/security1/ceph/messages/MMDSCacheRejoin.h create mode 100644 branches/aleung/security1/ceph/messages/MMDSCacheRejoinAck.h create mode 100644 branches/aleung/security1/ceph/messages/MMDSImportMap.h create mode 100644 branches/aleung/security1/ceph/messages/MMonElectionPropose.h create mode 100644 branches/aleung/security1/ceph/messages/MMonElectionVictory.h create mode 100644 branches/aleung/security1/ceph/messages/MMonPaxos.h create mode 100644 branches/aleung/security1/ceph/mon/MonitorStore.cc create mode 100644 branches/aleung/security1/ceph/mon/MonitorStore.h create mode 100644 branches/aleung/security1/ceph/mon/Paxos.cc create mode 100644 branches/aleung/security1/ceph/mon/Paxos.h create mode 100644 branches/aleung/security1/ceph/msg/msg_types.h create mode 100644 branches/aleung/security1/ceph/osbdb/OSBDB.cc create mode 100644 branches/aleung/security1/ceph/osbdb/OSBDB.h create mode 100644 branches/aleung/security1/ceph/osd/osd_types.h create mode 100644 branches/aleung/security1/ceph/test/testos.cc diff --git a/branches/aleung/security1/ceph/Makefile b/branches/aleung/security1/ceph/Makefile index 225d4b6c5f311..5be7292e1294a 100644 --- a/branches/aleung/security1/ceph/Makefile +++ b/branches/aleung/security1/ceph/Makefile @@ -1,12 +1,13 @@ -# mpicxx must be on your path; on googoo, this means that -# /usr/local/mpich2-1.0.2/bin must be on your path. +# mpicxx must be on your path to build newsyn. on googoo, this means +# that /usr/local/mpich2-1.0.2/bin must be on your path. # For now, use g++ most of the time. -# When compiling MPI stuff, specify myfile.cc instead of myfile.o so that ${MPICC} is -# invoked instead of the generic .o rule (or it'll use g++). -# This makes it less annoying to build on non-mpi hosts for dev work, and seems to -# behave just fine... change ${CC} back to mpicxx if you get paranoid. +# When compiling MPI stuff, specify myfile.cc instead of myfile.o so +# that ${MPICC} is invoked instead of the generic .o rule (or it'll +# use g++). This makes it less annoying to build on non-mpi hosts for +# dev work, and seems to behave just fine... change ${CC} back to +# mpicxx if you get paranoid. #CC = g++ #CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE @@ -18,15 +19,22 @@ EXTRA_CFLAGS = ifeq ($(target),darwin) # For Darwin CFLAGS = -ggdb3 -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE -DDARWIN -D__FreeBSD__=10 ${EXTRA_CFLAGS} +LDINC = ar -rc else # For linux CFLAGS = -ggdb3 -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +LDINC = ld -i -o endif CC = g++ LIBS = -lpthread -lcrypto++5.2 #CRYPTOLIBS = /usr/lib/libcrypto++5.2.so +ifeq ($(want_bdb),yes) +CFLAGS += -DUSE_OSBDB +OSBDB_LIBS = -ldb_cxx +endif + #for normal mpich2 machines MPICC = mpicxx MPICFLAGS = ${CFLAGS} @@ -76,13 +84,14 @@ OSDC_OBJS= \ MON_OBJS= \ mon/Monitor.o\ + mon/Paxos.o\ mon/OSDMonitor.o\ mon/MDSMonitor.o\ mon/ClientMonitor.o\ - mon/Elector.o + mon/Elector.o\ + mon/MonitorStore.o COMMON_OBJS= \ - msg/Messenger.o\ msg/Message.o\ common/Logger.o\ common/Clock.o\ @@ -99,7 +108,14 @@ CLIENT_OBJS= \ CRYPTO_OBJS = \ crypto/CryptoLib.o -TARGETS = cmon cosd cmds cfuse newsyn fakesyn +ifeq ($(want_bdb),yes) +OSBDB_OBJS = \ + osbdb/OSBDB.o + +OSBDB_OBJ = osbdb.o +endif + +TARGETS = cmon cosd cmds cfuse csyn newsyn fakesyn mkmonmap SRCS=*.cc */*.cc *.h */*.h */*/*.h @@ -114,11 +130,11 @@ obfs: depend obfstest mkmonmap: mkmonmap.cc common.o crypto.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cmon: cmon.cc mon.o ebofs.o msg/SimpleMessenger.o common.o crypto.o +cmon: cmon.cc mon.o msg/SimpleMessenger.o common.o crypto.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ -cosd: cosd.cc osd.o ebofs.o msg/SimpleMessenger.o common.o crypto.o - ${CC} ${CFLAGS} ${LIBS} $^ -o $@ +cosd: cosd.cc osd.o ebofs.o ${OSBDB_OBJ} msg/SimpleMessenger.o common.o crypto.o + ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ cmds: cmds.cc mds.o osdc.o msg/SimpleMessenger.o common.o crypto.o ${CC} ${CFLAGS} ${LIBS} $^ -o $@ @@ -135,53 +151,44 @@ gprof-helper.so: test/gprof-helper.c gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl +# fake* +fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o ${OSBDB_OBJ} client/fuse.o msg/FakeMessenger.o common.o + ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -lfuse $^ -o $@ -# fuse -fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o - ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@ +fakesyn: fakesyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/FakeMessenger.o common.o crypto.o + ${CC} -pg ${CFLAGS} ${LIBS} ${OSBDB_LIBS} $^ -o $@ -tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o - ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ -mpifuse: mpifuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o - ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ +# mpi startup +newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o crypto.o + ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ +newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o ${OSBDB_OBJ} osdc.o msg/SimpleMessenger.o common.o crypto.o + ${MPICC} ${MPICFLAGS} ${MPILIBS} ${OSBDB_LIBS} $^ -o $@ -# synthetic workload -fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o crypto.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ -tcpsyn: tcpsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o ${TCP_OBJS} common.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ +# ebofs +mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ -newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/SimpleMessenger.o common.o crypto.o - ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@ +test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ -newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o crypto.o - ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ -# + obfs fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ - -# ebofs - -mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - -test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o - ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ - +osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o + ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a # libceph -libceph.o: client/ldceph.o client/Client.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} - ar -rc $@ $^ +libceph.o: client/ldceph.o client/Client.o msg/SimpleMessenger.o ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} + ${LDINC} $^ -o $@ bench/mdtest/mdtest.o: bench/mdtest/mdtest.c mpicc -c $^ -o $@ @@ -192,6 +199,11 @@ mdtest: bench/mdtest/mdtest.o mdtest.ceph: bench/mdtest/mdtest.o libceph.o ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ +# OSD test + +testos: test/testos.o ebofs.o osbdb.o common.o + ${CC} ${CFLAGS} ${LIBS} ${OSBDB_LIBS} -o $@ $^ + # %.so: %.cc @@ -201,31 +213,31 @@ clean: rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} common.o: ${COMMON_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ ebofs.o: ${EBOFS_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ client.o: ${CLIENT_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ osd.o: ${OSD_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ osdc.o: ${OSDC_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ crypto.o: ${CRYPTO_OBJS} ld -i -o $@ $^ -osd_obfs.o: osd/OBFSStore.o osd/OSD.cc osd/PG.o osd/ObjectStore.o osd/FakeStore.o - ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.o - mds.o: ${MDS_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ mon.o: ${MON_OBJS} - ar -rc $@ $^ + ${LDINC} $@ $^ + +osbdb.o: ${OSBDB_OBJS} + ${LDINC} $@ $^ %.o: %.cc ${CC} ${CFLAGS} -c $< -o $@ @@ -237,6 +249,9 @@ count: cat ${SRCS} | wc -l cat ${SRCS} | grep -c \; +TAGS: + etags `find . -name "*.[h|cc]"` + .depend: touch .depend diff --git a/branches/aleung/security1/ceph/TODO b/branches/aleung/security1/ceph/TODO index 3c1e1f62b437c..8a64da39dfc8a 100644 --- a/branches/aleung/security1/ceph/TODO +++ b/branches/aleung/security1/ceph/TODO @@ -1,50 +1,38 @@ -- paxos for monitor -- lnet? -- crush - - xml import/export? - - crush tools - -== todo - -1- pipelining writes? -2- intervening reads? - -inode ops - utime -- no concurrency issues - chown/chmod -- should lock - truncate -- should lock - 1-> no. multiple process concurrency on a single inode is not important. - 2-> maybe... intervening stats? probably not important. - -directory ops. parent inode mtime, + dirent xlocks? - mknod - open+create - symlink - unlink - rmdir - rename - 1-> yes. but mtime updates are independent (mtime monotonically increasing), so it's easy. - 2-> yes. - ---> so, make let's make file/hard wrlock exclusive. - -locks - namespace - path pins -- read lock - dentry xlock -- write lock - inode - hard/file rd start/stop -- read lock - hard/file wr start/stop -- write lock - +monitor +- finish generic paxos +osdmon +- distribute w/ paxos framework +- allow fresh replacement osds. add osd_created in osdmap, probably +- monitor needs to monitor some osds... +- monitor pg states, notify on out? +- watch osd utilization; adjust overload in cluster map -- integrate revisions into ObjectCacher -- clean up oid.rev vs op.rev in osd+osdc +mdsmon +- distribute w/ paxos framework + +journaler +- fix up for large events (e.g. imports) +- use set_floor_and_read for safe takeover from possibly-not-quite-dead otherguy. +- should we pad with zeros to avoid splitting individual entries? + - make it a g_conf flag? + - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) +- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes + + +crush +- xml import/export? +- crush tools + + +rados+ebofs +- purge replicated writes from cache. (with exception of partial tail blocks.) -rados paper todo +rados paper todo? - better experiments + - berkeleydb objectstore? - flush log only in response to subsequent read or write? - better behaving recovery - justify use of splay. @@ -52,6 +40,9 @@ rados paper todo - snapshots rados snapshots +- integrate revisions into ObjectCacher +- clean up oid.rev vs op.rev in osd+osdc + - attr.crev is rev we were created in. - oid.rev=0 is "live". defined for attr.crev <= rev. - otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) @@ -75,41 +66,14 @@ rados snapshots - clean up messenger failure modes. - add connection retry. -mds recovery -- multiple passes? - 1- establish import/export map - ?- - 2- replay inode, dir, dentry updates -- single pass - - each event needs to embed inode for trace up to the import - - second stage will reconcile cached items with other active mds nodes - - cached items will be shared with the primary to repopulate it's non-dirty cache - - query clients for their state too? - - mds must journal list of clients with whom we share state? - - -journaler -- should we pad with zeros to avoid splitting individual entries? - - make it a g_conf flag? - - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) -- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes - -monitor -?- monitor user lib that handles resending, redirection of mon requests. -- elector -/- organize monitor store - -osdmon -- distribute -- recovery: store elector epochs with maps.. -- monitor needs to monitor some osds... -- monitor pgs, notify on out -- watch osd utilization; adjust overload in cluster map - -mdsmon +objecter +- read+floor_lockout osd/rados +- read+floor_lockout for clean STOGITH-like/fencing semantics after failover. +- separate out replication code into a PG class, to pave way for RAID + - efficiently replicate clone() objects - pg_num instead of pg_bits - flag missing log entries on crash recovery --> WRNOOP? or WRLOST? @@ -121,23 +85,16 @@ osd/rados - pg_bit/pg_num changes - report crashed pgs? -messenger -/- share same tcp socket for sender and receiver -/- graceful connection teardown +simplemessenger - close idle connections -- generalize out a transport layer? - - eg reliable tcp for most things, connectionless unreliable datagrams for monitors? - - or, aggressive connection closing on monitors? or just max_connections and an lru? -- osds: forget idle client addrs - -objecter +- retry, timeout on connection or transmission failure objectcacher - ocacher caps transitions vs locks - test read locks reliability -- heartbeat vs ping +- heartbeat vs ping? - osdmonitor, filter ebofs @@ -153,20 +110,8 @@ ebofs - metadata in nvram? flash? - -bugs/stability -- figure out weird 40ms latency with double log entries - - -general -- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown -- well, just figure out general timer cancellation strategy that avoids races - - use updated Timer as a model? - - remaining hard problems - how to cope with file size changes and read/write sharing -- mds failure recovery (of course) crush @@ -178,7 +123,7 @@ mds - distributed client management - anchormgr - 2pc - - independent journal + - independent journal? - distributed? - link count management - also 2pc @@ -206,14 +151,11 @@ mds client -- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache - -- test client caps with meta exports -- some heuristic behavior to consolidate caps to inode auth -- client will re-tx anything it needed to say upon rx of new mds notification (?) - - - +- fstat +- make_request: cope with mds failure +- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache.. how to isolate.. +- test client caps migration w/ mds exports +- some heuristic behavior to consolidate caps to inode auth? @@ -263,11 +205,6 @@ SAGE: -CLIENT TODO - -- statfs - - @@ -305,3 +242,81 @@ IMPLEMENT - dump active config in run output somewhere + + + + + + + + +==== MDS RECOVERY ==== + +- how to reliably deliver cache expire messages? + - how should proxy behave? + - exporter failure + - all cacheexpire info has been passed on up until point where export is permanent. no impact. + - importer failure + - exporter collects expire info, so that it can reverse. + - ??? + - maybe hosts should double-up expires until after export is known to have committed? +--> just send expires to both nodes. dir_auth+dir_auth2. clean up export ack/notify process. :) + +*** dar... no, separate bystander dir_auth updates from the prepare/ack/commit cycle! +- expire should go to both old and new auth +- set_dir_auth should take optional second auth, and authority() should optionally set/return a second possible auth +- does inode need it's own replica list? no! +- dirslices. + + +/- exporter recovery if importer fails during EXPORT_EXPORTING stage +- importer recovery if exporter fails + +/?- delay response to sending import_map if export in progress? +/?- finish export before sending import_map? +/- ambiguous imports on active node should include in-progress imports! +/- how to effectively trim cache after resolve but before rejoin +/ - we need to eliminate unneed non-auth metadata, without hosing potentially useful auth metadata + +- osd needs a set_floor_and_read op for safe failover/STOGITH-like semantics. + +- failures during recovery stages (resolve, rejoin)... make sure rejoin still works! + +- fix mds initial osdmap weirdness (which will currently screw up on standby -> almost anything) + + +importmap only sent after exports have completed. +failures update export ack waitlists, so exports will compelte if unrelated nodes fail. +importmap can be sent regardless of import status -- pending import is just flagged ambiguous. +failure of exporter induces some cleanup on importer. importer will disambiguate when it gets an importmap on exporter recovery. +failure of importer induces cleanup on exporter. no ambiguity. + + +/- no new mds may join if cluster is in a recovery state. starting -> standby (unless failed) +/ - make sure creating -> standby, and are not included in recovery set? + + +mdsmap notes +- mds don't care about intervening states, except rejoin > active, and + that transition requires active involvement. thus, no need worry + about delivering/processing the full sequence of maps. + +blech: +- EMetablob should return 'expired' if they have + higher versions (and are thus described by a newer journal entry) + +mds +- mds falure vs clients + - clean up client op redirection + - idempotent ops + +- journal+recovery + - unlink + - open(wr cap), open+create + - file capabilities i/o + - link + - rename + +- should auth_pins really go to the root? + - FIXME: auth_pins on importer versus import beneath an authpinned region? + diff --git a/branches/aleung/security1/ceph/client/Client.cc b/branches/aleung/security1/ceph/client/Client.cc index 03fd854a640ac..13c65150d2de3 100644 --- a/branches/aleung/security1/ceph/client/Client.cc +++ b/branches/aleung/security1/ceph/client/Client.cc @@ -22,9 +22,7 @@ #include #include -#ifdef DARWIN #include -#endif // DARWIN #include @@ -60,7 +58,7 @@ using namespace std; #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " " +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_client) cout << g_clock.now() << " client" << whoami << "." << pthread_self() << " " #define tout if (g_conf.client_trace) cout << "trace: " @@ -101,7 +99,7 @@ public: Client::Client(Messenger *m, MonMap *mm) { // which client am i? - whoami = m->get_myaddr().num(); + whoami = m->get_myname().num(); monmap = mm; mounted = false; @@ -464,7 +462,7 @@ Dentry *Client::lookup(filepath& path) MClientReply *Client::make_request(MClientRequest *req, bool auth_best, - int use_mds) // this param is icky, debug weirdness! + int use_mds) // this param is purely for debug hacking { // assign a unique tid req->set_tid(++last_tid); @@ -504,7 +502,10 @@ MClientReply *Client::make_request(MClientRequest *req, // choose an mds int mds = 0; - if (diri) { + if (!diri || g_conf.client_use_random_mds) { + // no root info, pick a random MDS + mds = rand() % mdsmap->get_num_mds(); + } else { if (auth_best) { // pick the actual auth (as best we can) if (item) { @@ -523,9 +524,6 @@ MClientReply *Client::make_request(MClientRequest *req, else mds = diri->pick_replica(mdsmap); } - } else { - // no root info, pick a random MDS - mds = rand() % mdsmap->get_num_mds(); } dout(20) << "mds is " << mds << endl; @@ -584,7 +582,7 @@ MClientReply* Client::sendrecv(MClientRequest *req, int mds) tid_t tid = req->get_tid(); mds_rpc_cond[tid] = &cond; - messenger->send_message(req, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), MDS_PORT_SERVER); + messenger->send_message(req, mdsmap->get_inst(mds), MDS_PORT_SERVER); // wait while (mds_rpc_reply.count(tid) == 0) { @@ -684,7 +682,7 @@ Ticket *Client::get_user_ticket(uid_t uid, gid_t gid) dout(10) << "get_user_ticket requesting ticket for uid " << uid << " from mon" << mon << endl; messenger->send_message(new MClientAuthUser(username, uid, gid, key), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + monmap->get_inst(mon)); } else { // don't request, someone else already did. just wait! dout(10) << "get_user_ticket waiting for ticket for uid " << uid << endl; @@ -790,16 +788,17 @@ void Client::handle_mds_map(MMDSMap* m) if (whoami < 0) { whoami = m->get_dest().num(); dout(1) << "handle_mds_map i am now " << m->get_dest() << endl; - messenger->reset_myaddr(m->get_dest()); + messenger->reset_myname(m->get_dest()); } - map::reverse_iterator p = m->maps.rbegin(); - - dout(1) << "handle_mds_map epoch " << p->first << endl; - mdsmap->decode(p->second); + dout(1) << "handle_mds_map epoch " << m->get_epoch() << endl; + mdsmap->decode(m->get_encoded()); delete m; + // note our inc # + objecter->set_client_incarnation(0); // fixme + mount_cond.Signal(); // mount might be waiting for this. } @@ -926,7 +925,7 @@ void Client::handle_file_caps(MClientFileCaps *m) << ", which we don't want caps for, releasing." << endl; m->set_caps(0); m->set_wanted(0); - messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port()); + messenger->send_message(m, m->get_source_inst(), m->get_source_port()); return; } @@ -1033,7 +1032,7 @@ void Client::implemented_caps(MClientFileCaps *m, Inode *in) in->file_wr_size = 0; } - messenger->send_message(m, m->get_source(), m->get_source_inst(), m->get_source_port()); + messenger->send_message(m, m->get_source_inst(), m->get_source_port()); } @@ -1043,6 +1042,7 @@ void Client::release_caps(Inode *in, dout(5) << "releasing caps on ino " << in->inode.ino << dec << " had " << cap_string(in->file_caps()) << " retaining " << cap_string(retain) + << " want " << cap_string(in->file_caps_wanted()) << endl; for (map::iterator it = in->caps.begin(); @@ -1057,7 +1057,7 @@ void Client::release_caps(Inode *in, it->second.seq, it->second.caps, in->file_caps_wanted()); - messenger->send_message(m, MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER); + messenger->send_message(m, mdsmap->get_inst(it->first), MDS_PORT_LOCKER); } } @@ -1082,7 +1082,7 @@ void Client::update_caps_wanted(Inode *in) it->second.caps, in->file_caps_wanted()); messenger->send_message(m, - MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER); + mdsmap->get_inst(it->first), MDS_PORT_LOCKER); } } @@ -1104,7 +1104,7 @@ int Client::mount() delete mdsmap; int mon = monmap->pick_mon(); messenger->send_message(new MClientBoot(), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + monmap->get_inst(mon)); while (!mdsmap) mount_cond.Wait(client_lock); @@ -1114,7 +1114,7 @@ int Client::mount() int who = 0; // mdsmap->get_root(); // mount at root, for now messenger->send_message(m, - MSG_ADDR_MDS(who), mdsmap->get_inst(who), + mdsmap->get_inst(who), MDS_PORT_SERVER); while (!mounted) @@ -1210,7 +1210,7 @@ int Client::unmount() // send unmount! Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT); - messenger->send_message(req, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER); + messenger->send_message(req, mdsmap->get_inst(0), MDS_PORT_SERVER); while (mounted) mount_cond.Wait(client_lock); @@ -2495,13 +2495,15 @@ int Client::open(const char *relpath, int flags, __int64_t uid, __int64_t gid) void Client::close_release(Inode *in) { dout(10) << "close_release on " << in->ino() << endl; + dout(10) << " wr " << in->num_open_wr << " rd " << in->num_open_rd + << " dirty " << in->fc.is_dirty() << " cached " << in->fc.is_cached() << endl; if (!in->num_open_rd) in->fc.release_clean(); int retain = 0; - if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER; - if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER; + if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER | CAP_FILE_WREXTEND; + if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_RD | CAP_FILE_RDCACHE; release_caps(in, retain); // release caps now. } @@ -2992,7 +2994,6 @@ int Client::chdir(const char *path, __int64_t uid, __int64_t gid) return 0; } -#ifdef DARWIN int Client::statfs(const char *path, struct statvfs *stbuf, __int64_t uid, __int64_t gid) { @@ -3010,14 +3011,6 @@ int Client::statfs(const char *path, struct statvfs *stbuf, return 0; } -#else -int Client::statfs(const char *path, struct statfs *stbuf, - __int64_t uid, __int64_t gid) -{ - assert(0); // implement me - return 0; -} -#endif int Client::lazyio_propogate(int fd, off_t offset, size_t count, @@ -3122,15 +3115,17 @@ int Client::lazyio_synchronize(int fd, off_t offset, size_t count, } -void Client::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +void Client::ms_handle_failure(Message *m, const entity_inst_t& inst) { + entity_name_t dest = inst.name; + if (dest.is_mon()) { // resend to a different monitor. int mon = monmap->pick_mon(true); dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", resending to mon" << mon << endl; - messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(m, monmap->get_inst(mon)); } else if (dest.is_osd()) { objecter->ms_handle_failure(m, dest, inst); diff --git a/branches/aleung/security1/ceph/client/Client.h b/branches/aleung/security1/ceph/client/Client.h index 10bed373128f7..b06858fdd74ef 100644 --- a/branches/aleung/security1/ceph/client/Client.h +++ b/branches/aleung/security1/ceph/client/Client.h @@ -351,7 +351,7 @@ class Client : public Dispatcher { int unsafe_sync_write; public: - msg_addr_t get_myaddr() { return messenger->get_myaddr(); } + entity_name_t get_myname() { return messenger->get_myname(); } void hack_sync_write_safe(); protected: @@ -558,13 +558,8 @@ protected: int unmount(); // these shoud (more or less) mirror the actual system calls. -#ifdef DARWIN int statfs(const char *path, struct statvfs *stbuf, - __int64_t uid, __int64_t gid); -#else - int statfs(const char *path, struct statfs *stbuf, - __int64_t uid, __int64_t gid); -#endif + __int64_t uid=-1, __int64_t gid=-1); // crap int chdir(const char *s, __int64_t uid, __int64_t gid); @@ -648,7 +643,7 @@ protected: int describe_layout(char *fn, list& result); - void ms_handle_failure(Message*, msg_addr_t dest, const entity_inst_t& inst); + void ms_handle_failure(Message*, const entity_inst_t& inst); }; diff --git a/branches/aleung/security1/ceph/client/FileCache.cc b/branches/aleung/security1/ceph/client/FileCache.cc index 2465c0206c8ef..18a535768b14b 100644 --- a/branches/aleung/security1/ceph/client/FileCache.cc +++ b/branches/aleung/security1/ceph/client/FileCache.cc @@ -78,7 +78,7 @@ void FileCache::check_caps() // check callbacks map >::iterator p = caps_callbacks.begin(); while (p != caps_callbacks.end()) { - if (used == 0 || (~(p->first) & used)) { + if (used == 0 || (~(p->first) & used) == 0) { // implemented. dout(10) << "used is " << cap_string(used) << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl; diff --git a/branches/aleung/security1/ceph/client/SyntheticClient.cc b/branches/aleung/security1/ceph/client/SyntheticClient.cc index b0569d52e553e..e443c09a139ea 100644 --- a/branches/aleung/security1/ceph/client/SyntheticClient.cc +++ b/branches/aleung/security1/ceph/client/SyntheticClient.cc @@ -12,6 +12,7 @@ */ #include +#include using namespace std; @@ -30,7 +31,7 @@ using namespace std; #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << g_clock.now() << " synthetic" << client->get_nodeid() << " " // traces //void trace_include(SyntheticClient *syn, Client *cl, string& prefix); @@ -81,6 +82,9 @@ void parse_syn_options(vector& args) syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); + } else if (strcmp(args[i],"makedirmess") == 0) { + syn_modes.push_back( SYNCLIENT_MODE_MAKEDIRMESS ); + syn_iargs.push_back( atoi(args[++i]) ); } else if (strcmp(args[i],"statdirs") == 0) { syn_modes.push_back( SYNCLIENT_MODE_STATDIRS ); syn_iargs.push_back( atoi(args[++i]) ); @@ -105,7 +109,7 @@ void parse_syn_options(vector& args) syn_iargs.push_back( atoi(args[++i]) ); syn_iargs.push_back( atoi(args[++i]) ); - } else if (strcmp(args[i],"fullwalk") == 0) { + } else if (strcmp(args[i],"walk") == 0) { syn_modes.push_back( SYNCLIENT_MODE_FULLWALK ); //syn_sargs.push_back( atoi(args[++i]) ); } else if (strcmp(args[i],"randomwalk") == 0) { @@ -282,6 +286,16 @@ int SyntheticClient::run() } break; + case SYNCLIENT_MODE_MAKEDIRMESS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makedirmess " << sarg1 << " " << iarg1 << endl; + make_dir_mess(sarg1.c_str(), iarg1); + } + } + break; case SYNCLIENT_MODE_MAKEDIRS: { string sarg1 = get_sarg(0); @@ -366,7 +380,7 @@ int SyntheticClient::run() case SYNCLIENT_MODE_FULLWALK: { - string sarg1 = get_sarg(0); + string sarg1;// = get_sarg(0); if (run_me()) { dout(2) << "fullwalk" << sarg1 << endl; full_walk(sarg1); @@ -721,27 +735,39 @@ int SyntheticClient::full_walk(string& basedir) { if (time_to_stop()) return -1; - // read dir - map contents; - int r = client->getdir(basedir.c_str(), contents); - if (r < 0) { - dout(1) << "readdir on " << basedir << " returns " << r << endl; - return r; - } + list dirq; + dirq.push_back(basedir); - for (map::iterator it = contents.begin(); - it != contents.end(); - it++) { - string file = basedir + "/" + it->first; + while (!dirq.empty()) { + string dir = dirq.front(); + dirq.pop_front(); - struct stat st; - int r = client->lstat(file.c_str(), &st); + // read dir + map contents; + int r = client->getdir(dir.c_str(), contents); if (r < 0) { - dout(1) << "stat error on " << file << " r=" << r << endl; + dout(1) << "readdir on " << dir << " returns " << r << endl; continue; } - - if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file); + + for (map::iterator it = contents.begin(); + it != contents.end(); + it++) { + if (it->first == ".") continue; + if (it->first == "..") continue; + string file = dir + "/" + it->first; + + struct stat st; + int r = client->lstat(file.c_str(), &st); + if (r < 0) { + dout(1) << "stat error on " << file << " r=" << r << endl; + continue; + } + + if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { + dirq.push_back(file); + } + } } return 0; @@ -1224,3 +1250,42 @@ int SyntheticClient::random_walk(int num_req) } + + +void SyntheticClient::make_dir_mess(const char *basedir, int n) +{ + vector dirs; + + dirs.push_back(basedir); + dirs.push_back(basedir); + + client->mkdir(basedir, 0755); + + // motivation: + // P(dir) ~ subdirs_of(dir) + 2 + // from 5-year metadata workload paper in fast'07 + + // create dirs + for (int i=0; i> dir; + + // update dirs + dirs.push_back(parent); + dirs.push_back(dir); + dirs.push_back(dir); + + // do it + client->mkdir(dir.c_str(), 0755); + } + + +} + diff --git a/branches/aleung/security1/ceph/client/SyntheticClient.h b/branches/aleung/security1/ceph/client/SyntheticClient.h index 14720bdd412b2..ebf96386be95c 100644 --- a/branches/aleung/security1/ceph/client/SyntheticClient.h +++ b/branches/aleung/security1/ceph/client/SyntheticClient.h @@ -24,8 +24,9 @@ #define SYNCLIENT_MODE_RANDOMWALK 1 #define SYNCLIENT_MODE_FULLWALK 2 -#define SYNCLIENT_MODE_REPEATWALK 7 +#define SYNCLIENT_MODE_REPEATWALK 3 +#define SYNCLIENT_MODE_MAKEDIRMESS 7 #define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth #define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth #define SYNCLIENT_MODE_READDIRS 10 // dirs files depth @@ -193,6 +194,8 @@ class SyntheticClient { int play_trace(Trace& t, string& prefix); + void make_dir_mess(const char *basedir, int n); + }; #endif diff --git a/branches/aleung/security1/ceph/client/fuse.cc b/branches/aleung/security1/ceph/client/fuse.cc index 8e92962a17862..0eb107526609a 100644 --- a/branches/aleung/security1/ceph/client/fuse.cc +++ b/branches/aleung/security1/ceph/client/fuse.cc @@ -36,11 +36,7 @@ #include #include #include -#ifdef DARWIN #include -#else -#include -#endif // DARWIN // ceph stuff @@ -194,17 +190,10 @@ static int ceph_flush(const char *path, struct fuse_file_info *fi) */ -#ifdef DARWIN static int ceph_statfs(const char *path, struct statvfs *stbuf) { return client->statfs(path, stbuf); } -#else -static int ceph_statfs(const char *path, struct statfs *stbuf) -{ - return client->statfs(path, stbuf); -} -#endif diff --git a/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.cc b/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.cc new file mode 100644 index 0000000000000..6466dd6300891 --- /dev/null +++ b/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.cc @@ -0,0 +1,217 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +//#include + + +using namespace std; + +// globals +//Client *client; // the ceph client +//this has to go - the real client will have to hold the pointer. +//Every function will need to take a Client pointer. + +// ------ +// fuse hooks + +static int ceph_getattr(Client* client, const char *path, struct stat *stbuf) +{ + return client->lstat(path, stbuf); +} + +static int ceph_readlink(Client* client, const char *path, char *buf, size_t size) +{ + int res; + + res = client->readlink(path, buf, size - 1); + if (res < 0) return res; + + buf[res] = '\0'; + return 0; +} + +// get rid of the callback thing, perhaps? and return the answer some other way? +/* +static int ceph_getdir(Client* client, const char *path, fuse_dirh_t h, fuse_dirfil_t filler) +{ + map contents; + + int res = client->getdir(path, contents); + if (res < 0) return res; + + // return contents to fuse via callback + for (map::iterator it = contents.begin(); + it != contents.end(); + it++) { + // (immutable) inode contents too. + res = filler(h, // fuse's handle + it->first.c_str(), // dentry as char* + it->second.mode & INODE_TYPE_MASK, // mask type bits from mode + it->second.ino); // ino.. 64->32 bit issue here? FIXME + if (res != 0) break; // fuse has had enough + } + return res; +} +*/ + +static int ceph_mknod(Client* client, const char *path, mode_t mode, dev_t rdev) +{ + return client->mknod(path, mode); +} + +static int ceph_mkdir(Client* client, const char *path, mode_t mode) +{ + return client->mkdir(path, mode); +} + +static int ceph_unlink(Client* client, const char *path) +{ + return client->unlink(path); +} + +static int ceph_rmdir(Client* client, const char *path) +{ + return client->rmdir(path); +} + +static int ceph_symlink(Client* client, const char *from, const char *to) +{ + return client->symlink(from, to); +} + + +static int ceph_rename(Client* client, const char *from, const char *to) +{ + return client->rename(from, to); +} + +static int ceph_link(Client* client, const char *from, const char *to) +{ + return client->link(from, to); +} + +static int ceph_chmod(Client* client, const char *path, mode_t mode) +{ + return client->chmod(path, mode); +} + +static int ceph_chown(Client* client, const char *path, uid_t uid, gid_t gid) +{ + return client->chown(path, uid, gid); +} + +static int ceph_truncate(Client* client, const char *path, off_t size) +{ + return client->truncate(path, size); +} + +static int ceph_utime(Client* client, const char *path, struct utimbuf *buf) +{ + return client->utime(path, buf); +} + + +static int ceph_open(Client* client, const char *path, struct fuse_file_info *fi) +{ + int res; + + res = client->open(path, fi->flags); + if (res < 0) return res; + fi->fh = res; + return 0; // fuse wants 0 onsucess +} + +static int ceph_read(Client* client, const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->read(fh, buf, size, offset); +} + +static int ceph_write(Client* client, const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->write(fh, buf, size, offset); +} + +/* +static int ceph_flush(const char *path, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->flush(fh); +} +*/ + + +#ifdef DARWIN +static int ceph_statfs(Client* client, const char *path, struct statvfs *stbuf) +{ + return client->statfs(path, stbuf); +} +#else +static int ceph_statfs(Client* client, const char *path, struct statfs *stbuf) +{ + return client->statfs(path, stbuf); +} +#endif + + +/* remove fuse stuff from these two +static int ceph_release(Client* client, const char *path, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + int r = client->close(fh); // close the file + return r; +} + +static int ceph_fsync(Client* client, const char *path, int isdatasync, + struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->fsync(fh, isdatasync ? true:false); +} +*/ + +/* +static struct fuse_operations ceph_oper = { + getattr: ceph_getattr, + readlink: ceph_readlink, + getdir: ceph_getdir, + mknod: ceph_mknod, + mkdir: ceph_mkdir, + unlink: ceph_unlink, + rmdir: ceph_rmdir, + symlink: ceph_symlink, + rename: ceph_rename, + link: ceph_link, + chmod: ceph_chmod, + chown: ceph_chown, + truncate: ceph_truncate, + utime: ceph_utime, + open: ceph_open, + read: ceph_read, + write: ceph_write, + statfs: ceph_statfs, + flush: 0, //ceph_flush, + release: ceph_release, + fsync: ceph_fsync +}; + +*/ + + +// Does this do anything we need? No. All it does is assemble a bunch of +// arguments and call fuse_main. + diff --git a/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.h b/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.h new file mode 100644 index 0000000000000..e0b37c305029e --- /dev/null +++ b/branches/aleung/security1/ceph/client/hadoop/CephClientInterface.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#ifdef DARWIN +#include +#else +#include +#endif // DARWIN + +// ceph stuff +#include "include/types.h" + +#include "Client.h" + +#include "config.h" + +// stl +#include + + + + + + +// stbuf holds the attributes +static int ceph_getattr(Client* client, const char *path, struct stat *stbuf); + +// reads a symlink +static int ceph_readlink(Client* client, const char *path, char *buf, size_t size); + +// to do: remove fuse stuff from this one +//static int ceph_getdir(Client* client, const char *path, fuse_dirh_t h, fuse_dirfil_t filler); + +// looks irrelevant - it's for special device files +static int ceph_mknod(Client* client, const char *path, mode_t mode, dev_t rdev); + +// mode is the file permission bits +static int ceph_mkdir(Client* client, const char *path, mode_t mode); + +// delete! +static int ceph_unlink(Client* client, const char *path); + +// delete! if it's an empty directory +static int ceph_rmdir(Client* client, const char *path); + +// make a symlink +static int ceph_symlink(Client* client, const char *from, const char *to); + +// self-explanatory +static int ceph_rename(Client* client, const char *from, const char *to); + +static int ceph_link(Client* client, const char *from, const char *to); //hard link + +static int ceph_chmod(Client* client, const char *path, mode_t mode); //just chmod + +static int ceph_chown(Client* client, const char *path, uid_t uid, gid_t gid); //duh + +static int ceph_truncate(Client* client, const char *path, off_t size); //chop or zero-pad to size + +// set file access/modification times +static int ceph_utime(Client* client, const char *path, struct utimbuf *buf); + +// ok, gotta figure out what's in fuse_file_info and how to use it. Presumably it includes +// a file descriptor and the open flags? +static int ceph_open(Client* client, const char *path, struct fuse_file_info *fi); + +// read! +static int ceph_read(Client* client, const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi); + +// write! +static int ceph_write(Client* client, const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi); + +/* was already commented out +static int ceph_flush(const char *path, struct fuse_file_info *fi); +*/ + + +// is this statvfs perhaps? we probably don't need it +#ifdef DARWIN +static int ceph_statfs(Client* client, const char *path, struct statvfs *stbuf); +#else +static int ceph_statfs(Client* client, const char *path, struct statfs *stbuf); +#endif + +// Remove fuse stuff from these two +//static int ceph_release(Client* client, const char *path, struct fuse_file_info *fi); + +//static int ceph_fsync(Client* client, const char *path, int isdatasync, struct fuse_file_info *fi); //kinda like flush? + +/* ceph_fuse_main + * - start up fuse glue, attached to Client* cl. + * - argc, argv should include a mount point, and + * any weird fuse options you want. by default, + * we will put fuse in the foreground so that it + * won't fork and we can see stdout. + */ +// int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/aleung/security1/ceph/cmds.cc b/branches/aleung/security1/ceph/cmds.cc index 743597108e52c..8faf6a5bc6049 100644 --- a/branches/aleung/security1/ceph/cmds.cc +++ b/branches/aleung/security1/ceph/cmds.cc @@ -60,6 +60,20 @@ int main(int argc, char **argv) if (g_conf.debug_after) g_timer.add_event_after(g_conf.debug_after, new C_Debug); + // mds specific args + int whoami = -1; + bool standby = false; // by default, i'll start active. + for (unsigned i=0; iget_myaddr().num(), m, &monmap); - mds->init(); + MDS *mds = new MDS(whoami, m, &monmap); + mds->init(standby); // wait rank.wait(); diff --git a/branches/aleung/security1/ceph/cmon.cc b/branches/aleung/security1/ceph/cmon.cc index 690a3746b3f13..aefc282e6736a 100644 --- a/branches/aleung/security1/ceph/cmon.cc +++ b/branches/aleung/security1/ceph/cmon.cc @@ -76,6 +76,8 @@ int main(int argc, char **argv) MonMap monmap; + string new_private_key; + if (whoami < 0) { // let's assume a standalone monitor cout << "starting standalone mon0" << endl; @@ -86,8 +88,15 @@ int main(int argc, char **argv) cout << "bound to " << rank.get_listen_addr() << endl; // add single mon0 - monmap.add_mon(rank.my_inst); + entity_inst_t inst; + inst.name = MSG_ADDR_MON(0); + inst.addr = rank.my_addr; + monmap.add_mon(inst); + // generate a key pair + cout << "generating a key pair" << endl; + monmap.generate_key_pair(new_private_key); + // write monmap cout << "writing monmap to " << monmap_fn << endl;; int r = monmap.write(monmap_fn); @@ -102,14 +111,17 @@ int main(int argc, char **argv) // bind to a specific port cout << "starting mon" << whoami << " at " << monmap.get_inst(whoami) << endl; - tcpaddr_t addr = monmap.get_inst(whoami).addr; - rank.set_listen_addr(addr); + g_my_addr = monmap.get_inst(whoami).addr; rank.start_rank(); } // start monitor Messenger *m = rank.register_entity(MSG_ADDR_MON(whoami)); Monitor *mon = new Monitor(whoami, m, &monmap); + + if (new_private_key.length()) + mon->set_new_private_key(new_private_key); + mon->init(); // wait diff --git a/branches/aleung/security1/ceph/common/Clock.h b/branches/aleung/security1/ceph/common/Clock.h index 3cfe726ece8fa..92a2b2bddf6d0 100644 --- a/branches/aleung/security1/ceph/common/Clock.h +++ b/branches/aleung/security1/ceph/common/Clock.h @@ -100,6 +100,10 @@ inline utime_t& operator-=(utime_t& l, const utime_t& r) { } return l; } +inline utime_t& operator-=(utime_t& l, double f) { + l += -f; + return l; +} inline bool operator>(const utime_t& a, const utime_t& b) { diff --git a/branches/aleung/security1/ceph/common/Timer.cc b/branches/aleung/security1/ceph/common/Timer.cc index d70259c3e0a08..adacf0c5eb6c6 100644 --- a/branches/aleung/security1/ceph/common/Timer.cc +++ b/branches/aleung/security1/ceph/common/Timer.cc @@ -21,7 +21,8 @@ #include "include/Context.h" #undef dout -#define dout(x) if (x <= g_conf.debug) cout << "Timer: " +#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " TIMER " +#define derr(x) if (x <= g_conf.debug) cerr << g_clock.now() << " TIMER " #define DBL 10 @@ -33,8 +34,23 @@ Timer g_timer; + /**** thread solution *****/ +bool Timer::get_next_due(utime_t& when) +{ + if (scheduled.empty()) { + dout(10) << "get_next_due - nothing scheduled" << endl; + return false; + } else { + map< utime_t, set >::iterator it = scheduled.begin(); + when = it->first; + dout(10) << "get_next_due - " << when << endl; + return true; + } +} + + void Timer::timer_entry() { lock.Lock(); @@ -46,20 +62,20 @@ void Timer::timer_entry() // any events due? utime_t next; - Context *event = get_next_scheduled(next); - - list pending; + bool next_due = get_next_due(next); - if (event && now >= next) { + if (next_due && now >= next) { // move to pending list - map< utime_t, multiset >::iterator it = scheduled.begin(); + list pending; + + map< utime_t, set >::iterator it = scheduled.begin(); while (it != scheduled.end()) { if (it->first > now) break; utime_t t = it->first; dout(DBL) << "queueing event(s) scheduled at " << t << endl; - for (multiset::iterator cit = it->second.begin(); + for (set::iterator cit = it->second.begin(); cit != it->second.end(); cit++) { pending.push_back(*cit); @@ -67,7 +83,7 @@ void Timer::timer_entry() num_event--; } - map< utime_t, multiset >::iterator previt = it; + map< utime_t, set >::iterator previt = it; it++; scheduled.erase(previt); } @@ -75,13 +91,16 @@ void Timer::timer_entry() if (!pending.empty()) { sleeping = false; lock.Unlock(); - { // make sure we're not holding any locks while we do callbacks + { + // make sure we're not holding any locks while we do callbacks // make the callbacks myself. for (list::iterator cit = pending.begin(); cit != pending.end(); cit++) { - dout(DBL) << "doing callback " << *cit << endl; + dout(DBL) << "start callback " << *cit << endl; (*cit)->finish(0); + dout(DBL) << "finish callback " << *cit << endl; + delete *cit; } pending.clear(); assert(pending.empty()); @@ -90,10 +109,9 @@ void Timer::timer_entry() } } - else { // sleep - if (event) { + if (next_due) { dout(DBL) << "sleeping until " << next << endl; timed_sleep = true; sleeping = true; @@ -130,7 +148,7 @@ void Timer::register_timer() else sleep_cond.SignalAll(); } else { - dout(DBL) << "register_timer doing nothing; thread is alive but not sleeping" << endl; + dout(DBL) << "register_timer doing nothing; thread is awake" << endl; // it's probably doing callbacks. } } else { @@ -177,19 +195,20 @@ void Timer::add_event_after(float seconds, void Timer::add_event_at(utime_t when, Context *callback) { - // insert + lock.Lock(); + dout(DBL) << "add_event " << callback << " at " << when << endl; - lock.Lock(); - scheduled[ when ].insert(callback); - assert(event_times.count(callback) == 0); // err.. there can be only one (for now!) + // insert + scheduled[when].insert(callback); + assert(event_times.count(callback) == 0); event_times[callback] = when; num_event++; - - // make sure i wake up + + // make sure i wake up on time register_timer(); - + lock.Unlock(); } @@ -200,21 +219,111 @@ bool Timer::cancel_event(Context *callback) dout(DBL) << "cancel_event " << callback << endl; if (!event_times.count(callback)) { - dout(DBL) << "cancel_event " << callback << " wasn't scheduled?" << endl; + dout(DBL) << "cancel_event " << callback << " isn't scheduled (probably executing)" << endl; lock.Unlock(); - assert(0); return false; // wasn't scheduled. } utime_t tp = event_times[callback]; - assert(scheduled.count(tp)); - - multiset::iterator p = scheduled[tp].find(callback); // there may be more than one? - assert(p != scheduled[tp].end()); - scheduled[tp].erase(p); - event_times.erase(callback); + + assert(scheduled.count(tp)); + assert(scheduled[tp].count(callback)); + scheduled[tp].erase(callback); + if (scheduled[tp].empty()) + scheduled.erase(tp); lock.Unlock(); return true; } + + +// ------------------------------- + +void SafeTimer::add_event_after(float seconds, Context *c) +{ + assert(lock.is_locked()); + Context *w = new EventWrapper(this, c); + dout(DBL) << "SafeTimer.add_event_after wrapping " << c << " with " << w << endl; + scheduled[c] = w; + g_timer.add_event_after(seconds, w); +} + +void SafeTimer::add_event_at(utime_t when, Context *c) +{ + assert(lock.is_locked()); + Context *w = new EventWrapper(this, c); + dout(DBL) << "SafeTimer.add_event_at wrapping " << c << " with " << w << endl; + scheduled[c] = w; + g_timer.add_event_at(when, w); +} + +void SafeTimer::EventWrapper::finish(int r) +{ + timer->lock.Lock(); + if (timer->scheduled.count(actual)) { + // still scheduled. execute. + actual->finish(r); + timer->scheduled.erase(actual); + } else { + // i was canceled. + assert(timer->canceled.count(actual)); + } + + // did i get canceled? + // (this can happen even if i just executed above. e.g., i may have canceled myself.) + if (timer->canceled.count(actual)) { + timer->canceled.erase(actual); + timer->cond.Signal(); + } + + // delete the original event + delete actual; + + timer->lock.Unlock(); +} + +void SafeTimer::cancel_event(Context *c) +{ + assert(lock.is_locked()); + assert(scheduled.count(c)); + + if (g_timer.cancel_event(scheduled[c])) { + // hosed wrapper. hose original event too. + delete scheduled[c]; + } else { + // clean up later. + canceled[c] = scheduled[c]; + } + scheduled.erase(c); +} + +void SafeTimer::cancel_all() +{ + assert(lock.is_locked()); + + while (!scheduled.empty()) + cancel_event(scheduled.begin()->first); +} + +void SafeTimer::join() +{ + assert(lock.is_locked()); + assert(scheduled.empty()); + + while (!canceled.empty()) { + // wait + dout(-10) << "SafeTimer.join waiting for " << canceled.size() << " to join" << endl; + dout(-10) << canceled << endl; + cond.Wait(lock); + } +} + +SafeTimer::~SafeTimer() +{ + if (!scheduled.empty() && !canceled.empty()) { + derr(0) << "SafeTimer.~SafeTimer " << scheduled.size() << " events scheduled, " + << canceled.size() << " canceled but unflushed" + << endl; + } +} diff --git a/branches/aleung/security1/ceph/common/Timer.h b/branches/aleung/security1/ceph/common/Timer.h index bd63d7173a3d3..88d9929ac5ae1 100644 --- a/branches/aleung/security1/ceph/common/Timer.h +++ b/branches/aleung/security1/ceph/common/Timer.h @@ -50,17 +50,13 @@ namespace __gnu_cxx { class Timer { private: - map< utime_t, multiset > scheduled; // time -> (context ...) + map< utime_t, set > scheduled; // time -> (context ...) hash_map< Context*, utime_t > event_times; // event -> time // get time of the next event - Context* get_next_scheduled(utime_t& when) { - if (scheduled.empty()) return 0; - map< utime_t, multiset >::iterator it = scheduled.begin(); - when = it->first; - multiset::iterator sit = it->second.begin(); - return *sit; - } + //Context* get_next_scheduled(utime_t& when); + + bool get_next_due(utime_t &when); void register_timer(); // make sure i get a callback void cancel_timer(); // make sure i get a callback @@ -104,10 +100,10 @@ class Timer { cancel_timer(); // scheduled - for (map< utime_t, multiset >::iterator it = scheduled.begin(); + for (map< utime_t, set >::iterator it = scheduled.begin(); it != scheduled.end(); it++) { - for (multiset::iterator sit = it->second.begin(); + for (set::iterator sit = it->second.begin(); sit != it->second.end(); sit++) delete *sit; @@ -135,6 +131,44 @@ class Timer { }; +/* + * SafeTimer is a wrapper around the raw Timer (or rather, g_timer, it's global + * instantiation) that protects event execution with an existing mutex. It + * provides for, among other things, reliable event cancellation on class + * destruction. The caller just needs to cancel each event (or cancel_all()), + * and then call join() to ensure any concurrently exectuting events (in other + * threads) get flushed. + */ +class SafeTimer { + Mutex& lock; + Cond cond; + map scheduled; // actual -> wrapper + map canceled; + + class EventWrapper : public Context { + SafeTimer *timer; + Context *actual; + public: + EventWrapper(SafeTimer *st, Context *c) : timer(st), + actual(c) {} + void finish(int r); + }; + +public: + SafeTimer(Mutex& l) : lock(l) { } + ~SafeTimer(); + + void add_event_after(float seconds, Context *c); + void add_event_at(utime_t when, Context *c); + void cancel_event(Context *c); + void cancel_all(); + void join(); + + int get_num_scheduled() { return scheduled.size(); } + int get_num_canceled() { return canceled.size(); } +}; + + // single global instance extern Timer g_timer; diff --git a/branches/aleung/security1/ceph/config.cc b/branches/aleung/security1/ceph/config.cc index f8479c1101186..9f676d8f1024a 100644 --- a/branches/aleung/security1/ceph/config.cc +++ b/branches/aleung/security1/ceph/config.cc @@ -51,6 +51,8 @@ FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects std::map g_fake_osd_down; std::map g_fake_osd_out; +entity_addr_t g_my_addr; + md_config_t g_debug_after_conf; md_config_t g_conf = { @@ -127,6 +129,7 @@ md_config_t g_conf = { mon_tick_interval: 5, mon_osd_down_out_interval: 5, // seconds mon_lease: 2.000, // seconds + mon_stop_with_last_mds: true, // --- client --- client_cache_size: 300, @@ -149,7 +152,7 @@ md_config_t g_conf = { objecter_buffer_uncommitted: true, // --- journaler --- - journaler_allow_split_entries: false, + journaler_allow_split_entries: true, // --- mds --- mds_cache_size: MDS_CACHE_SIZE, @@ -157,6 +160,9 @@ md_config_t g_conf = { mds_decay_halflife: 30, + mds_beacon_interval: 5.0, + mds_beacon_grace: 10.0, + mds_log: true, mds_log_max_len: MDS_CACHE_SIZE / 3, mds_log_max_trimming: 10000, @@ -164,7 +170,7 @@ md_config_t g_conf = { mds_log_pad_entry: 128,//256,//64, mds_log_before_reply: true, mds_log_flush_on_shutdown: true, - + mds_log_import_map_interval: 1024*1024, // frequency (in bytes) of EImportMap in log mds_bal_replicate_threshold: 2000, mds_bal_unreplicate_threshold: 0,//500, mds_bal_hash_rd: 10000, @@ -186,6 +192,7 @@ md_config_t g_conf = { mds_commit_on_shutdown: true, mds_shutdown_check: 0, //30, + mds_shutdown_on_last_unmount: true, mds_verify_export_dirauth: true, @@ -214,7 +221,8 @@ md_config_t g_conf = { fakestore_fsync: false,//true, fakestore_writesync: false, fakestore_syncthreads: 4, - fakestore_fakeattr: true, + fakestore_fake_attrs: false, + fakestore_fake_collections: false, fakestore_dev: 0, // --- ebofs --- @@ -299,6 +307,17 @@ md_config_t g_conf = { hash_scheme: 0, /* 0=sha-1, 1=sha-256, 2=sha-512, 3 = md5 */ crypt_scheme: 0 /* 0=rijndael, 1=RC5 */ + +#ifdef USE_OSBDB + , + bdbstore: false, + debug_bdbstore: 1, + bdbstore_btree: false, + bdbstore_ffactor: 0, + bdbstore_nelem: 0, + bdbstore_pagesize: 0, + bdbstore_cachesize: 0 +#endif // USE_OSBDB }; @@ -349,12 +368,62 @@ void vec_to_argv(std::vector& args, argv[argc++] = args[i]; } +bool parse_ip_port(const char *s, entity_addr_t& a) +{ + int count = 0; // digit count + int off = 0; + + while (1) { + // parse the #. + int val = 0; + int numdigits = 0; + + while (*s >= '0' && *s <= '9') { + int digit = *s - '0'; + //cout << "digit " << digit << endl; + val *= 10; + val += digit; + numdigits++; + s++; off++; + } + //cout << "val " << val << endl; + + if (numdigits == 0) { + cerr << "no digits at off " << off << endl; + return false; // no digits + } + if (count < 3 && *s != '.') { + cerr << "should period at " << off << endl; + return false; // should have 3 periods + } + if (count == 3 && *s != ':') { + cerr << "expected : at " << off << endl; + return false; // then a colon + } + s++; off++; + + if (count <= 3) + a.ipq[count] = val; + else + a.port = val; + + count++; + if (count == 5) break; + } + + return true; +} + + + void parse_config_options(std::vector& args) { std::vector nargs; for (unsigned i=0; i& args) else if (strcmp(args[i], "--mds_cache_size") == 0) g_conf.mds_cache_size = atoi(args[++i]); + else if (strcmp(args[i], "--mds_beacon_interval") == 0) + g_conf.mds_beacon_interval = atoi(args[++i]); + else if (strcmp(args[i], "--mds_beacon_grace") == 0) + g_conf.mds_beacon_grace = atoi(args[++i]); + else if (strcmp(args[i], "--mds_log") == 0) g_conf.mds_log = atoi(args[++i]); else if (strcmp(args[i], "--mds_log_before_reply") == 0) @@ -515,6 +589,8 @@ void parse_config_options(std::vector& args) g_conf.mds_commit_on_shutdown = atoi(args[++i]); else if (strcmp(args[i], "--mds_shutdown_check") == 0) g_conf.mds_shutdown_check = atoi(args[++i]); + else if (strcmp(args[i], "--mds_shutdown_on_last_unmount") == 0) + g_conf.mds_shutdown_on_last_unmount = atoi(args[++i]); else if (strcmp(args[i], "--mds_log_flush_on_shutdown") == 0) g_conf.mds_log_flush_on_shutdown = atoi(args[++i]); @@ -556,7 +632,9 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mds_local_osd") == 0) g_conf.mds_local_osd = atoi(args[++i]); - + + else if (strcmp(args[i], "--client_use_random_mds") == 0) + g_conf.client_use_random_mds = true; else if (strcmp(args[i], "--client_cache_size") == 0) g_conf.client_cache_size = atoi(args[++i]); else if (strcmp(args[i], "--client_cache_stat_ttl") == 0) @@ -570,6 +648,8 @@ void parse_config_options(std::vector& args) else if (strcmp(args[i], "--mon_osd_down_out_interval") == 0) g_conf.mon_osd_down_out_interval = atoi(args[++i]); + else if (strcmp(args[i], "--mon_stop_with_last_mds") == 0) + g_conf.mon_stop_with_last_mds = atoi(args[++i]); else if (strcmp(args[i], "--client_sync_writes") == 0) g_conf.client_sync_writes = atoi(args[++i]); @@ -618,6 +698,10 @@ void parse_config_options(std::vector& args) g_conf.fakestore_writesync = atoi(args[++i]); else if (strcmp(args[i], "--fakestore_dev") == 0) g_conf.fakestore_dev = args[++i]; + else if (strcmp(args[i], "--fakestore_fake_attrs") == 0) + g_conf.fakestore_fake_attrs = true;//atoi(args[++i]); + else if (strcmp(args[i], "--fakestore_fake_collections") == 0) + g_conf.fakestore_fake_collections = true;//atoi(args[++i]); else if (strcmp(args[i], "--obfs") == 0) { g_conf.uofs = 1; @@ -720,6 +804,28 @@ void parse_config_options(std::vector& args) g_conf.mds_log = false; } +#ifdef USE_OSBDB + else if (strcmp(args[i], "--bdbstore") == 0) { + g_conf.bdbstore = true; + g_conf.ebofs = 0; + } + else if (strcmp(args[i], "--bdbstore-btree") == 0) { + g_conf.bdbstore_btree = true; + } + else if (strcmp(args[i], "--bdbstore-hash-ffactor") == 0) { + g_conf.bdbstore_ffactor = atoi(args[++i]); + } + else if (strcmp(args[i], "--bdbstore-hash-nelem") == 0) { + g_conf.bdbstore_nelem = atoi(args[++i]); + } + else if (strcmp(args[i], "--bdbstore-hash-pagesize") == 0) { + g_conf.bdbstore_pagesize = atoi(args[++i]); + } + else if (strcmp(args[i], "--bdbstore-cachesize") == 0) { + g_conf.bdbstore_cachesize = atoi(args[++i]); + } +#endif // USE_OSBDB + else { nargs.push_back(args[i]); } diff --git a/branches/aleung/security1/ceph/config.h b/branches/aleung/security1/ceph/config.h index 504aa0bc33db1..6ab7c4e750837 100644 --- a/branches/aleung/security1/ceph/config.h +++ b/branches/aleung/security1/ceph/config.h @@ -28,6 +28,11 @@ extern std::map g_fake_osd_out; #define OSD_REP_SPLAY 1 #define OSD_REP_CHAIN 2 + +#include "msg/msg_types.h" + +extern entity_addr_t g_my_addr; + struct md_config_t { int num_mon; int num_mds; @@ -102,6 +107,7 @@ struct md_config_t { int mon_tick_interval; int mon_osd_down_out_interval; float mon_lease; + bool mon_stop_with_last_mds; // client int client_cache_size; @@ -145,6 +151,9 @@ struct md_config_t { float mds_decay_halflife; + float mds_beacon_interval; + float mds_beacon_grace; + bool mds_log; int mds_log_max_len; int mds_log_max_trimming; @@ -152,6 +161,7 @@ struct md_config_t { int mds_log_pad_entry; bool mds_log_before_reply; bool mds_log_flush_on_shutdown; + off_t mds_log_import_map_interval; float mds_bal_replicate_threshold; float mds_bal_unreplicate_threshold; @@ -174,6 +184,7 @@ struct md_config_t { bool mds_commit_on_shutdown; int mds_shutdown_check; + bool mds_shutdown_on_last_unmount; bool mds_verify_export_dirauth; // debug flag bool mds_local_osd; @@ -200,7 +211,8 @@ struct md_config_t { bool fakestore_fsync; bool fakestore_writesync; int fakestore_syncthreads; // such crap - bool fakestore_fakeattr; + bool fakestore_fake_attrs; + bool fakestore_fake_collections; char *fakestore_dev; // ebofs @@ -284,6 +296,15 @@ struct md_config_t { int hash_scheme; int crypt_scheme; +#ifdef USE_OSBDB + bool bdbstore; + int debug_bdbstore; + bool bdbstore_btree; + int bdbstore_ffactor; + int bdbstore_nelem; + int bdbstore_pagesize; + int bdbstore_cachesize; +#endif // USE_OSBDB }; extern md_config_t g_conf; @@ -300,4 +321,8 @@ void vec_to_argv(std::vector& args, void parse_config_options(std::vector& args); +extern bool parse_ip_port(const char *s, entity_addr_t& addr); + + + #endif diff --git a/branches/aleung/security1/ceph/cosd.cc b/branches/aleung/security1/ceph/cosd.cc index beb5b3f732cd3..93d14348996df 100644 --- a/branches/aleung/security1/ceph/cosd.cc +++ b/branches/aleung/security1/ceph/cosd.cc @@ -62,7 +62,7 @@ int main(int argc, char **argv) if (g_conf.debug_after) g_timer.add_event_after(g_conf.debug_after, new C_Debug); - + // osd specific args char *dev; int whoami = -1; for (unsigned i=0; i::iterator p = data.lower_bound(bstart); + p != data.end(); + ++p) { + BufferHead *bh = p->second; + + // don't trim unless it's entirely in our range + if (bh->start() < bstart) continue; + if (bh->end() > blast) break; + + dout(12) << "moving " << *bh << " to bottom of lru" << endl; + bc->touch_bottom(bh); // move to bottom of lru list + } +} + + void ObjectCache::truncate(block_t blocks, version_t super_epoch) { dout(7) << "truncate " << object_id diff --git a/branches/aleung/security1/ceph/ebofs/BufferCache.h b/branches/aleung/security1/ceph/ebofs/BufferCache.h index 922c5e531ee56..846809735103a 100644 --- a/branches/aleung/security1/ceph/ebofs/BufferCache.h +++ b/branches/aleung/security1/ceph/ebofs/BufferCache.h @@ -75,12 +75,15 @@ class BufferHead : public LRUObject { utime_t dirty_stamp; + bool want_to_expire; // wants to be at bottom of lru + public: BufferHead(ObjectCache *o) : oc(o), //cancellable_ioh(0), tx_epoch(0), rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), shadow_of(0), - ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0) + ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0), + want_to_expire(false) {} ~BufferHead() { unpin_shadows(); @@ -405,6 +408,7 @@ class ObjectCache { interval_set& alloc, map& hits, version_t super_epoch); // can write to these. + void touch_bottom(block_t bstart, block_t blast); BufferHead *split(BufferHead *bh, block_t off); @@ -509,6 +513,13 @@ class BufferCache { } else lru_rest.lru_touch(bh); } + void touch_bottom(BufferHead *bh) { + if (bh->is_dirty()) { + bh->want_to_expire = true; + lru_dirty.lru_bottouch(bh); + } else + lru_rest.lru_bottouch(bh); + } void remove_bh(BufferHead *bh) { bh->get_oc()->remove_bh(bh); stat_sub(bh); @@ -586,7 +597,10 @@ class BufferCache { } if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { lru_dirty.lru_remove(bh); - lru_rest.lru_insert_mid(bh); + if (bh->want_to_expire) + lru_rest.lru_insert_bot(bh); + else + lru_rest.lru_insert_mid(bh); dirty_bh.erase(bh); } diff --git a/branches/aleung/security1/ceph/ebofs/Ebofs.cc b/branches/aleung/security1/ceph/ebofs/Ebofs.cc index e6b505435c949..468a18178bbbd 100644 --- a/branches/aleung/security1/ceph/ebofs/Ebofs.cc +++ b/branches/aleung/security1/ceph/ebofs/Ebofs.cc @@ -16,12 +16,12 @@ #include "Ebofs.h" #include -#ifdef DARWIN + +#ifndef DARWIN +#include +#else #include #include -#include -#else -#include #endif // DARWIN // ******************* @@ -574,7 +574,7 @@ Onode* Ebofs::get_onode(object_t oid) { while (1) { // in cache? - if (onode_map.count(oid)) { + if (have_onode(oid)) { // yay Onode *on = onode_map[oid]; on->get(); @@ -1218,8 +1218,10 @@ void Ebofs::kick_idle() void Ebofs::sync(Context *onsafe) { ebofs_lock.Lock(); - if (onsafe) + if (onsafe) { + dirty = true; commit_waiters[super_epoch].push_back(onsafe); + } ebofs_lock.Unlock(); } @@ -1229,22 +1231,14 @@ void Ebofs::sync() if (!dirty) { dout(7) << "sync in " << super_epoch << ", not dirty" << endl; } else { - dout(7) << "sync in " << super_epoch << endl; - - if (!commit_thread_started) { - dout(10) << "sync waiting for commit thread to start" << endl; - sync_cond.Wait(ebofs_lock); - } - - if (mid_commit) { - dout(10) << "sync waiting for commit in progress" << endl; + epoch_t start = super_epoch; + dout(7) << "sync start in " << start << endl; + while (super_epoch == start) { + dout(7) << "sync kicking commit in " << super_epoch << endl; + dirty = true; + commit_cond.Signal(); sync_cond.Wait(ebofs_lock); } - - commit_cond.Signal(); // trigger a commit - - sync_cond.Wait(ebofs_lock); // wait - dout(10) << "sync finish in " << super_epoch << endl; } ebofs_lock.Unlock(); @@ -1830,6 +1824,92 @@ bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, return true; } + +/* + * is_cached -- query whether a object extent is in our cache + * return value of -1 if onode isn't loaded. otherwise, the number + * of extents that need to be read (i.e. # of seeks) + */ +int Ebofs::is_cached(object_t oid, off_t off, size_t len) +{ + ebofs_lock.Lock(); + int r = _is_cached(oid, off, len); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_is_cached(object_t oid, off_t off, size_t len) +{ + if (!have_onode(oid)) { + dout(7) << "_is_cached " << oid << " " << off << "~" << len << " ... onode " << endl; + return -1; // object dne? + } + Onode *on = get_onode(oid); + + if (!on->have_oc()) { + // nothing is cached. return # of extents in file. + return on->extent_map.size(); + } + + // map + block_t bstart = off / EBOFS_BLOCK_SIZE; + block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; + block_t blen = blast-bstart+1; + + map hits; + map missing; // read these + map rx; // wait for these + map partials; // ?? + on->get_oc(&bc)->map_read(bstart, blen, hits, missing, rx, partials); + return missing.size() + rx.size() + partials.size(); + + // FIXME: actually, we should calculate if these extents are contiguous. + // and not using map_read, probably... + /* hrmpf + block_t dpos = 0; + block_t opos = bstart; + while (opos < blen) { + if (hits.begin()->first == opos) { + } else { + block_t d; + if (missing.begin()->first == opos) d = missing.begin()->second. + + } + */ +} + +void Ebofs::trim_from_cache(object_t oid, off_t off, size_t len) +{ + ebofs_lock.Lock(); + _trim_from_cache(oid, off, len); + ebofs_lock.Unlock(); +} + +void Ebofs::_trim_from_cache(object_t oid, off_t off, size_t len) +{ + // be careful not to load it if we don't have it + if (!have_onode(oid)) { + dout(7) << "_trim_from_cache " << oid << " " << off << "~" << len << " ... onode not in cache " << endl; + return; + } + + // ok, we have it, get a pointer. + Onode *on = get_onode(oid); + + if (!on->have_oc()) + return; // nothing is cached. + + // map to blocks + block_t bstart = off / EBOFS_BLOCK_SIZE; + block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; + + ObjectCache *oc = on->get_oc(&bc); + oc->touch_bottom(bstart, blast); + + return; +} + + int Ebofs::read(object_t oid, off_t off, size_t len, bufferlist& bl) @@ -1977,6 +2057,15 @@ unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) } break; + case Transaction::OP_TRIMCACHE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + _trim_from_cache(oid, offset, len); + } + break; + case Transaction::OP_TRUNCATE: { object_t oid = t.oids.front(); t.oids.pop_front(); diff --git a/branches/aleung/security1/ceph/ebofs/Ebofs.h b/branches/aleung/security1/ceph/ebofs/Ebofs.h index a8efe3b6a6b4c..6d18b7a0204fa 100644 --- a/branches/aleung/security1/ceph/ebofs/Ebofs.h +++ b/branches/aleung/security1/ceph/ebofs/Ebofs.h @@ -118,6 +118,9 @@ class Ebofs : public ObjectStore { map > waitfor_onode; Onode* new_onode(object_t oid); // make new onode. ref++. + bool have_onode(object_t oid) { + return onode_map.count(oid); + } Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. void remove_onode(Onode *on); void put_onode(Onode* o); // put it back down. ref--. @@ -242,8 +245,10 @@ class Ebofs : public ObjectStore { bool exists(object_t); int stat(object_t, struct stat*); int read(object_t, off_t off, size_t len, bufferlist& bl); - //int write(object_t oid, off_t off, size_t len, bufferlist& bl, bool fsync=true); + int is_cached(object_t oid, off_t off, size_t len); + int write(object_t oid, off_t off, size_t len, bufferlist& bl, Context *onsafe); + void trim_from_cache(object_t oid, off_t off, size_t len); int truncate(object_t oid, off_t size, Context *onsafe=0); int truncate_front(object_t oid, off_t size, Context *onsafe=0); int remove(object_t oid, Context *onsafe=0); @@ -298,12 +303,14 @@ class Ebofs : public ObjectStore { private: // private interface -- use if caller already holds lock int _read(object_t oid, off_t off, size_t len, bufferlist& bl); + int _is_cached(object_t oid, off_t off, size_t len); int _stat(object_t oid, struct stat *st); int _getattr(object_t oid, const char *name, void *value, size_t size); int _getattrs(object_t oid, map &aset); bool _write_will_block(); int _write(object_t oid, off_t off, size_t len, bufferlist& bl); + void _trim_from_cache(object_t oid, off_t off, size_t len); int _truncate(object_t oid, off_t size); int _truncate_front(object_t oid, off_t size); int _remove(object_t oid); diff --git a/branches/aleung/security1/ceph/fakesyn.cc b/branches/aleung/security1/ceph/fakesyn.cc index d82ff5074aff5..4f23941e4094f 100644 --- a/branches/aleung/security1/ceph/fakesyn.cc +++ b/branches/aleung/security1/ceph/fakesyn.cc @@ -46,6 +46,14 @@ public: } }; +class C_Die : public Context { +public: + void finish(int) { + cerr << "die" << endl; + exit(1); + } +}; + int main(int argc, char **argv) { @@ -72,11 +80,15 @@ int main(int argc, char **argv) assert(nargs.empty()); + if (g_conf.kill_after) + g_timer.add_event_after(g_conf.kill_after, new C_Die); + + g_clock.tare(); if (g_conf.secure_io) { cout << "Testing crypto library" << endl; - + const byte* myMsg = (const byte*)"hash me"; byte digestBuf[SHA1DIGESTSIZE]; byte hexBuf[2*SHA1DIGESTSIZE]; @@ -89,8 +101,15 @@ int main(int argc, char **argv) string((const char*)hexBuf,2*SHA1DIGESTSIZE) << endl; } + // need to reload old monmap on !mkfs to make mon private key match monmap. FIXME. + assert(g_conf.mkfs); + MonMap *monmap = new MonMap(g_conf.num_mon); - monmap->mon_inst[0].rank = 0; // hack ; see FakeMessenger.cc + entity_addr_t a; + monmap->mon_inst[0] = entity_inst_t(MSG_ADDR_MON(0), a); // hack ; see FakeMessenger.cc + + string mon_private_key; + monmap->generate_key_pair(mon_private_key); char hostname[100]; gethostname(hostname,100); @@ -100,6 +119,7 @@ int main(int argc, char **argv) Monitor *mon[g_conf.num_mon]; for (int i=0; iset_new_private_key(mon_private_key); } // create mds diff --git a/branches/aleung/security1/ceph/include/buffer.h b/branches/aleung/security1/ceph/include/buffer.h index 82c4f6cbeb67f..4634c2f5b590e 100644 --- a/branches/aleung/security1/ceph/include/buffer.h +++ b/branches/aleung/security1/ceph/include/buffer.h @@ -747,6 +747,7 @@ inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { // encoder/decode helpers +// -- basic types -- // string inline void _encode(const std::string& s, bufferlist& bl) { @@ -796,18 +797,74 @@ inline void _decode(bufferlist& s, bufferlist& bl, int& off) off += len; } + #include #include #include #include +// set +inline void _encode(const std::set& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (std::set::const_iterator it = s.begin(); + it != s.end(); + it++) { + ::_encode(*it, bl); + n--; + } + assert(n==0); +} +inline void _decode(std::set& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +inline void _encode(const std::list& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (std::list::const_iterator it = s.begin(); + it != s.end(); + it++) { + ::_encode(*it, bl); + n--; + } + assert(n==0); +} +inline void _decode(std::list& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i template -inline void _encode(std::set& s, bufferlist& bl) +inline void _encode(const std::set& s, bufferlist& bl) { int n = s.size(); bl.append((char*)&n, sizeof(n)); - for (typename std::set::iterator it = s.begin(); + for (typename std::set::const_iterator it = s.begin(); it != s.end(); it++) { T v = *it; @@ -1018,6 +1075,73 @@ inline void _decode(std::map& s, bufferlist& bl, int& off) assert(s.size() == (unsigned)n); } +// map +template +inline void _encode(const std::map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::map::const_iterator it = s.begin(); + it != s.end(); + it++) { + ::_encode(it->first, bl); + U v = it->second; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(std::map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i> +template +inline void _encode(const std::map >& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::map >::const_iterator it = s.begin(); + it != s.end(); + it++) { + T k = it->first; + bl.append((char*)&k, sizeof(k)); + ::_encode(it->second, bl); + n--; + } + assert(n==0); +} +template +inline void _decode(std::map >& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i template inline void _encode(const std::map& s, bufferlist& bl) diff --git a/branches/aleung/security1/ceph/include/object.h b/branches/aleung/security1/ceph/include/object.h index 3a66c4ab83d54..9773ecb4b3288 100644 --- a/branches/aleung/security1/ceph/include/object.h +++ b/branches/aleung/security1/ceph/include/object.h @@ -30,6 +30,7 @@ struct object_t { object_t() : ino(0), bno(0), rev(0) {} object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {} + object_t(__uint64_t i, __uint32_t b, __uint32_t r) : ino(i), bno(b), rev(r) {} }; diff --git a/branches/aleung/security1/ceph/include/reqid.h b/branches/aleung/security1/ceph/include/reqid.h new file mode 100644 index 0000000000000..3c71fbae69ab6 --- /dev/null +++ b/branches/aleung/security1/ceph/include/reqid.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __REQID_H +#define __REQID_H + + +#include "include/types.h" +#include "msg/msg_types.h" + +/* reqid_t - caller name + incarnation# + tid to unique identify this request + * use for metadata and osd ops. + */ +class reqid_t { +public: + entity_name_t name; // who + int inc; // incarnation + tid_t tid; + reqid_t() : inc(0), tid(0) {} + reqid_t(const entity_name_t& a, int i, tid_t t) : name(a), inc(i), tid(t) {} +}; + +inline ostream& operator<<(ostream& out, const reqid_t& r) { + return out << r.name << "." << r.inc << ":" << r.tid; +} + +inline bool operator==(const reqid_t& l, const reqid_t& r) { + return (l.name == r.name) && (l.inc == r.inc) && (l.tid == r.tid); +} +inline bool operator!=(const reqid_t& l, const reqid_t& r) { + return (l.name != r.name) || (l.inc != r.inc) || (l.tid != r.tid); +} +inline bool operator<(const reqid_t& l, const reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid < r.tid); +} +inline bool operator<=(const reqid_t& l, const reqid_t& r) { + return (l.name < r.name) || (l.inc < r.inc) || + (l.name == r.name && l.inc == r.inc && l.tid <= r.tid); +} +inline bool operator>(const reqid_t& l, const reqid_t& r) { return !(l <= r); } +inline bool operator>=(const reqid_t& l, const reqid_t& r) { return !(l < r); } + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const reqid_t &r) const { + static blobhash H; + return H((const char*)&r, sizeof(r)); + } + }; +} + + +#endif diff --git a/branches/aleung/security1/ceph/include/types.h b/branches/aleung/security1/ceph/include/types.h index 9b7f3d8198e64..b09ee2d4726be 100644 --- a/branches/aleung/security1/ceph/include/types.h +++ b/branches/aleung/security1/ceph/include/types.h @@ -32,10 +32,8 @@ using namespace std; #include using namespace __gnu_cxx; - #include "object.h" - #ifndef MIN # define MIN(a,b) ((a) < (b) ? (a):(b)) #endif @@ -44,34 +42,6 @@ using namespace __gnu_cxx; #endif -// md ops -#define MDS_OP_STATFS 1 - -#define MDS_OP_STAT 100 -#define MDS_OP_LSTAT 101 -#define MDS_OP_UTIME 102 -#define MDS_OP_CHMOD 103 -#define MDS_OP_CHOWN 104 - - -#define MDS_OP_READDIR 200 -#define MDS_OP_MKNOD 201 -#define MDS_OP_LINK 202 -#define MDS_OP_UNLINK 203 -#define MDS_OP_RENAME 204 - -#define MDS_OP_MKDIR 220 -#define MDS_OP_RMDIR 221 -#define MDS_OP_SYMLINK 222 - -#define MDS_OP_OPEN 301 -#define MDS_OP_TRUNCATE 306 -#define MDS_OP_FSYNC 307 -//#define MDS_OP_CLOSE 310 -#define MDS_OP_RELEASE 308 - - - // -- stl crap -- /* @@ -80,6 +50,28 @@ using namespace __gnu_cxx; compile now? */ +class blobhash { +public: + size_t operator()(const char *p, unsigned len) { + static hash H; + long acc = 0; + while (len >= sizeof(long)) { + acc ^= *(long*)p; + p += sizeof(long); + len -= sizeof(long); + } + int sh = 0; + while (len) { + acc ^= (long)*p << sh; + sh += 8; + len--; + p++; + } + return H(acc); + } +}; + + namespace __gnu_cxx { template<> struct hash< std::string > { @@ -124,6 +116,17 @@ struct ltstr +// ---------------------- +// some basic types + +typedef __uint64_t tid_t; // transaction id +typedef __uint64_t version_t; +typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) + + + + + /** object layout * how objects are mapped into PGs */ @@ -178,8 +181,6 @@ struct FileLayout { // -- inode -- -//typedef __uint64_t inodeno_t; - struct inodeno_t { __uint64_t val; inodeno_t() : val() {} @@ -203,9 +204,6 @@ namespace __gnu_cxx { }; } -typedef __uint64_t version_t; - - #define INODE_MODE_FILE 0100000 // S_IFREG #define INODE_MODE_SYMLINK 0120000 // S_IFLNK @@ -259,182 +257,6 @@ struct inode_t { -// lame 128-bit value class. -class lame128_t { -public: - __uint64_t hi, lo; - lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {} -}; - -inline ostream& operator<<(ostream& out, lame128_t& oid) { - return out << oid.hi << "." << oid.lo; -} - - -// osd types -//typedef __uint32_t ps_t; // placement seed -//typedef __uint32_t pg_t; // placement group -typedef __uint64_t coll_t; // collection id -typedef __uint64_t tid_t; // transaction id - -typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) - -// pg stuff -typedef __uint16_t ps_t; -typedef __uint8_t pruleset_t; - -// placement group id -struct pg_t { - union { - struct { - int preferred; - ps_t ps; - __uint8_t nrep; - pruleset_t ruleset; - } fields; - __uint64_t val; - } u; - pg_t() { u.val = 0; } - pg_t(const pg_t& o) { u.val = o.u.val; } - pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { - u.fields.ps = s; - u.fields.preferred = p; - u.fields.nrep = n; - u.fields.ruleset = r; - } - pg_t(__uint64_t v) { u.val = v; } - /* - pg_t operator=(__uint64_t v) { u.val = v; return *this; } - pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } - pg_t operator+=(pg_t o) { u.val += o.val; return *this; } - pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } - pg_t operator++() { ++u.val; return *this; } - */ - operator __uint64_t() const { return u.val; } -}; - -inline ostream& operator<<(ostream& out, pg_t pg) { - //return out << hex << pg.val << dec; - if (pg.u.fields.ruleset) - out << (int)pg.u.fields.ruleset << '.'; - out << (int)pg.u.fields.nrep << '.'; - if (pg.u.fields.preferred) - out << pg.u.fields.preferred << '.'; - out << hex << pg.u.fields.ps << dec; - return out; -} - -namespace __gnu_cxx { - template<> struct hash< pg_t > - { - size_t operator()( const pg_t& x ) const - { - static hash<__uint64_t> H; - return H(x); - } - }; -} - - - -// compound rados version type -class eversion_t { -public: - epoch_t epoch; - version_t version; - eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} -}; - -inline bool operator==(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) && (l.version == r.version); -} -inline bool operator!=(const eversion_t& l, const eversion_t& r) { - return (l.epoch != r.epoch) || (l.version != r.version); -} -inline bool operator<(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); -} -inline bool operator<=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); -} -inline bool operator>(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); -} -inline bool operator>=(const eversion_t& l, const eversion_t& r) { - return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); -} -inline ostream& operator<<(ostream& out, const eversion_t e) { - return out << e.epoch << "'" << e.version; -} - - - -#define PG_NONE 0xffffffffL - - -typedef __uint16_t snapv_t; // snapshot version - - -class OSDSuperblock { -public: - const static __uint64_t MAGIC = 0xeb0f505dULL; - __uint64_t magic; - __uint64_t fsid; // unique fs id (random number) - int whoami; // my role in this fs. - epoch_t current_epoch; // most recent epoch - epoch_t oldest_map, newest_map; // oldest/newest maps we have. - OSDSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), - current_epoch(0), oldest_map(0), newest_map(0) {} -}; - -inline ostream& operator<<(ostream& out, OSDSuperblock& sb) -{ - return out << "sb(fsid " << sb.fsid - << " osd" << sb.whoami - << " e" << sb.current_epoch - << " [" << sb.oldest_map << "," << sb.newest_map - << "])"; -} - -class MonSuperblock { -public: - const static __uint64_t MAGIC = 0x00eb0f5000ULL; - __uint64_t magic; - __uint64_t fsid; - int whoami; // mon # - epoch_t current_epoch; - MonSuperblock(__uint64_t f=0, int w=0) : - magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {} -}; - - -// new types - -class ObjectExtent { - public: - object_t oid; // object id - off_t start; // in object - size_t length; // in object - - objectrev_t rev; // which revision? - pg_t pgid; // where to find the object - - map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) - - ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} - ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } -}; - -inline ostream& operator<<(ostream& out, ObjectExtent &ex) -{ - return out << "extent(" - << ex.oid << " in " << hex << ex.pgid << dec - << " " << ex.start << "~" << ex.length - << ")"; -} - - // client types typedef int fh_t; // file handle @@ -446,8 +268,6 @@ typedef int fh_t; // file handle - - // -- io helpers -- template diff --git a/branches/aleung/security1/ceph/mds/AnchorClient.cc b/branches/aleung/security1/ceph/mds/AnchorClient.cc index b330a93cec6ca..af84eb6c2448a 100644 --- a/branches/aleung/security1/ceph/mds/AnchorClient.cc +++ b/branches/aleung/security1/ceph/mds/AnchorClient.cc @@ -104,7 +104,7 @@ void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfini pending_lookup_context[ino] = onfinish; messenger->send_message(req, - MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + mdsmap->get_inst(mdsmap->get_anchortable()), MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); } @@ -117,7 +117,7 @@ void AnchorClient::create(inodeno_t ino, vector& trace, Context *onfini pending_op[ino] = onfinish; messenger->send_message(req, - MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + mdsmap->get_inst(mdsmap->get_anchortable()), MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); } @@ -130,7 +130,7 @@ void AnchorClient::update(inodeno_t ino, vector& trace, Context *onfini pending_op[ino] = onfinish; messenger->send_message(req, - MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + mdsmap->get_inst(mdsmap->get_anchortable()), MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); } @@ -142,7 +142,7 @@ void AnchorClient::destroy(inodeno_t ino, Context *onfinish) pending_op[ino] = onfinish; messenger->send_message(req, - MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + mdsmap->get_inst(mdsmap->get_anchortable()), MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); } diff --git a/branches/aleung/security1/ceph/mds/AnchorTable.cc b/branches/aleung/security1/ceph/mds/AnchorTable.cc index d2c338513740c..6f380b0908d8d 100644 --- a/branches/aleung/security1/ceph/mds/AnchorTable.cc +++ b/branches/aleung/security1/ceph/mds/AnchorTable.cc @@ -224,7 +224,7 @@ void AnchorTable::handle_anchor_request(class MAnchorRequest *m) } // send reply - mds->messenger->send_message(reply, m->get_source(), m->get_source_inst(), m->get_source_port()); + mds->messenger->send_message(reply, m->get_source_inst(), m->get_source_port()); delete m; } diff --git a/branches/aleung/security1/ceph/mds/CDentry.cc b/branches/aleung/security1/ceph/mds/CDentry.cc index 2cfbbd80b58be..22d292a001e33 100644 --- a/branches/aleung/security1/ceph/mds/CDentry.cc +++ b/branches/aleung/security1/ceph/mds/CDentry.cc @@ -17,32 +17,56 @@ #include "CInode.h" #include "CDir.h" +#include "MDS.h" +#include "MDCache.h" + #include #undef dout -#define dout(x) if ((x) <= g_conf.debug) cout << "mds.dentry " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << dir->cache->mds->get_nodeid() << ".cache.den(" << dir->ino() << " " << name << ") " // CDentry ostream& operator<<(ostream& out, CDentry& dn) { - out << "[dentry " << dn.get_name(); - if (dn.is_pinned()) out << " " << dn.num_pins() << " pins"; + string path; + dn.make_path(path); + out << "[dentry " << path; + if (dn.is_auth()) { + out << " auth"; + if (dn.is_replicated()) + out << dn.get_replicas(); + } else { + out << " rep@" << dn.authority(); + out << "." << dn.get_replica_nonce(); + assert(dn.get_replica_nonce() >= 0); + } + if (dn.is_null()) out << " NULL"; if (dn.is_remote()) out << " REMOTE"; + if (dn.is_pinned()) out << " " << dn.num_pins() << " pathpins"; + if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; - if (dn.is_dirty()) out << " dirty"; if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); - out << " dirv=" << dn.get_parent_dir_version(); + out << " v=" << dn.get_version(); + out << " pv=" << dn.get_projected_version(); out << " inode=" << dn.get_inode(); + + if (dn.get_num_ref()) { + out << " |"; + for(set::iterator it = dn.get_ref_set().begin(); + it != dn.get_ref_set().end(); + it++) + out << " " << CDentry::pin_name(*it); + } + out << " " << &dn; - out << " in " << *dn.get_dir(); out << "]"; return out; } @@ -52,41 +76,73 @@ CDentry::CDentry(const CDentry& m) { } -void CDentry::mark_dirty() +inodeno_t CDentry::get_ino() { - dout(10) << " mark_dirty " << *this << endl; + if (inode) + return inode->ino(); + return inodeno_t(); +} + - // dir is now dirty (if it wasn't already) - dir->mark_dirty(); +int CDentry::authority() +{ + return dir->dentry_authority( name ); +} - // pin inode? - if (is_primary() && !dirty && inode) inode->get(CINODE_PIN_DNDIRTY); - - // i now live in that (potentially newly dirty) version - parent_dir_version = dir->get_version(); - dirty = true; +version_t CDentry::pre_dirty() +{ + // NOTE: in the future, this will dirty a particular slice/subset of the dir. + projected_version = dir->pre_dirty(); + dout(10) << " pre_dirty " << *this << endl; + return projected_version; } + + +void CDentry::_mark_dirty() +{ + // state+pin + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + } +} + +void CDentry::mark_dirty(version_t pv) +{ + dout(10) << " mark_dirty " << *this << endl; + + // i now live in this new dir version + assert(pv == projected_version); + version = pv; + _mark_dirty(); + + // mark dir too + dir->mark_dirty(pv); +} + void CDentry::mark_clean() { dout(10) << " mark_clean " << *this << endl; - assert(dirty); - assert(parent_dir_version <= dir->get_version()); - - if (parent_dir_version < dir->get_last_committed_version()) - cerr << " bad mark_clean " << *this << endl; + assert(is_dirty()); + assert(version <= dir->get_version()); - assert(parent_dir_version >= dir->get_last_committed_version()); + // this happens on export. + //assert(version <= dir->get_last_committed_version()); - if (is_primary() && dirty && inode) inode->put(CINODE_PIN_DNDIRTY); - dirty = false; + // state+pin + state_clear(STATE_DIRTY); + put(PIN_DIRTY); } void CDentry::make_path(string& s) { - if (dir->inode->get_parent_dn()) - dir->inode->get_parent_dn()->make_path(s); - + if (dir) { + if (dir->inode->get_parent_dn()) + dir->inode->get_parent_dn()->make_path(s); + } else { + s = "???"; + } s += "/"; s += name; } @@ -111,6 +167,12 @@ void CDentry::unlink_remote() } +CDentryDiscover *CDentry::replicate_to(int who) +{ + int nonce = add_replica(who); + return new CDentryDiscover(this, nonce); +} + diff --git a/branches/aleung/security1/ceph/mds/CDentry.h b/branches/aleung/security1/ceph/mds/CDentry.h index a399ef7acfe5a..65b9155ce69f9 100644 --- a/branches/aleung/security1/ceph/mds/CDentry.h +++ b/branches/aleung/security1/ceph/mds/CDentry.h @@ -22,6 +22,9 @@ using namespace std; #include "include/types.h" +#include "include/buffer.h" +#include "include/lru.h" +#include "mdstypes.h" class CInode; class CDir; @@ -29,14 +32,36 @@ class CDir; #define DN_LOCK_SYNC 0 #define DN_LOCK_PREXLOCK 1 #define DN_LOCK_XLOCK 2 -#define DN_LOCK_UNPINNING 3 // waiting for pins to go away +#define DN_LOCK_UNPINNING 3 // waiting for pins to go away .. FIXME REVIEW THIS CODE .. #define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. class Message; +class CDentryDiscover; // dentry -class CDentry { +class CDentry : public MDSCacheObject, public LRUObject { + public: + // state + static const int STATE_AUTH = (1<<0); + static const int STATE_DIRTY = (1<<1); + + // pins + static const int PIN_INODEPIN = 0; // linked inode is pinned + static const int PIN_REPLICATED = 1; // replicated by another MDS + static const int PIN_DIRTY = 2; // + static const int PIN_PROXY = 3; // + static const char *pin_name(int p) { + switch (p) { + case PIN_INODEPIN: return "inodepin"; + case PIN_REPLICATED: return "replicated"; + case PIN_DIRTY: return "dirty"; + case PIN_PROXY: return "proxy"; + default: assert(0); + } + }; + + protected: string name; CInode *inode; @@ -44,15 +69,15 @@ class CDentry { inodeno_t remote_ino; // if remote dentry - // state - bool dirty; - version_t parent_dir_version; // dir version when last touched. + version_t version; // dir version when last touched. + version_t projected_version; // what it will be when i unlock/commit. // locking int lockstate; Message *xlockedby; set gather_set; + // path pins int npins; multiset pinset; @@ -71,8 +96,8 @@ class CDentry { inode(0), dir(0), remote_ino(0), - dirty(0), - parent_dir_version(0), + version(0), + projected_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), npins(0) { } @@ -81,8 +106,8 @@ class CDentry { inode(in), dir(0), remote_ino(ino), - dirty(0), - parent_dir_version(0), + version(0), + projected_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), npins(0) { } @@ -91,8 +116,8 @@ class CDentry { inode(in), dir(0), remote_ino(0), - dirty(0), - parent_dir_version(0), + version(0), + projected_version(0), lockstate(DN_LOCK_SYNC), xlockedby(0), npins(0) { } @@ -100,10 +125,21 @@ class CDentry { CInode *get_inode() { return inode; } CDir *get_dir() { return dir; } const string& get_name() { return name; } + inodeno_t get_ino(); inodeno_t get_remote_ino() { return remote_ino; } void set_remote_ino(inodeno_t ino) { remote_ino = ino; } + + // ref counts: pin ourselves in the LRU when we're pinned. + void first_get() { + lru_pin(); + } + void last_put() { + lru_unpin(); + } + + // dentry type is primary || remote || null // inode ptr is required for primary, optional for remote, undefined for null bool is_primary() { return remote_ino == 0 && inode != 0; } @@ -131,18 +167,26 @@ class CDentry { void make_path(string& p); // -- state - __uint64_t get_parent_dir_version() { return parent_dir_version; } - void float_parent_dir_version(__uint64_t ge) { - if (parent_dir_version < ge) - parent_dir_version = ge; - } + version_t get_version() { return version; } + void set_version(version_t v) { projected_version = version = v; } + version_t get_projected_version() { return projected_version; } + void set_projected_version(version_t v) { projected_version = v; } - bool is_dirty() { return dirty; } - bool is_clean() { return !dirty; } + int authority(); + + bool is_auth() { return state & STATE_AUTH; } + bool is_dirty() { return state & STATE_DIRTY; } + bool is_clean() { return !is_dirty(); } - void mark_dirty(); + version_t pre_dirty(); + void _mark_dirty(); + void mark_dirty(version_t projected_dirv); void mark_clean(); + + // -- replication + CDentryDiscover *replicate_to(int rep); + // -- locking int get_lockstate() { return lockstate; } @@ -158,8 +202,23 @@ class CDentry { bool is_prexlockbyother(Message *m) { return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; } + + int get_replica_lockstate() { + switch (lockstate) { + case DN_LOCK_XLOCK: + case DN_LOCK_SYNC: + return lockstate; + case DN_LOCK_PREXLOCK: + return DN_LOCK_XLOCK; + case DN_LOCK_UNPINNING: + return DN_LOCK_SYNC; + } + assert(0); + return 0; + } + void set_lockstate(int s) { lockstate = s; } - // pins + // path pins void pin(Message *m) { npins++; pinset.insert(m); @@ -185,4 +244,45 @@ class CDentry { ostream& operator<<(ostream& out, CDentry& dn); +class CDentryDiscover { + string dname; + int replica_nonce; + int lockstate; + + inodeno_t ino; + inodeno_t remote_ino; + +public: + CDentryDiscover() {} + CDentryDiscover(CDentry *dn, int nonce) : + dname(dn->get_name()), replica_nonce(nonce), + lockstate(dn->get_replica_lockstate()), + ino(dn->get_ino()), + remote_ino(dn->get_remote_ino()) { } + + string& get_dname() { return dname; } + int get_nonce() { return replica_nonce; } + + void update_dentry(CDentry *dn) { + dn->set_replica_nonce( replica_nonce ); + dn->set_lockstate( lockstate ); + } + + void _encode(bufferlist& bl) { + ::_encode(dname, bl); + bl.append((char*)&replica_nonce, sizeof(replica_nonce)); + bl.append((char*)&lockstate, sizeof(lockstate)); + } + + void _decode(bufferlist& bl, int& off) { + ::_decode(dname, bl, off); + bl.copy(off, sizeof(replica_nonce), (char*)&replica_nonce); + off += sizeof(replica_nonce); + bl.copy(off, sizeof(lockstate), (char*)&lockstate); + off += sizeof(lockstate); + } + +}; + + #endif diff --git a/branches/aleung/security1/ceph/mds/CDir.cc b/branches/aleung/security1/ceph/mds/CDir.cc index a590e6821e1de..c9b9996d91c2d 100644 --- a/branches/aleung/security1/ceph/mds/CDir.cc +++ b/branches/aleung/security1/ceph/mds/CDir.cc @@ -18,6 +18,7 @@ #include "CInode.h" #include "MDS.h" +#include "MDCache.h" #include "MDSMap.h" #include "include/Context.h" @@ -27,28 +28,12 @@ #include "config.h" #undef dout -#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << cache->mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " // PINS -int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - -static char* cdir_pin_names[CDIR_NUM_PINS] = { - "child", - "opened", - "waiter", - "import", - "export", - "freeze", - "proxy", - "authpin", - "imping", - "impex", - "hashed", - "hashing", - "dirty", - "reqpins" -}; +//int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + ostream& operator<<(ostream& out, CDir& dir) @@ -56,42 +41,43 @@ ostream& operator<<(ostream& out, CDir& dir) string path; dir.get_inode()->make_path(path); out << "[dir " << dir.ino() << " " << path << "/"; - if (dir.is_dirty()) out << " dirty"; - if (dir.is_import()) out << " import"; - if (dir.is_export()) out << " export"; - if (dir.is_rep()) out << " repl"; - if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed; if (dir.is_auth()) { out << " auth"; - if (dir.is_open_by_anyone()) - out << "+" << dir.get_open_by(); + if (dir.is_replicated()) + out << dir.get_replicas(); + + out << " v=" << dir.get_version(); + out << " pv=" << dir.get_projected_version(); + out << " cv=" << dir.get_committing_version(); + out << " lastcv=" << dir.get_last_committed_version(); } else { out << " rep@" << dir.authority(); if (dir.get_replica_nonce() > 1) out << "." << dir.get_replica_nonce(); } - if (dir.is_pinned()) { + if (dir.get_dir_auth() != CDIR_AUTH_PARENT) + out << " dir_auth=" << dir.get_dir_auth(); + + out << " state=" << dir.get_state(); + if (dir.state_test(CDIR_STATE_PROXY)) out << "|proxy"; + if (dir.state_test(CDIR_STATE_COMPLETE)) out << "|complete"; + if (dir.state_test(CDIR_STATE_FREEZINGTREE)) out << "|freezingtree"; + if (dir.state_test(CDIR_STATE_FROZENTREE)) out << "|frozentree"; + if (dir.state_test(CDIR_STATE_FROZENTREELEAF)) out << "|frozentreeleaf"; + if (dir.state_test(CDIR_STATE_FROZENDIR)) out << "|frozendir"; + if (dir.state_test(CDIR_STATE_FREEZINGDIR)) out << "|freezingdir"; + + out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); + + if (dir.get_num_ref()) { out << " |"; for(set::iterator it = dir.get_ref_set().begin(); it != dir.get_ref_set().end(); it++) - if (*it < CDIR_NUM_PINS) - out << " " << cdir_pin_names[*it]; - else - out << " " << *it; + out << " " << CDir::pin_name(*it); } - if (dir.get_dir_auth() != CDIR_AUTH_PARENT) - out << " dir_auth=" << dir.get_dir_auth(); - - out << " state=" << dir.get_state(); - out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); - - out << " v=" << dir.get_version(); - out << " cv=" << dir.get_committing_version(); - out << " lastcv=" << dir.get_last_committed_version(); - out << " " << &dir; return out << "]"; } @@ -100,16 +86,16 @@ ostream& operator<<(ostream& out, CDir& dir) // ------------------------------------------------------------------- // CDir -CDir::CDir(CInode *in, MDS *mds, bool auth) +CDir::CDir(CInode *in, MDCache *mdcache, bool auth) { inode = in; - this->mds = mds; + this->cache = mdcache; nitems = 0; nnull = 0; state = CDIR_STATE_INITIAL; - version = 0; + projected_version = version = 0; committing_version = 0; last_committed_version = 0; @@ -141,15 +127,19 @@ CDir::CDir(CInode *in, MDS *mds, bool auth) * linking fun */ -CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) +CDentry* CDir::add_dentry( const string& dname, inodeno_t ino, bool auth) { // foreign assert(lookup(dname) == 0); // create dentry CDentry* dn = new CDentry(dname, ino); + if (auth) + dn->state_set(CDentry::STATE_AUTH); + cache->lru.lru_insert_mid(dn); + dn->dir = this; - dn->parent_dir_version = version; + dn->version = projected_version; // add to dir assert(items.count(dn->name) == 0); @@ -161,7 +151,7 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) dout(12) << "add_dentry " << *dn << endl; // pin? - if (nnull + nitems == 1) get(CDIR_PIN_CHILD); + if (nnull + nitems == 1) get(PIN_CHILD); assert(nnull + nitems == items.size()); assert(nnull == null_items.size()); @@ -169,15 +159,19 @@ CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) } -CDentry* CDir::add_dentry( const string& dname, CInode *in ) +CDentry* CDir::add_dentry( const string& dname, CInode *in, bool auth ) { // primary assert(lookup(dname) == 0); // create dentry CDentry* dn = new CDentry(dname, in); + if (auth) + dn->state_set(CDentry::STATE_AUTH); + cache->lru.lru_insert_mid(dn); + dn->dir = this; - dn->parent_dir_version = version; + dn->version = projected_version; // add to dir assert(items.count(dn->name) == 0); @@ -196,7 +190,7 @@ CDentry* CDir::add_dentry( const string& dname, CInode *in ) dout(12) << "add_dentry " << *dn << endl; // pin? - if (nnull + nitems == 1) get(CDIR_PIN_CHILD); + if (nnull + nitems == 1) get(PIN_CHILD); assert(nnull + nitems == items.size()); assert(nnull == null_items.size()); @@ -214,7 +208,7 @@ void CDir::remove_dentry(CDentry *dn) unlink_inode_work(dn); } else { // remove from null list - assert(null_items.count(dn->name) == 1); + assert(null_items.count(dn->name) == 1); null_items.erase(dn->name); nnull--; } @@ -223,10 +217,11 @@ void CDir::remove_dentry(CDentry *dn) assert(items.count(dn->name) == 1); items.erase(dn->name); + cache->lru.lru_remove(dn); delete dn; // unpin? - if (nnull + nitems == 0) put(CDIR_PIN_CHILD); + if (nnull + nitems == 0) put(PIN_CHILD); assert(nnull + nitems == items.size()); assert(nnull == null_items.size()); @@ -234,7 +229,7 @@ void CDir::remove_dentry(CDentry *dn) void CDir::link_inode( CDentry *dn, inodeno_t ino) { - //dout(12) << "link_inode " << *dn << " remote " << ino << endl; + dout(12) << "link_inode " << *dn << " remote " << ino << endl; assert(dn->is_null()); dn->set_remote_ino(ino); @@ -247,10 +242,10 @@ void CDir::link_inode( CDentry *dn, inodeno_t ino) void CDir::link_inode( CDentry *dn, CInode *in ) { + dout(12) << "link_inode " << *dn << " " << *in << endl; assert(!dn->is_remote()); link_inode_work(dn,in); - //dout(12) << "link_inode " << *dn << " " << *in << endl; // remove from null list assert(null_items.count(dn->name) == 1); @@ -269,14 +264,15 @@ void CDir::link_inode_work( CDentry *dn, CInode *in ) nitems++; // adjust dir size // set dir version - in->parent_dir_version = get_version(); + in->inode.version = dn->get_version(); // clear dangling - in->state_clear(CINODE_STATE_DANGLING); - - // dn dirty? - if (dn->is_dirty()) in->get(CINODE_PIN_DNDIRTY); + in->state_clear(CInode::STATE_DANGLING); + // pin dentry? + if (in->get_num_ref()) + dn->get(CDentry::PIN_INODEPIN); + // adjust auth pin count if (in->auth_pins + in->nested_auth_pins) adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); @@ -314,17 +310,18 @@ void CDir::unlink_inode_work( CDentry *dn ) // explicitly define auth in->dangling_auth = in->authority(); //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; + + // unpin dentry? + if (in->get_num_ref()) + dn->put(CDentry::PIN_INODEPIN); // unlink auth_pin count if (in->auth_pins + in->nested_auth_pins) adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); // set dangling flag - in->state_set(CINODE_STATE_DANGLING); + in->state_set(CInode::STATE_DANGLING); - // dn dirty? - if (dn->is_dirty()) in->put(CINODE_PIN_DNDIRTY); - // detach inode in->remove_primary_parent(dn); dn->inode = 0; @@ -377,7 +374,7 @@ void CDir::add_waiter(int tag, const string& dentry, Context *c) { if (waiting.empty() && waiting_on_dentry.size() == 0) - get(CDIR_PIN_WAITER); + get(PIN_WAITER); waiting_on_dentry[ dentry ].insert(pair(tag,c)); dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; } @@ -398,7 +395,7 @@ void CDir::add_waiter(int tag, Context *c) { // this dir. if (waiting.empty() && waiting_on_dentry.size() == 0) - get(CDIR_PIN_WAITER); + get(PIN_WAITER); waiting.insert(pair(tag,c)); dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; } @@ -434,7 +431,7 @@ void CDir::take_waiting(int mask, // ...whole map? if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(CDIR_PIN_WAITER); + put(PIN_WAITER); } /* NOTE: this checks dentry waiters too */ @@ -465,7 +462,7 @@ void CDir::take_waiting(int mask, } if (waiting_on_dentry.size() == 0 && waiting.empty()) - put(CDIR_PIN_WAITER); + put(PIN_WAITER); } } @@ -491,73 +488,51 @@ void CDir::finish_waiting(int mask, const string& dn, int result) // dirty/clean -void CDir::mark_dirty() +version_t CDir::pre_dirty() +{ + ++projected_version; + dout(10) << "pre_dirty " << projected_version << endl; + return projected_version; +} + +void CDir::_mark_dirty() { if (!state_test(CDIR_STATE_DIRTY)) { - version++; state_set(CDIR_STATE_DIRTY); - dout(10) << "mark_dirty (was clean) " << *this << " new version " << version << endl; - get(CDIR_PIN_DIRTY); - } - else if (state_test(CDIR_STATE_COMMITTING) && - committing_version == version) { - version++; // now dirtier than committing version! - dout(10) << "mark_dirty (committing) " << *this << " new version " << version << "/" << committing_version << endl; + dout(10) << "mark_dirty (was clean) " << *this << " version " << version << endl; + get(PIN_DIRTY); } else { dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl; } } +void CDir::mark_dirty(version_t pv) +{ + ++version; + assert(pv == version); + _mark_dirty(); +} + void CDir::mark_clean() { dout(10) << "mark_clean " << *this << " version " << version << endl; if (state_test(CDIR_STATE_DIRTY)) { state_clear(CDIR_STATE_DIRTY); - put(CDIR_PIN_DIRTY); + put(PIN_DIRTY); } } -// ref counts - -void CDir::put(int by) { - cdir_pins[by]--; - - // bad? - if (ref == 0 || ref_set.count(by) != 1) { - dout(7) << *this << " bad put by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - - ref--; - ref_set.erase(by); - // inode - if (ref == 0) - inode->put(CINODE_PIN_DIR); - - dout(7) << *this << " put by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; +void CDir::first_get() +{ + inode->get(CInode::PIN_DIR); } -void CDir::get(int by) { - cdir_pins[by]++; - - // inode - if (ref == 0) - inode->get(CINODE_PIN_DIR); - - // bad? - if (ref_set.count(by)) { - dout(7) << *this << " bad get by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 0); - } - - ref++; - ref_set.insert(by); - - dout(7) << *this << " get by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; +void CDir::last_put() +{ + inode->put(CInode::PIN_DIR); } @@ -571,34 +546,23 @@ void CDir::get(int by) { */ int CDir::authority() { - if (get_dir_auth() >= 0) - return get_dir_auth(); - - /* - CDir *parent = inode->get_parent_dir(); - if (parent) - return parent->authority(); - - // root, or dangling - assert(inode->is_root()); // no dirs under danglers!? - //assert(inode->is_root() || inode->is_dangling()); - */ - - return inode->authority(); + if (dir_auth == CDIR_AUTH_PARENT) + return inode->authority(); + return dir_auth; } int CDir::dentry_authority(const string& dn ) { // hashing -- subset of nodes have hashed the contents if (is_hashing() && !hashed_subset.empty()) { - int hashauth = mds->hash_dentry( inode->ino(), dn ); // hashed + int hashauth = cache->hash_dentry( inode->ino(), dn ); // hashed if (hashed_subset.count(hashauth)) return hashauth; } // hashed if (is_hashed()) { - return mds->hash_dentry( inode->ino(), dn ); // hashed + return cache->hash_dentry( inode->ino(), dn ); // hashed } if (get_dir_auth() == CDIR_AUTH_PARENT) { @@ -624,7 +588,7 @@ void CDir::set_dir_auth(int d) void CDir::auth_pin() { if (auth_pins == 0) - get(CDIR_PIN_AUTHPIN); + get(PIN_AUTHPIN); auth_pins++; dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; @@ -637,7 +601,7 @@ void CDir::auth_pin() { void CDir::auth_unpin() { auth_pins--; if (auth_pins == 0) - put(CDIR_PIN_AUTHPIN); + put(PIN_AUTHPIN); dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; assert(auth_pins >= 0); @@ -760,13 +724,24 @@ void CDir::freeze_tree_finish(Context *c) void CDir::unfreeze_tree() { dout(10) << "unfreeze_tree " << *this << endl; - state_clear(CDIR_STATE_FROZENTREE); - - // unpin (may => FREEZEABLE) FIXME: is this order good? - inode->auth_unpin(); - // waiters? - finish_waiting(CDIR_WAIT_UNFREEZE); + if (state_test(CDIR_STATE_FROZENTREE)) { + // frozen. unfreeze. + state_clear(CDIR_STATE_FROZENTREE); + + // unpin (may => FREEZEABLE) FIXME: is this order good? + inode->auth_unpin(); + + // waiters? + finish_waiting(CDIR_WAIT_UNFREEZE); + } else { + // freezing. stop it. + assert(state_test(CDIR_STATE_FREEZINGTREE)); + state_clear(CDIR_STATE_FREEZINGTREE); + + // cancel freeze waiters + finish_waiting(CDIR_WAIT_FREEZEABLE, -1); + } } bool CDir::is_freezing_tree() @@ -790,6 +765,7 @@ bool CDir::is_frozen_tree() if (dir->is_frozen_tree_root()) return true; if (dir->is_import()) return false; if (dir->is_hashed()) return false; + if (dir->is_frozen_tree_leaf()) return false; if (dir->inode->parent) dir = dir->inode->parent->dir; else diff --git a/branches/aleung/security1/ceph/mds/CDir.h b/branches/aleung/security1/ceph/mds/CDir.h index a1e857a72f9f9..6283bef7c0aff 100644 --- a/branches/aleung/security1/ceph/mds/CDir.h +++ b/branches/aleung/security1/ceph/mds/CDir.h @@ -18,6 +18,7 @@ #include "include/types.h" #include "include/buffer.h" +#include "mdstypes.h" #include "config.h" #include "common/DecayCounter.h" @@ -37,7 +38,7 @@ using __gnu_cxx::hash_map; #include "CInode.h" class CDentry; -class MDS; +class MDCache; class MDCluster; class Context; @@ -45,6 +46,7 @@ class Context; // directory authority types // >= 0 is the auth mds #define CDIR_AUTH_PARENT -1 // default +#define CDIR_AUTH_UNKNOWN -2 #define CDIR_NONCE_EXPORT 1 @@ -57,23 +59,24 @@ class Context; #define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache #define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit -#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) -#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing -#define CDIR_STATE_FROZENDIR (1<<6) -#define CDIR_STATE_FREEZINGDIR (1<<7) +#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) +#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing +#define CDIR_STATE_FROZENTREELEAF (1<<6) // outer bound of frozen region (on import) +#define CDIR_STATE_FROZENDIR (1<<7) +#define CDIR_STATE_FREEZINGDIR (1<<8) -#define CDIR_STATE_COMMITTING (1<<8) // mid-commit -#define CDIR_STATE_FETCHING (1<<9) // currenting fetching +#define CDIR_STATE_COMMITTING (1<<9) // mid-commit +#define CDIR_STATE_FETCHING (1<<10) // currenting fetching -#define CDIR_STATE_DELETED (1<<10) +#define CDIR_STATE_DELETED (1<<11) -#define CDIR_STATE_IMPORT (1<<11) // flag set if this is an import. -#define CDIR_STATE_EXPORT (1<<12) -#define CDIR_STATE_IMPORTINGEXPORT (1<<13) +#define CDIR_STATE_IMPORT (1<<12) // flag set if this is an import. +#define CDIR_STATE_EXPORT (1<<13) +#define CDIR_STATE_IMPORTINGEXPORT (1<<14) -#define CDIR_STATE_HASHED (1<<14) // if hashed -#define CDIR_STATE_HASHING (1<<15) -#define CDIR_STATE_UNHASHING (1<<16) +#define CDIR_STATE_HASHED (1<<15) // if hashed +#define CDIR_STATE_HASHING (1<<16) +#define CDIR_STATE_UNHASHING (1<<17) @@ -85,12 +88,15 @@ class Context; |CDIR_STATE_DIRTY) #define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\ |CDIR_STATE_EXPORT\ - |CDIR_STATE_IMPORTINGEXPORT) + |CDIR_STATE_IMPORTINGEXPORT\ + |CDIR_STATE_FROZENTREE\ + |CDIR_STATE_PROXY) + #define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\ |CDIR_STATE_FROZENTREE\ |CDIR_STATE_FROZENDIR\ |CDIR_STATE_EXPORT\ - |CDIR_STATE_PROXY) + |CDIR_STATE_PROXY) // common states #define CDIR_STATE_CLEAN 0 @@ -103,29 +109,6 @@ class Context; -// pins - -#define CDIR_PIN_CHILD 0 -#define CDIR_PIN_OPENED 1 // open by another node -#define CDIR_PIN_WAITER 2 // waiter(s) - -#define CDIR_PIN_IMPORT 3 -#define CDIR_PIN_EXPORT 4 -#define CDIR_PIN_FREEZE 5 -#define CDIR_PIN_PROXY 6 // auth just changed. - -#define CDIR_PIN_AUTHPIN 7 - -#define CDIR_PIN_IMPORTING 8 -#define CDIR_PIN_IMPORTINGEXPORT 9 - -#define CDIR_PIN_HASHED 10 -#define CDIR_PIN_HASHING 11 -#define CDIR_PIN_DIRTY 12 - -#define CDIR_PIN_REQUEST 13 - -#define CDIR_NUM_PINS 14 @@ -185,11 +168,54 @@ ostream& operator<<(ostream& out, class CDir& dir); typedef map CDir_map_t; -extern int cdir_pins[CDIR_NUM_PINS]; +//extern int cdir_pins[CDIR_NUM_PINS]; -class CDir { +class CDir : public MDSCacheObject { public: + // -- pins -- + static const int PIN_CHILD = 0; + static const int PIN_OPENED = 1; // open by another node + static const int PIN_WAITER = 2; // waiter(s) + static const int PIN_IMPORT = 3; + static const int PIN_EXPORT = 4; + //static const int PIN_FREEZE = 5; + static const int PIN_FREEZELEAF = 6; + static const int PIN_PROXY = 7; // auth just changed. + static const int PIN_AUTHPIN = 8; + static const int PIN_IMPORTING = 9; + static const int PIN_IMPORTINGEXPORT = 10; + static const int PIN_HASHED = 11; + static const int PIN_HASHING = 12; + static const int PIN_DIRTY = 13; + static const int PIN_REQUEST = 14; + static const char *pin_name(int p) { + switch (p) { + case PIN_CHILD: return "child"; + case PIN_OPENED: return "opened"; + case PIN_WAITER: return "waiter"; + case PIN_IMPORT: return "import"; + case PIN_EXPORT: return "export"; + //case PIN_FREEZE: return "freeze"; + case PIN_FREEZELEAF: return "freezeleaf"; + case PIN_PROXY: return "proxy"; + case PIN_AUTHPIN: return "authpin"; + case PIN_IMPORTING: return "importing"; + case PIN_IMPORTINGEXPORT: return "importingexport"; + case PIN_HASHED: return "hashed"; + case PIN_HASHING: return "hashing"; + case PIN_DIRTY: return "dirty"; + case PIN_REQUEST: return "request"; + default: assert(0); + } + } + + + public: + // context + MDCache *cache; + + // my inode CInode *inode; protected: @@ -198,25 +224,16 @@ class CDir { CDir_map_t null_items; // null and foreign size_t nitems; // non-null size_t nnull; // null - //size_t nauthitems; - //size_t namesize; // state - unsigned state; version_t version; version_t committing_version; - version_t last_committed_version; + version_t last_committed_version; // slight lie; we bump this on import. + version_t projected_version; // authority, replicas - set open_by; // nodes that have me open - map open_by_nonce; - int replica_nonce; int dir_auth; - // reference countin/pins - int ref; // reference count - set ref_set; - // lock nesting, freeze int auth_pins; int nested_auth_pins; @@ -229,8 +246,6 @@ class CDir { map, list > > hashed_readdir; protected: - // context - MDS *mds; // waiters @@ -256,7 +271,7 @@ class CDir { friend class CDirExport; public: - CDir(CInode *in, MDS *mds, bool auth); + CDir(CInode *in, MDCache *mdcache, bool auth); @@ -268,20 +283,10 @@ class CDir { CDir_map_t::iterator begin() { return items.begin(); } CDir_map_t::iterator end() { return items.end(); } size_t get_size() { - - //if ( is_auth() && !is_hashed()) assert(nauthitems == nitems); - //if (!is_auth() && !is_hashed()) assert(nauthitems == 0); - return nitems; } size_t get_nitems() { return nitems; } size_t get_nnull() { return nnull; } - /* - size_t get_auth_size() { - assert(nauthitems <= nitems); - return nauthitems; - } - */ /* float get_popularity() { @@ -300,8 +305,8 @@ class CDir { return iter->second; } - CDentry* add_dentry( const string& dname, CInode *in=0 ); - CDentry* add_dentry( const string& dname, inodeno_t ino ); + CDentry* add_dentry( const string& dname, CInode *in=0, bool auth=true ); + CDentry* add_dentry( const string& dname, inodeno_t ino, bool auth=true ); void remove_dentry( CDentry *dn ); // delete dentry void link_inode( CDentry *dn, inodeno_t ino ); void link_inode( CDentry *dn, CInode *in ); @@ -319,77 +324,23 @@ class CDir { int get_dir_auth() { return dir_auth; } void set_dir_auth(int d); - bool is_open_by_anyone() { return !open_by.empty(); } - bool is_open_by(int mds) { return open_by.count(mds); } - int get_open_by_nonce(int mds) { - map::iterator it = open_by_nonce.find(mds); - return it->second; - } - set::iterator open_by_begin() { return open_by.begin(); } - set::iterator open_by_end() { return open_by.end(); } - set& get_open_by() { return open_by; } - - int get_replica_nonce() { assert(!is_auth()); return replica_nonce; } - - int open_by_add(int mds) { - int nonce = 1; - - if (is_open_by(mds)) { // already had it? - nonce = get_open_by_nonce(mds) + 1; // new nonce (+1) - dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl; - open_by_nonce.erase(mds); - } else { - if (open_by.empty()) - get(CDIR_PIN_OPENED); - open_by.insert(mds); - } - open_by_nonce.insert(pair(mds,nonce)); // first! serial of 1. - return nonce; // default nonce - } - void open_by_remove(int mds) { - //if (!is_open_by(mds)) return; - assert(is_open_by(mds)); - - open_by.erase(mds); - open_by_nonce.erase(mds); - if (open_by.empty()) - put(CDIR_PIN_OPENED); - } - void open_by_clear() { - if (!open_by.empty()) - put(CDIR_PIN_OPENED); - open_by.clear(); - open_by_nonce.clear(); - } - - + // for giving to clients void get_dist_spec(set& ls, int auth) { if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) { //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; - ls = open_by; - if (!ls.empty()) ls.insert(auth); + for (map::iterator p = replicas_begin(); + p != replicas_end(); + ++p) + ls.insert(p->first); + if (!ls.empty()) + ls.insert(auth); } } // -- state -- - unsigned get_state() { return state; } - void reset_state(unsigned s) { - state = s; - dout(10) << " cdir:" << *this << " state reset" << endl; - } - void state_clear(unsigned mask) { - state &= ~mask; - dout(10) << " cdir:" << *this << " state -" << mask << " = " << state << endl; - } - void state_set(unsigned mask) { - state |= mask; - dout(10) << " cdir:" << *this << " state +" << mask << " = " << state << endl; - } - unsigned state_test(unsigned mask) { return state & mask; } - bool is_complete() { return state & CDIR_STATE_COMPLETE; } bool is_dirty() { return state_test(CDIR_STATE_DIRTY); } @@ -411,18 +362,18 @@ class CDir { // -- dirtyness -- version_t get_version() { return version; } - void float_version(version_t ge) { - if (version < ge) - version = ge; - } - void set_version(version_t v) { version = v; } - + void set_version(version_t v) { projected_version = version = v; } + version_t get_projected_version() { return projected_version; } + version_t get_committing_version() { return committing_version; } version_t get_last_committed_version() { return last_committed_version; } // as in, we're committing the current version. void set_committing_version() { committing_version = version; } void set_last_committed_version(version_t v) { last_committed_version = v; } - void mark_dirty(); + + version_t pre_dirty(); + void _mark_dirty(); + void mark_dirty(version_t pv); void mark_clean(); void mark_complete() { state_set(CDIR_STATE_COMPLETE); } bool is_clean() { return !state_test(CDIR_STATE_DIRTY); } @@ -431,21 +382,16 @@ class CDir { // -- reference counting -- - void put(int by); - void get(int by); - bool is_pinned_by(int by) { - return ref_set.count(by); - } - bool is_pinned() { return ref > 0; } - int get_ref() { return ref; } - set& get_ref_set() { return ref_set; } + void first_get(); + void last_put(); + void request_pin_get() { - if (request_pins == 0) get(CDIR_PIN_REQUEST); + if (request_pins == 0) get(PIN_REQUEST); request_pins++; } void request_pin_put() { request_pins--; - if (request_pins == 0) put(CDIR_PIN_REQUEST); + if (request_pins == 0) put(PIN_REQUEST); } @@ -490,6 +436,7 @@ class CDir { bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } bool is_frozen_tree(); bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; } + bool is_frozen_tree_leaf() { return state & CDIR_STATE_FROZENTREELEAF; } bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; } bool is_freezeable() { @@ -568,25 +515,18 @@ class CDirDiscover { // export -typedef struct { - inodeno_t ino; - __uint64_t nitems; // actual real entries - __uint64_t nden; // num dentries (including null ones) - version_t version; - unsigned state; - meta_load_t popularity_justme; - meta_load_t popularity_curdom; - int dir_auth; - int dir_rep; - int nopen_by; - int nrep_by; - // ints follow -} CDirExport_st; - class CDirExport { - CDirExport_st st; - set open_by; - map open_by_nonce; + struct { + inodeno_t ino; + long nitems; // actual real entries + long nden; // num dentries (including null ones) + version_t version; + unsigned state; + meta_load_t popularity_justme; + meta_load_t popularity_curdom; + int dir_rep; + } st; + map replicas; set rep_by; public: @@ -594,12 +534,13 @@ class CDirExport { CDirExport(CDir *dir) { memset(&st, 0, sizeof(st)); + assert(dir->get_version() == dir->get_projected_version()); + st.ino = dir->ino(); st.nitems = dir->nitems; st.nden = dir->items.size(); st.version = dir->version; st.state = dir->state; - st.dir_auth = dir->dir_auth; st.dir_rep = dir->dir_rep; st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); @@ -608,8 +549,7 @@ class CDirExport { dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; rep_by = dir->dir_rep_by; - open_by = dir->open_by; - open_by_nonce = dir->open_by_nonce; + replicas = dir->replicas; } inodeno_t get_ino() { return st.ino; } @@ -619,13 +559,17 @@ class CDirExport { assert(dir->ino() == st.ino); //dir->nitems = st.nitems; - dir->version = st.version; + + // set last_committed_version at old version + dir->committing_version = dir->last_committed_version = st.version; + dir->projected_version = dir->version = st.version; // this is bumped, below, if dirty + + // twiddle state if (dir->state & CDIR_STATE_HASHED) - dir->state |= CDIR_STATE_AUTH; // just inherit auth flag when hashed + dir->state_set( CDIR_STATE_AUTH ); // just inherit auth flag when hashed else dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. (st.state & CDIR_MASK_STATE_EXPORTED); - dir->dir_auth = st.dir_auth; dir->dir_rep = st.dir_rep; dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; @@ -635,67 +579,34 @@ class CDirExport { dir->replica_nonce = 0; // no longer defined - if (!dir->open_by.empty()) - dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl; + if (!dir->replicas.empty()) + dout(0) << "replicas not empty non import, " << *dir << ", " << dir->replicas << endl; dir->dir_rep_by = rep_by; - dir->open_by = open_by; - dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl; - dir->open_by_nonce = open_by_nonce; - if (!open_by.empty()) - dir->get(CDIR_PIN_OPENED); - if (dir->is_dirty()) - dir->get(CDIR_PIN_DIRTY); + dir->replicas = replicas; + dout(12) << "replicas in export is " << replicas << ", dir now " << dir->replicas << endl; + if (!replicas.empty()) + dir->get(CDir::PIN_OPENED); + if (dir->is_dirty()) { + dir->get(CDir::PIN_DIRTY); + + // bump dir version + 1 if dirty + dir->projected_version = dir->version = st.version + 1; + } } void _encode(bufferlist& bl) { - st.nrep_by = rep_by.size(); - st.nopen_by = open_by_nonce.size(); bl.append((char*)&st, sizeof(st)); - - // open_by - for (map::iterator it = open_by_nonce.begin(); - it != open_by_nonce.end(); - it++) { - int m = it->first; - bl.append((char*)&m, sizeof(int)); - int n = it->second; - bl.append((char*)&n, sizeof(int)); - } - - // rep_by - for (set::iterator it = rep_by.begin(); - it != rep_by.end(); - it++) { - int m = *it; - bl.append((char*)&m, sizeof(int)); - } + ::_encode(replicas, bl); + ::_encode(rep_by, bl); } int _decode(bufferlist& bl, int off = 0) { bl.copy(off, sizeof(st), (char*)&st); off += sizeof(st); - - // open_by - for (int i=0; i(m,n)); - } - - // rep_by - for (int i=0; imds->get_nodeid() << ".cache.inode(" << inode.ino << ") " +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.ino(" << inode.ino << ") " -int cinode_pins[CINODE_NUM_PINS]; // counts +//int cinode_pins[CINODE_NUM_PINS]; // counts ostream& operator<<(ostream& out, CInode& in) @@ -40,18 +40,11 @@ ostream& operator<<(ostream& out, CInode& in) out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); if (in.is_auth()) { out << "auth"; - if (in.is_cached_by_anyone()) { - //out << "+" << in.get_cached_by(); - for (set::iterator it = in.cached_by_begin(); - it != in.cached_by_end(); - it++) { - out << "+" << *it << "." << in.get_cached_by_nonce(*it); - } - } + if (in.is_replicated()) + out << in.get_replicas(); } else { out << "rep@" << in.authority(); - //if (in.get_replica_nonce() > 1) - out << "." << in.get_replica_nonce(); + out << "." << in.get_replica_nonce(); assert(in.get_replica_nonce() >= 0); } @@ -62,15 +55,12 @@ ostream& operator<<(ostream& out, CInode& in) out << " hard=" << in.hardlock; out << " file=" << in.filelock; - if (in.is_pinned()) { + if (in.get_num_ref()) { out << " |"; for(set::iterator it = in.get_ref_set().begin(); it != in.get_ref_set().end(); it++) - if (*it < CINODE_NUM_PINS) - out << " " << cinode_pin_names[*it]; - else - out << " " << *it; + out << " " << CInode::pin_name(*it); } // hack: spit out crap on which clients have caps @@ -91,11 +81,12 @@ ostream& operator<<(ostream& out, CInode& in) // ====== CInode ======= -CInode::CInode(MDCache *c, bool auth) : LRUObject() { +CInode::CInode(MDCache *c, bool auth) { mdcache = c; ref = 0; + num_parents = 0; parent = NULL; dir = NULL; // CDir opened separately @@ -106,15 +97,46 @@ CInode::CInode(MDCache *c, bool auth) : LRUObject() { state = 0; - committing_version = committed_version = 0; - - if (auth) state_set(CINODE_STATE_AUTH); + if (auth) state_set(STATE_AUTH); } CInode::~CInode() { if (dir) { delete dir; dir = 0; } } + +// pins + +void CInode::first_get() +{ + // pin my dentry? + if (parent) + parent->get(CDentry::PIN_INODEPIN); +} + +void CInode::last_put() +{ + // unpin my dentry? + if (parent) { + parent->put(CDentry::PIN_INODEPIN); + } + if (num_parents == 0 && get_num_ref() == 0) + mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection +} + +void CInode::get_parent() +{ + num_parents++; +} +void CInode::put_parent() +{ + num_parents--; + if (num_parents == 0 && get_num_ref() == 0) + mdcache->inode_expire_queue.push_back(this); // queue myself for garbage collection +} + + + CDir *CInode::get_parent_dir() { if (parent) @@ -135,7 +157,7 @@ bool CInode::dir_is_auth() { return is_auth(); } -CDir *CInode::get_or_open_dir(MDS *mds) +CDir *CInode::get_or_open_dir(MDCache *mdcache) { assert(is_dir()); @@ -146,7 +168,7 @@ CDir *CInode::get_or_open_dir(MDS *mds) // only auth can open dir alone. assert(is_auth()); - set_dir( new CDir(this, mds, true) ); + set_dir( new CDir(this, mdcache, true) ); dir->dir_auth = -1; return dir; } @@ -158,21 +180,23 @@ CDir *CInode::set_dir(CDir *newdir) return dir; } +void CInode::close_dir() +{ + assert(dir); + assert(dir->get_num_ref() == 0); + delete dir; + dir = 0; +} + + void CInode::set_auth(bool a) { if (!is_dangling() && !is_root() && is_auth() != a) { - /* - CDir *dir = get_parent_dir(); - if (is_auth() && !a) - dir->nauthitems--; - else - dir->nauthitems++; - */ } - if (a) state_set(CINODE_STATE_AUTH); - else state_clear(CINODE_STATE_AUTH); + if (a) state_set(STATE_AUTH); + else state_clear(STATE_AUTH); } @@ -200,7 +224,7 @@ void CInode::make_anchor_trace(vector& trace) parent->dir->inode->ino(), parent->name) ); } - else if (state_test(CINODE_STATE_DANGLING)) { + else if (state_test(STATE_DANGLING)) { dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl; string ref_dn; trace.push_back( new Anchor(ino(), @@ -214,14 +238,25 @@ void CInode::make_anchor_trace(vector& trace) -void CInode::mark_dirty() { +version_t CInode::pre_dirty() +{ + assert(parent); + return parent->pre_dirty(); +} + +void CInode::_mark_dirty() +{ + if (!state_test(STATE_DIRTY)) { + state_set(STATE_DIRTY); + get(PIN_DIRTY); + } +} + +void CInode::mark_dirty(version_t pv) { dout(10) << "mark_dirty " << *this << endl; - if (!parent) { - dout(10) << " dangling, not marking dirty!" << endl; - return; - } + assert(parent); /* NOTE: I may already be dirty, but this fn _still_ needs to be called so that @@ -229,32 +264,25 @@ void CInode::mark_dirty() { updated below. */ - // only auth can get dirty. "dirty" async data in replicas is relative to (say) filelock state, not dirty flag. + // only auth can get dirty. "dirty" async data in replicas is relative to + // filelock state, not the dirty flag. assert(is_auth()); - - // touch my private version - inode.version++; - if (!(state & CINODE_STATE_DIRTY)) { - state |= CINODE_STATE_DIRTY; - get(CINODE_PIN_DIRTY); - } - // relative to parent dir: - if (parent) { - // dir is now dirty (if it wasn't already) - parent->dir->mark_dirty(); - - // i now live in that (potentially newly dirty) version - parent_dir_version = parent->dir->get_version(); - } + // touch my private version + assert(inode.version < pv); + inode.version = pv; + _mark_dirty(); + + // mark dentry too + parent->mark_dirty(pv); } void CInode::mark_clean() { dout(10) << " mark_clean " << *this << endl; - if (state & CINODE_STATE_DIRTY) { - state &= ~CINODE_STATE_DIRTY; - put(CINODE_PIN_DIRTY); + if (state_test(STATE_DIRTY)) { + state_clear(STATE_DIRTY); + put(PIN_DIRTY); } } @@ -323,31 +351,6 @@ void CInode::decode_hard_state(bufferlist& r, int& off) } -// old state encoders - -/* -void CInode::encode_basic_state(bufferlist& r) -{ - // inode - r.append((char*)&inode, sizeof(inode)); - ::_encode(cached_by, r); - ::_encode(cached_by_nonce, r); -} - -void CInode::decode_basic_state(bufferlist& r, int& off) -{ - // inode - r.copy(0,sizeof(inode_t), (char*)&inode); - off += sizeof(inode_t); - - bool empty = cached_by.empty(); - ::_decode(cached_by, r, off); - ::_decode(cached_by_nonce, r, off); - if (!empty) - get(CINODE_PIN_CACHED); -} -*/ - // waiting @@ -386,7 +389,7 @@ void CInode::add_waiter(int tag, Context *c) { // this inode. if (waiting.size() == 0) - get(CINODE_PIN_WAITER); + get(PIN_WAITER); waiting.insert(pair(tag,c)); dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; @@ -410,7 +413,7 @@ void CInode::take_waiting(int mask, list& ls) } if (waiting.empty()) - put(CINODE_PIN_WAITER); + put(PIN_WAITER); } void CInode::finish_waiting(int mask, int result) @@ -432,7 +435,7 @@ bool CInode::can_auth_pin() { void CInode::auth_pin() { if (auth_pins == 0) - get(CINODE_PIN_AUTHPIN); + get(PIN_AUTHPIN); auth_pins++; dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; @@ -444,7 +447,7 @@ void CInode::auth_pin() { void CInode::auth_unpin() { auth_pins--; if (auth_pins == 0) - put(CINODE_PIN_AUTHPIN); + put(PIN_AUTHPIN); dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; @@ -460,11 +463,19 @@ void CInode::auth_unpin() { int CInode::authority() { if (is_dangling()) - return dangling_auth; // explicit - if (is_root()) - return 0; // i am root - assert(parent); - return parent->dir->dentry_authority( parent->name ); + return dangling_auth; // explicit + + if (is_root()) { // i am root + if (dir) + return dir->get_dir_auth(); // bit of a chicken/egg issue here! + else + return CDIR_AUTH_UNKNOWN; + } + + if (parent) + return parent->dir->dentry_authority( parent->name ); + + return -1; // undefined (inode must not be linked yet!) } @@ -473,11 +484,11 @@ CInodeDiscover* CInode::replicate_to( int rep ) assert(is_auth()); // relax locks? - if (!is_cached_by_anyone()) + if (!is_replicated()) replicate_relax_locks(); // return the thinger - int nonce = cached_by_add( rep ); + int nonce = add_replica( rep ); return new CInodeDiscover( this, nonce ); } diff --git a/branches/aleung/security1/ceph/mds/CInode.h b/branches/aleung/security1/ceph/mds/CInode.h index 99ba056c31309..8f50f1ef2615f 100644 --- a/branches/aleung/security1/ceph/mds/CInode.h +++ b/branches/aleung/security1/ceph/mds/CInode.h @@ -20,11 +20,12 @@ #include "include/types.h" #include "include/lru.h" +#include "mdstypes.h" + #include "CDentry.h" #include "Lock.h" #include "Capability.h" -#include "mdstypes.h" #include #include @@ -38,54 +39,6 @@ using namespace std; using namespace std; #include "crypto/ExtCap.h" - - -// pins for keeping an item in cache (and debugging) -#define CINODE_PIN_DIR 0 -#define CINODE_PIN_CACHED 1 -#define CINODE_PIN_DIRTY 2 // must flush -#define CINODE_PIN_PROXY 3 // can't expire yet -#define CINODE_PIN_WAITER 4 // waiter - -#define CINODE_PIN_CAPS 5 // local fh's - -#define CINODE_PIN_DNDIRTY 7 // dentry is dirty - -#define CINODE_PIN_AUTHPIN 8 -#define CINODE_PIN_IMPORTING 9 // multipurpose, for importing -#define CINODE_PIN_REQUEST 10 // request is logging, finishing -#define CINODE_PIN_RENAMESRC 11 // pinned on dest for foreign rename -#define CINODE_PIN_ANCHORING 12 - -#define CINODE_PIN_OPENINGDIR 13 - -#define CINODE_PIN_DENTRYLOCK 14 - -#define CINODE_NUM_PINS 15 - -static char *cinode_pin_names[CINODE_NUM_PINS] = { - "dir", - "cached", - "dirty", - "proxy", - "waiter", - "caps", - "--", - "dndirty", - "authpin", - "imping", - "request", - "rensrc", - "anching", - "opdir", - "dnlock" -}; - - - - - - // wait reasons #define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE // waiters: write_hard_start, read_file_start, write_file_start (mdcache) @@ -122,30 +75,9 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { #define CINODE_WAIT_CAPS (1<<30) - - - #define CINODE_WAIT_ANY 0xffffffff -// state -#define CINODE_STATE_AUTH (1<<0) -#define CINODE_STATE_ROOT (1<<1) - -#define CINODE_STATE_DIRTY (1<<2) -#define CINODE_STATE_UNSAFE (1<<3) // not logged yet -#define CINODE_STATE_DANGLING (1<<4) // delete me when i expire; i have no dentry -#define CINODE_STATE_UNLINKING (1<<5) -#define CINODE_STATE_PROXY (1<<6) // can't expire yet -#define CINODE_STATE_EXPORTING (1<<7) // on nonauth bystander. - -#define CINODE_STATE_ANCHORING (1<<8) - -#define CINODE_STATE_OPENINGDIR (1<<9) - -//#define CINODE_STATE_RENAMING (1<<8) // moving me -//#define CINODE_STATE_RENAMINGTO (1<<9) // rename target (will be unlinked) - // misc #define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export @@ -154,22 +86,71 @@ static char *cinode_pin_names[CINODE_NUM_PINS] = { class Context; class CDentry; class CDir; -class MDS; class Message; class CInode; class CInodeDiscover; class MDCache; -//class MInodeSyncStart; ostream& operator<<(ostream& out, CInode& in); -extern int cinode_pins[CINODE_NUM_PINS]; // counts +// cached inode wrapper +class CInode : public MDSCacheObject { + public: + // -- pins -- + static const int PIN_CACHED = 1; + static const int PIN_DIR = 2; + static const int PIN_DIRTY = 4; // must flush + static const int PIN_PROXY = 5; // can't expire yet + static const int PIN_WAITER = 6; // waiter + static const int PIN_CAPS = 7; // local fh's + static const int PIN_AUTHPIN = 8; + static const int PIN_IMPORTING = 9; // multipurpose, for importing + static const int PIN_REQUEST = 10; // request is logging, finishing + static const int PIN_RENAMESRC = 11; // pinned on dest for foreign rename + static const int PIN_ANCHORING = 12; + + static const int PIN_OPENINGDIR = 13; + + static const int PIN_DENTRYLOCK = 14; + + static const char *pin_name(int p) { + switch (p) { + case PIN_CACHED: return "cached"; + case PIN_DIR: return "dir"; + case PIN_DIRTY: return "dirty"; + case PIN_PROXY: return "proxy"; + case PIN_WAITER: return "waiter"; + case PIN_CAPS: return "caps"; + case PIN_AUTHPIN: return "authpin"; + case PIN_IMPORTING: return "importing"; + case PIN_REQUEST: return "request"; + case PIN_RENAMESRC: return "renamesrc"; + case PIN_ANCHORING: return "anchoring"; + case PIN_OPENINGDIR: return "openingdir"; + case PIN_DENTRYLOCK: return "dentrylock"; + default: assert(0); + } + } + + // state + static const int STATE_AUTH = (1<<0); + static const int STATE_ROOT = (1<<1); + static const int STATE_DIRTY = (1<<2); + static const int STATE_UNSAFE = (1<<3); // not logged yet + static const int STATE_DANGLING = (1<<4); // delete me when i expire; i have no dentry + static const int STATE_UNLINKING = (1<<5); + static const int STATE_PROXY = (1<<6); // can't expire yet + static const int STATE_EXPORTING = (1<<7); // on nonauth bystander. + static const int STATE_ANCHORING = (1<<8); + static const int STATE_OPENINGDIR = (1<<9); + //static const int STATE_RENAMING = (1<<8); // moving me + //static const int STATE_RENAMINGTO = (1<<9); // rename target (will be unlinked) + + -// cached inode wrapper -class CInode : public LRUObject { public: MDCache *mdcache; @@ -178,31 +159,13 @@ class CInode : public LRUObject { CDir *dir; // directory, if we have it opened. string symlink; // symlink dest, if symlink - // inode metadata locks - CLock hardlock; - CLock filelock; - protected: - int ref; // reference count - set ref_set; - version_t parent_dir_version; // parent dir version when i was last touched. - version_t committing_version; - version_t committed_version; - - unsigned state; - // parent dentries in cache + int num_parents; CDentry *parent; // primary link set remote_parents; // if hard linked // -- distributed caching - set cached_by; // [auth] mds's that cache me. - /* NOTE: on replicas, this doubles as replicated_by, but the - cached_by_* access methods below should NOT be used in those - cases, as the semantics are different! */ - map cached_by_nonce; // [auth] nonce issued to each replica - int replica_nonce; // [replica] defined on replica - int dangling_auth; // explicit auth, when dangling. int num_request_pins; @@ -210,9 +173,15 @@ class CInode : public LRUObject { // waiters multimap waiting; + + // -- distributed state -- +public: + // inode metadata locks + CLock hardlock; + CLock filelock; +protected: // file capabilities map client_caps; // client -> caps - // secure capabilities // will be dependant based on MDS collection policy! @@ -256,13 +225,11 @@ class CInode : public LRUObject { bool is_anchored() { return inode.anchored; } - bool is_root() { return state & CINODE_STATE_ROOT; } - bool is_proxy() { return state & CINODE_STATE_PROXY; } + bool is_root() { return state & STATE_ROOT; } + bool is_proxy() { return state & STATE_PROXY; } - bool is_auth() { return state & CINODE_STATE_AUTH; } + bool is_auth() { return state & STATE_AUTH; } void set_auth(bool auth); - bool is_replica() { return !is_auth(); } - int get_replica_nonce() { assert(!is_auth()); return replica_nonce; } inodeno_t ino() { return inode.ino; } uid_t get_uid() { return inode.uid; } @@ -273,8 +240,9 @@ class CInode : public LRUObject { CInode *get_parent_inode(); CInode *get_realm_root(); // import, hash, or root - CDir *get_or_open_dir(MDS *mds); + CDir *get_or_open_dir(MDCache *mdcache); CDir *set_dir(CDir *newdir); + void close_dir(); bool dir_is_auth(); @@ -287,17 +255,12 @@ class CInode : public LRUObject { // -- state -- - unsigned get_state() { return state; } - void state_clear(unsigned mask) { state &= ~mask; } - void state_set(unsigned mask) { state |= mask; } - unsigned state_test(unsigned mask) { return state & mask; } + bool is_unsafe() { return state & STATE_UNSAFE; } + bool is_dangling() { return state & STATE_DANGLING; } + bool is_unlinking() { return state & STATE_UNLINKING; } - bool is_unsafe() { return state & CINODE_STATE_UNSAFE; } - bool is_dangling() { return state & CINODE_STATE_DANGLING; } - bool is_unlinking() { return state & CINODE_STATE_UNLINKING; } - - void mark_unsafe() { state |= CINODE_STATE_UNSAFE; } - void mark_safe() { state &= ~CINODE_STATE_UNSAFE; } + void mark_unsafe() { state |= STATE_UNSAFE; } + void mark_safe() { state &= ~STATE_UNSAFE; } // -- state encoding -- //void encode_basic_state(bufferlist& r); @@ -313,74 +276,17 @@ class CInode : public LRUObject { // -- dirtyness -- version_t get_version() { return inode.version; } - version_t get_parent_dir_version() { return parent_dir_version; } - void float_parent_dir_version(version_t ge) { - if (parent_dir_version < ge) - parent_dir_version = ge; - } - version_t get_committing_version() { return committing_version; } - version_t get_last_committed_version() { return committed_version; } - void set_committing_version(version_t v) { committing_version = v; } - void set_committed_version() { - committed_version = committing_version; - committing_version = 0; - } - bool is_dirty() { return state & CINODE_STATE_DIRTY; } + bool is_dirty() { return state & STATE_DIRTY; } bool is_clean() { return !is_dirty(); } - void mark_dirty(); + version_t pre_dirty(); + void _mark_dirty(); + void mark_dirty(version_t projected_dirv); void mark_clean(); - // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy - bool is_cached_by_anyone() { return !cached_by.empty(); } - bool is_cached_by(int mds) { return cached_by.count(mds); } - int num_cached_by() { return cached_by.size(); } - // cached_by_add returns a nonce - int cached_by_add(int mds) { - int nonce = 1; - if (is_cached_by(mds)) { // already had it? - nonce = get_cached_by_nonce(mds) + 1; // new nonce (+1) - dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl; - cached_by_nonce.erase(mds); - } else { - if (cached_by.empty()) - get(CINODE_PIN_CACHED); - cached_by.insert(mds); - } - cached_by_nonce.insert(pair(mds,nonce)); // first! serial of 1. - return nonce; // default nonce - } - void cached_by_add(int mds, int nonce) { - if (cached_by.empty()) - get(CINODE_PIN_CACHED); - cached_by.insert(mds); - cached_by_nonce.insert(pair(mds,nonce)); - } - int get_cached_by_nonce(int mds) { - map::iterator it = cached_by_nonce.find(mds); - return it->second; - } - void cached_by_remove(int mds) { - //if (!is_cached_by(mds)) return; - assert(is_cached_by(mds)); - - cached_by.erase(mds); - cached_by_nonce.erase(mds); - if (cached_by.empty()) - put(CINODE_PIN_CACHED); - } - void cached_by_clear() { - if (cached_by.size()) - put(CINODE_PIN_CACHED); - cached_by.clear(); - cached_by_nonce.clear(); - } - set::iterator cached_by_begin() { return cached_by.begin(); } - set::iterator cached_by_end() { return cached_by.end(); } - set& get_cached_by() { return cached_by; } CInodeDiscover* replicate_to(int rep); @@ -392,12 +298,19 @@ class CInode : public LRUObject { void finish_waiting(int mask, int result = 0); + bool is_hardlock_write_wanted() { + return waiting_for(CINODE_WAIT_HARDW); + } + bool is_filelock_write_wanted() { + return waiting_for(CINODE_WAIT_FILEW); + } + // -- caps -- (new) // client caps map& get_client_caps() { return client_caps; } void add_client_cap(int client, Capability& cap) { if (client_caps.empty()) - get(CINODE_PIN_CAPS); + get(PIN_CAPS); assert(client_caps.count(client) == 0); client_caps[client] = cap; } @@ -405,7 +318,7 @@ class CInode : public LRUObject { assert(client_caps.count(client) == 1); client_caps.erase(client); if (client_caps.empty()) - put(CINODE_PIN_CAPS); + put(PIN_CAPS); } Capability* get_client_cap(int client) { if (client_caps.count(client)) @@ -415,20 +328,20 @@ class CInode : public LRUObject { /* void set_client_caps(map& cl) { if (client_caps.empty() && !cl.empty()) - get(CINODE_PIN_CAPS); + get(PIN_CAPS); client_caps.clear(); client_caps = cl; } */ void take_client_caps(map& cl) { if (!client_caps.empty()) - put(CINODE_PIN_CAPS); + put(PIN_CAPS); cl = client_caps; client_caps.clear(); } void merge_client_caps(map& cl, set& new_client_caps) { if (client_caps.empty() && !cl.empty()) - get(CINODE_PIN_CAPS); + get(PIN_CAPS); for (map::iterator it = cl.begin(); it != cl.end(); it++) { @@ -494,7 +407,7 @@ class CInode : public LRUObject { void replicate_relax_locks() { assert(is_auth()); - assert(!is_cached_by_anyone()); + assert(!is_replicated()); dout(10) << " relaxing locks on " << *this << endl; if (hardlock.get_state() == LOCK_LOCK && @@ -540,60 +453,53 @@ class CInode : public LRUObject { linked to an active_request, so they're automatically cleaned up when a request is finished. pin at will! */ void request_pin_get() { - if (num_request_pins == 0) get(CINODE_PIN_REQUEST); + if (num_request_pins == 0) get(PIN_REQUEST); num_request_pins++; } void request_pin_put() { num_request_pins--; - if (num_request_pins == 0) put(CINODE_PIN_REQUEST); + if (num_request_pins == 0) put(PIN_REQUEST); assert(num_request_pins >= 0); } - - bool is_pinned() { return ref > 0; } - set& get_ref_set() { return ref_set; } - void put(int by) { - cinode_pins[by]--; - if (ref == 0 || ref_set.count(by) != 1) { - dout(7) << " bad put " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 1); - assert(ref > 0); - } - ref--; - ref_set.erase(by); - if (ref == 0) - lru_unpin(); - dout(7) << " put " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; - } - void get(int by) { - cinode_pins[by]++; - if (ref == 0) - lru_pin(); - if (ref_set.count(by)) { - dout(7) << " bad get " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; - assert(ref_set.count(by) == 0); - } - ref++; - ref_set.insert(by); - dout(7) << " get " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; + void bad_put(int by) { + dout(7) << " bad put " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 1); + assert(ref > 0); } - bool is_pinned_by(int by) { - return ref_set.count(by); + void bad_get(int by) { + dout(7) << " bad get " << *this << " by " << by << " " << pin_name(by) << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 0); } + void first_get(); + void last_put(); + // -- hierarchy stuff -- +private: + void get_parent(); + void put_parent(); + +public: void set_primary_parent(CDentry *p) { + assert(parent == 0); parent = p; + get_parent(); } void remove_primary_parent(CDentry *dn) { assert(dn == parent); parent = 0; + put_parent(); } void add_remote_parent(CDentry *p) { + if (remote_parents.empty()) + get_parent(); remote_parents.insert(p); } void remove_remote_parent(CDentry *p) { remote_parents.erase(p); + if (remote_parents.empty()) + put_parent(); } int num_remote_parents() { return remote_parents.size(); @@ -685,8 +591,7 @@ class CInodeExport { int num_caps; } st; - set cached_by; - map cached_by_nonce; + map replicas; map cap_map; CLock hardlock,filelock; @@ -697,12 +602,11 @@ public: CInodeExport(CInode *in) { st.inode = in->inode; st.is_dirty = in->is_dirty(); - cached_by = in->cached_by; - cached_by_nonce = in->cached_by_nonce; + replicas = in->replicas; hardlock = in->hardlock; filelock = in->filelock; - + st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; @@ -725,15 +629,12 @@ public: in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; in->popularity[MDS_POP_NESTED] += st.popularity_curdom; - if (st.is_dirty) { - in->mark_dirty(); - } + if (st.is_dirty) + in->_mark_dirty(); - in->cached_by.clear(); - in->cached_by = cached_by; - in->cached_by_nonce = cached_by_nonce; - if (!cached_by.empty()) - in->get(CINODE_PIN_CACHED); + in->replicas = replicas; + if (!replicas.empty()) + in->get(CInode::PIN_CACHED); in->hardlock = hardlock; in->filelock = filelock; @@ -747,8 +648,7 @@ public: bl.append((char*)&st, sizeof(st)); // cached_by + nonce - ::_encode(cached_by, bl); - ::_encode(cached_by_nonce, bl); + ::_encode(replicas, bl); hardlock.encode_state(bl); filelock.encode_state(bl); @@ -766,8 +666,7 @@ public: bl.copy(off, sizeof(st), (char*)&st); off += sizeof(st); - ::_decode(cached_by, bl, off); - ::_decode(cached_by_nonce, bl, off); + ::_decode(replicas, bl, off); hardlock.decode_state(bl, off); filelock.decode_state(bl, off); diff --git a/branches/aleung/security1/ceph/mds/ClientMap.h b/branches/aleung/security1/ceph/mds/ClientMap.h index 63f310358cae8..7cd1e496debdd 100644 --- a/branches/aleung/security1/ceph/mds/ClientMap.h +++ b/branches/aleung/security1/ceph/mds/ClientMap.h @@ -22,6 +22,17 @@ using namespace std; #include using namespace __gnu_cxx; + +/* + * this structure is used by the MDS purely so that + * it can remember client addresses (entity_inst_t) + * while processing request(s) on behalf of clients. + * as such it's only really a sort of short-term cache. + * + * it also remembers which clients mounted via this MDS, + * for the same reason (so that mounted clients can be + * contacted if necessary). + */ class ClientMap { hash_map client_inst; set client_mount; diff --git a/branches/aleung/security1/ceph/mds/Lock.h b/branches/aleung/security1/ceph/mds/Lock.h index faf648ed3b07f..0d9dabb61b669 100644 --- a/branches/aleung/security1/ceph/mds/Lock.h +++ b/branches/aleung/security1/ceph/mds/Lock.h @@ -70,38 +70,43 @@ any + statlite(mtime) // -- lock... hard or file +class Message; + class CLock { protected: // lock state char state; set gather_set; // auth - int nread, nwrite; + + // local state + int nread; + Message *wrlock_by; public: CLock() : - state(LOCK_LOCK), + state(LOCK_SYNC), nread(0), - nwrite(0) { + wrlock_by(0) { } // encode/decode void encode_state(bufferlist& bl) { bl.append((char*)&state, sizeof(state)); - bl.append((char*)&nread, sizeof(nread)); - bl.append((char*)&nwrite, sizeof(nwrite)); - _encode(gather_set, bl); + + //bl.append((char*)&nread, sizeof(nread)); + //bl.append((char*)&nwrite, sizeof(nwrite)); } void decode_state(bufferlist& bl, int& off) { bl.copy(off, sizeof(state), (char*)&state); off += sizeof(state); - bl.copy(off, sizeof(nread), (char*)&nread); - off += sizeof(nread); - bl.copy(off, sizeof(nwrite), (char*)&nwrite); - off += sizeof(nwrite); - _decode(gather_set, bl, off); + + //bl.copy(off, sizeof(nread), (char*)&nread); + //off += sizeof(nread); + //bl.copy(off, sizeof(nwrite), (char*)&nwrite); + //off += sizeof(nwrite); } char get_state() { return state; } @@ -142,8 +147,9 @@ class CLock { // gather set set& get_gather_set() { return gather_set; } - void init_gather(set& i) { - gather_set = i; + void init_gather(const map& i) { + for (map::const_iterator p = i.begin(); p != i.end(); ++p) + gather_set.insert(p->first); } bool is_gathering(int i) { return gather_set.count(i); @@ -160,16 +166,20 @@ class CLock { } int get_nread() { return nread; } - int get_write() { return ++nwrite; } - int put_write() { - assert(nwrite>0); - return --nwrite; + void get_write(Message *who) { + assert(wrlock_by == 0); + wrlock_by = who; + } + void put_write() { + assert(wrlock_by); + wrlock_by = 0; } - int get_nwrite() { return nwrite; } + bool is_wrlocked() { return wrlock_by ? true:false; } + Message *get_wrlocked_by() { return wrlock_by; } bool is_used() { - return (nwrite+nread)>0 ? true:false; + return (is_wrlocked() || (nread>0)) ? true:false; } - + // stable bool is_stable() { @@ -196,7 +206,7 @@ class CLock { bool can_write(bool auth) { if (auth) - return (state == LOCK_LOCK); + return (state == LOCK_LOCK) && !is_wrlocked(); else return false; } @@ -291,9 +301,9 @@ inline ostream& operator<<(ostream& out, CLock& l) if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); if (l.get_nread()) - out << " " << l.get_nread() << "r"; - if (l.get_nwrite()) - out << " " << l.get_nwrite() << "w"; + out << " r=" << l.get_nread(); + if (l.is_wrlocked()) + out << " w=" << l.get_wrlocked_by(); // rw? /* diff --git a/branches/aleung/security1/ceph/mds/Locker.cc b/branches/aleung/security1/ceph/mds/Locker.cc index d9a827bb07b3c..9c22a4d97f94f 100644 --- a/branches/aleung/security1/ceph/mds/Locker.cc +++ b/branches/aleung/security1/ceph/mds/Locker.cc @@ -27,8 +27,8 @@ #include "include/filepath.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" +#include "events/EString.h" +#include "events/EUpdate.h" #include "events/EUnlink.h" #include "msg/Messenger.h" @@ -87,6 +87,41 @@ void Locker::dispatch(Message *m) } +void Locker::send_lock_message(CInode *in, int msg, int type) +{ + for (map::iterator it = in->replicas_begin(); + it != in->replicas_end(); + it++) { + MLock *m = new MLock(msg, mds->get_nodeid()); + m->set_ino(in->ino(), type); + mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); + } +} + + +void Locker::send_lock_message(CInode *in, int msg, int type, bufferlist& data) +{ + for (map::iterator it = in->replicas_begin(); + it != in->replicas_end(); + it++) { + MLock *m = new MLock(msg, mds->get_nodeid()); + m->set_ino(in->ino(), type); + m->set_data(data); + mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); + } +} + +void Locker::send_lock_message(CDentry *dn, int msg) +{ + for (map::iterator it = dn->replicas_begin(); + it != dn->replicas_end(); + it++) { + MLock *m = new MLock(msg, mds->get_nodeid()); + m->set_dn(dn->dir->ino(), dn->name); + mds->send_message_mds(m, it->first, MDS_PORT_LOCKER); + } +} + // file i/o ----------------------------------------- @@ -260,7 +295,7 @@ bool Locker::issue_caps(CInode *in) it->second.get_last_seq(), it->second.pending(), it->second.wanted()), - MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first), + mds->clientmap.get_inst(it->first), 0, MDS_PORT_LOCKER); } } @@ -399,7 +434,7 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) MClientFileCaps *r = new MClientFileCaps(in->inode, 0, 0, 0, MClientFileCaps::FILECAP_RELEASE); - mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER); + mds->messenger->send_message(r, m->get_source_inst(), 0, MDS_PORT_LOCKER); } // merge in atime? @@ -428,7 +463,7 @@ void Locker::handle_client_file_caps(MClientFileCaps *m) } if (dirty) - mds->mdlog->submit_entry(new EInodeUpdate(in)); + mds->mdlog->submit_entry(new EString("cap inode update dirty fixme")); } // reevaluate, waiters @@ -601,7 +636,7 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) // if not replicated, i can twiddle lock at will if (in->is_auth() && - !in->is_cached_by_anyone() && + !in->is_replicated() && in->hardlock.get_state() != LOCK_LOCK) in->hardlock.set_state(LOCK_LOCK); @@ -615,7 +650,7 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) } in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* - in->hardlock.get_write(); + in->hardlock.get_write(m); return true; } @@ -648,17 +683,19 @@ bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) void Locker::inode_hard_write_finish(CInode *in) { // drop ref - assert(in->hardlock.can_write(in->is_auth())); + //assert(in->hardlock.can_write(in->is_auth())); in->hardlock.put_write(); in->auth_unpin(); dout(7) << "inode_hard_write_finish on " << *in << endl; - - // drop lock? - if (in->hardlock.get_nwrite() == 0) { + // others waiting? + if (in->is_hardlock_write_wanted()) { + // wake 'em up + in->take_waiting(CINODE_WAIT_HARDW, mds->finished_queue); + } else { // auto-sync if alone. if (in->is_auth() && - !in->is_cached_by_anyone() && + !in->is_replicated() && in->hardlock.get_state() != LOCK_SYNC) in->hardlock.set_state(LOCK_SYNC); @@ -679,9 +716,9 @@ void Locker::inode_hard_eval(CInode *in) in->hardlock.set_state(LOCK_LOCK); // waiters - in->hardlock.get_write(); + //in->hardlock.get_write(); in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); - in->hardlock.put_write(); + //in->hardlock.put_write(); break; default: @@ -693,8 +730,8 @@ void Locker::inode_hard_eval(CInode *in) if (in->is_auth()) { // sync? - if (in->is_cached_by_anyone() && - in->hardlock.get_nwrite() == 0 && + if (in->is_replicated() && + in->is_hardlock_write_wanted() && in->hardlock.get_state() != LOCK_SYNC) { dout(7) << "inode_hard_eval stable, syncing " << *in << endl; inode_hard_sync(in); @@ -725,14 +762,7 @@ void Locker::inode_hard_sync(CInode *in) in->encode_hard_state(harddata); // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IHARD); - m->set_data(harddata); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IHARD, harddata); // change lock in->hardlock.set_state(LOCK_SYNC); @@ -753,17 +783,11 @@ void Locker::inode_hard_lock(CInode *in) assert(in->hardlock.get_state() == LOCK_SYNC); // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IHARD); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IHARD); // change lock in->hardlock.set_state(LOCK_GLOCKR); - in->hardlock.init_gather(in->get_cached_by()); + in->hardlock.init_gather(in->get_replicas()); } @@ -913,9 +937,9 @@ bool Locker::inode_file_read_start(CInode *in, MClientRequest *m) if (in->filelock.can_read(in->is_auth())) { in->filelock.get_read(); - in->filelock.get_write(); + //in->filelock.get_write(); in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - in->filelock.put_write(); + //in->filelock.put_write(); return true; } } else { @@ -968,48 +992,52 @@ void Locker::inode_file_read_finish(CInode *in) bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) { - // can write? grab ref. - if (in->filelock.can_write(in->is_auth())) { - in->filelock.get_write(); - return true; - } + // can't write? + if (!in->filelock.can_write(in->is_auth())) { - // can't write, replicated. - if (in->is_auth()) { - // auth - if (in->filelock.can_write_soon(in->is_auth())) { - // just wait - } else { - if (!in->filelock.is_stable()) { - dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; - in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); - return false; - } - - // initiate lock - inode_file_lock(in); - - if (in->filelock.can_write(in->is_auth())) { - in->filelock.get_write(); - - in->filelock.get_read(); - in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); - in->filelock.put_read(); - return true; + // can't write. + if (in->is_auth()) { + // auth + if (!in->filelock.can_write_soon(in->is_auth())) { + if (!in->filelock.is_stable()) { + dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; + } + + // initiate lock + inode_file_lock(in); + + // fall-thru to below. } + } else { + // replica + // fw to auth + int auth = in->authority(); + dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mdcache->request_forward(m, auth); + return false; + } + } + + // check again + if (in->filelock.can_write(in->is_auth())) { + // can i auth pin? + assert(in->is_auth()); + if (!in->can_auth_pin()) { + dout(7) << "inode_file_write_start waiting for authpinnable on " << *in << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; } + in->auth_pin(); + in->filelock.get_write(m); + return true; + } else { dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); return false; - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mdcache->request_forward(m, auth); - return false; } } @@ -1022,7 +1050,7 @@ void Locker::inode_file_write_finish(CInode *in) dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; // drop lock? - if (in->filelock.get_nwrite() == 0) { + if (!in->is_filelock_write_wanted()) { in->finish_waiting(CINODE_WAIT_FILENOWR); inode_file_eval(in); } @@ -1057,10 +1085,10 @@ void Locker::inode_file_eval(CInode *in) // waiters in->filelock.get_read(); - in->filelock.get_write(); + //in->filelock.get_write(); in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); in->filelock.put_read(); - in->filelock.put_write(); + //in->filelock.put_write(); } break; @@ -1076,20 +1104,13 @@ void Locker::inode_file_eval(CInode *in) if ((issued & ~(CAP_FILE_WR)) == 0) { in->filelock.set_state(LOCK_MIXED); - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // data bufferlist softdata; in->encode_file_state(softdata); // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - m->set_data(softdata); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); } in->finish_waiting(CINODE_WAIT_FILESTABLE); @@ -1121,14 +1142,7 @@ void Locker::inode_file_eval(CInode *in) bufferlist softdata; in->encode_file_state(softdata); - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); - reply->set_data(softdata); - mds->send_message_mds(reply, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); } // waiters @@ -1194,7 +1208,7 @@ void Locker::inode_file_eval(CInode *in) // * -> loner? if (in->filelock.get_nread() == 0 && - in->filelock.get_nwrite() == 0 && + !in->is_filelock_write_wanted() && (wanted & CAP_FILE_WR) && loner && in->filelock.get_state() != LOCK_LONER) { @@ -1204,7 +1218,7 @@ void Locker::inode_file_eval(CInode *in) // * -> mixed? else if (in->filelock.get_nread() == 0 && - in->filelock.get_nwrite() == 0 && + !in->is_filelock_write_wanted() && (wanted & CAP_FILE_RD) && (wanted & CAP_FILE_WR) && !(loner && in->filelock.get_state() == LOCK_LONER) && @@ -1214,10 +1228,10 @@ void Locker::inode_file_eval(CInode *in) } // * -> sync? - else if (in->filelock.get_nwrite() == 0 && + else if (!in->is_filelock_write_wanted() && !(wanted & CAP_FILE_WR) && ((wanted & CAP_FILE_RD) || - in->is_cached_by_anyone() || + in->is_replicated() || (!loner && in->filelock.get_state() == LOCK_LONER)) && in->filelock.get_state() != LOCK_SYNC) { dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; @@ -1225,7 +1239,7 @@ void Locker::inode_file_eval(CInode *in) } // * -> lock? (if not replicated or open) - else if (!in->is_cached_by_anyone() && + else if (!in->is_replicated() && wanted == 0 && in->filelock.get_state() != LOCK_LOCK) { inode_file_lock(in); @@ -1259,20 +1273,13 @@ bool Locker::inode_file_sync(CInode *in) assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // soft data bufferlist softdata; in->encode_file_state(softdata); // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - m->set_data(softdata); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE, softdata); } // change lock @@ -1292,15 +1299,9 @@ bool Locker::inode_file_sync(CInode *in) } else { // no writers, go straight to sync - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); } // change lock @@ -1317,15 +1318,9 @@ bool Locker::inode_file_sync(CInode *in) issue_caps(in); } else { // no writers, go straight to sync - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_SYNC, LOCK_OTYPE_IFILE); } // change lock @@ -1340,6 +1335,7 @@ bool Locker::inode_file_sync(CInode *in) } + void Locker::inode_file_lock(CInode *in) { dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; @@ -1358,16 +1354,10 @@ void Locker::inode_file_lock(CInode *in) int issued = in->get_caps_issued(); if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - in->filelock.init_gather(in->get_cached_by()); + send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); + in->filelock.init_gather(in->get_replicas()); // change lock in->filelock.set_state(LOCK_GLOCKR); @@ -1387,16 +1377,10 @@ void Locker::inode_file_lock(CInode *in) } else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - in->filelock.init_gather(in->get_cached_by()); + send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); + in->filelock.init_gather(in->get_replicas()); // change lock in->filelock.set_state(LOCK_GLOCKM); @@ -1449,16 +1433,10 @@ void Locker::inode_file_mixed(CInode *in) int issued = in->get_caps_issued(); if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - in->filelock.init_gather(in->get_cached_by()); + send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); + in->filelock.init_gather(in->get_replicas()); in->filelock.set_state(LOCK_GMIXEDR); issue_caps(in); @@ -1473,20 +1451,13 @@ void Locker::inode_file_mixed(CInode *in) } else if (in->filelock.get_state() == LOCK_LOCK) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // data bufferlist softdata; in->encode_file_state(softdata); // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - m->set_data(softdata); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE, softdata); } // change lock @@ -1500,15 +1471,9 @@ void Locker::inode_file_mixed(CInode *in) in->filelock.set_state(LOCK_GMIXEDL); issue_caps(in); } - else if (in->is_cached_by_anyone()) { + else if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + send_lock_message(in, LOCK_AC_MIXED, LOCK_OTYPE_IFILE); in->filelock.set_state(LOCK_MIXED); issue_caps(in); } else { @@ -1538,16 +1503,10 @@ void Locker::inode_file_loner(CInode *in) assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); if (in->filelock.get_state() == LOCK_SYNC) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - in->filelock.init_gather(in->get_cached_by()); + send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); + in->filelock.init_gather(in->get_replicas()); // change lock in->filelock.set_state(LOCK_GLONERR); @@ -1565,16 +1524,10 @@ void Locker::inode_file_loner(CInode *in) } else if (in->filelock.get_state() == LOCK_MIXED) { - if (in->is_cached_by_anyone()) { + if (in->is_replicated()) { // bcast to replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); - m->set_ino(in->ino(), LOCK_OTYPE_IFILE); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } - in->filelock.init_gather(in->get_cached_by()); + send_lock_message(in, LOCK_AC_LOCK, LOCK_OTYPE_IFILE); + in->filelock.init_gather(in->get_replicas()); // change lock in->filelock.set_state(LOCK_GLONERM); @@ -1720,9 +1673,9 @@ void Locker::handle_lock_inode_file(MLock *m) issue_caps(in); // waiters - in->filelock.get_write(); + //in->filelock.get_write(); in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); - in->filelock.put_write(); + //in->filelock.put_write(); inode_file_eval(in); break; @@ -1885,11 +1838,15 @@ bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) // mine! dn->xlockedby = m; - if (dn->dir->is_open_by_anyone()) { + if (dn->is_replicated()) { dn->lockstate = DN_LOCK_PREXLOCK; // xlock with whom? - set who = dn->dir->get_open_by(); + set who; + for (map::iterator p = dn->replicas_begin(); + p != dn->replicas_end(); + ++p) + who.insert(p->first); dn->gather_set = who; // make path @@ -1941,14 +1898,8 @@ void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) // tell replicas? if (!quiet) { // tell even if dn is null. - if (dn->dir->is_open_by_anyone()) { - for (set::iterator it = dn->dir->open_by_begin(); - it != dn->dir->open_by_end(); - it++) { - MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); - m->set_dn(dn->dir->ino(), dn->name); - mds->send_message_mds(m, *it, MDS_PORT_LOCKER); - } + if (dn->is_replicated()) { + send_lock_message(dn, LOCK_AC_SYNC); } } diff --git a/branches/aleung/security1/ceph/mds/Locker.h b/branches/aleung/security1/ceph/mds/Locker.h index a7f5d380538aa..e3c192cbf4694 100644 --- a/branches/aleung/security1/ceph/mds/Locker.h +++ b/branches/aleung/security1/ceph/mds/Locker.h @@ -54,6 +54,10 @@ private: void dispatch(Message *m); + void send_lock_message(CInode *in, int msg, int type); + void send_lock_message(CInode *in, int msg, int type, bufferlist& data); + void send_lock_message(CDentry *dn, int msg); + // -- locks -- // high level interface public: diff --git a/branches/aleung/security1/ceph/mds/LogEvent.cc b/branches/aleung/security1/ceph/mds/LogEvent.cc index 5b15f487d77ab..4a83902c5c6c4 100644 --- a/branches/aleung/security1/ceph/mds/LogEvent.cc +++ b/branches/aleung/security1/ceph/mds/LogEvent.cc @@ -17,13 +17,16 @@ // events i know of #include "events/EString.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" +#include "events/EImportMap.h" +#include "events/EMetaBlob.h" +#include "events/EUpdate.h" #include "events/EUnlink.h" #include "events/EAlloc.h" -#include "events/EMknod.h" -#include "events/EMkdir.h" #include "events/EPurgeFinish.h" +#include "events/EExportStart.h" +#include "events/EExportFinish.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" LogEvent *LogEvent::decode(bufferlist& bl) { @@ -41,40 +44,18 @@ LogEvent *LogEvent::decode(bufferlist& bl) // create event LogEvent *le; switch (type) { - case EVENT_STRING: // string - le = new EString(); - break; - - case EVENT_INODEUPDATE: - le = new EInodeUpdate(); - break; - - case EVENT_DIRUPDATE: - le = new EDirUpdate(); - break; - - case EVENT_UNLINK: - le = new EUnlink(); - break; - - case EVENT_PURGEFINISH: - le = new EPurgeFinish(); - break; - - case EVENT_ALLOC: - le = new EAlloc(); - break; - - case EVENT_MKNOD: - le = new EMknod(); - break; - - case EVENT_MKDIR: - le = new EMkdir(); - break; - + case EVENT_STRING: le = new EString(); break; + case EVENT_IMPORTMAP: le = new EImportMap; break; + case EVENT_UPDATE: le = new EUpdate; break; + case EVENT_UNLINK: le = new EUnlink(); break; + case EVENT_PURGEFINISH: le = new EPurgeFinish(); break; + case EVENT_ALLOC: le = new EAlloc(); break; + case EVENT_EXPORTSTART: le = new EExportStart; break; + case EVENT_EXPORTFINISH: le = new EExportFinish; break; + case EVENT_IMPORTSTART: le = new EImportStart; break; + case EVENT_IMPORTFINISH: le = new EImportFinish; break; default: - dout(1) << "uh oh, unknown event type " << type << endl; + dout(1) << "uh oh, unknown log event type " << type << endl; assert(0); } diff --git a/branches/aleung/security1/ceph/mds/LogEvent.h b/branches/aleung/security1/ceph/mds/LogEvent.h index 0de268252036a..6895ed54074d4 100644 --- a/branches/aleung/security1/ceph/mds/LogEvent.h +++ b/branches/aleung/security1/ceph/mds/LogEvent.h @@ -19,6 +19,9 @@ #define EVENT_INODEUPDATE 2 #define EVENT_DIRUPDATE 3 +#define EVENT_IMPORTMAP 4 +#define EVENT_UPDATE 5 + #define EVENT_ALLOC 10 #define EVENT_MKNOD 11 #define EVENT_MKDIR 12 @@ -28,6 +31,12 @@ #define EVENT_RMDIR 21 #define EVENT_PURGEFINISH 22 +#define EVENT_EXPORTSTART 30 +#define EVENT_EXPORTFINISH 31 +#define EVENT_IMPORTSTART 32 +#define EVENT_IMPORTFINISH 33 + + #include using namespace std; @@ -41,13 +50,17 @@ class MDS; class LogEvent { private: int _type; - off_t _end_off; + off_t _start_off,_end_off; friend class MDLog; public: - LogEvent(int t) : _type(t), _end_off(0) { } + LogEvent(int t) : _type(t), _start_off(0), _end_off(0) { } virtual ~LogEvent() { } + int get_type() { return _type; } + off_t get_start_off() { return _start_off; } + off_t get_end_off() { return _end_off; } + // encoding virtual void encode_payload(bufferlist& bl) = 0; virtual void decode_payload(bufferlist& bl, int& off) = 0; @@ -64,26 +77,22 @@ class LogEvent { /* obsolete() - is this entry committed to primary store, such that * we can expire it from the journal? */ - virtual bool can_expire(MDS *m) { + virtual bool has_expired(MDS *m) { return true; } - /* retire() - prod MDS into committing hte relevant state so that this + /* expire() - prod MDS into committing the relevant state so that this * entry can be expired from the jorunal. */ - virtual void retire(MDS *m, Context *c) { + virtual void expire(MDS *m, Context *c) { + assert(0); c->finish(0); delete c; } /*** recovery ***/ - - /* has_happened() - true if this event has already been applied. - */ - virtual bool has_happened(MDS *m) { return true; } - - /* replay() - replay given event + /* replay() - replay given event. this is idempotent. */ virtual void replay(MDS *m) { assert(0); } diff --git a/branches/aleung/security1/ceph/mds/MDBalancer.cc b/branches/aleung/security1/ceph/mds/MDBalancer.cc index c1888fea3c2d2..57e79dcdf51fc 100644 --- a/branches/aleung/security1/ceph/mds/MDBalancer.cc +++ b/branches/aleung/security1/ceph/mds/MDBalancer.cc @@ -31,13 +31,14 @@ using namespace std; #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " +#define dout(l) if (l<=g_conf.debug_mds || l<=g_conf.debug_mds_balancer) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".bal " #define MIN_LOAD 50 // ?? #define MIN_REEXPORT 5 // will automatically reexport #define MIN_OFFLOAD 10 // point at which i stop trying, close enough + int MDBalancer::proc_message(Message *m) { switch (m->get_type()) { @@ -56,6 +57,41 @@ int MDBalancer::proc_message(Message *m) } + + +void MDBalancer::tick() +{ + static int num_bal_times = g_conf.mds_bal_max; + static utime_t first = g_clock.now(); + utime_t now = g_clock.now(); + utime_t elapsed = now; + elapsed -= first; + + // balance? + if (true && + mds->get_nodeid() == 0 && + (num_bal_times || + (g_conf.mds_bal_max_until >= 0 && + elapsed.sec() > g_conf.mds_bal_max_until)) && + mds->is_active() && + now.sec() - last_heartbeat.sec() >= g_conf.mds_bal_interval) { + last_heartbeat = now; + send_heartbeat(); + num_bal_times--; + } + + // hash? + if (true && + g_conf.num_mds > 1 && + now.sec() - last_hash.sec() > g_conf.mds_bal_hash_interval) { + last_hash = now; + do_hashing(); + } +} + + + + class C_Bal_SendHeartbeat : public Context { public: MDS *mds; @@ -119,15 +155,15 @@ void MDBalancer::send_heartbeat() } - int size = mds->get_mds_map()->get_num_mds(); - for (int i = 0; iget_nodeid()) continue; + set up; + mds->get_mds_map()->get_up_mds_set(up); + for (set::iterator p = up.begin(); p != up.end(); ++p) { + if (*p == mds->get_nodeid()) continue; MHeartbeat *hb = new MHeartbeat(load, beat_epoch); hb->get_import_map() = import_map; mds->messenger->send_message(hb, - MSG_ADDR_MDS(i), mds->mdsmap->get_inst(i), - MDS_PORT_BALANCER, - MDS_PORT_BALANCER); + mds->mdsmap->get_inst(*p), + MDS_PORT_BALANCER, MDS_PORT_BALANCER); } } @@ -517,6 +553,9 @@ void MDBalancer::do_rebalance(int beat) << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() << endl; mds->mdcache->migrator->export_dir(*it, target); + + // hack! only do one dir. + break; } } @@ -807,71 +846,7 @@ void MDBalancer::add_import(CDir *dir) void MDBalancer::show_imports(bool external) { - int db = 20; //debug level - return; - - if (mds->mdcache->imports.empty() && - mds->mdcache->hashdirs.empty()) { - dout(db) << "no imports/exports/hashdirs" << endl; - return; - } - dout(db) << "imports/exports/hashdirs:" << endl; - - set ecopy = mds->mdcache->exports; - - set::iterator it = mds->mdcache->hashdirs.begin(); - while (1) { - if (it == mds->mdcache->hashdirs.end()) it = mds->mdcache->imports.begin(); - if (it == mds->mdcache->imports.end() ) break; - - CDir *im = *it; - - if (im->is_import()) { - dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - assert( im->is_auth() ); - } - else if (im->is_hashed()) { - if (im->is_import()) continue; // if import AND hash, list as import. - dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; - } - - for (set::iterator p = mds->mdcache->nested_exports[im].begin(); - p != mds->mdcache->nested_exports[im].end(); - p++) { - CDir *exp = *p; - if (exp->is_hashed()) { - //assert(0); // we don't do it this way actually - dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - assert( !exp->is_auth() ); - } else { - dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; - assert( exp->is_export() ); - assert( !exp->is_auth() ); - } - - if ( mds->mdcache->get_auth_container(exp) != im ) { - dout(1) << "uh oh, auth container is " << mds->mdcache->get_auth_container(exp) << endl; - dout(1) << "uh oh, auth container is " << *mds->mdcache->get_auth_container(exp) << endl; - assert( mds->mdcache->get_auth_container(exp) == im ); - } - - if (ecopy.count(exp) != 1) { - dout(1) << "***** nested_export " << *exp << " not in exports" << endl; - assert(0); - } - ecopy.erase(exp); - } - - it++; - } - - if (ecopy.size()) { - for (set::iterator it = ecopy.begin(); - it != ecopy.end(); - it++) - dout(1) << "***** stray item in exports: " << **it << endl; - assert(ecopy.size() == 0); - } + mds->mdcache->show_imports(); } diff --git a/branches/aleung/security1/ceph/mds/MDBalancer.h b/branches/aleung/security1/ceph/mds/MDBalancer.h index a6129045ca3f7..d84d6439dbccc 100644 --- a/branches/aleung/security1/ceph/mds/MDBalancer.h +++ b/branches/aleung/security1/ceph/mds/MDBalancer.h @@ -39,9 +39,11 @@ class CDir; class MDBalancer { protected: MDS *mds; - int beat_epoch; + utime_t last_heartbeat; + utime_t last_hash; + // todo set hash_queue; @@ -66,10 +68,9 @@ class MDBalancer { } public: - MDBalancer(MDS *m) { - mds = m; - beat_epoch = 0; - } + MDBalancer(MDS *m) : + mds(m), + beat_epoch(0) { } mds_load_t get_load(); @@ -78,6 +79,8 @@ class MDBalancer { void send_heartbeat(); void handle_heartbeat(MHeartbeat *m); + void tick(); + void do_hashing(); void export_empties(); diff --git a/branches/aleung/security1/ceph/mds/MDCache.cc b/branches/aleung/security1/ceph/mds/MDCache.cc index 5216476131914..eb8ad591d6a35 100644 --- a/branches/aleung/security1/ceph/mds/MDCache.cc +++ b/branches/aleung/security1/ceph/mds/MDCache.cc @@ -38,10 +38,17 @@ #include "osdc/Filer.h" +#include "events/EImportMap.h" +#include "events/EString.h" #include "events/EUnlink.h" #include "events/EPurgeFinish.h" #include "messages/MGenericMessage.h" + +#include "messages/MMDSImportMap.h" +#include "messages/MMDSCacheRejoin.h" +#include "messages/MMDSCacheRejoinAck.h" + #include "messages/MDiscover.h" #include "messages/MDiscoverReply.h" @@ -90,6 +97,7 @@ MDCache::MDCache(MDS *m) lru.lru_set_midpoint(g_conf.mds_cache_mid); did_shutdown_exports = false; + did_shutdown_log_cap = false; shutdown_commits = 0; } @@ -100,6 +108,7 @@ MDCache::~MDCache() } + void MDCache::log_stat(Logger *logger) { if (get_root()) { @@ -158,11 +167,8 @@ void MDCache::destroy_inode(CInode *in) void MDCache::add_inode(CInode *in) { // add to lru, inode map - assert(inode_map.size() == lru.lru_get_size()); - lru.lru_insert_mid(in); assert(inode_map.count(in->ino()) == 0); // should be no dup inos! inode_map[ in->ino() ] = in; - assert(inode_map.size() == lru.lru_get_size()); } void MDCache::remove_inode(CInode *o) @@ -178,11 +184,686 @@ void MDCache::remove_inode(CInode *o) dn->dir->unlink_inode(dn); // leave dentry } inode_map.erase(o->ino()); // remove from map - lru.lru_remove(o); // remove from lru +} + + +/* + * take note of where we write import_maps in the log, as we need + * to take care not to expire them until an updated map is safely flushed. + */ +class C_MDS_WroteImportMap : public Context { + MDLog *mdlog; + off_t end_off; +public: + C_MDS_WroteImportMap(MDLog *ml, off_t eo) : mdlog(ml), end_off(eo) { } + void finish(int r) { + // cout << "WroteImportMap at " << end_off << endl; + if (r >= 0) + mdlog->last_import_map = end_off; + mdlog->writing_import_map = false; + } +}; + + + +void MDCache::log_import_map(Context *onsync) +{ + dout(10) << "log_import_map " << imports.size() << " imports, " + << exports.size() << " exports" << endl; + + EImportMap *le = new EImportMap; + + // include import/export inodes, + // and a spanning tree to tie it to the root of the fs + for (set::iterator p = imports.begin(); + p != imports.end(); + p++) { + CDir *im = *p; + le->imports.insert(im->ino()); + le->metablob.add_dir_context(im, true); + le->metablob.add_dir(im, false); + + if (nested_exports.count(im)) { + for (set::iterator q = nested_exports[im].begin(); + q != nested_exports[im].end(); + ++q) { + CDir *ex = *q; + le->nested_exports[im->ino()].insert(ex->ino()); + le->exports.insert(ex->ino()); + le->metablob.add_dir_context(ex); + le->metablob.add_dir(ex, false); + } + } + } + + mds->mdlog->writing_import_map = true; + mds->mdlog->submit_entry(le); + mds->mdlog->wait_for_sync(new C_MDS_WroteImportMap(mds->mdlog, mds->mdlog->get_write_pos())); + if (onsync) + mds->mdlog->wait_for_sync(onsync); +} + + + + + +// ===================== +// recovery stuff + +void MDCache::send_pending_import_maps() +{ + if (wants_import_map.empty()) + return; // nothing to send. + + // only if it's appropriate! + if (migrator->is_exporting()) { + dout(7) << "send_pending_import_maps waiting, exports still in progress" << endl; + return; // not now + } + + // ok, send them. + for (set::iterator p = wants_import_map.begin(); + p != wants_import_map.end(); + p++) + send_import_map_now(*p); + wants_import_map.clear(); +} + +void MDCache::send_import_map(int who) +{ + if (migrator->is_exporting()) + send_import_map_later(who); + else + send_import_map_now(who); +} + +void MDCache::send_import_map_now(int who) +{ + dout(10) << "send_import_map to mds" << who << endl; + + MMDSImportMap *m = new MMDSImportMap; + + // known + for (set::iterator p = imports.begin(); + p != imports.end(); + p++) { + CDir *im = *p; + + if (migrator->is_importing(im->ino())) { + // ambiguous (mid-import) + m->add_ambiguous_import(im->ino(), + migrator->get_import_bounds(im->ino())); + } else { + // not ambiguous. + m->add_import(im->ino()); + + if (nested_exports.count(im)) { + for (set::iterator q = nested_exports[im].begin(); + q != nested_exports[im].end(); + ++q) { + CDir *ex = *q; + m->add_import_export(im->ino(), ex->ino()); + } + } + } + } + + // ambiguous + for (map >::iterator p = my_ambiguous_imports.begin(); + p != my_ambiguous_imports.end(); + ++p) + m->add_ambiguous_import(p->first, p->second); + + // second + mds->send_message_mds(m, who, MDS_PORT_CACHE); } +/* + * during resolve state, we share import_maps to determine who + * is authoritative for which trees. we expect to get an import_map + * from _everyone_ in the recovery_set (the mds cluster at the time of + * the first failure). + */ +void MDCache::handle_import_map(MMDSImportMap *m) +{ + dout(7) << "handle_import_map from " << m->get_source() << endl; + int from = m->get_source().num(); + + // FIXME: check if we are a surviving ambiguous importer + + // update my dir_auth values + for (map >::iterator pi = m->imap.begin(); + pi != m->imap.end(); + ++pi) { + CInode *imi = get_inode(pi->first); + if (!imi) continue; + CDir *im = imi->dir; + if (!im) continue; + + im->set_dir_auth(from); + + for (set::iterator pe = pi->second.begin(); + pe != pi->second.end(); + ++pe) { + CInode *exi = get_inode(*pe); + if (!exi) continue; + CDir *ex = exi->dir; + if (!ex) continue; + + if (ex->get_dir_auth() == CDIR_AUTH_PARENT) + ex->set_dir_auth(CDIR_AUTH_UNKNOWN); + } + } + + // note ambiguous imports too + for (map >::iterator pi = m->ambiguous_imap.begin(); + pi != m->ambiguous_imap.end(); + ++pi) + mds->mdcache->other_ambiguous_imports[from][pi->first].swap( pi->second ); + + // did i get them all? + got_import_map.insert(from); + + if (got_import_map == recovery_set) { + dout(10) << "got all import maps, ready to rejoin" << endl; + disambiguate_imports(); + recalc_auth_bits(); + trim_non_auth(); + + // move to rejoin state + mds->set_want_state(MDSMap::STATE_REJOIN); + + } else { + dout(10) << "still waiting for more importmaps, got " << got_import_map + << ", need " << recovery_set << endl; + } + + delete m; +} + + +void MDCache::disambiguate_imports() +{ + dout(10) << "disambiguate_imports" << endl; + + // other nodes' ambiguous imports + for (map > >::iterator p = other_ambiguous_imports.begin(); + p != other_ambiguous_imports.begin(); + ++p) { + int who = p->first; + + for (map >::iterator q = p->second.begin(); + q != p->second.end(); + ++q) { + CInode *diri = get_inode(q->first); + if (!diri) continue; + CDir *dir = diri->dir; + if (!dir) continue; + + if (dir->authority() >= CDIR_AUTH_UNKNOWN) { + dout(10) << "mds" << who << " did not import " << *dir << endl; + } else { + dout(10) << "mds" << who << " did import " << *dir << endl; + int was = dir->authority(); + dir->set_dir_auth(who); + + for (set::iterator r = q->second.begin(); + r != q->second.end(); + ++r) { + CInode *exi = get_inode(q->first); + if (!exi) continue; + CDir *ex = exi->dir; + if (!ex) continue; + if (ex->get_dir_auth() == CDIR_AUTH_PARENT) + ex->set_dir_auth(was); + dout(10) << " bound " << *ex << endl; + } + } + } + } + other_ambiguous_imports.clear(); + + // my ambiguous imports + while (!my_ambiguous_imports.empty()) { + map >::iterator q = my_ambiguous_imports.begin(); + + CInode *diri = get_inode(q->first); + if (!diri) continue; + CDir *dir = diri->dir; + if (!dir) continue; + + if (dir->authority() != CDIR_AUTH_UNKNOWN) { + dout(10) << "ambiguous import auth known, must not be me " << *dir << endl; + cancel_ambiguous_import(q->first); + } else { + dout(10) << "ambiguous import auth unknown, must be me " << *dir << endl; + finish_ambiguous_import(q->first); + } + } + assert(my_ambiguous_imports.empty()); + + show_imports(); +} + +void MDCache::cancel_ambiguous_import(inodeno_t dirino) +{ + assert(my_ambiguous_imports.count(dirino)); + dout(10) << "cancel_ambiguous_import " << dirino + << " bounds " << my_ambiguous_imports[dirino] + << endl; + my_ambiguous_imports.erase(dirino); +} + +void MDCache::finish_ambiguous_import(inodeno_t dirino) +{ + assert(my_ambiguous_imports.count(dirino)); + set bounds; + bounds.swap(my_ambiguous_imports[dirino]); + my_ambiguous_imports.erase(dirino); + + dout(10) << "finish_ambiguous_import " << dirino + << " bounds " << bounds + << endl; + + CInode *diri = get_inode(dirino); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + // adjust dir_auth + CDir *im = dir; + if (dir->get_inode()->authority() == mds->get_nodeid()) { + // parent is already me. adding to existing import. + im = get_auth_container(dir); + if (!im) im = dir; + nested_exports[im].erase(dir); + exports.erase(dir); + dir->set_dir_auth( CDIR_AUTH_PARENT ); + dir->state_clear(CDIR_STATE_EXPORT); + dir->put(CDir::PIN_EXPORT); + } else { + // parent isn't me. new import. + imports.insert(dir); + dir->set_dir_auth( mds->get_nodeid() ); + dir->state_set(CDIR_STATE_IMPORT); + dir->get(CDir::PIN_IMPORT); + } + + dout(10) << " base " << *dir << endl; + if (dir != im) + dout(10) << " under " << *im << endl; + + // bounds (exports, before) + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CInode *bi = get_inode(*p); + assert(bi); + CDir *bd = bi->dir; + assert(bd); + + if (bd->get_dir_auth() == mds->get_nodeid()) { + // still me. was an import. + imports.erase(bd); + bd->set_dir_auth( CDIR_AUTH_PARENT ); + bd->state_clear(CDIR_STATE_IMPORT); + bd->put(CDir::PIN_IMPORT); + // move nested exports. + for (set::iterator q = nested_exports[bd].begin(); + q != nested_exports[bd].end(); + ++q) + nested_exports[im].insert(*q); + nested_exports.erase(bd); + + } else { + // not me anymore. now an export. + exports.insert(bd); + nested_exports[im].insert(bd); + assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); + bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); + bd->state_set(CDIR_STATE_EXPORT); + bd->get(CDir::PIN_EXPORT); + } + + dout(10) << " bound " << *bd << endl; + } +} + +void MDCache::finish_ambiguous_export(inodeno_t dirino, set& bounds) +{ + CInode *diri = get_inode(dirino); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + dout(10) << "finish_ambiguous_export " << dirino + << " bounds " << bounds + << endl; + + // adjust dir_auth + CDir *im = get_auth_container(dir); + if (dir->get_inode()->authority() == CDIR_AUTH_UNKNOWN) { + // was an import, hose it + assert(im == dir); + assert(imports.count(dir)); + imports.erase(dir); + dir->set_dir_auth( CDIR_AUTH_PARENT ); + dir->state_clear(CDIR_STATE_IMPORT); + dir->put(CDir::PIN_IMPORT); + } else { + // i'm now an export + exports.insert(dir); + nested_exports[im].insert(dir); + dir->set_dir_auth( CDIR_AUTH_UNKNOWN ); // not me + dir->state_set(CDIR_STATE_EXPORT); + dir->get(CDir::PIN_EXPORT); + } + dout(10) << " base " << *dir << endl; + if (dir != im) + dout(10) << " under " << *im << endl; + + // bounds (there were exports, before) + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CInode *bi = get_inode(*p); + assert(bi); + CDir *bd = bi->dir; + assert(bd); + + // hose export + assert(exports.count(bd)); + exports.erase(bd); + nested_exports[im].erase(bd); + + // fix dir_auth + assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); + bd->set_dir_auth( CDIR_AUTH_PARENT ); // not me + + bd->state_clear(CDIR_STATE_EXPORT); + bd->put(CDir::PIN_EXPORT); + + dout(10) << " bound " << *bd << endl; + } + + show_imports(); +} + + + + +/* + * rejoin phase! + * we start out by sending rejoins to everyone in the recovery set. + * + * if _were_ are rejoining, send for all regions in our cache. + * if we are active|stopping, send only to nodes that are are rejoining. + */ +void MDCache::send_cache_rejoins() +{ + dout(10) << "send_cache_rejoins " << endl; + + map rejoins; + + // if i am rejoining, send a rejoin to everyone. + // otherwise, just send to others who are rejoining. + for (set::iterator p = recovery_set.begin(); + p != recovery_set.end(); + ++p) { + if (*p == mds->get_nodeid()) continue; // nothing to myself! + if (mds->is_rejoin() || + mds->mdsmap->is_rejoin(*p)) + rejoins[*p] = new MMDSCacheRejoin; + } + + // build list of dir_auth regions + list dir_auth_regions; + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + if (!p->second->is_dir()) continue; + if (!p->second->dir) continue; + if (p->second->dir->get_dir_auth() == CDIR_AUTH_PARENT) continue; + + int auth = p->second->dir->get_dir_auth(); + assert(auth >= 0); + + if (auth == mds->get_nodeid()) continue; // skip my own regions! + + if (rejoins.count(auth) == 0) + continue; // don't care about this node's regions + + // add to list + dout(10) << " on mds" << auth << " region " << *p->second << endl; + dir_auth_regions.push_back(p->second->dir); + } + + // walk the regions + for (list::iterator p = dir_auth_regions.begin(); + p != dir_auth_regions.end(); + ++p) { + CDir *dir = *p; + int to = dir->authority(); + cache_rejoin_walk(dir, rejoins[to]); + } + + // send the messages + assert(rejoin_ack_gather.empty()); + for (map::iterator p = rejoins.begin(); + p != rejoins.end(); + ++p) { + mds->send_message_mds(p->second, p->first, MDS_PORT_CACHE); + rejoin_ack_gather.insert(p->first); + } + + // nothing? + if (rejoins.empty()) { + dout(10) << "nothing to rejoin, going active" << endl; + mds->set_want_state(MDSMap::STATE_ACTIVE); + } +} + + + +void MDCache::cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) +{ + dout(10) << "cache_rejoin_walk " << *dir << endl; + rejoin->add_dir(dir->ino()); + + list nested; // finish this dir, then do nested items + + // walk dentries + for (map::iterator p = dir->items.begin(); + p != dir->items.end(); + ++p) { + // dentry + rejoin->add_dentry(dir->ino(), p->first); + + // inode? + if (p->second->is_primary() && p->second->get_inode()) { + CInode *in = p->second->get_inode(); + rejoin->add_inode(in->ino(), + in->get_caps_wanted()); + + // dir? + if (in->dir && + in->dir->get_dir_auth() == CDIR_AUTH_PARENT) + nested.push_back(in->dir); + } + } + + // recurse into nested dirs + for (list::iterator p = nested.begin(); + p != nested.end(); + ++p) + cache_rejoin_walk(*p, rejoin); +} + + +/* + * i got a rejoin. + * + * - reply with the lockstate + * + * if i am active|stopping, + * - remove source from replica list for everything not referenced here. + */ +void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) +{ + dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; + int from = m->get_source().num(); + + MMDSCacheRejoinAck *ack = new MMDSCacheRejoinAck; + + if (mds->is_active() || mds->is_stopping()) { + dout(10) << "removing stale cache replicas" << endl; + // first, scour cache of replica references + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + // inode + CInode *in = p->second; + if (in->is_replica(from) && m->inodes.count(p->first) == 0) { + inode_remove_replica(in, from); + dout(10) << " rem " << *in << endl; + } + + // dentry + if (in->parent) { + CDentry *dn = in->parent; + if (dn->is_replica(from) && + (m->dentries.count(dn->get_dir()->ino()) == 0 || + m->dentries[dn->get_dir()->ino()].count(dn->get_name()) == 0)) { + dn->remove_replica(from); + dout(10) << " rem " << *dn << endl; + } + } + + // dir + if (in->dir) { + CDir *dir = in->dir; + if (dir->is_replica(from) && m->dirs.count(p->first) == 0) { + dir->remove_replica(from); + dout(10) << " rem " << *dir << endl; + } + } + } + } else { + assert(mds->is_rejoin()); + } + + // dirs + for (set::iterator p = m->dirs.begin(); + p != m->dirs.end(); + ++p) { + CInode *diri = get_inode(*p); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + int nonce = dir->add_replica(from); + dout(10) << " has " << *dir << endl; + ack->add_dir(*p, nonce); + + // dentries + for (set::iterator q = m->dentries[*p].begin(); + q != m->dentries[*p].end(); + ++q) { + CDentry *dn = dir->lookup(*q); + assert(dn); + int nonce = dn->add_replica(from); + dout(10) << " has " << *dn << endl; + ack->add_dentry(*p, *q, dn->get_lockstate(), nonce); + } + } + + // inodes + for (map::iterator p = m->inodes.begin(); + p != m->inodes.end(); + ++p) { + CInode *in = get_inode(p->first); + assert(in); + int nonce = in->add_replica(from); + if (p->second) + in->mds_caps_wanted[from] = p->second; + else + in->mds_caps_wanted.erase(from); + in->hardlock.gather_set.erase(from); // just in case + in->filelock.gather_set.erase(from); // just in case + dout(10) << " has " << *in << endl; + ack->add_inode(p->first, + in->hardlock.get_replica_state(), in->filelock.get_replica_state(), + nonce); + } + + // send ack + mds->send_message_mds(ack, from, MDS_PORT_CACHE); + + delete m; +} + + +void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoinAck *m) +{ + dout(7) << "handle_cache_rejoin from " << m->get_source() << endl; + int from = m->get_source().num(); + + // dirs + for (list::iterator p = m->dirs.begin(); + p != m->dirs.end(); + ++p) { + CInode *diri = get_inode(p->dirino); + CDir *dir = diri->dir; + assert(dir); + + dir->set_replica_nonce(p->nonce); + dout(10) << " got " << *dir << endl; + + // dentries + for (map::iterator q = m->dentries[p->dirino].begin(); + q != m->dentries[p->dirino].end(); + ++q) { + CDentry *dn = dir->lookup(q->first); + assert(dn); + dn->set_replica_nonce(q->second.nonce); + dn->set_lockstate(q->second.lock); + dout(10) << " got " << *dn << endl; + } + } + + // inodes + for (list::iterator p = m->inodes.begin(); + p != m->inodes.end(); + ++p) { + CInode *in = get_inode(p->ino); + assert(in); + in->set_replica_nonce(p->nonce); + in->hardlock.set_state(p->hardlock); + in->filelock.set_state(p->filelock); + dout(10) << " got " << *in << endl; + } + + delete m; + + // done? + rejoin_ack_gather.erase(from); + if (rejoin_ack_gather.empty()) { + dout(7) << "all done, going active!" << endl; + show_imports(); + show_cache(); + mds->set_want_state(MDSMap::STATE_ACTIVE); + } else { + dout(7) << "still need rejoin_ack from " << rejoin_ack_gather << endl; + } + +} + + + + + +// =============================================================================== void MDCache::rename_file(CDentry *srcdn, CDentry *destdn) @@ -205,14 +886,55 @@ void MDCache::set_root(CInode *in) { assert(root == 0); root = in; - root->state_set(CINODE_STATE_ROOT); + root->state_set(CInode::STATE_ROOT); } void MDCache::add_import(CDir *dir) { imports.insert(dir); dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDIR_PIN_IMPORT); + dir->get(CDir::PIN_IMPORT); +} + + +void MDCache::recalc_auth_bits() +{ + dout(7) << "recalc_auth_bits" << endl; + + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + CInode *in = p->second; + if (in->authority() == mds->get_nodeid()) + in->state_set(CInode::STATE_AUTH); + else { + in->state_clear(CInode::STATE_AUTH); + if (in->is_dirty()) + in->mark_clean(); + } + + if (in->parent) { + if (in->parent->authority() == mds->get_nodeid()) + in->parent->state_set(CDentry::STATE_AUTH); + else { + in->parent->state_clear(CDentry::STATE_AUTH); + if (in->parent->is_dirty()) + in->parent->mark_clean(); + } + } + + if (in->dir) { + if (in->dir->authority() == mds->get_nodeid()) + in->dir->state_set(CDIR_STATE_AUTH); + else { + in->dir->state_clear(CDIR_STATE_AUTH); + if (in->dir->is_dirty()) + in->dir->mark_clean(); + } + } + } + show_imports(); + show_cache(); } @@ -297,109 +1019,95 @@ void MDCache::start_recovered_purges() - bool MDCache::trim(int max) { - // empty? short cut. - if (lru.lru_get_size() == 0) return true; - + // trim LRU if (max < 0) { max = lru.lru_get_max(); if (!max) return false; } + dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; map expiremap; - dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; - assert(expiremap.empty()); - while (lru.lru_get_size() > (unsigned)max) { - CInode *in = (CInode*)lru.lru_expire(); - if (!in) break; //return false; + CDentry *dn = (CDentry*)lru.lru_expire(); + if (!dn) break; + + CDir *dir = dn->get_dir(); + assert(dir); + + // notify dentry authority? + if (!dn->is_auth()) { + int auth = dn->authority(); + dout(17) << "sending expire to mds" << auth << " on " << *dn << endl; + if (expiremap.count(auth) == 0) + expiremap[auth] = new MCacheExpire(mds->get_nodeid()); + expiremap[auth]->add_dentry(dir->ino(), dn->get_name(), dn->get_replica_nonce()); + } + + // unlink the dentry + dout(15) << "trim removing " << *dn << endl; + if (!dn->is_null()) + dir->unlink_inode(dn); + dir->remove_dentry(dn); + + // adjust the dir state + CInode *diri = dir->get_inode(); + diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! + + // reexport? + if (diri->dir->is_import() && // import + diri->dir->get_size() == 0 && // no children + !diri->is_root()) // not root + migrator->export_empty_import(diri->dir); + + if (mds->logger) mds->logger->inc("cex"); + } + + // inode expire_queue + while (!inode_expire_queue.empty()) { + CInode *in = inode_expire_queue.front(); + inode_expire_queue.pop_front(); + assert(in->get_num_ref() == 0); + + int dirauth = -2; if (in->dir) { // notify dir authority? - int auth = in->dir->authority(); - if (auth != mds->get_nodeid()) { - dout(17) << "sending expire to mds" << auth << " on " << *in->dir << endl; - if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce); + dirauth = in->dir->authority(); + if (dirauth != mds->get_nodeid()) { + dout(17) << "sending expire to mds" << dirauth << " on " << *in->dir << endl; + if (expiremap.count(dirauth) == 0) + expiremap[dirauth] = new MCacheExpire(mds->get_nodeid()); + expiremap[dirauth]->add_dir(in->ino(), in->dir->replica_nonce); } - } - // notify inode authority? - { - int auth = in->authority(); - if (auth != mds->get_nodeid()) { - assert(!in->is_auth()); - dout(17) << "sending expire to mds" << auth << " on " << *in << endl; - if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid()); - expiremap[auth]->add_inode(in->ino(), in->replica_nonce); - } else { - assert(in->is_auth()); - } + in->close_dir(); } - CInode *diri = NULL; - if (in->parent) - diri = in->parent->dir->inode; - - if (in->is_root()) { - dout(7) << "just trimmed root, cache now empty." << endl; - root = NULL; + + // notify inode authority + int auth = in->authority(); + if (auth == CDIR_AUTH_UNKNOWN) { + assert(in->ino() == 1); + assert(dirauth >= 0); + auth = dirauth; } - - - // last link? - if (in->inode.nlink == 0) { - dout(17) << "last link, removing file content " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS! - mds->filer->zero(in->inode, - 0, in->inode.size, - NULL, NULL); // FIXME + if (auth != mds->get_nodeid()) { + assert(!in->is_auth()); + dout(17) << "sending expire to mds" << auth << " on " << *in << endl; + if (expiremap.count(auth) == 0) + expiremap[auth] = new MCacheExpire(mds->get_nodeid()); + expiremap[auth]->add_inode(in->ino(), in->get_replica_nonce()); + } else { + assert(in->is_auth()); } - // remove it - dout(15) << "trim removing " << *in << " " << in << endl; + dout(15) << "trim removing " << *in << endl; + if (in == root) root = 0; remove_inode(in); - delete in; - - if (diri) { - // dir incomplete! - diri->dir->state_clear(CDIR_STATE_COMPLETE); - - // reexport? - if (diri->dir->is_import() && // import - diri->dir->get_size() == 0 && // no children - !diri->is_root()) // not root - migrator->export_empty_import(diri->dir); - - } - - if (mds->logger) mds->logger->inc("cex"); } - - /* hack - if (lru.lru_get_size() == max) { - int i; - dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl; - CInode *cur = (CInode*)lru.lru_tophead; - i = 1; - while (cur) { - dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << " " << *cur << endl; - cur = (CInode*)cur->lru_next; - } - - dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl; - cur = (CInode*)lru.lru_bothead; - i = 1; - while (cur) { - dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << " " << *cur << endl; - cur = (CInode*)cur->lru_next; - } - - } - */ - // send expires for (map::iterator it = expiremap.begin(); it != expiremap.end(); @@ -412,6 +1120,57 @@ bool MDCache::trim(int max) return true; } + +void MDCache::trim_non_auth() +{ + dout(7) << "trim_non_auth" << endl; + + CDentry *first_auth = 0; + + // trim non-auth items from the lru + while (lru.lru_get_size() > 0) { + CDentry *dn = (CDentry*)lru.lru_expire(); + if (!dn) break; + + if (dn->is_auth()) { + // add back into lru (at the top) + lru.lru_insert_top(dn); + + if (!first_auth) { + first_auth = dn; + } else { + if (first_auth == dn) + break; + } + } else { + // non-auth. expire. + CDir *dir = dn->get_dir(); + assert(dir); + + // unlink the dentry + dout(15) << "trim_non_auth removing " << *dn << endl; + if (!dn->is_null()) + dir->unlink_inode(dn); + dir->remove_dentry(dn); + + // adjust the dir state + CInode *diri = dir->get_inode(); + diri->dir->state_clear(CDIR_STATE_COMPLETE); // dir incomplete! + } + } + + // inode expire queue + while (!inode_expire_queue.empty()) { + CInode *in = inode_expire_queue.front(); + inode_expire_queue.pop_front(); + dout(15) << "trim_non_auth removing " << *in << endl; + if (in == root) root = 0; + remove_inode(in); + } +} + + + class C_MDC_ShutdownCommit : public Context { MDCache *mdc; public: @@ -425,13 +1184,10 @@ public: class C_MDC_ShutdownCheck : public Context { MDCache *mdc; - Mutex *lock; public: - C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {} + C_MDC_ShutdownCheck(MDCache *m) : mdc(m) {} void finish(int) { - lock->Lock(); mdc->shutdown_check(); - lock->Unlock(); } }; @@ -444,7 +1200,7 @@ void MDCache::shutdown_check() g_conf.debug_mds = 10; show_cache(); g_conf.debug_mds = o; - g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock)); + mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); // this dout(0) << "lru size now " << lru.lru_get_size() << endl; @@ -463,7 +1219,7 @@ void MDCache::shutdown_start() dout(1) << "shutdown_start" << endl; if (g_conf.mds_shutdown_check) - g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock)); + mds->timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this)); } @@ -472,7 +1228,7 @@ bool MDCache::shutdown_pass() { dout(7) << "shutdown_pass" << endl; //assert(mds->is_shutting_down()); - if (mds->is_stopped()) { + if (mds->is_out()) { dout(7) << " already shut down" << endl; show_cache(); show_imports(); @@ -524,16 +1280,20 @@ bool MDCache::shutdown_pass() // flush anything we can from the cache trim(0); - dout(5) << "cache size now " << lru.lru_get_size() << endl; + dout(5) << "lru size now " << lru.lru_get_size() << endl; + mds->mdlog->trim(0); // (wait for) flush log? - if (g_conf.mds_log_flush_on_shutdown && - mds->mdlog->get_num_events()) { - dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl; - return false; - } - + if (g_conf.mds_log_flush_on_shutdown) { + if (mds->mdlog->get_non_importmap_events()) { + dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() + << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; + return false; + } + } + + // send all imports back to 0. if (mds->get_nodeid() != 0 && !did_shutdown_exports) { // flush what i can from the cache first.. @@ -562,41 +1322,71 @@ bool MDCache::shutdown_pass() return false; } - // filer active? - if (mds->filer->is_active()) { - dout(7) << "filer still active" << endl; - return false; - } // close root? if (mds->get_nodeid() == 0 && - lru.lru_get_size() == 1 && + lru.lru_get_size() == 0 && root && root->dir && root->dir->is_import() && - root->dir->get_ref() == 1) { // 1 is the import! + root->dir->get_num_ref() == 1) { // 1 is the import! // un-import dout(7) << "removing root import" << endl; imports.erase(root->dir); root->dir->state_clear(CDIR_STATE_IMPORT); - root->dir->put(CDIR_PIN_IMPORT); + root->dir->put(CDir::PIN_IMPORT); - if (root->is_pinned_by(CINODE_PIN_DIRTY)) { - dout(7) << "clearing root dirty flag" << endl; - root->put(CINODE_PIN_DIRTY); + if (root->is_pinned_by(CInode::PIN_DIRTY)) { + dout(7) << "clearing root inode dirty flag" << endl; + root->put(CInode::PIN_DIRTY); } trim(0); - assert(inode_map.size() == lru.lru_get_size()); } // imports? - if (!imports.empty()) { - dout(7) << "still have " << imports.size() << " imports" << endl; + if (!imports.empty() || migrator->is_exporting()) { + dout(7) << "still have " << imports.size() << " imports, or still exporting" << endl; show_cache(); return false; } + // cap log? + if (g_conf.mds_log_flush_on_shutdown) { + + if (imports.empty() && exports.empty()) { + // (only do this once!) + if (!mds->mdlog->is_capped()) { + dout(7) << "capping the log" << endl; + mds->mdlog->cap(); + // note that this won't flush right away, so we'll make at least one more pass + } + } + + if (mds->mdlog->get_num_events()) { + dout(7) << "waiting for log to flush (including import_map, now) .. " << mds->mdlog->get_num_events() + << " (" << mds->mdlog->get_non_importmap_events() << ")" << endl; + return false; + } + + if (!did_shutdown_log_cap) { + // flush journal header + dout(7) << "writing header for (now-empty) journal" << endl; + assert(mds->mdlog->empty()); + mds->mdlog->write_head(0); + // NOTE: filer active checker below will block us until this completes. + did_shutdown_log_cap = true; + return false; + } + } + + // filer active? + if (mds->filer->is_active()) { + dout(7) << "filer still active" << endl; + return false; + } + + // done? if (lru.lru_get_size() > 0) { dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; @@ -614,6 +1404,27 @@ bool MDCache::shutdown_pass() +CInode *MDCache::create_root_inode() +{ + CInode *root = new CInode(this); + memset(&root->inode, 0, sizeof(inode_t)); + root->inode.ino = 1; + root->inode.hash_seed = 0; // not hashed! + + // make it up (FIXME) + root->inode.mode = 0755 | INODE_MODE_DIR; + root->inode.size = 0; + root->inode.ctime = 0; + root->inode.mtime = g_clock.gettime(); + + root->inode.nlink = 1; + root->inode.layout = g_OSD_MDDirLayout; + + set_root( root ); + add_inode( root ); + + return root; +} int MDCache::open_root(Context *c) @@ -623,33 +1434,18 @@ int MDCache::open_root(Context *c) // open root inode if (whoami == 0) { // i am root inode - CInode *root = new CInode(this); - memset(&root->inode, 0, sizeof(inode_t)); - root->inode.ino = 1; - root->inode.hash_seed = 0; // not hashed! - - // make it up (FIXME) - root->inode.mode = 0755 | INODE_MODE_DIR; - root->inode.size = 0; - root->inode.ctime = 0; - root->inode.mtime = g_clock.gettime(); - - root->inode.nlink = 1; - root->inode.layout = g_OSD_MDDirLayout; - - set_root( root ); - add_inode( root ); + CInode *root = create_root_inode(); // root directory too assert(root->dir == NULL); - root->set_dir( new CDir(root, mds, true) ); + root->set_dir( new CDir(root, this, true) ); root->dir->set_dir_auth( 0 ); // me! root->dir->dir_rep = CDIR_REP_ALL; //NONE; // root is sort of technically an import (from a vacuum) imports.insert( root->dir ); root->dir->state_set(CDIR_STATE_IMPORT); - root->dir->get(CDIR_PIN_IMPORT); + root->dir->get(CDir::PIN_IMPORT); if (c) { c->finish(0); @@ -691,6 +1487,19 @@ int MDCache::open_root(Context *c) void MDCache::dispatch(Message *m) { switch (m->get_type()) { + + case MSG_MDS_IMPORTMAP: + handle_import_map((MMDSImportMap*)m); + break; + + case MSG_MDS_CACHEREJOIN: + handle_cache_rejoin((MMDSCacheRejoin*)m); + break; + case MSG_MDS_CACHEREJOINACK: + handle_cache_rejoin_ack((MMDSCacheRejoinAck*)m); + break; + + case MSG_MDS_DISCOVER: handle_discover((MDiscover*)m); break; @@ -826,7 +1635,7 @@ int MDCache::path_traverse(filepath& origpath, return 1; } - cur->get_or_open_dir(mds); + cur->get_or_open_dir(this); assert(cur->dir); } else { // discover dir from/via inode auth @@ -963,19 +1772,19 @@ int MDCache::path_traverse(filepath& origpath, if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() && cur->dir->is_auth() && cur->dir->is_rep() && - cur->dir->is_open_by(req->get_source().num()) && + cur->dir->is_replica(req->get_source().num()) && dn->get_inode()->is_auth() ) { assert(req->get_source().is_mds()); int from = req->get_source().num(); - if (dn->get_inode()->is_cached_by(from)) { + if (dn->get_inode()->is_replica(from)) { dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " << req->get_source() << " dn " << *dn << endl; } else { dout(10) << "traverse: REP replicating to " << req->get_source() << " dn " << *dn << endl; MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino()); - reply->add_dentry( dn->get_name(), !dn->can_read()); + reply->add_dentry( dn->replicate_to( from ) ); reply->add_inode( dn->inode->replicate_to( from ) ); mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); } @@ -1239,7 +2048,7 @@ void MDCache::path_unpin(vector& trace, dout(11) << "path_unpinned " << *dn << endl; // did we completely unpin a waiter? - if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) { + if (dn->lockstate == DN_LOCK_UNPINNING && !dn->get_num_ref()) { // return state to sync, in case the unpinner flails dn->lockstate = DN_LOCK_SYNC; @@ -1389,10 +2198,12 @@ void MDCache::request_cleanup(Message *req) if (g_conf.log_pins) { // pin - for (int i=0; ilogger2) mds->logger2->set(cinode_pin_names[i], cinode_pins[i]); } + */ /* for (map::iterator it = cdir_pins.begin(); it != cdir_pins.end(); @@ -1448,10 +2259,10 @@ public: assert(in->inode.anchored == false); in->inode.anchored = true; - in->state_clear(CINODE_STATE_ANCHORING); - in->put(CINODE_PIN_ANCHORING); + in->state_clear(CInode::STATE_ANCHORING); + in->put(CInode::PIN_ANCHORING); - in->mark_dirty(); + in->_mark_dirty(); // fixme } // trigger @@ -1464,7 +2275,7 @@ void MDCache::anchor_inode(CInode *in, Context *onfinish) assert(in->is_auth()); // already anchoring? - if (in->state_test(CINODE_STATE_ANCHORING)) { + if (in->state_test(CInode::STATE_ANCHORING)) { dout(7) << "anchor_inode already anchoring " << *in << endl; // wait @@ -1475,8 +2286,8 @@ void MDCache::anchor_inode(CInode *in, Context *onfinish) dout(7) << "anchor_inode anchoring " << *in << endl; // auth: do it - in->state_set(CINODE_STATE_ANCHORING); - in->get(CINODE_PIN_ANCHORING); + in->state_set(CInode::STATE_ANCHORING); + in->get(CInode::PIN_ANCHORING); // wait in->add_waiter(CINODE_WAIT_ANCHORED, @@ -1517,7 +2328,7 @@ void MDCache::handle_inode_link(MInodeLink *m) } in->inode.nlink++; - in->mark_dirty(); + in->_mark_dirty(); // fixme // reply dout(7) << " nlink++, now " << in->inode.nlink++ << endl; @@ -1605,7 +2416,7 @@ void MDCache::handle_discover(MDiscover *dis) } if (!cur->dir) - cur->get_or_open_dir(mds); + cur->get_or_open_dir(this); assert(cur->dir); dout(10) << "dir is " << *cur->dir << endl; @@ -1655,10 +2466,10 @@ void MDCache::handle_discover(MDiscover *dis) break; } - if (!cur->dir) cur->get_or_open_dir(mds); + if (!cur->dir) cur->get_or_open_dir(this); reply->add_dir( new CDirDiscover( cur->dir, - cur->dir->open_by_add( dis->get_asker() ) ) ); + cur->dir->add_replica( dis->get_asker() ) ) ); dout(7) << "added dir " << *cur->dir << endl; } if (dis->get_want().depth() == 0) break; @@ -1690,7 +2501,7 @@ void MDCache::handle_discover(MDiscover *dis) break; // don't replicate null but non-locked dentries. } - reply->add_dentry( dis->get_dentry(i), !dn->can_read() ); + reply->add_dentry( dn->replicate_to( dis->get_asker() ) ); dout(7) << "added dentry " << *dn << endl; if (!dn->inode) break; // we're done. @@ -1828,7 +2639,7 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) dout2(7) << ", now " << *cur->dir << endl; } else { // add it (_replica_) - cur->set_dir( new CDir(cur, mds, false) ); + cur->set_dir( new CDir(cur, this, false) ); m->get_dir(i).update_dir(cur->dir); dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; @@ -1857,27 +2668,24 @@ void MDCache::handle_discover_reply(MDiscoverReply *m) if (i >= m->get_num_dentries()) break; // dentry - dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl; + dout(7) << "i = " << i << " dentry is " << m->get_dentry(i).get_dname() << endl; CDentry *dn = 0; if (i > 0 || m->has_base_dentry()) { - dn = cur->dir->lookup( m->get_dentry(i) ); + dn = cur->dir->lookup( m->get_dentry(i).get_dname() ); if (dn) { dout(7) << "had " << *dn << endl; + dn->replica_nonce = m->get_dentry(i).get_nonce(); // fix nonce. } else { - dn = cur->dir->add_dentry( m->get_dentry(i) ); - if (m->get_dentry_xlock(i)) { - dout(7) << " new dentry is xlock " << *dn << endl; - dn->lockstate = DN_LOCK_XLOCK; - dn->xlockedby = 0; - } + dn = cur->dir->add_dentry( m->get_dentry(i).get_dname(), 0, false ); + m->get_dentry(i).update_dentry(dn); dout(7) << "added " << *dn << endl; } cur->dir->take_waiting(CDIR_WAIT_DENTRY, - m->get_dentry(i), + m->get_dentry(i).get_dname(), finished); } @@ -2001,9 +2809,9 @@ void MDCache::handle_cache_expire(MCacheExpire *m) map proxymap; if (m->get_from() == source) { - dout(7) << "cache_expire from " << from << endl; + dout(7) << "cache_expire from mds" << from << endl; } else { - dout(7) << "cache_expire from " << from << " via " << source << endl; + dout(7) << "cache_expire from mds" << from << " via " << source << endl; } // inodes @@ -2014,15 +2822,15 @@ void MDCache::handle_cache_expire(MCacheExpire *m) int nonce = it->second; if (!in) { - dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl; + dout(0) << "inode expire on " << it->first << " from " << from << ", don't have it" << endl; assert(in); // i should be authority, or proxy .. and pinned } if (!in->is_auth()) { int newauth = in->authority(); dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl; assert(newauth >= 0); - if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; - assert(in->state_test(CINODE_STATE_PROXY)); + if (!in->state_test(CInode::STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; + assert(in->state_test(CInode::STATE_PROXY)); if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); proxymap[newauth]->add_inode(it->first, it->second); continue; @@ -2032,38 +2840,20 @@ void MDCache::handle_cache_expire(MCacheExpire *m) if (from == mds->get_nodeid()) { // my cache_expire, and the export_dir giving auth back to me crossed paths! // we can ignore this. no danger of confusion since the two parties are both me. - dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; + dout(7) << "inode expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; } - else if (nonce == in->get_cached_by_nonce(from)) { + else if (nonce == in->get_replica_nonce(from)) { // remove from our cached_by - dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl; - in->cached_by_remove(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->hardlock.is_gathering(from)) { - in->hardlock.gather_set.erase(from); - if (in->hardlock.gather_set.size() == 0) - mds->locker->inode_hard_eval(in); - } - if (in->filelock.is_gathering(from)) { - in->filelock.gather_set.erase(from); - if (in->filelock.gather_set.size() == 0) - mds->locker->inode_file_eval(in); - } - - // alone now? - if (!in->is_cached_by_anyone()) { - mds->locker->inode_hard_eval(in); - mds->locker->inode_file_eval(in); - } + dout(7) << "inode expire on " << *in << " from mds" << from << " cached_by was " << in->get_replicas() << endl; + inode_remove_replica(in, from); } else { // this is an old nonce, ignore expire. - dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl; - assert(in->get_cached_by_nonce(from) > nonce); + dout(7) << "inode expire on " << *in << " from mds" << from + << " with old nonce " << nonce << " (current " << in->get_replica_nonce(from) << "), dropping" + << endl; + assert(in->get_replica_nonce(from) > nonce); } } @@ -2072,11 +2862,12 @@ void MDCache::handle_cache_expire(MCacheExpire *m) it != m->get_dirs().end(); it++) { CInode *diri = get_inode(it->first); + assert(diri); CDir *dir = diri->dir; int nonce = it->second; if (!dir) { - dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl; + dout(0) << "dir expire on " << it->first << " from " << from << ", don't have it" << endl; assert(dir); // i should be authority, or proxy ... and pinned } if (!dir->is_auth()) { @@ -2093,17 +2884,71 @@ void MDCache::handle_cache_expire(MCacheExpire *m) // check nonce if (from == mds->get_nodeid()) { - dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME! ignoring" << endl; + dout(7) << "dir expire on " << *dir << " from mds" << from + << " .. ME! ignoring" << endl; } - else if (nonce == dir->get_open_by_nonce(from)) { + else if (nonce == dir->get_replica_nonce(from)) { // remove from our cached_by - dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl; - dir->open_by_remove(from); + dout(7) << "dir expire on " << *dir << " from mds" << from + << " replicas was " << dir->replicas << endl; + dir->remove_replica(from); } else { // this is an old nonce, ignore expire. - dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl; - assert(dir->get_open_by_nonce(from) > nonce); + dout(7) << "dir expire on " << *dir << " from mds" << from + << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) + << "), dropping" << endl; + assert(dir->get_replica_nonce(from) > nonce); + } + } + + // dentries + for (map >::iterator pd = m->get_dentries().begin(); + pd != m->get_dentries().end(); + ++pd) { + dout(0) << "dn expires in dir " << pd->first << endl; + CInode *diri = get_inode(pd->first); + CDir *dir = diri->dir; + assert(dir); + + if (!dir->is_auth()) { + int newauth = dir->authority(); + dout(7) << "proxy dentry expires on " << *dir << " to " << newauth << endl; + if (!dir->is_proxy()) + dout(0) << "nonproxy dentry expires? " << *dir << " .. auth is " << newauth + << " .. expire is from " << from << endl; + assert(dir->is_proxy()); + assert(newauth >= 0); + assert(dir->state_test(CDIR_STATE_PROXY)); + if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); + proxymap[newauth]->add_dentries(pd->first, pd->second); + continue; + } + + for (map::iterator p = pd->second.begin(); + p != pd->second.end(); + ++p) { + int nonce = p->second; + + CDentry *dn = dir->lookup(p->first); + if (!dn) + dout(0) << "missing dentry for " << p->first << " in " << *dir << endl; + assert(dn); + + if (from == mds->get_nodeid()) { + dout(7) << "dentry_expire on " << *dn << " from mds" << from + << " .. ME! ignoring" << endl; + } + else if (nonce == dn->get_replica_nonce(from)) { + dout(7) << "dentry_expire on " << *dn << " from mds" << from << endl; + dn->remove_replica(from); + } + else { + dout(7) << "dentry_expire on " << *dn << " from mds" << from + << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) + << "), dropping" << endl; + assert(dn->get_replica_nonce(from) > nonce); + } } } @@ -2119,15 +2964,45 @@ void MDCache::handle_cache_expire(MCacheExpire *m) delete m; } +void MDCache::inode_remove_replica(CInode *in, int from) +{ + in->remove_replica(from); + in->mds_caps_wanted.erase(from); + + // note: this code calls _eval more often than it needs to! + // fix lock + if (in->hardlock.is_gathering(from)) { + in->hardlock.gather_set.erase(from); + if (in->hardlock.gather_set.size() == 0) + mds->locker->inode_hard_eval(in); + } + if (in->filelock.is_gathering(from)) { + in->filelock.gather_set.erase(from); + if (in->filelock.gather_set.size() == 0) + mds->locker->inode_file_eval(in); + } + + // alone now? + if (!in->is_replicated()) { + mds->locker->inode_hard_eval(in); + mds->locker->inode_file_eval(in); + } +} int MDCache::send_dir_updates(CDir *dir, bool bcast) { // this is an FYI, re: replication - set who = dir->open_by; - if (bcast) - who = mds->get_mds_map()->get_mds(); + set who; + if (bcast) { + mds->get_mds_map()->get_active_mds_set(who); + } else { + for (map::iterator p = dir->replicas_begin(); + p != dir->replicas_end(); + ++p) + who.insert(p->first); + } dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl; @@ -2231,17 +3106,17 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // log it if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME - mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode), + mds->mdlog->submit_entry(new EString("unlink fixme fixme"),//EUnlink(dir, dn, dn->inode), NULL); // FIXME FIXME FIXME // tell replicas - if (dir->is_open_by_anyone()) { - for (set::iterator it = dir->open_by_begin(); - it != dir->open_by_end(); + if (dir->is_replicated()) { + for (map::iterator it = dir->replicas_begin(); + it != dir->replicas_end(); it++) { - dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl; + dout(7) << "inode_unlink sending DentryUnlink to mds" << it->first << endl; - mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE); + mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), it->first, MDS_PORT_CACHE); } // don't need ack. @@ -2280,10 +3155,10 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // unlink locally CInode *in = dn->inode; dn->dir->unlink_inode( dn ); - dn->mark_dirty(); + dn->_mark_dirty(); // fixme // mark it dirty! - in->mark_dirty(); + in->_mark_dirty(); // fixme // update anchor to point to inode file+mds vector atrace; @@ -2300,10 +3175,10 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // awesome, i can do it dout(7) << "remote target is local, nlink--" << endl; dn->inode->inode.nlink--; - dn->inode->mark_dirty(); + dn->inode->_mark_dirty(); // fixme - if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) || - (!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) { + if (( dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 0) || + (!dn->inode->state_test(CInode::STATE_DANGLING) && dn->inode->inode.nlink == 1)) { dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl; // remove anchor (async) @@ -2319,7 +3194,7 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // unlink locally CInode *in = dn->inode; dn->dir->unlink_inode( dn ); - dn->mark_dirty(); + dn->_mark_dirty(); // fixme // add waiter in->add_waiter(CINODE_WAIT_UNLINK, c); @@ -2331,7 +3206,7 @@ void MDCache::dentry_unlink(CDentry *dn, Context *c) // unlink locally dn->dir->unlink_inode( dn ); - dn->mark_dirty(); + dn->_mark_dirty(); // fixme // finish! dentry_unlink_finish(dn, dir, c); @@ -2417,7 +3292,7 @@ void MDCache::handle_inode_unlink(MInodeUnlink *m) assert(in->inode.nlink > 0); in->inode.nlink--; - if (in->state_test(CINODE_STATE_DANGLING)) { + if (in->state_test(CInode::STATE_DANGLING)) { // already dangling. // last link? if (in->inode.nlink == 0) { @@ -2429,12 +3304,12 @@ void MDCache::handle_inode_unlink(MInodeUnlink *m) mds->anchorclient->destroy(in->ino(), NULL); } else { - in->mark_dirty(); + in->_mark_dirty(); // fixme } } else { // has primary link still. assert(in->inode.nlink >= 1); - in->mark_dirty(); + in->_mark_dirty(); // fixme if (in->inode.nlink == 1) { dout(7) << "nlink=1, removing anchor" << endl; @@ -2474,7 +3349,7 @@ void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m) * Returns the directory in which authority is delegated for *dir. * This may be because a directory is an import, or because it is hashed * and we are nested underneath an inode in that dir (that hashes to us). - * Thus do not assume con->is_auth()! It is_auth() || is_hashed(). + * Thus do not assume result->is_auth()! It is_auth() || is_hashed(). */ CDir *MDCache::get_auth_container(CDir *dir) { @@ -2484,13 +3359,29 @@ CDir *MDCache::get_auth_container(CDir *dir) while (true) { if (imp->is_import()) break; // import imp = imp->get_parent_dir(); - assert(imp); + if (!imp) break; // none if (imp->is_hashed()) break; // hash } return imp; } +CDir *MDCache::get_export_container(CDir *dir) +{ + CDir *ex = dir; // might be *dir + assert(!ex->is_auth()); + + // find the underlying import or hash that delegates dir away + while (true) { + if (ex->is_export()) break; // import + ex = ex->get_parent_dir(); + assert(ex); + if (ex->is_hashed()) break; // hash + } + + return ex; +} + void MDCache::find_nested_exports(CDir *dir, set& s) { @@ -2560,13 +3451,76 @@ void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) void MDCache::show_imports() { - mds->balancer->show_imports(); + int db = 10; + + if (imports.empty() && + hashdirs.empty()) { + dout(db) << "show_imports: no imports/exports/hashdirs" << endl; + return; + } + dout(db) << "show_imports:" << endl; + + set ecopy = exports; + + set::iterator it = hashdirs.begin(); + while (1) { + if (it == hashdirs.end()) it = imports.begin(); + if (it == imports.end() ) break; + + CDir *im = *it; + + if (im->is_import()) { + dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; + //assert( im->is_auth() ); + } + else if (im->is_hashed()) { + if (im->is_import()) continue; // if import AND hash, list as import. + dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; + } + + for (set::iterator p = nested_exports[im].begin(); + p != nested_exports[im].end(); + p++) { + CDir *exp = *p; + if (exp->is_hashed()) { + //assert(0); // we don't do it this way actually + dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; + //assert( !exp->is_auth() ); + } else { + dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; + assert( exp->is_export() ); + //assert( !exp->is_auth() ); + } + + if ( get_auth_container(exp) != im ) { + dout(1) << "uh oh, auth container is " << *get_auth_container(exp) << endl; + assert( get_auth_container(exp) == im ); + } + + if (ecopy.count(exp) != 1) { + dout(1) << "***** nested_export " << *exp << " not in exports" << endl; + assert(0); + } + ecopy.erase(exp); + } + + it++; + } + + if (ecopy.size()) { + for (set::iterator it = ecopy.begin(); + it != ecopy.end(); + it++) + dout(1) << "***** stray item in exports: " << **it << endl; + assert(ecopy.size() == 0); + } } void MDCache::show_cache() { dout(7) << "show_cache" << endl; + for (hash_map::iterator it = inode_map.begin(); it != inode_map.end(); it++) { diff --git a/branches/aleung/security1/ceph/mds/MDCache.h b/branches/aleung/security1/ceph/mds/MDCache.h index e62113312447f..7b8825f073726 100644 --- a/branches/aleung/security1/ceph/mds/MDCache.h +++ b/branches/aleung/security1/ceph/mds/MDCache.h @@ -39,6 +39,9 @@ class Logger; class Message; +class MMDSImportMap; +class MMDSCacheRejoin; +class MMDSCacheRejoinAck; class MDiscover; class MDiscoverReply; class MCacheExpire; @@ -80,15 +83,20 @@ namespace __gnu_cxx { } class MDCache { - protected: + public: // my master MDS *mds; + LRU lru; // dentry lru for expiring items from cache + + protected: // the cache CInode *root; // root inode - LRU lru; // lru for expiring items hash_map inode_map; // map of inodes by ino - + + list inode_expire_queue; // inodes to delete + + // root list waiting_for_root; @@ -98,6 +106,11 @@ class MDCache { set hashdirs; map > nested_exports; // exports nested under imports _or_ hashdirs + void adjust_export(int to, CDir *root, set& bounds); + void adjust_import(int from, CDir *root, set& bounds); + + + // active MDS requests hash_map active_requests; @@ -108,15 +121,63 @@ class MDCache { // shutdown crap int shutdown_commits; bool did_shutdown_exports; + bool did_shutdown_log_cap; friend class C_MDC_ShutdownCommit; + // recovery +protected: + // from EImportStart w/o EImportFinish during journal replay + map > my_ambiguous_imports; + // from MMDSImportMaps + map > > other_ambiguous_imports; + + set recovery_set; + set wants_import_map; // nodes i need to send my import map to + set got_import_map; // nodes i need to send my import map to (when exports finish) + set rejoin_ack_gather; // nodes i need a rejoin ack from + + void handle_import_map(MMDSImportMap *m); + void handle_cache_rejoin(MMDSCacheRejoin *m); + void handle_cache_rejoin_ack(MMDSCacheRejoinAck *m); + void disambiguate_imports(); + void cache_rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); + void send_cache_rejoin_acks(); +public: + void send_import_map(int who); + void send_import_map_now(int who); + void send_import_map_later(int who) { + wants_import_map.insert(who); + } + void send_pending_import_maps(); // maybe. + void send_cache_rejoins(); + + void set_recovery_set(set& s) { + recovery_set = s; + } + + // ambiguous imports + void add_ambiguous_import(inodeno_t base, set& bounds) { + my_ambiguous_imports[base].swap(bounds); + } + void cancel_ambiguous_import(inodeno_t dirino); + void finish_ambiguous_import(inodeno_t dirino); + + void finish_ambiguous_export(inodeno_t dirino, set& bounds); + + + + + friend class CInode; friend class Locker; friend class Migrator; friend class Renamer; friend class MDBalancer; + friend class EImportMap; + public: + // subsystems Migrator *migrator; Renamer *renamer; @@ -132,13 +193,19 @@ class MDCache { CInode *get_root() { return root; } void set_root(CInode *r); + int get_num_imports() { return imports.size(); } void add_import(CDir *dir); void remove_import(CDir *dir); + void recalc_auth_bits(); + void log_import_map(Context *onsync=0); + + // cache void set_cache_size(size_t max) { lru.lru_set_max(max); } size_t get_cache_size() { return lru.lru_get_size(); } bool trim(int max = -1); // trim cache + void trim_non_auth(); // trim out trimmable non-auth items // shutdown void shutdown_start(); @@ -154,6 +221,12 @@ class MDCache { return NULL; } + + int hash_dentry(inodeno_t ino, const string& s) { + return 0; // fixme + } + + public: CInode *create_inode(); void add_inode(CInode *in); @@ -162,15 +235,23 @@ class MDCache { void remove_inode(CInode *in); void destroy_inode(CInode *in); void touch_inode(CInode *in) { - // touch parent(s) too - if (in->get_parent_dir()) touch_inode(in->get_parent_dir()->inode); + if (in->get_parent_dn()) + touch_dentry(in->get_parent_dn()); + } + void touch_dentry(CDentry *dn) { + // touch ancestors + if (dn->get_dir()->get_inode()->get_parent_dn()) + touch_dentry(dn->get_dir()->get_inode()->get_parent_dn()); - // top or mid, depending on whether i'm auth - if (in->is_auth()) - lru.lru_touch(in); + // touch me + if (dn->is_auth()) + lru.lru_touch(dn); else - lru.lru_midtouch(in); + lru.lru_midtouch(dn); } + + void inode_remove_replica(CInode *in, int rep); + void rename_file(CDentry *srcdn, CDentry *destdn); public: @@ -182,14 +263,15 @@ class MDCache { void start_recovered_purges(); - protected: - // private methods + public: CDir *get_auth_container(CDir *in); + CDir *get_export_container(CDir *dir); void find_nested_exports(CDir *dir, set& s); void find_nested_exports_under(CDir *import, CDir *dir, set& s); public: + CInode *create_root_inode(); int open_root(Context *c); int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, Message *req, Context *ondelay, diff --git a/branches/aleung/security1/ceph/mds/MDLog.cc b/branches/aleung/security1/ceph/mds/MDLog.cc index b272eb9a176d6..182bd4d0333e1 100644 --- a/branches/aleung/security1/ceph/mds/MDLog.cc +++ b/branches/aleung/security1/ceph/mds/MDLog.cc @@ -13,6 +13,7 @@ #include "MDLog.h" #include "MDS.h" +#include "MDCache.h" #include "LogEvent.h" #include "osdc/Journaler.h" @@ -22,8 +23,8 @@ #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " +#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " +#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " // cons/des @@ -35,10 +36,30 @@ MDLog::MDLog(MDS *m) num_events = 0; waiting_for_read = false; + last_import_map = 0; + writing_import_map = false; + seen_import_map = false; + max_events = g_conf.mds_log_max_len; + capped = false; + unflushed = 0; + journaler = 0; + logger = 0; +} + + +MDLog::~MDLog() +{ + if (journaler) { delete journaler; journaler = 0; } + if (logger) { delete logger; logger = 0; } +} + + +void MDLog::init_journaler() +{ // logger char name[80]; sprintf(name, "mds%d.log", mds->get_nodeid()); @@ -47,7 +68,7 @@ MDLog::MDLog(MDS *m) static bool didit = false; if (!didit) { mdlog_logtype.add_inc("add"); - mdlog_logtype.add_inc("retire"); + mdlog_logtype.add_inc("expire"); mdlog_logtype.add_inc("obs"); mdlog_logtype.add_inc("trim"); mdlog_logtype.add_set("size"); @@ -66,28 +87,25 @@ MDLog::MDLog(MDS *m) log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD; log_inode.layout.osd = mds->get_nodeid() + 10000; // hack } - + // log streamer + if (journaler) delete journaler; journaler = new Journaler(log_inode, mds->objecter, logger); - } -MDLog::~MDLog() -{ - if (journaler) { delete journaler; journaler = 0; } - if (logger) { delete logger; logger = 0; } -} - void MDLog::reset() { + dout(5) << "reset to empty log" << endl; + init_journaler(); journaler->reset(); } void MDLog::open(Context *c) { dout(5) << "open discovering log bounds" << endl; + init_journaler(); journaler->recover(c); } @@ -97,12 +115,24 @@ void MDLog::write_head(Context *c) } +off_t MDLog::get_read_pos() +{ + return journaler->get_read_pos(); +} + +off_t MDLog::get_write_pos() +{ + return journaler->get_write_pos(); +} + + + void MDLog::submit_entry( LogEvent *le, Context *c ) { - dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; - if (g_conf.mds_log) { + dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; + // encode it, with event type bufferlist bl; bl.append((char*)&le->_type, sizeof(le->_type)); @@ -111,6 +141,8 @@ void MDLog::submit_entry( LogEvent *le, // journal it. journaler->append_entry(bl); + assert(!capped); + delete le; num_events++; @@ -125,6 +157,14 @@ void MDLog::submit_entry( LogEvent *le, else unflushed++; + // should we log a new import_map? + // FIXME: should this go elsewhere? + if (last_import_map && !writing_import_map && + journaler->get_write_pos() - last_import_map >= g_conf.mds_log_import_map_interval) { + // log import map + mds->mdcache->log_import_map(); + } + } else { // hack: log is disabled. if (c) { @@ -196,12 +236,12 @@ void MDLog::_did_read() void MDLog::_trimmed(LogEvent *le) { - dout(7) << " trimmed " << *le << endl; - - assert(le->can_expire(mds)); + dout(7) << "trimmed : " << le->get_start_off() << " : " << *le << endl; + assert(le->has_expired(mds)); if (trimming.begin()->first == le->_end_off) { - // front! we can expire the log a bit + // we trimmed off the front! + // we can expire the log a bit. journaler->set_expire_pos(le->_end_off); } @@ -223,6 +263,8 @@ void MDLog::trim(Context *c) trim_waiters.push_back(c); // trim! + dout(10) << "trim " << num_events << " events / " << max_events << " max" << endl; + while (num_events > max_events) { off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); @@ -237,26 +279,28 @@ void MDLog::trim(Context *c) } bufferlist bl; + off_t so = journaler->get_read_pos(); if (journaler->try_read_entry(bl)) { // decode logevent LogEvent *le = LogEvent::decode(bl); + le->_start_off = so; le->_end_off = journaler->get_read_pos(); num_events--; // we just read an event. - if (le->can_expire(mds) == true) { + if (le->has_expired(mds)) { // obsolete - dout(7) << "trim obsolete: " << *le << endl; + dout(7) << "trim obsolete : " << le->get_start_off() << " : " << *le << endl; delete le; logger->inc("obs"); } else { assert ((int)trimming.size() < g_conf.mds_log_max_trimming); // trim! - dout(7) << "trim trimming: " << *le << endl; + dout(7) << "trim expiring : " << le->get_start_off() << " : " << *le << endl; trimming[le->_end_off] = le; - le->retire(mds, new C_MDL_Trimmed(this, le)); - logger->inc("retire"); + le->expire(mds, new C_MDL_Trimmed(this, le)); + logger->inc("expire"); logger->set("trim", trimming.size()); } logger->set("read", journaler->get_read_pos()); @@ -283,6 +327,17 @@ void MDLog::trim(Context *c) std::list finished; finished.swap(trim_waiters); finish_contexts(finished, 0); + + // hmm, are we at the end? + /* + if (journaler->get_read_pos() == journaler->get_write_pos() && + trimming.size() == import_map_expire_waiters.size()) { + dout(5) << "trim log is empty, allowing import_map to expire" << endl; + list ls; + ls.swap(import_map_expire_waiters); + finish_contexts(ls); + } + */ } @@ -338,13 +393,18 @@ void MDLog::_replay() LogEvent *le = LogEvent::decode(bl); num_events++; - if (le->has_happened(mds)) { + // have we seen an import map yet? + if (!seen_import_map && + le->get_type() != EVENT_IMPORTMAP) { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << " : already happened" << endl; + << " -- waiting for import_map. (skipping " << *le << ")" << endl; } else { dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() - << " : " << *le << " : applying" << endl; + << " : " << *le << endl; le->replay(mds); + + if (le->get_type() == EVENT_IMPORTMAP) + seen_import_map = true; } delete le; } diff --git a/branches/aleung/security1/ceph/mds/MDLog.h b/branches/aleung/security1/ceph/mds/MDLog.h index 37329a164e781..0d0248bde391d 100644 --- a/branches/aleung/security1/ceph/mds/MDLog.h +++ b/branches/aleung/security1/ceph/mds/MDLog.h @@ -48,12 +48,12 @@ class MDLog { int unflushed; + bool capped; + inode_t log_inode; Journaler *journaler; - - //hash_map trimming; // events currently being trimmed - map trimming; + map trimming; std::list trim_waiters; // contexts waiting for trim bool trim_reading; @@ -64,13 +64,49 @@ class MDLog { list waitfor_replay; + // importmaps + off_t last_import_map; // offsets of last committed importmap. constrains trimming. + list import_map_expire_waiters; + bool writing_import_map; // one is being written now + bool seen_import_map; // for recovery + + friend class EImportMap; + friend class C_MDS_WroteImportMap; + friend class MDCache; + + void init_journaler(); + + + public: + // replay state + map > pending_exports; + + + public: MDLog(MDS *m); ~MDLog(); - + + + void set_max_events(size_t max) { max_events = max; } size_t get_max_events() { return max_events; } size_t get_num_events() { return num_events + trimming.size(); } + size_t get_non_importmap_events() { return num_events + trimming.size() - import_map_expire_waiters.size(); } + + off_t get_read_pos(); + off_t get_write_pos(); + bool empty() { + return get_read_pos() == get_write_pos(); + } + + bool is_capped() { return capped; } + void cap() { + capped = true; + list ls; + ls.swap(import_map_expire_waiters); + finish_contexts(ls); + } void submit_entry( LogEvent *e, Context *c = 0 ); void wait_for_sync( Context *c ); diff --git a/branches/aleung/security1/ceph/mds/MDS.cc b/branches/aleung/security1/ceph/mds/MDS.cc index 1649bc3642f2b..89923c03c2f27 100644 --- a/branches/aleung/security1/ceph/mds/MDS.cc +++ b/branches/aleung/security1/ceph/mds/MDS.cc @@ -44,7 +44,7 @@ #include "common/Timer.h" #include "messages/MMDSMap.h" -#include "messages/MMDSBoot.h" +#include "messages/MMDSBeacon.h" #include "messages/MPing.h" #include "messages/MPingAck.h" @@ -66,7 +66,7 @@ LogType mds_logtype, mds_cache_logtype; // cons/des -MDS::MDS(int whoami, Messenger *m, MonMap *mm) { +MDS::MDS(int whoami, Messenger *m, MonMap *mm) : timer(mds_lock) { this->whoami = whoami; monmap = mm; @@ -94,13 +94,18 @@ MDS::MDS(int whoami, Messenger *m, MonMap *mm) { // init keys myPrivKey = esignPrivKey("crypto/esig1536.dat"); myPubKey = esignPubKey(myPrivKey); + + // beacon + beacon_last_seq = 0; + beacon_sender = 0; + beacon_killer = 0; + // tick + tick_event = 0; req_rate = 0; - state = STATE_BOOTING; - - last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now(); + want_state = state = MDSMap::STATE_DNE; logger = logger2 = 0; @@ -129,7 +134,7 @@ MDS::~MDS() { } -void MDS::reopen_log() +void MDS::reopen_logger() { // flush+close old log if (logger) { @@ -195,62 +200,389 @@ void MDS::reopen_log() void MDS::send_message_mds(Message *m, int mds, int port, int fromport) { + // send mdsmap first? + if (peer_mdsmap_epoch[mds] < mdsmap->get_epoch()) { + messenger->send_message(new MMDSMap(mdsmap), + mdsmap->get_inst(mds)); + peer_mdsmap_epoch[mds] = mdsmap->get_epoch(); + } + + // send message if (port && !fromport) fromport = port; - messenger->send_message(m, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), port, fromport); + messenger->send_message(m, mdsmap->get_inst(mds), port, fromport); } -int MDS::init() +class C_MDS_Tick : public Context { + MDS *mds; +public: + C_MDS_Tick(MDS *m) : mds(m) {} + void finish(int r) { + mds->tick(); + } +}; + + + +int MDS::init(bool standby) { + mds_lock.Lock(); + // generate my key pair + // .. + + if (standby) + want_state = MDSMap::STATE_STANDBY; + else + want_state = MDSMap::STATE_STARTING; - // request osd map - dout(5) << "requesting mds and osd maps from mon" << endl; - int mon = monmap->pick_mon(); - messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + // starting beacon. this will induce an MDSMap from the monitor + beacon_start(); + + // schedule tick + reset_tick(); + + mds_lock.Unlock(); return 0; } +void MDS::reset_tick() +{ + // cancel old + if (tick_event) timer.cancel_event(tick_event); -void MDS::handle_mds_map(MMDSMap *m) + // schedule + tick_event = new C_MDS_Tick(this); + timer.add_event_after(g_conf.mon_tick_interval, tick_event); +} + +void MDS::tick() +{ + // reschedule + reset_tick(); + + // log + mds_load_t load = balancer->get_load(); + + if (logger) { + req_rate = logger->get("req"); + + logger->set("l", (int)load.mds_load()); + logger->set("q", messenger->get_dispatch_queue_len()); + logger->set("buf", buffer_total_alloc); + + mdcache->log_stat(logger); + } + + // booted? + if (is_active()) { + + // balancer + balancer->tick(); + + // HACK to test hashing stuff + if (false) { + /* + static map didhash; + if (elapsed.sec() > 15 && !didhash[whoami]) { + CInode *in = mdcache->get_inode(100000010); + if (in && in->dir) { + if (in->dir->is_auth()) + mdcache->migrator->hash_dir(in->dir); + didhash[whoami] = 1; + } + } + if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { + CInode *in = mdcache->get_inode(100000010); + if (in && in->dir) { + if (in->dir->is_auth() && in->dir->is_hashed()) + mdcache->migrator->unhash_dir(in->dir); + didhash[whoami] = 2; + } + } + */ + } + } +} + + + + +// ----------------------- +// beacons + +void MDS::beacon_start() +{ + beacon_send(); // send first beacon + + //reset_beacon_killer(); // schedule killer +} + + +class C_MDS_BeaconSender : public Context { + MDS *mds; +public: + C_MDS_BeaconSender(MDS *m) : mds(m) {} + void finish(int r) { + mds->beacon_send(); + } +}; + +void MDS::beacon_send() { - map::reverse_iterator p = m->maps.rbegin(); + ++beacon_last_seq; + dout(10) << "beacon_send " << MDSMap::get_state_name(want_state) + << " seq " << beacon_last_seq + << " (currently " << MDSMap::get_state_name(state) << ")" + << endl; + + beacon_seq_stamp[beacon_last_seq] = g_clock.now(); + + int mon = monmap->pick_mon(); + messenger->send_message(new MMDSBeacon(want_state, beacon_last_seq), + monmap->get_inst(mon)); + + // schedule next sender + if (beacon_sender) timer.cancel_event(beacon_sender); + beacon_sender = new C_MDS_BeaconSender(this); + timer.add_event_after(g_conf.mds_beacon_interval, beacon_sender); +} - dout(1) << "handle_mds_map epoch " << p->first << endl; - mdsmap->decode(p->second); +void MDS::handle_mds_beacon(MMDSBeacon *m) +{ + dout(10) << "handle_mds_beacon " << MDSMap::get_state_name(m->get_state()) + << " seq " << m->get_seq() << endl; + version_t seq = m->get_seq(); + + // update lab + if (beacon_seq_stamp.count(seq)) { + assert(beacon_seq_stamp[seq] > beacon_last_acked_stamp); + beacon_last_acked_stamp = beacon_seq_stamp[seq]; + + // clean up seq_stamp map + while (!beacon_seq_stamp.empty() && + beacon_seq_stamp.begin()->first <= seq) + beacon_seq_stamp.erase(beacon_seq_stamp.begin()); + + reset_beacon_killer(); + } delete m; +} + +class C_MDS_BeaconKiller : public Context { + MDS *mds; + utime_t lab; +public: + C_MDS_BeaconKiller(MDS *m, utime_t l) : mds(m), lab(l) {} + void finish(int r) { + mds->beacon_kill(lab); + } +}; + +void MDS::reset_beacon_killer() +{ + utime_t when = beacon_last_acked_stamp; + when += g_conf.mds_beacon_grace; + + dout(15) << "reset_beacon_killer last_acked_stamp at " << beacon_last_acked_stamp + << ", will die at " << when << endl; + if (beacon_killer) timer.cancel_event(beacon_killer); + + beacon_killer = new C_MDS_BeaconKiller(this, beacon_last_acked_stamp); + timer.add_event_at(when, beacon_killer); +} + +void MDS::beacon_kill(utime_t lab) +{ + if (lab == beacon_last_acked_stamp) { + dout(0) << "beacon_kill last_acked_stamp " << lab + << ", killing myself." + << endl; + exit(0); + } else { + dout(20) << "beacon_kill last_acked_stamp " << beacon_last_acked_stamp + << " != my " << lab + << ", doing nothing." + << endl; + } +} + + + +void MDS::handle_mds_map(MMDSMap *m) +{ + version_t epoch = m->get_epoch(); + dout(1) << "handle_mds_map epoch " << epoch << " from " << m->get_source() << endl; + + // note source's map version + if (m->get_source().is_mds() && + peer_mdsmap_epoch[m->get_source().num()] < epoch) { + dout(15) << " peer " << m->get_source() + << " has mdsmap epoch >= " << epoch + << endl; + peer_mdsmap_epoch[m->get_source().num()] = epoch; + } + + // is it new? + if (epoch <= mdsmap->get_epoch()) { + dout(1) << " old map epoch " << epoch << " <= " << mdsmap->get_epoch() + << ", discarding" << endl; + delete m; + return; + } + + // note some old state + int oldwhoami = whoami; + int oldstate = state; + set oldresolve; + mdsmap->get_mds_set(oldresolve, MDSMap::STATE_RESOLVE); + bool wasrejoining = mdsmap->is_rejoining(); + set oldfailed; + mdsmap->get_mds_set(oldfailed, MDSMap::STATE_FAILED); + + // decode and process + mdsmap->decode(m->get_encoded()); + // see who i am - int w = mdsmap->get_inst_rank(messenger->get_myinst()); - if (w != whoami) { - whoami = w; - messenger->reset_myaddr(MSG_ADDR_MDS(w)); - reopen_log(); + whoami = mdsmap->get_inst_rank(messenger->get_myaddr()); + if (oldwhoami != whoami) { + // update messenger. + messenger->reset_myname(MSG_ADDR_MDS(whoami)); + + reopen_logger(); + dout(1) << "handle_mds_map i am now mds" << whoami + << " incarnation " << mdsmap->get_inc(whoami) + << endl; + + // do i need an osdmap? + if (oldwhoami < 0) { + // we need an osdmap too. + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDGetMap(0), + monmap->get_inst(mon)); + } } - dout(1) << "map says i am " << w << endl; - if (is_booting()) { - // we need an osdmap too. - int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(0), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + // tell objecter my incarnation + if (objecter->get_client_incarnation() < 0 && + mdsmap->have_inst(whoami)) { + assert(mdsmap->get_inc(whoami) > 0); + objecter->set_client_incarnation(mdsmap->get_inc(whoami)); } + + // update my state + state = mdsmap->get_state(whoami); + + // did it change? + if (oldstate != state) { + if (state == want_state) { + dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) << endl; + } else { + dout(1) << "handle_mds_map new state " << mdsmap->get_state_name(state) + << ", although i wanted " << mdsmap->get_state_name(want_state) + << endl; + want_state = state; + } + + // now active? + if (is_active()) { + dout(1) << "now active" << endl; + finish_contexts(waitfor_active); // kick waiters + } + + else if (is_replay()) { + // initialize gather sets + set rs; + mdsmap->get_recovery_mds_set(rs); + rs.erase(whoami); + dout(1) << "now replay. my recovery peers are " << rs << endl; + mdcache->set_recovery_set(rs); + } + + // now stopping? + else if (is_stopping()) { + assert(oldstate == MDSMap::STATE_ACTIVE); + dout(1) << "now stopping" << endl; + + mdcache->shutdown_start(); + + // save anchor table + if (mdsmap->get_anchortable() == whoami) + anchormgr->save(0); // FIXME? or detect completion via filer? + + if (idalloc) + idalloc->save(0); // FIXME? or detect completion via filer? + + // flush log + mdlog->set_max_events(0); + mdlog->trim(NULL); + } + + // now standby? + else if (is_stopped()) { + assert(oldstate == MDSMap::STATE_STOPPING); + dout(1) << "now stopped, sending down:out and exiting" << endl; + shutdown_final(); + } + } + + + // is anyone resolving? + if (is_resolve() || is_rejoin() || is_active() || is_stopping()) { + set resolve; + mdsmap->get_mds_set(resolve, MDSMap::STATE_RESOLVE); + if (oldresolve != resolve) + dout(10) << "resolve set is " << resolve << ", was " << oldresolve << endl; + for (set::iterator p = resolve.begin(); p != resolve.end(); ++p) { + if (*p == whoami) continue; + if (oldresolve.count(*p) == 0 || // if other guy newly resolve, or + oldstate == MDSMap::STATE_REPLAY) // if i'm newly resolve, + mdcache->send_import_map(*p); // share my import map (now or later) + } + } + + // is everybody finally rejoining? + if (is_rejoin() || is_active() || is_stopping()) { + if (!wasrejoining && mdsmap->is_rejoining()) { + mdcache->send_cache_rejoins(); + } + } + + // did anyone go down? + if (is_active() || is_stopping()) { + set failed; + mdsmap->get_mds_set(failed, MDSMap::STATE_FAILED); + for (set::iterator p = failed.begin(); p != failed.end(); ++p) { + // newly so? + if (oldfailed.count(*p)) continue; + + mdcache->migrator->handle_mds_failure(*p); + } + } + + delete m; } void MDS::handle_osd_map(MOSDMap *m) { + version_t had = osdmap->get_epoch(); + // process locally objecter->handle_osd_map(m); - - if (is_booting()) { - // we got our maps. mkfs for recovery? - if (g_conf.mkfs) - boot_mkfs(); + + if (had == 0) { + if (is_creating()) + boot_create(); // new tables, journal + else if (is_starting()) + boot_start(); // old tables, empty journal + else if (is_replay()) + boot_replay(); // replay, join else - boot_recover(); - } + assert(is_standby()); + } // pass on to clients for (set::iterator it = clientmap.get_mount_set().begin(); @@ -259,26 +591,26 @@ void MDS::handle_osd_map(MOSDMap *m) MOSDMap *n = new MOSDMap; n->maps = m->maps; n->incremental_maps = m->incremental_maps; - messenger->send_message(n, MSG_ADDR_CLIENT(*it), clientmap.get_inst(*it)); + messenger->send_message(n, clientmap.get_inst(*it)); } } -class C_MDS_MkfsFinish : public Context { +class C_MDS_BootFinish : public Context { MDS *mds; public: - C_MDS_MkfsFinish(MDS *m) : mds(m) {} - void finish(int r) { mds->boot_mkfs_finish(); } + C_MDS_BootFinish(MDS *m) : mds(m) {} + void finish(int r) { mds->boot_finish(); } }; -void MDS::boot_mkfs() +void MDS::boot_create() { - dout(3) << "boot_mkfs" << endl; + dout(3) << "boot_create" << endl; + + C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); - C_Gather *fin = new C_Gather(new C_MDS_MkfsFinish(this)); - if (whoami == 0) { - dout(3) << "boot_mkfs - creating root inode and dir" << endl; + dout(3) << "boot_create since i am also mds0, creating root inode and dir" << endl; // create root inode. mdcache->open_root(0); @@ -288,34 +620,68 @@ void MDS::boot_mkfs() // force empty root dir CDir *dir = root->dir; dir->mark_complete(); - dir->mark_dirty(); - + dir->mark_dirty(dir->pre_dirty()); + // save it mdstore->commit_dir(dir, fin->new_sub()); } - + // start with a fresh journal - dout(10) << "boot_mkfs creating fresh journal" << endl; + dout(10) << "boot_create creating fresh journal" << endl; mdlog->reset(); mdlog->write_head(fin->new_sub()); + + // write our first importmap + mdcache->log_import_map(fin->new_sub()); // fixme: fake out idalloc (reset, pretend loaded) - dout(10) << "boot_mkfs creating fresh idalloc table" << endl; + dout(10) << "boot_create creating fresh idalloc table" << endl; idalloc->reset(); idalloc->save(fin->new_sub()); // fixme: fake out anchortable if (mdsmap->get_anchortable() == whoami) { - dout(10) << "boot_mkfs creating fresh anchortable" << endl; + dout(10) << "boot_create creating fresh anchortable" << endl; anchormgr->reset(); anchormgr->save(fin->new_sub()); } } -void MDS::boot_mkfs_finish() +void MDS::boot_start() +{ + dout(2) << "boot_start" << endl; + + C_Gather *fin = new C_Gather(new C_MDS_BootFinish(this)); + + dout(2) << "boot_start opening idalloc" << endl; + idalloc->load(fin->new_sub()); + + if (mdsmap->get_anchortable() == whoami) { + dout(2) << "boot_start opening anchor table" << endl; + anchormgr->load(fin->new_sub()); + } else { + dout(2) << "boot_start i have no anchor table" << endl; + } + + dout(2) << "boot_start opening mds log" << endl; + mdlog->open(fin->new_sub()); + + if (mdsmap->get_root() == whoami) { + dout(2) << "boot_start opening root directory" << endl; + mdcache->open_root(fin->new_sub()); + } +} + +void MDS::boot_finish() { - dout(3) << "boot_mkfs_finish" << endl; - mark_active(); + dout(3) << "boot_finish" << endl; + + if (is_starting()) { + // make sure mdslog is empty + assert(mdlog->get_read_pos() == mdlog->get_write_pos()); + } + + set_want_state(MDSMap::STATE_ACTIVE); } @@ -324,93 +690,95 @@ class C_MDS_BootRecover : public Context { int nextstep; public: C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {} - void finish(int r) { mds->boot_recover(nextstep); } + void finish(int r) { mds->boot_replay(nextstep); } }; -void MDS::boot_recover(int step) +void MDS::boot_replay(int step) { - if (is_booting()) - state = STATE_RECOVERING; - switch (step) { case 0: - if (whoami == 0) { - dout(2) << "boot_recover " << step << ": creating root inode" << endl; - mdcache->open_root(0); - step = 1; - // fall-thru - } else { - // FIXME - assert(0); - } + step = 1; // fall-thru. case 1: - dout(2) << "boot_recover " << step << ": opening idalloc" << endl; + dout(2) << "boot_replay " << step << ": opening idalloc" << endl; idalloc->load(new C_MDS_BootRecover(this, 2)); break; case 2: if (mdsmap->get_anchortable() == whoami) { - dout(2) << "boot_recover " << step << ": opening anchor table" << endl; + dout(2) << "boot_replay " << step << ": opening anchor table" << endl; anchormgr->load(new C_MDS_BootRecover(this, 3)); break; - } else { - dout(2) << "boot_recover " << step << ": i have no anchor table" << endl; - step++; } - // fall-thru + dout(2) << "boot_replay " << step << ": i have no anchor table" << endl; + step++; // fall-thru case 3: - dout(2) << "boot_recover " << step << ": opening mds log" << endl; + dout(2) << "boot_replay " << step << ": opening mds log" << endl; mdlog->open(new C_MDS_BootRecover(this, 4)); break; case 4: - dout(2) << "boot_recover " << step << ": replaying mds log" << endl; + dout(2) << "boot_replay " << step << ": replaying mds log" << endl; mdlog->replay(new C_MDS_BootRecover(this, 5)); break; case 5: - dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl; + dout(2) << "boot_replay " << step << ": restarting any recovered purges" << endl; mdcache->start_recovered_purges(); - step++; - // fall-thru - + + step++; // fall-thru + case 6: - dout(2) << "boot_recover " << step << ": done." << endl; - mark_active(); + // done with replay! + if (mdsmap->get_num_mds(MDSMap::STATE_ACTIVE) == 0 && + mdsmap->get_num_mds(MDSMap::STATE_STOPPING) == 0 && + mdsmap->get_num_mds(MDSMap::STATE_RESOLVE) == 0 && + mdsmap->get_num_mds(MDSMap::STATE_REJOIN) == 0 && + mdsmap->get_num_mds(MDSMap::STATE_REPLAY) == 1 && // me + mdsmap->get_num_mds(MDSMap::STATE_FAILED) == 0) { + dout(2) << "boot_replay " << step << ": i am alone, moving to state active" << endl; + set_want_state(MDSMap::STATE_ACTIVE); + } else { + dout(2) << "boot_replay " << step << ": i am not alone, moving to state resolve" << endl; + set_want_state(MDSMap::STATE_RESOLVE); + } + break; + } } - -void MDS::mark_active() +void MDS::set_want_state(int s) { - dout(3) << "mark_active" << endl; - state = STATE_ACTIVE; - finish_contexts(waitfor_active); // kick waiters + dout(3) << "set_want_state " << MDSMap::get_state_name(s) << endl; + want_state = s; + beacon_send(); } - int MDS::shutdown_start() { dout(1) << "shutdown_start" << endl; derr(0) << "mds shutdown start" << endl; - for (set::iterator p = mdsmap->get_mds().begin(); - p != mdsmap->get_mds().end(); + // tell everyone to stop. + set active; + mdsmap->get_active_mds_set(active); + for (set::iterator p = active.begin(); + p != active.end(); p++) { - dout(1) << "sending MShutdownStart to mds" << *p << endl; - send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), - *p, MDS_PORT_MAIN); + if (mdsmap->is_up(*p)) { + dout(1) << "sending MShutdownStart to mds" << *p << endl; + send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), + *p, MDS_PORT_MAIN); + } } - if (idalloc) idalloc->shutdown(); - - handle_shutdown_start(NULL); + // go + set_want_state(MDSMap::STATE_STOPPING); return 0; } @@ -420,53 +788,51 @@ void MDS::handle_shutdown_start(Message *m) dout(1) << " handle_shutdown_start" << endl; // set flag - state = STATE_STOPPING; - - mdcache->shutdown_start(); - - // save anchor table - if (mdsmap->get_anchortable() == whoami) - anchormgr->save(0); // FIXME FIXME + set_want_state(MDSMap::STATE_STOPPING); - // flush log - mdlog->set_max_events(0); - mdlog->trim(NULL); - - if (m) delete m; - - //g_conf.debug_mds = 10; + delete m; } int MDS::shutdown_final() { - dout(1) << "shutdown" << endl; - - state = STATE_STOPPED; + dout(1) << "shutdown_final" << endl; + + // send final down:out beacon (it doesn't matter if this arrives) + set_want_state(MDSMap::STATE_OUT); + + // stop timers + if (beacon_killer) { + timer.cancel_event(beacon_killer); + beacon_killer = 0; + } + if (beacon_sender) { + timer.cancel_event(beacon_sender); + beacon_sender = 0; + } + if (tick_event) { + timer.cancel_event(tick_event); + tick_event = 0; + } + timer.cancel_all(); + timer.join(); // shut down cache mdcache->shutdown(); - - // tell monitor - messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - MSG_ADDR_MON(0), monmap->get_inst(0)); - + // shut down messenger messenger->shutdown(); - + return 0; } + void MDS::dispatch(Message *m) { - // make sure we advacne the clock - g_clock.now(); - - // process mds_lock.Lock(); my_dispatch(m); mds_lock.Unlock(); @@ -476,6 +842,24 @@ void MDS::dispatch(Message *m) void MDS::my_dispatch(Message *m) { + // from bad mds? + if (m->get_source().is_mds()) { + int from = m->get_source().num(); + if (!mdsmap->have_inst(from) || + mdsmap->get_inst(from) != m->get_source_inst()) { + // bogus mds? + if (m->get_type() != MSG_MDS_MAP) { + dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + << ", dropping" << endl; + delete m; + return; + } else { + dout(5) << "got " << *m << " from old/bad/imposter mds " << m->get_source() + << ", but it's an mdsmap, looking at it" << endl; + } + } + } + switch (m->get_dest_port()) { @@ -519,17 +903,6 @@ void MDS::my_dispatch(Message *m) // HACK FOR NOW - /* - static bool did_heartbeat_hack = false; - if (!shutting_down && !shut_down && - false && - !did_heartbeat_hack) { - osdmonitor->initiate_heartbeat(); - did_heartbeat_hack = true; - } - */ - - if (is_active()) { // flush log to disk after every op. for now. mdlog->flush(); @@ -549,7 +922,7 @@ void MDS::my_dispatch(Message *m) - // hash root? + // hack: force hash root? if (false && mdcache->get_root() && mdcache->get_root()->dir && @@ -560,77 +933,8 @@ void MDS::my_dispatch(Message *m) } - // periodic crap (1-second resolution) - static utime_t last_log = g_clock.recent_now(); - utime_t now = g_clock.recent_now(); - if (is_active() && - last_log.sec() != now.sec()) { - - // log - last_log = now; - mds_load_t load = balancer->get_load(); - - if (logger) { - req_rate = logger->get("req"); - - logger->set("l", (int)load.mds_load()); - logger->set("q", messenger->get_dispatch_queue_len()); - logger->set("buf", buffer_total_alloc); - - mdcache->log_stat(logger); - } - // balance? - static int num_bal_times = g_conf.mds_bal_max; - static utime_t first = g_clock.recent_now(); - utime_t elapsed = now; - elapsed -= first; - if (true && - whoami == 0 && - (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) && - !is_stopping() && !is_stopped() && - now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) { - last_balancer_heartbeat = now; - balancer->send_heartbeat(); - num_bal_times--; - } - - // hash? - if (true && - g_conf.num_mds > 1 && - now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) { - last_balancer_hash = now; - balancer->do_hashing(); - } - - - - // HACK to test hashing stuff - if (false) { - static map didhash; - if (elapsed.sec() > 15 && !didhash[whoami]) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth()) - mdcache->migrator->hash_dir(in->dir); - didhash[whoami] = 1; - } - } - if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { - CInode *in = mdcache->get_inode(100000010); - if (in && in->dir) { - if (in->dir->is_auth() && in->dir->is_hashed()) - mdcache->migrator->unhash_dir(in->dir); - didhash[whoami] = 2; - } - } - } - - - - } - // HACK to force export to test foreign renames if (false && whoami == 0) { static bool didit = false; @@ -638,7 +942,7 @@ void MDS::my_dispatch(Message *m) // 7 to 1 CInode *in = mdcache->get_inode(1001); if (in && in->is_dir() && !didit) { - CDir *dir = in->get_or_open_dir(this); + CDir *dir = in->get_or_open_dir(mdcache); if (dir->is_auth()) { dout(1) << "FORCING EXPORT" << endl; mdcache->migrator->export_dir(dir,1); @@ -652,8 +956,10 @@ void MDS::my_dispatch(Message *m) // shut down? if (is_stopping()) { if (mdcache->shutdown_pass()) { - dout(7) << "shutdown_pass=true, finished w/ shutdown" << endl; - shutdown_final(); + dout(7) << "shutdown_pass=true, finished w/ shutdown, moving to up:stopped" << endl; + + // tell monitor we shut down cleanly. + set_want_state(MDSMap::STATE_STOPPED); } } @@ -682,15 +988,20 @@ void MDS::proc_message(Message *m) handle_mds_map((MMDSMap*)m); return; + case MSG_MDS_BEACON: + handle_mds_beacon((MMDSBeacon*)m); + return; + case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ handle_shutdown_start(m); return; - - case MSG_PING: handle_ping((MPing*)m); return; + + default: + assert(0); } } @@ -705,7 +1016,7 @@ void MDS::handle_ping(MPing *m) dout(10) << " received ping from " << m->get_source() << " with seq " << m->seq << endl; messenger->send_message(new MPingAck(m), - m->get_source(), m->get_source_inst()); + m->get_source_inst()); delete m; } diff --git a/branches/aleung/security1/ceph/mds/MDS.h b/branches/aleung/security1/ceph/mds/MDS.h index 4de10ea877914..0db713f49e585 100644 --- a/branches/aleung/security1/ceph/mds/MDS.h +++ b/branches/aleung/security1/ceph/mds/MDS.h @@ -26,14 +26,19 @@ using namespace std; #include using namespace __gnu_cxx; +#include "mdstypes.h" + #include "msg/Dispatcher.h" #include "include/types.h" #include "include/Context.h" #include "common/DecayCounter.h" #include "common/Logger.h" #include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Timer.h" #include "mon/MonMap.h" +#include "MDSMap.h" #include "ClientMap.h" @@ -71,7 +76,6 @@ using namespace CryptoLib; class filepath; -class MDSMap; class OSDMap; class Objecter; class Filer; @@ -98,13 +102,15 @@ class MClientReply; class MHashReaddir; class MHashReaddirReply; - +class MMDSBeacon; class MDS : public Dispatcher { public: Mutex mds_lock; + SafeTimer timer; + protected: int whoami; @@ -134,17 +140,10 @@ class MDS : public Dispatcher { Logger *logger, *logger2; - protected: // -- MDS state -- - static const int STATE_BOOTING = 1; // fetching mds and osd maps - static const int STATE_MKFS = 2; // creating a file system - static const int STATE_RECOVERING = 3; // recovering mds log - static const int STATE_ACTIVE = 4; // up and active! - static const int STATE_STOPPING = 5; - static const int STATE_STOPPED = 6; - - int state; + int state; // my confirmed state + int want_state; // the state i want list waitfor_active; // mds pub/priv keys @@ -152,15 +151,24 @@ class MDS : public Dispatcher { esignPub myPubKey; public: + map peer_mdsmap_epoch; + void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); } - bool is_booting() { return state == STATE_BOOTING; } - bool is_recovering() { return state == STATE_RECOVERING; } - bool is_active() { return state == STATE_ACTIVE; } - bool is_stopping() { return state == STATE_STOPPING; } - bool is_stopped() { return state == STATE_STOPPED; } + bool is_dne() { return state == MDSMap::STATE_DNE; } + bool is_out() { return state == MDSMap::STATE_OUT; } + bool is_failed() { return state == MDSMap::STATE_FAILED; } + bool is_creating() { return state == MDSMap::STATE_CREATING; } + bool is_starting() { return state == MDSMap::STATE_STARTING; } + bool is_standby() { return state == MDSMap::STATE_STANDBY; } + bool is_replay() { return state == MDSMap::STATE_REPLAY; } + bool is_resolve() { return state == MDSMap::STATE_RESOLVE; } + bool is_rejoin() { return state == MDSMap::STATE_REJOIN; } + bool is_active() { return state == MDSMap::STATE_ACTIVE; } + bool is_stopping() { return state == MDSMap::STATE_STOPPING; } + bool is_stopped() { return state == MDSMap::STATE_STOPPED; } - void mark_active(); + void set_want_state(int s); // -- waiters -- @@ -173,7 +181,18 @@ public: finished_queue.splice( finished_queue.end(), ls ); } + // -- keepalive beacon -- + version_t beacon_last_seq; // last seq sent to monitor + map beacon_seq_stamp; // seq # -> time sent + utime_t beacon_last_acked_stamp; // last time we sent a beacon that got acked + Context *beacon_sender; + Context *beacon_killer; // next scheduled time of death + + // tick and other timer fun + Context *tick_event; + void reset_tick(); + // shutdown crap int req_rate; @@ -187,12 +206,7 @@ public: friend class MDStore; - - public: - - protected: - utime_t last_balancer_heartbeat, last_balancer_hash; - + public: MDS(int whoami, Messenger *m, MonMap *mm); ~MDS(); @@ -205,12 +219,13 @@ public: void send_message_mds(Message *m, int mds, int port=0, int fromport=0); // start up, shutdown - int init(); - void reopen_log(); + int init(bool standby=false); + void reopen_logger(); - void boot_mkfs(); - void boot_mkfs_finish(); - void boot_recover(int step=0); + void boot_create(); // i am new mds. + void boot_start(); // i am old but empty (was down:out) mds. + void boot_replay(int step=0); // i am recovering existing (down:failed) mds. + void boot_finish(); int shutdown_start(); int shutdown_final(); @@ -221,7 +236,14 @@ public: int hash_dentry(inodeno_t ino, const string& s) { return 0; // fixme } + + void tick(); + void beacon_start(); + void beacon_send(); + void beacon_kill(utime_t lab); + void handle_mds_beacon(MMDSBeacon *m); + void reset_beacon_killer(); // messages void proc_message(Message *m); diff --git a/branches/aleung/security1/ceph/mds/MDSMap.h b/branches/aleung/security1/ceph/mds/MDSMap.h index 051cd5cf3bdcd..e7de34313e074 100644 --- a/branches/aleung/security1/ceph/mds/MDSMap.h +++ b/branches/aleung/security1/ceph/mds/MDSMap.h @@ -29,20 +29,60 @@ using namespace CryptoLib; using namespace std; class MDSMap { + public: + // mds states + static const int STATE_DNE = 0; // down, never existed. + static const int STATE_OUT = 1; // down, once existed, but no imports, empty log. + static const int STATE_FAILED = 2; // down, holds (er, held) metadata; needs to be recovered. + + static const int STATE_STANDBY = 3; // up, but inactive. waiting for assignment by monitor. + static const int STATE_CREATING = 4; // up, creating MDS instance (new journal, idalloc..) + static const int STATE_STARTING = 5; // up, starting prior out MDS instance. + static const int STATE_REPLAY = 6; // up, scanning journal, recoverying any shared state + static const int STATE_RESOLVE = 7; // up, disambiguating partial distributed operations (import/export, ...rename?) + static const int STATE_REJOIN = 8; // up, replayed journal, rejoining distributed cache + static const int STATE_ACTIVE = 9; // up, active + static const int STATE_STOPPING = 10; // up, exporting metadata (-> standby or out) + static const int STATE_STOPPED = 11; // up, finished stopping. like standby, but not avail to takeover. + + static const char *get_state_name(int s) { + switch (s) { + // down + case STATE_DNE: return "down:dne"; + case STATE_OUT: return "down:out"; + case STATE_FAILED: return "down:failed"; + // up + case STATE_STANDBY: return "up:standby"; + case STATE_CREATING: return "up:creating"; + case STATE_STARTING: return "up:starting"; + case STATE_REPLAY: return "up:replay"; + case STATE_RESOLVE: return "up:resolve"; + case STATE_REJOIN: return "up:rejoin"; + case STATE_ACTIVE: return "up:active"; + case STATE_STOPPING: return "up:stopping"; + case STATE_STOPPED: return "up:stopped"; + default: assert(0); + } + return 0; + } + protected: epoch_t epoch; utime_t ctime; - int anchortable; + int anchortable; // which MDS has anchortable (fixme someday) + int root; // which MDS has root directory - set all_mds; - set down_mds; - map mds_inst; + set mds_created; // which mds ids have initialized journals and id tables. + map mds_state; // MDS state + map mds_state_seq; + map mds_inst; // up instances + map mds_inc; // incarnation count (monotonically increases) friend class MDSMonitor; public: - MDSMap() : epoch(0), anchortable(0) {} + MDSMap() : epoch(0), anchortable(0), root(0) {} epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } @@ -50,16 +90,126 @@ class MDSMap { const utime_t& get_ctime() const { return ctime; } int get_anchortable() const { return anchortable; } + int get_root() const { return root; } + + // counts + int get_num_mds() const { return mds_state.size(); } + int get_num_mds(int state) { + int n = 0; + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (p->second == state) ++n; + return n; + } + int get_num_up_mds() { + int n = 0; + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (is_up(p->first)) ++n; + return n; + } + int get_num_up_or_failed_mds() { + int n = 0; + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (is_up(p->first) || is_failed(p->first)) + ++n; + return n; + } + + // sets + void get_mds_set(set& s) { + s.clear(); + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + s.insert(p->first); + } + void get_up_mds_set(set& s) { + s.clear(); + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (is_up(p->first)) + s.insert(p->first); + } + void get_mds_set(set& s, int state) { + s.clear(); + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (p->second == state) + s.insert(p->first); + } + void get_active_mds_set(set& s) { + get_mds_set(s, MDSMap::STATE_ACTIVE); + } + void get_failed_mds_set(set& s) { + get_mds_set(s, MDSMap::STATE_FAILED); + } + void get_recovery_mds_set(set& s) { + s.clear(); + for (map::const_iterator p = mds_state.begin(); + p != mds_state.end(); + p++) + if (is_failed(p->first) || + is_replay(p->first) || is_resolve(p->first) || is_rejoin(p->first) || + is_active(p->first) || is_stopping(p->first)) + s.insert(p->first); + } + + + // mds states + bool is_down(int m) { return is_dne(m) || is_out(m) || is_failed(m); } + bool is_up(int m) { return !is_down(m); } + + bool is_dne(int m) { return mds_state.count(m) == 0 || mds_state[m] == STATE_DNE; } + bool is_out(int m) { return mds_state.count(m) && mds_state[m] == STATE_OUT; } + bool is_failed(int m) { return mds_state.count(m) && mds_state[m] == STATE_FAILED; } - int get_num_mds() const { return all_mds.size(); } - int get_num_up_mds() const { return all_mds.size() - down_mds.size(); } + bool is_standby(int m) { return mds_state.count(m) && mds_state[m] == STATE_STANDBY; } + bool is_creating(int m) { return mds_state.count(m) && mds_state[m] == STATE_CREATING; } + bool is_starting(int m) { return mds_state.count(m) && mds_state[m] == STATE_STARTING; } + bool is_replay(int m) { return mds_state.count(m) && mds_state[m] == STATE_REPLAY; } + bool is_resolve(int m) { return mds_state.count(m) && mds_state[m] == STATE_RESOLVE; } + bool is_rejoin(int m) { return mds_state.count(m) && mds_state[m] == STATE_REJOIN; } + bool is_active(int m) { return mds_state.count(m) && mds_state[m] == STATE_ACTIVE; } + bool is_stopping(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPING; } + bool is_stopped(int m) { return mds_state.count(m) && mds_state[m] == STATE_STOPPED; } + + bool has_created(int m) { return mds_created.count(m); } + + // cluster states + bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. + return get_num_mds(STATE_REPLAY) + + get_num_mds(STATE_RESOLVE) + + get_num_mds(STATE_REJOIN) + + get_num_mds(STATE_FAILED); + } + /*bool is_resolving() { // nodes are resolving distributed ops + return get_num_mds(STATE_RESOLVE); + }*/ + bool is_rejoining() { + // nodes are rejoining cache state + return get_num_mds(STATE_REJOIN) > 0 && + get_num_mds(STATE_RESOLVE) == 0 && + get_num_mds(STATE_REPLAY) == 0 && + get_num_mds(STATE_FAILED) == 0; + } - const set& get_mds() const { return all_mds; } - const set& get_down_mds() const { return down_mds; } - bool is_down(int m) const { return down_mds.count(m); } - bool is_up(int m) const { return !is_down(m); } + int get_state(int m) { + if (mds_state.count(m)) return mds_state[m]; + return STATE_OUT; + } + // inst + bool have_inst(int m) { + return mds_inst.count(m); + } const entity_inst_t& get_inst(int m) { assert(mds_inst.count(m)); return mds_inst[m]; @@ -72,25 +222,47 @@ class MDSMap { return false; } - int get_inst_rank(const entity_inst_t& inst) { + int get_inst_rank(const entity_addr_t& addr) { for (map::iterator p = mds_inst.begin(); p != mds_inst.end(); ++p) { - if (p->second == inst) return p->first; + if (p->second.addr == addr) return p->first; } + /*else + for (map::iterator p = mds_inst.begin(); + p != mds_inst.end(); + ++p) { + if (memcmp(&p->second.addr,&inst.addr, sizeof(inst.addr)) == 0) return p->first; + } + */ + return -1; } + int get_inc(int m) { + assert(mds_inc.count(m)); + return mds_inc[m]; + } + + + void remove_mds(int m) { + mds_inst.erase(m); + mds_state.erase(m); + mds_state_seq.erase(m); + } + // serialize, unserialize void encode(bufferlist& blist) { blist.append((char*)&epoch, sizeof(epoch)); blist.append((char*)&ctime, sizeof(ctime)); blist.append((char*)&anchortable, sizeof(anchortable)); + blist.append((char*)&root, sizeof(root)); - _encode(all_mds, blist); - _encode(down_mds, blist); - _encode(mds_inst, blist); + ::_encode(mds_state, blist); + ::_encode(mds_state_seq, blist); + ::_encode(mds_inst, blist); + ::_encode(mds_inc, blist); } void decode(bufferlist& blist) { @@ -101,10 +273,13 @@ class MDSMap { off += sizeof(ctime); blist.copy(off, sizeof(anchortable), (char*)&anchortable); off += sizeof(anchortable); + blist.copy(off, sizeof(root), (char*)&root); + off += sizeof(root); - _decode(all_mds, blist, off); - _decode(down_mds, blist, off); - _decode(mds_inst, blist, off); + ::_decode(mds_state, blist, off); + ::_decode(mds_state_seq, blist, off); + ::_decode(mds_inst, blist, off); + ::_decode(mds_inc, blist, off); } diff --git a/branches/aleung/security1/ceph/mds/MDStore.cc b/branches/aleung/security1/ceph/mds/MDStore.cc index 433d631dfa5ca..13aa270a2ee6c 100644 --- a/branches/aleung/security1/ceph/mds/MDStore.cc +++ b/branches/aleung/security1/ceph/mds/MDStore.cc @@ -228,7 +228,7 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, } // make sure we have a CDir - CDir *dir = idir->get_or_open_dir(mds); + CDir *dir = idir->get_or_open_dir(mds->mdcache); // do it dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl; @@ -281,7 +281,7 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, // what to do? if (hashcode >= 0) { - int dentryhashcode = mds->hash_dentry( dir->ino(), dname ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); assert(dentryhashcode == hashcode); } @@ -322,7 +322,7 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, // what to do? if (hashcode >= 0) { - int dentryhashcode = mds->hash_dentry( dir->ino(), dname ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dname ); assert(dentryhashcode == hashcode); } @@ -335,10 +335,11 @@ void MDStore::fetch_dir_hash_2( bufferlist& bl, dout(12) << "readdir had dentry " << dname << endl; // under water? - if (dn->get_inode()->get_parent_dir_version() <= got_version) { + if (dn->get_version() <= got_version) { + assert(dn->get_inode()->get_version() <= got_version); dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; - dn->get_inode()->mark_clean(); dn->mark_clean(); + dn->get_inode()->mark_clean(); } } continue; @@ -412,12 +413,16 @@ public: CInode *in = mds->mdcache->get_inode(ino); assert(in && in->dir); if (in && in->dir && in->dir->is_auth()) { - dout(7) << "CommitDirVerify: current version = " << in->dir->get_version() << endl; - dout(7) << "CommitDirVerify: last committed = " << in->dir->get_last_committed_version() << endl; - dout(7) << "CommitDirVerify: required = " << version << endl; + dout(7) << "CommitDirVerify: current = " << in->dir->get_version() + << ", last committed = " << in->dir->get_last_committed_version() + << ", required = " << version << endl; if (in->dir->get_last_committed_version() >= version) { dout(7) << "my required version is safe, done." << endl; + if (c) { + c->finish(0); + delete c; + } } else { dout(7) << "my required version is still not safe, committing again." << endl; @@ -425,13 +430,15 @@ public: mds->mdstore->commit_dir(in->dir, version, c); - return; } + return; } - } - + } + // must have exported ors omethign! dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl; + + // finish. if (c) { c->finish(-1); delete c; @@ -613,16 +620,10 @@ void MDStore::commit_dir_slice( CDir *dir, CDentry *dn = it->second; if (hashcode >= 0) { - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != hashcode) continue; } - // put dentry in this version - if (dn->is_dirty()) { - dn->float_parent_dir_version( dir->get_version() ); - dout(12) << " dirty dn " << *dn << " now " << dn->get_parent_dir_version() << endl; - } - if (dn->is_null()) continue; // skipping negative entry // primary or remote? @@ -653,18 +654,6 @@ void MDStore::commit_dir_slice( CDir *dir, dout(18) << " inlcuding symlink ptr " << in->symlink << endl; dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); } - - // put inode in this dir version - if (in->is_dirty()) { - in->float_parent_dir_version( dir->get_version() ); - dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl; - - in->set_committing_version( in->get_version() ); - assert(in->get_last_committed_version() < in->get_committing_version()); - } else { - assert(in->get_committing_version() == in->get_version()); - } - } num++; @@ -707,62 +696,39 @@ void MDStore::commit_dir_slice_2( int result, it++; if (hashcode >= 0) { - int dentryhashcode = mds->hash_dentry( dir->ino(), dn->get_name() ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), dn->get_name() ); if (dentryhashcode != hashcode) continue; } // dentry - if (committed_version > dn->get_parent_dir_version()) { - dout(15) << " dir " << committed_version << " > dn " << dn->get_parent_dir_version() << " still clean " << *dn << endl; - assert(!dn->is_dirty()); - } - else if (dn->get_parent_dir_version() == committed_version) { - dout(15) << " dir " << committed_version << " == dn " << dn->get_parent_dir_version() << " now clean " << *dn << endl; - if (dn->is_dirty()) - dn->mark_clean(); // might not but could be dirty - - // remove, if it's null and unlocked - if (dn->is_null() && dn->is_sync()) { - dout(15) << " removing clean and null " << *dn << endl; - null_clean.push_back(dn); - continue; - } + if (committed_version >= dn->get_version()) { + if (dn->is_dirty()) { + dout(15) << " dir " << committed_version << " >= dn " << dn->get_version() << " now clean " << *dn << endl; + dn->mark_clean(); + } } else { - dout(15) << " dir " << committed_version << " < dn " << dn->get_parent_dir_version() << " still dirty " << *dn << endl; - assert(committed_version < dn->get_parent_dir_version()); - //assert(dn->is_dirty() || !dn->is_sync()); // -OR- we did a fetch_dir in order to do a newer commit... + dout(15) << " dir " << committed_version << " < dn " << dn->get_version() << " still dirty " << *dn << endl; } // only do primary... - if (!dn->is_primary()) continue; + if (!dn->is_primary()) + continue; CInode *in = dn->get_inode(); assert(in); assert(in->is_auth()); - if (in->get_committing_version()) - in->set_committed_version(); - - if (committed_version > in->get_parent_dir_version()) { - dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl; - assert(!in->is_dirty()); - } - else if (in->get_parent_dir_version() == committed_version) { - dout(15) << " dir " << committed_version << " == inode " << in->get_parent_dir_version() << " now clean " << *(in) << endl; - in->mark_clean(); // might not but could be dirty + if (committed_version >= in->get_version()) { + if (in->is_dirty()) { + dout(15) << " dir " << committed_version << " >= inode " << in->get_version() << " now clean " << *in << endl; + in->mark_clean(); + } } else { - dout(15) << " dir " << committed_version << " < inode " << in->get_parent_dir_version() << " still dirty " << *(in) << endl; - assert(committed_version < in->get_parent_dir_version()); - //assert(in->is_dirty()); // -OR- we did a fetch_dir in order to do a newer commit... + dout(15) << " dir " << committed_version << " < inode " << in->get_version() << " still dirty " << *in << endl; + assert(in->is_dirty()); } } - // remove null clean dentries - for (list::iterator it = null_clean.begin(); - it != null_clean.end(); - it++) - dir->remove_dentry(*it); - // unpin dir->auth_unpin(); diff --git a/branches/aleung/security1/ceph/mds/Migrator.cc b/branches/aleung/security1/ceph/mds/Migrator.cc index 82c7970e072c7..5d14bfbee4283 100644 --- a/branches/aleung/security1/ceph/mds/Migrator.cc +++ b/branches/aleung/security1/ceph/mds/Migrator.cc @@ -19,6 +19,8 @@ #include "CDentry.h" #include "Migrator.h" #include "Locker.h" +#include "MDStore.h" +#include "Migrator.h" #include "MDBalancer.h" #include "MDLog.h" @@ -26,8 +28,11 @@ #include "include/filepath.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" +#include "events/EString.h" +#include "events/EExportStart.h" +#include "events/EExportFinish.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" #include "msg/Messenger.h" @@ -59,6 +64,11 @@ #include "messages/MUnhashDirNotifyAck.h" +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".migrator " + + void Migrator::dispatch(Message *m) { @@ -199,8 +209,150 @@ void Migrator::export_empty_import(CDir *dir) } + + +// ========================================================== +// mds failure handling + +void Migrator::handle_mds_failure(int who) +{ + dout(5) << "handle_mds_failure mds" << who << endl; + + // check my exports + map::iterator p = export_state.begin(); + while (p != export_state.end()) { + map::iterator next = p; + next++; + CDir *dir = p->first; + + if (export_peer[dir] == who) { + // the guy i'm exporting to failed. + // clean up. + dout(10) << "cleaning up export state " << p->second << " of " << *dir << endl; + + switch (p->second) { + case EXPORT_DISCOVERING: + dout(10) << "state discovering : canceling freeze and removing auth_pin" << endl; + dir->unfreeze_tree(); // cancel the freeze + dir->auth_unpin(); // remove the auth_pin (that was holding up the freeze) + break; + + case EXPORT_FREEZING: + dout(10) << "state freezing : canceling freeze" << endl; + dir->unfreeze_tree(); // cancel the freeze + break; + + case EXPORT_LOGGINGSTART: + case EXPORT_PREPPING: + dout(10) << "state loggingstart|prepping : logging EExportFinish(false)" << endl; + mds->mdlog->submit_entry(new EExportFinish(dir,false)); + // logger will unfreeze. + break; + + case EXPORT_EXPORTING: + dout(10) << "state exporting : logging EExportFinish(false), reversing, and unfreezing" << endl; + mds->mdlog->submit_entry(new EExportFinish(dir,false)); + reverse_export(dir); + dir->unfreeze_tree(); + break; + + case EXPORT_LOGGINGFINISH: + dout(10) << "state loggingfinish : doing nothing, we were successful." << endl; + break; + + default: + assert(0); + } + + export_state.erase(dir); + export_peer.erase(dir); + + // unpin the path + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + // wake up any waiters + mds->queue_finished(export_finish_waiters[dir]); + export_finish_waiters.erase(dir); + + // send pending import_maps? + mds->mdcache->send_pending_import_maps(); + + mds->mdcache->show_imports(); + mds->mdcache->show_cache(); + } else { + // third party failed. potential peripheral damage? + if (p->second == EXPORT_EXPORTING) { + // yeah, i'm waiting for acks, let's fake theirs. + if (export_notify_ack_waiting[dir].count(who)) { + dout(10) << "faking export_dir_notify_ack from mds" << who + << " on " << *dir << " to mds" << export_peer[dir] + << endl; + export_notify_ack_waiting[dir].erase(who); + if (export_notify_ack_waiting[dir].empty()) + export_dir_acked(dir); + } + } + } + + // next! + p = next; + } + + + // check my imports + map::iterator q = import_state.begin(); + while (q != import_state.end()) { + map::iterator next = q; + next++; + inodeno_t dirino = q->first; + CInode *diri = mds->mdcache->get_inode(dirino); + CDir *dir = 0; + if (diri) + dir = diri->dir; + + if (import_peer[dirino] == who) { + switch (import_peer[dirino]) { + case IMPORT_DISCOVERED: + + break; + + case IMPORT_PREPPING: + + break; + + case IMPORT_PREPPED: + + break; + + case IMPORT_LOGGINGSTART: + + break; + + case IMPORT_ACKING: + // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate + // ... + break; + + case IMPORT_LOGGINGFINISH: + // do nothing, exporter is no longer involved. + break; + } + } + + // next! + q = next; + } +} + + + + + + // ========================================================== -// IMPORT/EXPORT +// EXPORT class C_MDC_ExportFreeze : public Context { @@ -212,7 +364,8 @@ public: C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) : mig(m), ex(e), dest(d) {} virtual void finish(int r) { - mig->export_dir_frozen(ex, dest); + if (r >= 0) + mig->export_dir_frozen(ex, dest); } }; @@ -223,12 +376,17 @@ public: * will fail if the directory is freezing, frozen, unpinnable, or root. */ void Migrator::export_dir(CDir *dir, - int dest) + int dest) { dout(7) << "export_dir " << *dir << " to " << dest << endl; assert(dest != mds->get_nodeid()); assert(!dir->is_hashed()); + if (mds->mdsmap->is_degraded()) { + dout(7) << "cluster degraded, no exports for now" << endl; + return; + } + if (dir->inode->is_root()) { dout(7) << "i won't export root" << endl; assert(0); @@ -255,16 +413,17 @@ void Migrator::export_dir(CDir *dir, } // ok, let's go. + assert(export_state.count(dir) == 0); + export_state[dir] = EXPORT_DISCOVERING; + export_peer[dir] = dest; // send ExportDirDiscover (ask target) - export_gather[dir].insert(dest); mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) // take away the popularity we're sending. FIXME: do this later? mds->balancer->subtract_export(dir); - // freeze the subtree dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest)); } @@ -281,29 +440,40 @@ void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) CDir *dir = in->dir; assert(dir); - int from = m->get_source().num(); - assert(export_gather[dir].count(from)); - export_gather[dir].erase(from); + dout(7) << "export_dir_discover_ack from " << m->get_source() + << " on " << *dir << ", releasing auth_pin" << endl; - if (export_gather[dir].empty()) { - dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; - dir->auth_unpin(); // unpin to allow freeze to complete - } else { - dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl; - } + export_state[dir] = EXPORT_FREEZING; + + dir->auth_unpin(); // unpin to allow freeze to complete delete m; // done } +class C_MDC_ExportStartLogged : public Context { + Migrator *mig; + CDir *ex; // dir i'm exporting + int dest; + MExportDirPrep *prep; + +public: + C_MDC_ExportStartLogged(Migrator *m, CDir *e, int d, MExportDirPrep *p) : + mig(m), ex(e), dest(d), prep(p) {} + virtual void finish(int r) { + mig->export_dir_frozen_logged(ex, prep, dest); + } +}; void Migrator::export_dir_frozen(CDir *dir, int dest) { // subtree is now frozen! dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl; + export_state[dir] = EXPORT_LOGGINGSTART; show_imports(); + EExportStart *le = new EExportStart(dir, dest); MExportDirPrep *prep = new MExportDirPrep(dir->inode); // include spanning tree for all nested exports. @@ -311,9 +481,10 @@ void Migrator::export_dir_frozen(CDir *dir, // dir_auth updates on any nested exports are properly absorbed. set inodes_added; - + // include base dir - prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) ); + prep->add_dir( new CDirDiscover(dir, dir->add_replica(dest)) ); + le->metablob.add_dir( dir, false ); // also include traces to all nested exports. set my_nested; @@ -326,6 +497,9 @@ void Migrator::export_dir_frozen(CDir *dir, dout(7) << " including nested export " << *exp << " in prep" << endl; prep->add_export( exp->ino() ); + le->get_bounds().insert(exp->ino()); + le->metablob.add_dir_context( exp ); + le->metablob.add_dir( exp, false ); /* first assemble each trace, in trace order, and put in message */ list inode_trace; @@ -347,7 +521,7 @@ void Migrator::export_dir_frozen(CDir *dir, // include dir? note: this'll include everything except the nested exports themselves, // since someone else is obviously auth. if (cur->is_auth()) { - prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay! + prep->add_dir( new CDirDiscover(cur, cur->add_replica(dest)) ); // yay! dout(7) << " added " << *cur << endl; } @@ -359,14 +533,32 @@ void Migrator::export_dir_frozen(CDir *dir, it++) { CInode *in = *it; dout(7) << " added " << *in << endl; - prep->add_inode( in->parent->dir->ino(), - in->parent->name, + prep->add_inode( in->parent->get_dir()->ino(), + in->parent->get_name(), in->replicate_to(dest) ); } } - // send it! + // log our intentions + dout(7) << " logging EExportStart" << endl; + mds->mdlog->submit_entry(le, new C_MDC_ExportStartLogged(this, dir, dest, prep)); +} + +void Migrator::export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest) +{ + dout(7) << "export_dir_frozen_logged " << *dir << endl; + + if (export_state.count(dir) == 0 || + export_state[dir] != EXPORT_LOGGINGSTART) { + // export must have aborted. + dout(7) << "export must have aborted, unfreezing and deleting me old prep message" << endl; + delete prep; + dir->unfreeze_tree(); // cancel the freeze + return; + } + + export_state[dir] = EXPORT_PREPPING; mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); } @@ -379,7 +571,16 @@ void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl; + if (export_state.count(dir) == 0 || + export_state[dir] != EXPORT_PREPPING) { + // export must have aborted. + dout(7) << "export must have aborted, unfreezing" << endl; + dir->unfreeze_tree(); + return; + } + // start export. + export_state[dir] = EXPORT_EXPORTING; export_dir_go(dir, m->get_source().num()); // done @@ -388,16 +589,14 @@ void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) void Migrator::export_dir_go(CDir *dir, - int dest) + int dest) { dout(7) << "export_dir_go " << *dir << " to " << dest << endl; show_imports(); - - // build export message - MExportDir *req = new MExportDir(dir->inode); // include pop - + assert(export_bounds.count(dir) == 0); + assert(export_data.count(dir) == 0); // update imports/exports CDir *containing_import = cache->get_auth_container(dir); @@ -407,7 +606,7 @@ void Migrator::export_dir_go(CDir *dir, assert(dir->is_import()); cache->imports.erase(dir); dir->state_clear(CDIR_STATE_IMPORT); - dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import + dir->put(CDir::PIN_IMPORT); // unpin, no longer an import // discard nested exports (that we're handing off for (set::iterator p = cache->nested_exports[dir].begin(); @@ -416,7 +615,7 @@ void Migrator::export_dir_go(CDir *dir, p++; // add to export message - req->add_export(nested); + export_bounds[dir].insert(nested); // nested beneath our new export *in; remove! dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl; @@ -430,7 +629,7 @@ void Migrator::export_dir_go(CDir *dir, cache->nested_exports[containing_import].insert(dir); dir->state_set(CDIR_STATE_EXPORT); - dir->get(CDIR_PIN_EXPORT); // i must keep it pinned + dir->get(CDir::PIN_EXPORT); // i must keep it pinned // discard nested exports (that we're handing off) for (set::iterator p = cache->nested_exports[containing_import].begin(); @@ -452,7 +651,7 @@ void Migrator::export_dir_go(CDir *dir, // exports.erase(nested); _walk does this // add to msg - req->add_export(nested); + export_bounds[dir].insert(nested); } else { dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl; assert(cache->get_auth_container(containing_export) != containing_import); @@ -466,32 +665,45 @@ void Migrator::export_dir_go(CDir *dir, else dir->set_dir_auth( dest ); + // make list of nodes i expect an export_dir_notify_ack from // (everyone w/ this dir open, but me!) assert(export_notify_ack_waiting[dir].empty()); - for (set::iterator it = dir->open_by.begin(); - it != dir->open_by.end(); + for (map::iterator it = dir->replicas_begin(); + it != dir->replicas_end(); it++) { - if (*it == mds->get_nodeid()) continue; - export_notify_ack_waiting[dir].insert( *it ); + if (it->first == mds->get_nodeid()) continue; + export_notify_ack_waiting[dir].insert( it->first ); // send warning to all but dest - if (*it != dest) { - dout(10) << " sending export_dir_warning to mds" << *it << endl; - mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR); + if (it->first != dest) { + dout(10) << " sending export_dir_warning to mds" << it->first << endl; + mds->send_message_mds(new MExportDirWarning( dir->ino() ), it->first, MDS_PORT_MIGRATOR); } } assert(export_notify_ack_waiting[dir].count( dest )); // fill export message with cache data - C_Contexts *fin = new C_Contexts; - int num_exported_inodes = export_dir_walk( req, + C_Contexts *fin = new C_Contexts; // collect all the waiters + int num_exported_inodes = encode_export_dir( export_data[dir], fin, dir, // base dir, // recur start point dest ); // send the export data! + MExportDir *req = new MExportDir(dir->ino()); + + // export state + req->set_dirstate( export_data[dir] ); + + // add bounds + for (set::iterator p = export_bounds[dir].begin(); + p != export_bounds[dir].end(); + ++p) + req->add_export((*p)->ino()); + + //s end mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); // queue up the finisher @@ -513,8 +725,6 @@ void Migrator::export_dir_go(CDir *dir, */ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) { - in->inode.version++; // so local log entries are ignored, etc. (FIXME ??) - // tell (all) clients about migrating caps.. mark STALE for (map::iterator it = in->client_caps.begin(); it != in->client_caps.end(); @@ -525,16 +735,16 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au it->second.pending(), it->second.wanted(), MClientFileCaps::FILECAP_STALE); - mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first), + mds->messenger->send_message(m, mds->clientmap.get_inst(it->first), 0, MDS_PORT_CACHE); } // relax locks? - if (!in->is_cached_by_anyone()) + if (!in->is_replicated()) in->replicate_relax_locks(); // add inode - assert(in->cached_by.count(mds->get_nodeid()) == 0); + assert(!in->is_replica(mds->get_nodeid())); CInodeExport istate( in ); istate._encode( enc_state ); @@ -544,7 +754,7 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au if (in->is_dirty()) in->mark_clean(); // clear/unpin cached_by (we're no longer the authority) - in->cached_by_clear(); + in->clear_replicas(); // twiddle lock states for auth -> replica transition // hard @@ -580,20 +790,23 @@ void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_au // *** other state too? // move to end of LRU so we drop out of cache quickly! - cache->lru.lru_bottouch(in); + if (in->get_parent_dn()) + cache->lru.lru_bottouch(in->get_parent_dn()); } -int Migrator::export_dir_walk(MExportDir *req, - C_Contexts *fin, - CDir *basedir, - CDir *dir, - int newauth) +int Migrator::encode_export_dir(list& dirstatelist, + C_Contexts *fin, + CDir *basedir, + CDir *dir, + int newauth) { int num_exported = 0; dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; + assert(dir->get_projected_version() == dir->get_version()); + // dir bufferlist enc_dir; @@ -601,7 +814,7 @@ int Migrator::export_dir_walk(MExportDir *req, dstate._encode( enc_dir ); // release open_by - dir->open_by_clear(); + dir->clear_replicas(); // mark assert(dir->is_auth()); @@ -610,7 +823,7 @@ int Migrator::export_dir_walk(MExportDir *req, // proxy dir->state_set(CDIR_STATE_PROXY); - dir->get(CDIR_PIN_PROXY); + dir->get(CDir::PIN_PROXY); export_proxy_dirinos[basedir].push_back(dir->ino()); list subdirs; @@ -637,7 +850,7 @@ int Migrator::export_dir_walk(MExportDir *req, CDir_map_t::iterator it; for (it = dir->begin(); it != dir->end(); it++) { CDentry *dn = it->second; - CInode *in = dn->inode; + CInode *in = dn->get_inode(); num_exported++; @@ -649,7 +862,10 @@ int Migrator::export_dir_walk(MExportDir *req, enc_dir.append("D", 1); // dirty else enc_dir.append("C", 1); // clean - + + version_t dnv = dn->get_version(); + enc_dir.append((char*)&dnv, sizeof(dnv)); + // null dentry? if (dn->is_null()) { enc_dir.append("N", 1); // null dentry @@ -687,7 +903,7 @@ int Migrator::export_dir_walk(MExportDir *req, cache->exports.erase(in->dir); // discard nested export (nested_exports updated above) in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDIR_PIN_EXPORT); + in->dir->put(CDir::PIN_EXPORT); // simplify dir_auth? if (in->dir->get_dir_auth() == newauth) @@ -697,8 +913,8 @@ int Migrator::export_dir_walk(MExportDir *req, // add to proxy export_proxy_inos[basedir].push_back(in->ino()); - in->state_set(CINODE_STATE_PROXY); - in->get(CINODE_PIN_PROXY); + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); // waiters list waiters; @@ -707,16 +923,30 @@ int Migrator::export_dir_walk(MExportDir *req, } } - req->add_dir( enc_dir ); + // add to dirstatelist + bufferlist bl; + dirstatelist.push_back( bl ); + dirstatelist.back().claim( enc_dir ); // subdirs for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) - num_exported += export_dir_walk(req, fin, basedir, *it, newauth); + num_exported += encode_export_dir(dirstatelist, fin, basedir, *it, newauth); return num_exported; } +class C_MDS_ExportFinishLogged : public Context { + Migrator *migrator; + CDir *dir; +public: + C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : migrator(m), dir(d) {} + void finish(int r) { + migrator->export_dir_finish(dir); + } +}; + + /* * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack) */ @@ -732,107 +962,233 @@ void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) assert(export_notify_ack_waiting[dir].count(from)); export_notify_ack_waiting[dir].erase(from); + dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from + << ", still need (" << export_notify_ack_waiting[dir] << ")" << endl; + // done? - if (!export_notify_ack_waiting[dir].empty()) { - dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", still waiting for " << export_notify_ack_waiting[dir] << endl; - + if (export_notify_ack_waiting[dir].empty()) { + export_dir_acked(dir); } else { dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from - << ", last one!" << endl; + << ", still waiting for " << export_notify_ack_waiting[dir] << endl; + } + + delete m; +} - // ok, we're finished! - export_notify_ack_waiting.erase(dir); - // finish export (unfreeze, trigger finish context, etc.) - export_dir_finish(dir); - // unpin proxies - // inodes - for (list::iterator it = export_proxy_inos[dir].begin(); - it != export_proxy_inos[dir].end(); - it++) { - CInode *in = cache->get_inode(*it); - in->put(CINODE_PIN_PROXY); - assert(in->state_test(CINODE_STATE_PROXY)); - in->state_clear(CINODE_STATE_PROXY); - } - export_proxy_inos.erase(dir); +/* + * this happens if hte dest failes after i send teh export data but before it is acked + * that is, we don't know they safely received and logged it, so we reverse our changes + * and go on. + */ +void Migrator::reverse_export(CDir *dir) +{ + dout(7) << "reverse_export " << *dir << endl; + + assert(export_state[dir] == EXPORT_EXPORTING); + assert(export_bounds.count(dir)); + assert(export_data.count(dir)); + + // re-import it. + set bounds; + bounds.swap(export_bounds[dir]); + export_bounds.erase(dir); + + // -- adjust dir_auth -- + // base + CDir *im = dir; + if (dir->get_inode()->authority() == mds->get_nodeid()) { + // parent is already me. was export, adding back to existing import. + im = mds->mdcache->get_auth_container(dir); + assert(im); + mds->mdcache->nested_exports[im].erase(dir); + mds->mdcache->exports.erase(dir); + dir->set_dir_auth( CDIR_AUTH_PARENT ); + dir->state_clear(CDIR_STATE_EXPORT); + dir->put(CDir::PIN_EXPORT); + } else { + // parent isn't me. new import. + mds->mdcache->imports.insert(dir); + dir->set_dir_auth( mds->get_nodeid() ); + dir->state_set(CDIR_STATE_IMPORT); + dir->get(CDir::PIN_IMPORT); + } - // dirs - for (list::iterator it = export_proxy_dirinos[dir].begin(); - it != export_proxy_dirinos[dir].end(); - it++) { - CDir *dir = cache->get_inode(*it)->dir; - dir->put(CDIR_PIN_PROXY); - assert(dir->state_test(CDIR_STATE_PROXY)); - dir->state_clear(CDIR_STATE_PROXY); - - // hose neg dentries, too, since we're no longer auth - CDir_map_t::iterator it; - for (it = dir->begin(); it != dir->end(); ) { - CDentry *dn = it->second; - it++; - if (dn->is_null()) { - assert(dn->is_sync()); - dir->remove_dentry(dn); - } else { - //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; - if (dn->is_dirty()) - dn->mark_clean(); - } - } + dout(10) << " base " << *dir << endl; + if (dir != im) + dout(10) << " under " << *im << endl; + + // bounds + for (set::iterator p = bounds.begin(); + p != bounds.end(); + ++p) { + CDir *bd = *p; + + if (bd->get_dir_auth() == mds->get_nodeid()) { + // still me. was an import. + mds->mdcache->imports.erase(bd); + bd->set_dir_auth( CDIR_AUTH_PARENT ); + bd->state_clear(CDIR_STATE_IMPORT); + bd->put(CDir::PIN_IMPORT); + // move nested exports. + for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); + q != mds->mdcache->nested_exports[bd].end(); + ++q) + mds->mdcache->nested_exports[im].insert(*q); + mds->mdcache->nested_exports.erase(bd); + } else { + // not me anymore. now an export. + mds->mdcache->exports.insert(bd); + mds->mdcache->nested_exports[im].insert(bd); + assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); + bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); + bd->state_set(CDIR_STATE_EXPORT); + bd->get(CDir::PIN_EXPORT); } - export_proxy_dirinos.erase(dir); + + dout(10) << " bound " << *bd << endl; + } + + // reimport the dirs + list imported_subdirs; + int num_imported_inodes = 0; + + for (list::iterator p = export_data[dir].begin(); + p != export_data[dir].end(); + ++p) { + num_imported_inodes += + decode_import_dir(*p, + export_peer[dir], + dir, // import root + imported_subdirs, + 0); } - delete m; + // remove proxy bits + clear_export_proxy_pins(dir); + + // some clean up + export_data.erase(dir); + export_bounds.erase(dir); + export_notify_ack_waiting.erase(dir); } +void Migrator::export_dir_acked(CDir *dir) +{ + dout(7) << "export_dir_acked " << *dir << endl; + export_notify_ack_waiting.erase(dir); + + export_state[dir] = EXPORT_LOGGINGFINISH; + export_data.erase(dir); + export_bounds.erase(dir); + + // log export completion, then finish (unfreeze, trigger finish context, etc.) + mds->mdlog->submit_entry(new EExportFinish(dir, true), + new C_MDS_ExportFinishLogged(this, dir)); +} + + /* * once i get all teh notify_acks i can finish */ void Migrator::export_dir_finish(CDir *dir) { - // exported! + dout(7) << "export_dir_finish " << *dir << endl; - - // FIXME log it - - // send finish to new auth - mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); - + if (export_state.count(dir)) { + // send finish/commit to new auth + mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); + + // remove from exporting list + export_state.erase(dir); + export_peer.erase(dir); + } else { + dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << endl; + } + // unfreeze - dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl; + dout(7) << "export_dir_finish unfreezing" << endl; dir->unfreeze_tree(); - + // unpin path dout(7) << "export_dir_finish unpinning path" << endl; vector trace; cache->make_trace(trace, dir->inode); cache->path_unpin(trace, 0); + // unpin proxies + clear_export_proxy_pins(dir); + + // queue finishers + mds->queue_finished(export_finish_waiters[dir]); + export_finish_waiters.erase(dir); // stats if (mds->logger) mds->logger->set("nex", cache->exports.size()); show_imports(); -} - + // send pending import_maps? + mds->mdcache->send_pending_import_maps(); +} +void Migrator::clear_export_proxy_pins(CDir *dir) +{ + dout(10) << "clear_export_proxy_pins " << *dir << endl; + // inodes + for (list::iterator it = export_proxy_inos[dir].begin(); + it != export_proxy_inos[dir].end(); + it++) { + CInode *in = cache->get_inode(*it); + dout(15) << " " << *in << endl; + in->put(CInode::PIN_PROXY); + assert(in->state_test(CInode::STATE_PROXY)); + in->state_clear(CInode::STATE_PROXY); + } + export_proxy_inos.erase(dir); + + // dirs + for (list::iterator it = export_proxy_dirinos[dir].begin(); + it != export_proxy_dirinos[dir].end(); + it++) { + CDir *dir = cache->get_inode(*it)->dir; + dout(15) << " " << *dir << endl; + dir->put(CDir::PIN_PROXY); + assert(dir->state_test(CDIR_STATE_PROXY)); + dir->state_clear(CDIR_STATE_PROXY); + + // hose neg dentries, too, since we're no longer auth + CDir_map_t::iterator it; + for (it = dir->begin(); it != dir->end(); ) { + CDentry *dn = it->second; + it++; + if (dn->is_null()) { + assert(dn->is_sync()); + dir->remove_dentry(dn); + } else { + //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; + if (dn->is_dirty()) + dn->mark_clean(); + } + } + } + export_proxy_dirinos.erase(dir); +} +// ========================================================== +// IMPORT -// IMPORTS class C_MDC_ExportDirDiscover : public Context { Migrator *mig; @@ -891,10 +1247,14 @@ void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, i } // pin inode in the cache (for now) - in->get(CINODE_PIN_IMPORTING); + in->get(CInode::PIN_IMPORTING); // pin auth too, until the import completes. in->auth_pin(); + + import_state[in->ino()] = IMPORT_DISCOVERED; + import_peer[in->ino()] = m->get_source().num(); + // reply dout(7) << " sending export_dir_discover_ack on " << *in << endl; @@ -925,7 +1285,7 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) assert(!m->did_assim()); // open dir i'm importing. - diri->set_dir( new CDir(diri, mds, false) ); + diri->set_dir( new CDir(diri, mds->mdcache, false) ); dir = diri->dir; m->get_dir(diri->ino())->update_dir(dir); @@ -943,12 +1303,15 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) m->mark_assim(); // only do this the first time! // move pin to dir - diri->put(CINODE_PIN_IMPORTING); - dir->get(CDIR_PIN_IMPORTING); + diri->put(CInode::PIN_IMPORTING); + dir->get(CDir::PIN_IMPORTING); // auth pin too dir->auth_pin(); diri->auth_unpin(); + + // change import state + import_state[diri->ino()] = IMPORT_PREPPING; // assimilate traces to exports for (list::iterator it = m->get_inodes().begin(); @@ -980,7 +1343,7 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) m->get_dir(in->ino())->update_dir(in->dir); dout(7) << " updated " << *in->dir << endl; } else { - in->set_dir( new CDir(in, mds, false) ); + in->set_dir( new CDir(in, mds->mdcache, false) ); m->get_dir(in->ino())->update_dir(in->dir); dout(7) << " added " << *in->dir << endl; in->take_waiting(CINODE_WAIT_DIR, finished); @@ -996,14 +1359,17 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) CInode *in = cache->get_inode(*it); assert(in); + // note bound. + import_bounds[dir->ino()].insert(*it); + if (!in->dir) { dout(7) << " opening nested export on " << *in << endl; cache->open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); // pin it! - in->get(CINODE_PIN_OPENINGDIR); - in->state_set(CINODE_STATE_OPENINGDIR); + in->get(CInode::PIN_OPENINGDIR); + in->state_set(CInode::STATE_OPENINGDIR); } } } else { @@ -1023,12 +1389,12 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) if (in->dir) { if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { dout(7) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->get(CDir::PIN_IMPORTINGEXPORT); in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); - if (in->state_test(CINODE_STATE_OPENINGDIR)) { - in->put(CINODE_PIN_OPENINGDIR); - in->state_clear(CINODE_STATE_OPENINGDIR); + if (in->state_test(CInode::STATE_OPENINGDIR)) { + in->put(CInode::PIN_OPENINGDIR); + in->state_clear(CInode::STATE_OPENINGDIR); } } else { dout(7) << " already pinned nested export " << *in << endl; @@ -1045,7 +1411,10 @@ void Migrator::handle_export_dir_prep(MExportDirPrep *m) dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl; mds->send_message_mds(new MExportDirPrepAck(dir->ino()), m->get_source().num(), MDS_PORT_MIGRATOR); - + + // note new state + import_state[diri->ino()] = IMPORT_PREPPED; + // done delete m; } @@ -1084,7 +1453,23 @@ public: }; */ - +class C_MDS_ImportDirLoggedStart : public Context { + Migrator *migrator; + CDir *dir; + int from; + list imported_subdirs; + list exports; +public: + C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, int f, + list& is, list& e) : + migrator(m), dir(d), from(f) { + imported_subdirs.swap(is); + exports.swap(e); + } + void finish(int r) { + migrator->import_dir_logged_start(dir, from, imported_subdirs, exports); + } +}; void Migrator::handle_export_dir(MExportDir *m) { @@ -1094,111 +1479,101 @@ void Migrator::handle_export_dir(MExportDir *m) assert(dir); int oldauth = m->get_source().num(); - dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl; + dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << endl; assert(dir->is_auth() == false); - - show_imports(); + // start the journal entry + EImportStart *le = new EImportStart(dir->ino(), m->get_exports()); + le->metablob.add_dir_context(dir); + // note new authority (locally) - if (dir->inode->is_auth()) - dir->set_dir_auth( CDIR_AUTH_PARENT ); - else - dir->set_dir_auth( mds->get_nodeid() ); - dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl; - - // update imports/exports - CDir *containing_import; - if (cache->exports.count(dir)) { - // reimporting - dout(7) << " i'm reimporting " << *dir << endl; - cache->exports.erase(dir); - + CDir *im = dir; + if (dir->inode->is_auth()) { + // parent is already me. was export, adding back to existing import. + im = mds->mdcache->get_auth_container(dir); + assert(im); + mds->mdcache->nested_exports[im].erase(dir); + mds->mdcache->exports.erase(dir); + dir->set_dir_auth( CDIR_AUTH_PARENT ); dir->state_clear(CDIR_STATE_EXPORT); - dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export - - containing_import = cache->get_auth_container(dir); - dout(7) << " it is nested under import " << *containing_import << endl; - cache->nested_exports[containing_import].erase(dir); + dir->put(CDir::PIN_EXPORT); } else { - // new import - cache->imports.insert(dir); + // parent isn't me. new import. + mds->mdcache->imports.insert(dir); + dir->set_dir_auth( mds->get_nodeid() ); dir->state_set(CDIR_STATE_IMPORT); - dir->get(CDIR_PIN_IMPORT); // must keep it pinned - - containing_import = dir; // imported exports nested under *in - - dout(7) << " new import at " << *dir << endl; + dir->get(CDir::PIN_IMPORT); } - // take out my temp pin - dir->put(CDIR_PIN_IMPORTING); + dir->put(CDir::PIN_IMPORTING); + + // mark import point frozen + // (note: this is a manual freeze.. hack hack hack!) + dir->get_inode()->auth_pin(); + dir->state_set(CDIR_STATE_FROZENTREE); - // add any inherited exports + dout(10) << " base " << *dir << endl; + if (dir != im) + dout(10) << " under " << *im << endl; + + // bounds for (list::iterator it = m->get_exports().begin(); it != m->get_exports().end(); it++) { - CInode *exi = cache->get_inode(*it); - assert(exi && exi->dir); - CDir *ex = exi->dir; + CInode *bdi = cache->get_inode(*it); + CDir *bd = bdi->dir; + + if (bd->get_dir_auth() == mds->get_nodeid()) { + // still me. was an import. + assert(bd->is_import()); + mds->mdcache->imports.erase(bd); + bd->set_dir_auth( CDIR_AUTH_PARENT ); + bd->state_clear(CDIR_STATE_IMPORT); + bd->put(CDir::PIN_IMPORT); + // move nested exports. + for (set::iterator q = mds->mdcache->nested_exports[bd].begin(); + q != mds->mdcache->nested_exports[bd].end(); + ++q) + mds->mdcache->nested_exports[im].insert(*q); + mds->mdcache->nested_exports.erase(bd); + } else { + // not me anymore. now an export. + mds->mdcache->exports.insert(bd); + mds->mdcache->nested_exports[im].insert(bd); + assert(bd->get_dir_auth() != CDIR_AUTH_PARENT); + bd->set_dir_auth( CDIR_AUTH_UNKNOWN ); + bd->state_set(CDIR_STATE_EXPORT); + bd->get(CDir::PIN_EXPORT); + } - dout(15) << " nested export " << *ex << endl; + // mark export point frozenleaf + bd->get(CDir::PIN_FREEZELEAF); + bd->state_set(CDIR_STATE_FROZENTREELEAF); + assert(import_bounds[dir->ino()].count(*it)); // we took note during prep stage // remove our pin - ex->put(CDIR_PIN_IMPORTINGEXPORT); - ex->state_clear(CDIR_STATE_IMPORTINGEXPORT); - - - // add... - if (ex->is_import()) { - dout(7) << " importing my import " << *ex << endl; - cache->imports.erase(ex); - ex->state_clear(CDIR_STATE_IMPORT); - - if (mds->logger) mds->logger->inc("imex"); + bd->put(CDir::PIN_IMPORTINGEXPORT); + bd->state_clear(CDIR_STATE_IMPORTINGEXPORT); - // move nested exports under containing_import - for (set::iterator it = cache->nested_exports[ex].begin(); - it != cache->nested_exports[ex].end(); - it++) { - dout(7) << " moving nested export " << **it << " under " << *containing_import << endl; - cache->nested_exports[containing_import].insert(*it); - } - cache->nested_exports.erase(ex); // de-list under old import - - ex->set_dir_auth( CDIR_AUTH_PARENT ); - ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import - - } else { - dout(7) << " importing export " << *ex << endl; - - // add it - ex->state_set(CDIR_STATE_EXPORT); - ex->get(CDIR_PIN_EXPORT); // all exports are pinned - cache->exports.insert(ex); - cache->nested_exports[containing_import].insert(ex); - if (mds->logger) mds->logger->inc("imex"); - } - + dout(10) << " bound " << *bd << endl; } - - + // add this crap to my cache list imported_subdirs; - bufferlist dir_state; - dir_state.claim( m->get_state() ); - int off = 0; int num_imported_inodes = 0; - for (int i = 0; i < m->get_ndirs(); i++) { + for (list::iterator p = m->get_dirstate().begin(); + p != m->get_dirstate().end(); + ++p) { num_imported_inodes += - import_dir_block(dir_state, - off, + decode_import_dir(*p, oldauth, dir, // import root - imported_subdirs); + imported_subdirs, + le); } dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; @@ -1207,40 +1582,15 @@ void Migrator::handle_export_dir(MExportDir *m) // adjust popularity mds->balancer->add_import(dir); - // send notify's etc. - dout(7) << "sending notifyack for " << *dir << " to old auth " << m->get_source().num() << endl; - mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), - m->get_source().num(), MDS_PORT_MIGRATOR); + dout(7) << "handle_export_dir did " << *dir << endl; - dout(7) << "sending notify to others" << endl; - for (set::iterator it = dir->open_by.begin(); - it != dir->open_by.end(); - it++) { - assert( *it != mds->get_nodeid() ); - if ( *it == m->get_source().num() ) continue; // not to old auth. - - MExportDirNotify *notify = new MExportDirNotify(dir->ino(), m->get_source().num(), mds->get_nodeid()); - notify->copy_exports(m->get_exports()); - - if (g_conf.mds_verify_export_dirauth) - notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) - - mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR); - } - - // done - delete m; - - show_imports(); - - - // is it empty? - if (dir->get_size() == 0 && - !dir->inode->is_auth()) { - // reexport! - export_empty_import(dir); - } + // log it + mds->mdlog->submit_entry(le, + new C_MDS_ImportDirLoggedStart(this, dir, m->get_source().num(), + imported_subdirs, m->get_exports())); + // note state + import_state[dir->ino()] = IMPORT_LOGGINGSTART; // some stats if (mds->logger) { @@ -1249,24 +1599,53 @@ void Migrator::handle_export_dir(MExportDir *m) mds->logger->set("nim", cache->imports.size()); } + delete m; +} - // FIXME LOG IT - /* - stupid hashing crap, FIXME +void Migrator::import_dir_logged_start(CDir *dir, int from, + list &imported_subdirs, + list &exports) +{ + dout(7) << "import_dir_logged " << *dir << endl; - // wait for replicas in hashed dirs? - if (import_hashed_replicate_waiting.count(m->get_ino())) { - // it'll happen later!, when i get my inodegetreplicaack's back - } else { - // finish now - //not anymoreimport_dir_finish(dir); - } - */ + // note state + import_state[dir->ino()] = IMPORT_ACKING; + + // send notify's etc. + dout(7) << "sending notifyack for " << *dir << " to old auth mds" << from << endl; + mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), + from, MDS_PORT_MIGRATOR); + + dout(7) << "sending notify to others" << endl; + for (map::iterator it = dir->replicas_begin(); + it != dir->replicas_end(); + it++) { + assert( it->first != mds->get_nodeid() ); + if ( it->first == from ) continue; // not to old auth. + + MExportDirNotify *notify = new MExportDirNotify(dir->ino(), from, mds->get_nodeid()); + notify->copy_exports(exports); + if (g_conf.mds_verify_export_dirauth) + notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) + + mds->send_message_mds(notify, it->first, MDS_PORT_MIGRATOR); + } + + show_imports(); } +class C_MDS_ImportDirLoggedFinish : public Context { + Migrator *migrator; + CDir *dir; +public: + C_MDS_ImportDirLoggedFinish(Migrator *m, CDir *d) : migrator(m), dir(d) { } + void finish(int r) { + migrator->import_dir_logged_finish(dir); + } +}; void Migrator::handle_export_dir_finish(MExportDirFinish *m) { @@ -1274,24 +1653,60 @@ void Migrator::handle_export_dir_finish(MExportDirFinish *m) CDir *dir = diri->dir; assert(dir); - dout(7) << "handle_export_dir_finish on " << *dir << endl; + dout(7) << "handle_export_dir_finish logging import_finish on " << *dir << endl; assert(dir->is_auth()); - dout(5) << "done with import of " << *dir << endl; - show_imports(); - if (mds->logger) { - mds->logger->set("nex", cache->exports.size()); - mds->logger->set("nim", cache->imports.size()); - } + // note state + import_state[dir->ino()] = IMPORT_LOGGINGFINISH; + + // log + mds->mdlog->submit_entry(new EImportFinish(dir, true), + new C_MDS_ImportDirLoggedFinish(this,dir)); + delete m; +} + +void Migrator::import_dir_logged_finish(CDir *dir) +{ + dout(7) << "import_dir_logged_finish " << *dir << endl; // un auth pin (other exports can now proceed) dir->auth_unpin(); + // unfreeze! + for (set::iterator p = import_bounds[dir->ino()].begin(); + p != import_bounds[dir->ino()].end(); + ++p) { + CInode *diri = mds->mdcache->get_inode(*p); + CDir *dir = diri->dir; + assert(dir->state_test(CDIR_STATE_FROZENTREELEAF)); + dir->put(CDir::PIN_FREEZELEAF); + dir->state_clear(CDIR_STATE_FROZENTREELEAF); + } + + dir->unfreeze_tree(); + + // clear import state (we're done!) + import_state.erase(dir->ino()); + import_peer.erase(dir->ino()); + import_bounds.erase(dir->ino()); + // ok now finish contexts dout(5) << "finishing any waiters on imported data" << endl; dir->finish_waiting(CDIR_WAIT_IMPORTED); - delete m; + // log it + if (mds->logger) { + mds->logger->set("nex", cache->exports.size()); + mds->logger->set("nim", cache->imports.size()); + } + show_imports(); + + // is it empty? + if (dir->get_size() == 0 && + !dir->inode->is_auth()) { + // reexport! + export_empty_import(dir); + } } @@ -1310,16 +1725,15 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol in->set_auth(true); } - // link before state + // state after link -- or not! -sage + set merged_client_caps; + istate.update_inode(in, merged_client_caps); + + // link before state -- or not! -sage if (dn->inode != in) { assert(!dn->inode); dn->dir->link_inode(dn, in); } - - // state after link - set merged_client_caps; - istate.update_inode(in, merged_client_caps); - // add inode? if (added) { @@ -1330,11 +1744,11 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol } - // cached_by - assert(!in->is_cached_by(oldauth)); - in->cached_by_add( oldauth, CINODE_EXPORT_NONCE ); - if (in->is_cached_by(mds->get_nodeid())) - in->cached_by_remove(mds->get_nodeid()); + // adjust replica list + //assert(!in->is_replica(oldauth)); // not true on failed export + in->add_replica( oldauth, CINODE_EXPORT_NONCE ); + if (in->is_replica(mds->get_nodeid())) + in->remove_replica(mds->get_nodeid()); // twiddle locks // hard @@ -1356,7 +1770,7 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol MClientFileCaps::FILECAP_REAP); caps->set_mds( oldauth ); // reap from whom? mds->messenger->send_message(caps, - MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it), + mds->clientmap.get_inst(*it), 0, MDS_PORT_CACHE); } @@ -1368,31 +1782,27 @@ void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int ol if (in->filelock.gather_set.empty()) // necessary but not suffient... mds->locker->inode_file_eval(in); } - - // other - if (in->is_dirty()) { - dout(10) << "logging dirty import " << *in << endl; - mds->mdlog->submit_entry(new EInodeUpdate(in)); - } } -int Migrator::import_dir_block(bufferlist& bl, - int& off, - int oldauth, - CDir *import_root, - list& imported_subdirs) +int Migrator::decode_import_dir(bufferlist& bl, + int oldauth, + CDir *import_root, + list& imported_subdirs, + EImportStart *le) { + int off = 0; + // set up dir CDirExport dstate; off = dstate._decode(bl, off); - + CInode *diri = cache->get_inode(dstate.get_ino()); assert(diri); - CDir *dir = diri->get_or_open_dir(mds); + CDir *dir = diri->get_or_open_dir(mds->mdcache); assert(dir); - - dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl; + + dout(7) << "decode_import_dir " << *dir << endl; // add to list if (dir != import_root) @@ -1400,23 +1810,26 @@ int Migrator::import_dir_block(bufferlist& bl, // assimilate state dstate.update_dir( dir ); - if (diri->is_auth()) - dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth // mark (may already be marked from get_or_open_dir() above) if (!dir->is_auth()) dir->state_set(CDIR_STATE_AUTH); - // open_by - assert(!dir->is_open_by(oldauth)); - dir->open_by_add(oldauth); - if (dir->is_open_by(mds->get_nodeid())) - dir->open_by_remove(mds->get_nodeid()); + // adjust replica list + //assert(!dir->is_replica(oldauth)); // not true on failed export + dir->add_replica(oldauth); + if (dir->is_replica(mds->get_nodeid())) + dir->remove_replica(mds->get_nodeid()); + + // add to journal entry + if (le) + le->metablob.add_dir(dir, true); // Hmm: false would be okay in some cases + + int num_imported = 0; if (dir->is_hashed()) { // do nothing; dir is hashed - return 0; } else { // take all waiters on this dir // NOTE: a pass of imported data is guaranteed to get all of my waiters because @@ -1432,7 +1845,6 @@ int Migrator::import_dir_block(bufferlist& bl, dout(15) << "doing contents" << endl; // contents - int num_imported = 0; long nden = dstate.get_nden(); for (; nden>0; nden--) { @@ -1447,6 +1859,10 @@ int Migrator::import_dir_block(bufferlist& bl, char dirty; bl.copy(off, 1, &dirty); off++; + + version_t dnv; + bl.copy(off, sizeof(dnv), (char*)&dnv); + off += sizeof(dnv); char icode; bl.copy(off, 1, &icode); @@ -1455,9 +1871,14 @@ int Migrator::import_dir_block(bufferlist& bl, CDentry *dn = dir->lookup(dname); if (!dn) dn = dir->add_dentry(dname); // null + + // mark dentry dirty? + if (dirty == 'D') + dn->_mark_dirty(); - // mark dn dirty _after_ we link the inode (scroll down) - + dn->set_version( dnv ); + dn->set_projected_version( dnv ); + if (icode == 'N') { // null dentry assert(dn->is_null()); @@ -1475,17 +1896,16 @@ int Migrator::import_dir_block(bufferlist& bl, // inode decode_import_inode(dn, bl, off, oldauth); } - - // mark dentry dirty? (only _after_ we link the inode!) - if (dirty == 'D') dn->mark_dirty(); - - } - if (dir->is_dirty()) - mds->mdlog->submit_entry(new EDirUpdate(dir)); + // add dentry to journal entry + if (le) + le->metablob.add_dentry(dn, true); // Hmm: might we do dn->is_dirty() here instead? + } - return num_imported; } + + dout(7) << "decode_import_dir done " << *dir << endl; + return num_imported; } @@ -1653,7 +2073,7 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol // fix up subdir export? if (dn->inode->dir) { assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)); - dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT); + dn->inode->dir->put(CDir::PIN_IMPORTINGEXPORT); dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT); if (dn->inode->dir->is_auth()) { @@ -1662,7 +2082,7 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); cache->imports.erase(dn->inode->dir); - dn->inode->dir->put(CDIR_PIN_IMPORT); + dn->inode->dir->put(CDir::PIN_IMPORT); dn->inode->dir->state_clear(CDIR_STATE_IMPORT); // move nested under hashdir @@ -1678,7 +2098,7 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol else { // not mine. make it an export. dout(7) << "making subdir into export " << *dn->inode->dir << endl; - dn->inode->dir->get(CDIR_PIN_EXPORT); + dn->inode->dir->get(CDir::PIN_EXPORT); dn->inode->dir->state_set(CDIR_STATE_EXPORT); cache->exports.insert(dn->inode->dir); cache->nested_exports[dir].insert(dn->inode->dir); @@ -1691,7 +2111,7 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol } // mark dentry dirty? (only _after_ we link the inode!) - dn->mark_dirty(); + dn->_mark_dirty(); // fixme } } @@ -1708,7 +2128,7 @@ void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int ol - remember simple rule: dir auth follows inode, unless dir_auth is explicit. - - export_dir_walk and import_dir_block take care with dir_auth: (for import/export) + - export_dir_walk and decode_import_dir take care with dir_auth: (for import/export) - on export, -1 is changed to mds->get_nodeid() - on import, nothing special, actually. @@ -1781,7 +2201,7 @@ void Migrator::hash_dir(CDir *dir) // ok, go dir->state_set(CDIR_STATE_HASHING); - dir->get(CDIR_PIN_HASHING); + dir->get(CDir::PIN_HASHING); assert(dir->hashed_subset.empty()); // discover on all mds @@ -1850,7 +2270,7 @@ void Migrator::hash_dir_complete(CDir *dir) it != dir->end(); it++) { CInode *in = it->second->inode; - in->mark_dirty(); + in->_mark_dirty(); // fixme } if (dir->is_frozen_dir()) @@ -1889,7 +2309,7 @@ void Migrator::hash_dir_frozen(CDir *dir) if (!in->is_dir()) continue; if (!in->dir) continue; - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode == mds->get_nodeid()) continue; // msg? @@ -1984,7 +2404,7 @@ void Migrator::hash_dir_go(CDir *dir) CDentry *dn = it->second; CInode *in = dn->inode; - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode == mds->get_nodeid()) { continue; // still mine! } @@ -2024,8 +2444,8 @@ void Migrator::hash_dir_go(CDir *dir) // add to proxy hash_proxy_inos[dir].push_back(in); - in->state_set(CINODE_STATE_PROXY); - in->get(CINODE_PIN_PROXY); + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); // fix up subdirs if (in->dir) { @@ -2034,7 +2454,7 @@ void Migrator::hash_dir_go(CDir *dir) dout(7) << "making subdir into import " << *in->dir << endl; in->dir->set_dir_auth( mds->get_nodeid() ); cache->imports.insert(in->dir); - in->dir->get(CDIR_PIN_IMPORT); + in->dir->get(CDir::PIN_IMPORT); in->dir->state_set(CDIR_STATE_IMPORT); // fix nested bits @@ -2053,7 +2473,7 @@ void Migrator::hash_dir_go(CDir *dir) // not mine. dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; assert(in->dir->is_export()); - in->dir->put(CDIR_PIN_EXPORT); + in->dir->put(CDir::PIN_EXPORT); in->dir->state_clear(CDIR_STATE_EXPORT); cache->exports.erase(in->dir); cache->nested_exports[containing_import].erase(in->dir); @@ -2072,15 +2492,15 @@ void Migrator::hash_dir_go(CDir *dir) // dir state dir->state_set(CDIR_STATE_HASHED); - dir->get(CDIR_PIN_HASHED); + dir->get(CDir::PIN_HASHED); cache->hashdirs.insert(dir); - dir->mark_dirty(); - mds->mdlog->submit_entry(new EDirUpdate(dir)); + dir->mark_dirty(dir->pre_dirty()); // fixme + mds->mdlog->submit_entry(new EString("dirty dir fixme")); // inode state if (dir->inode->is_auth()) { - dir->inode->mark_dirty(); - mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash dirty fixme")); } // fix up nested_exports? @@ -2168,7 +2588,7 @@ void Migrator::hash_dir_finish(CDir *dir) // dir state hash_gather.erase(dir); dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDIR_PIN_HASHING); + dir->put(CDir::PIN_HASHING); dir->hashed_subset.clear(); // unproxy inodes @@ -2177,9 +2597,9 @@ void Migrator::hash_dir_finish(CDir *dir) it != hash_proxy_inos[dir].end(); it++) { CInode *in = *it; - assert(in->state_test(CINODE_STATE_PROXY)); - in->state_clear(CINODE_STATE_PROXY); - in->put(CINODE_PIN_PROXY); + assert(in->state_test(CInode::STATE_PROXY)); + in->state_clear(CInode::STATE_PROXY); + in->put(CInode::PIN_PROXY); } hash_proxy_inos.erase(dir); @@ -2260,7 +2680,7 @@ void Migrator::handle_hash_dir_notify(MHashDirNotify *m) if (!in) continue; if (!in->dir) continue; - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != from) continue; // we'll import these in a minute if (in->dir->authority() != dentryhashcode) @@ -2279,7 +2699,7 @@ void Migrator::handle_hash_dir_notify(MHashDirNotify *m) hash_gather.erase(dir); dir->state_clear(CDIR_STATE_HASHING); - dir->put(CDIR_PIN_HASHING); + dir->put(CDir::PIN_HASHING); dir->hashed_subset.clear(); } else { dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; @@ -2359,14 +2779,14 @@ void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r // pin dir, set hashing flag dir->state_set(CDIR_STATE_HASHING); - dir->get(CDIR_PIN_HASHING); + dir->get(CDir::PIN_HASHING); assert(dir->hashed_subset.empty()); // inode state dir->inode->inode.hash_seed = 1;// dir->ino(); if (dir->inode->is_auth()) { - dir->inode->mark_dirty(); - mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash dirty fixme")); } // get gather set ready for notifies @@ -2439,7 +2859,7 @@ void Migrator::handle_hash_dir_prep(MHashDirPrep *m) if (in->dir) { if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->get(CDir::PIN_IMPORTINGEXPORT); in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); } else { dout(5) << " already pinned nested export " << *in << endl; @@ -2486,14 +2906,14 @@ void Migrator::handle_hash_dir(MHashDir *m) // dir state dir->state_set(CDIR_STATE_HASHED); - dir->get(CDIR_PIN_HASHED); + dir->get(CDir::PIN_HASHED); cache->hashdirs.insert(dir); dir->hashed_subset.insert(mds->get_nodeid()); // dir is complete dir->mark_complete(); - dir->mark_dirty(); - mds->mdlog->submit_entry(new EDirUpdate(dir)); + dir->mark_dirty(dir->pre_dirty()); // fixme + mds->mdlog->submit_entry(new EString("dirty dir fixme")); // commit mds->mdstore->commit_dir(dir, 0); @@ -2670,7 +3090,7 @@ void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) if (in->dir) { if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { dout(5) << " pinning nested export " << *in->dir << endl; - in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->get(CDir::PIN_IMPORTINGEXPORT); in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); } else { dout(5) << " already pinned nested export " << *in << endl; @@ -2769,20 +3189,20 @@ void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) // dir state //dir->state_clear(CDIR_STATE_UNHASHING); //later dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDIR_PIN_HASHED); + dir->put(CDir::PIN_HASHED); cache->hashdirs.erase(dir); // commit! assert(dir->is_complete()); //dir->mark_complete(); - dir->mark_dirty(); + dir->mark_dirty(dir->pre_dirty()); // fixme mds->mdstore->commit_dir(dir, 0); // inode state dir->inode->inode.hash_seed = 0; if (dir->inode->is_auth()) { - dir->inode->mark_dirty(); - mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("hash inode dirty fixme")); } // notify @@ -2870,8 +3290,8 @@ void Migrator::unhash_dir_complete(CDir *dir) it++) { CInode *in = it->second->inode; if (in->is_auth()) { - in->mark_dirty(); - mds->mdlog->submit_entry(new EInodeUpdate(in)); + in->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("unhash dirty fixme")); } } @@ -2974,7 +3394,7 @@ void Migrator::unhash_dir_prep_finish(CDir *dir) if (!in->is_dir()) continue; if (!in->dir) continue; - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != mds->get_nodeid()) continue; // msg? @@ -3020,7 +3440,7 @@ void Migrator::handle_unhash_dir(MUnhashDir *m) CDentry *dn = it->second; CInode *in = dn->inode; - int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + int dentryhashcode = mds->mdcache->hash_dentry( dir->ino(), it->first ); if (dentryhashcode != mds->get_nodeid()) { // not mine! // twiddle dir_auth? @@ -3064,8 +3484,8 @@ void Migrator::handle_unhash_dir(MUnhashDir *m) dn->mark_clean(); // proxy - in->state_set(CINODE_STATE_PROXY); - in->get(CINODE_PIN_PROXY); + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); hash_proxy_inos[dir].push_back(in); if (in->dir) { @@ -3074,14 +3494,14 @@ void Migrator::handle_unhash_dir(MUnhashDir *m) dout(7) << "making subdir into import " << *in->dir << endl; in->dir->set_dir_auth( mds->get_nodeid() ); cache->imports.insert(in->dir); - in->dir->get(CDIR_PIN_IMPORT); + in->dir->get(CDir::PIN_IMPORT); in->dir->state_set(CDIR_STATE_IMPORT); } else { // not mine. dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; assert(in->dir->is_export()); - in->dir->put(CDIR_PIN_EXPORT); + in->dir->put(CDir::PIN_EXPORT); in->dir->state_clear(CDIR_STATE_EXPORT); cache->exports.erase(in->dir); cache->nested_exports[dir].erase(in->dir); @@ -3101,19 +3521,19 @@ void Migrator::handle_unhash_dir(MUnhashDir *m) // dir state //dir->state_clear(CDIR_STATE_UNHASHING); // later dir->state_clear(CDIR_STATE_HASHED); - dir->put(CDIR_PIN_HASHED); + dir->put(CDir::PIN_HASHED); cache->hashdirs.erase(dir); dir->mark_clean(); // inode state dir->inode->inode.hash_seed = 0; if (dir->inode->is_auth()) { - dir->inode->mark_dirty(); - mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + dir->inode->_mark_dirty(); // fixme + mds->mdlog->submit_entry(new EString("unhash inode dirty fixme")); } // init gather set - hash_gather[dir] = mds->get_mds_map()->get_mds(); + mds->get_mds_map()->get_active_mds_set( hash_gather[dir] ); hash_gather[dir].erase(mds->get_nodeid()); // send unhash message @@ -3173,9 +3593,9 @@ void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) it != hash_proxy_inos[dir].end(); it++) { CInode *in = *it; - assert(in->state_test(CINODE_STATE_PROXY)); - in->state_clear(CINODE_STATE_PROXY); - in->put(CINODE_PIN_PROXY); + assert(in->state_test(CInode::STATE_PROXY)); + in->state_clear(CInode::STATE_PROXY); + in->put(CInode::PIN_PROXY); } // unfreeze diff --git a/branches/aleung/security1/ceph/mds/Migrator.h b/branches/aleung/security1/ceph/mds/Migrator.h index eac7d2046690b..dd2886008d163 100644 --- a/branches/aleung/security1/ceph/mds/Migrator.h +++ b/branches/aleung/security1/ceph/mds/Migrator.h @@ -29,12 +29,12 @@ class CDir; class CInode; class CDentry; +class MExportDir; class MExportDirDiscover; class MExportDirDiscoverAck; class MExportDirPrep; class MExportDirPrepAck; class MExportDirWarning; -class MExportDir; class MExportDirNotify; class MExportDirNotifyAck; class MExportDirFinish; @@ -54,31 +54,87 @@ class MUnhashDirAck; class MUnhashDirNotify; class MUnhashDirNotifyAck; +class EImportStart; + class Migrator { private: MDS *mds; MDCache *cache; + // -- exports -- + // export stages. used to clean up intelligently if there's a failure. + const static int EXPORT_DISCOVERING = 1; // dest is disovering export dir + const static int EXPORT_FREEZING = 2; // we're freezing the dir tree + const static int EXPORT_LOGGINGSTART = 3; // we're logging EExportStart + const static int EXPORT_PREPPING = 4; // sending dest spanning tree to export bounds + const static int EXPORT_EXPORTING = 5; // sent actual export, waiting for acks + const static int EXPORT_LOGGINGFINISH = 6; // logging EExportFinish + // export fun - map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from + map export_state; + map export_peer; + map > export_bounds; + map > export_data; // only during EXPORTING state + map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from map > export_proxy_inos; map > export_proxy_dirinos; + map > export_finish_waiters; + set stray_export_warnings; // notifies i haven't seen map stray_export_notifies; - // hashing madness + + // -- imports -- + const static int IMPORT_DISCOVERED = 1; // waiting for prep + const static int IMPORT_PREPPING = 2; // opening dirs on bounds + const static int IMPORT_PREPPED = 3; // opened bounds, waiting for import + const static int IMPORT_LOGGINGSTART = 4; // got import, logging EImportStart + const static int IMPORT_ACKING = 5; // logged, sent acks + const static int IMPORT_LOGGINGFINISH = 6; + + map import_state; + map import_peer; + map > import_bounds; + + + // -- hashing madness -- multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir // maps frozen_dir_ino's to waiting-for-discover ino's. multimap import_hashed_frozen_waiting; // dirs i froze (for the above) - + + + public: // -- cons -- Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} void dispatch(Message*); + + // -- status -- + int is_exporting(CDir *dir) { + if (export_state.count(dir)) return export_state[dir]; + return 0; + } + bool is_exporting() { return !export_state.empty(); } + int is_importing(inodeno_t dirino) { + if (import_state.count(dirino)) return import_state[dirino]; + return 0; + } + bool is_importing() { return !import_state.empty(); } + const set& get_import_bounds(inodeno_t base) { + assert(import_bounds.count(base)); + return import_bounds[base]; + } + + + // -- misc -- + void handle_mds_failure(int who); + void show_imports(); + + // -- import/export -- // exporter public: @@ -89,48 +145,58 @@ public: void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth); void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth); + void add_export_finish_waiter(CDir *dir, Context *c) { + export_finish_waiters[dir].push_back(c); + } + void clear_export_proxy_pins(CDir *dir); + protected: - map< CDir*, set > export_gather; void handle_export_dir_discover_ack(MExportDirDiscoverAck *m); void export_dir_frozen(CDir *dir, int dest); + void export_dir_frozen_logged(CDir *dir, MExportDirPrep *prep, int dest); void handle_export_dir_prep_ack(MExportDirPrepAck *m); void export_dir_go(CDir *dir, int dest); - int export_dir_walk(MExportDir *req, + int encode_export_dir(list& dirstatelist, class C_Contexts *fin, CDir *basedir, CDir *dir, int newauth); - void export_dir_finish(CDir *dir); void handle_export_dir_notify_ack(MExportDirNotifyAck *m); - - - friend class C_MDC_ExportFreeze; + void reverse_export(CDir *dir); + void export_dir_acked(CDir *dir); + void export_dir_finish(CDir *dir); + friend class C_MDC_ExportFreeze; + friend class C_MDC_ExportStartLogged; + friend class C_MDS_ExportFinishLogged; // importer void handle_export_dir_discover(MExportDirDiscover *m); void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); void handle_export_dir_prep(MExportDirPrep *m); void handle_export_dir(MExportDir *m); - void import_dir_finish(CDir *dir); + void import_dir_logged_start(CDir *dir, int from, + list &imported_subdirs, + list &exports); + void import_dir_logged_finish(CDir *dir); void handle_export_dir_finish(MExportDirFinish *m); - int import_dir_block(bufferlist& bl, - int& off, - int oldauth, - CDir *import_root, - list& imported_subdirs); + int decode_import_dir(bufferlist& bl, + int oldauth, + CDir *import_root, + list& imported_subdirs, + EImportStart *le); void got_hashed_replica(CDir *import, inodeno_t dir_ino, inodeno_t replica_ino); - friend class C_MDC_ExportDirDiscover; + friend class C_MDS_ImportDirLoggedStart; + friend class C_MDS_ImportDirLoggedFinish; // bystander void handle_export_dir_warning(MExportDirWarning *m); void handle_export_dir_notify(MExportDirNotify *m); - void show_imports(); // -- hashed directories -- diff --git a/branches/aleung/security1/ceph/mds/OSDMonitor.cc b/branches/aleung/security1/ceph/mds/OSDMonitor.cc deleted file mode 100644 index 0c7cadbce3a6d..0000000000000 --- a/branches/aleung/security1/ceph/mds/OSDMonitor.cc +++ /dev/null @@ -1,523 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#include "OSDMonitor.h" - -#include "osd/OSDMap.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "messages/MPing.h" -#include "messages/MPingAck.h" -#include "messages/MOSDFailure.h" -#include "messages/MOSDMap.h" -#include "messages/MOSDGetMap.h" -#include "messages/MOSDBoot.h" -#include "messages/MOSDIn.h" -#include "messages/MOSDOut.h" - -#include "common/Timer.h" -#include "common/Clock.h" - -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " " - - -class C_OM_PingTick : public Context { -public: - Messenger *msgr; - C_OM_PingTick(Messenger *m) : msgr(m) {} - void finish(int r) { - msgr->send_message(new MPing, MSG_ADDR_MON(0)); - } -}; - -class C_OM_Faker : public Context { -public: - OSDMonitor *om; - C_OM_Faker(OSDMonitor *m) { - this->om = m; - } - void finish(int r) { - om->fake_reorg(); - } -}; - -class C_OM_FakeOSDFailure : public Context { - OSDMonitor *mon; - int osd; - bool down; -public: - C_OM_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} - void finish(int r) { - mon->fake_osd_failure(osd,down); - } -}; - - - -void OSDMonitor::fake_osdmap_update() -{ - dout(1) << "fake_osdmap_update" << endl; - accept_pending(); - - // tell a random osd - send_incremental_map(osdmap->get_epoch()-1, // ick! FIXME - MSG_ADDR_OSD(rand() % g_conf.num_osd)); -} - - -void OSDMonitor::fake_reorg() -{ - int r = rand() % g_conf.num_osd; - - if (osdmap->is_out(r)) { - dout(1) << "fake_reorg marking osd" << r << " in" << endl; - pending.new_in.push_back(r); - } else { - dout(1) << "fake_reorg marking osd" << r << " out" << endl; - pending.new_out.push_back(r); - } - - accept_pending(); - - // tell him! - send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(r)); -} - - -void OSDMonitor::init() -{ - dout(1) << "init" << endl; - - - // - osdmap = new OSDMap(); - osdmap->set_pg_bits(g_conf.osd_pg_bits); - - // start at epoch 0 until all osds boot - //osdmap->inc_epoch(); // = 1 - //assert(osdmap->get_epoch() == 1); - - - //if (g_conf.mkfs) osdmap->set_mkfs(); - - Bucket *b = new UniformBucket(1, 0); - int root = osdmap->crush.add_bucket(b); - for (int i=0; iosds.insert(i); - b->add_item(i, 1); - } - - for (int i=1; i<5; i++) { - osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); - osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); - osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); - } - - if (g_conf.mds_local_osd) { - // add mds osds, but don't put them in the crush mapping func - for (int i=0; iosds.insert(i+10000); - } - - // - - - - if (whoami == 0 && - g_conf.num_osd > 4 && - g_conf.fake_osdmap_expand) { - dout(1) << "scheduling OSD map reorg at " << g_conf.fake_osdmap_expand << endl; - g_timer.add_event_after(g_conf.fake_osdmap_expand, - new C_OM_Faker(this)); - } - - if (whoami == 0) { - // fake osd failures - for (map::iterator i = g_fake_osd_down.begin(); - i != g_fake_osd_down.end(); - i++) { - dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; - g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 1)); - } - for (map::iterator i = g_fake_osd_out.begin(); - i != g_fake_osd_out.end(); - i++) { - dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; - g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 0)); - } - } - - - // i'm ready! - messenger->set_dispatcher(this); - - // start ticker - g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger)); -} - - -void OSDMonitor::dispatch(Message *m) -{ - switch (m->get_type()) { - case MSG_OSD_FAILURE: - handle_osd_failure((MOSDFailure*)m); - break; - - case MSG_PING_ACK: - handle_ping_ack((MPingAck*)m); - break; - - case MSG_OSD_GETMAP: - handle_osd_getmap((MOSDGetMap*)m); - return; - - case MSG_OSD_BOOT: - handle_osd_boot((MOSDBoot*)m); - return; - - case MSG_OSD_IN: - handle_osd_in((MOSDIn*)m); - break; - case MSG_OSD_OUT: - handle_osd_out((MOSDOut*)m); - break; - - case MSG_SHUTDOWN: - handle_shutdown(m); - return; - - case MSG_PING: - tick(); - delete m; - return; - - default: - dout(0) << "unknown message " << *m << endl; - assert(0); - } -} - - -void OSDMonitor::handle_shutdown(Message *m) -{ - dout(1) << "shutdown from " << m->get_source() << endl; - messenger->shutdown(); - delete messenger; - delete m; -} - -void OSDMonitor::handle_ping_ack(MPingAck *m) -{ - // ... - - delete m; -} - -void OSDMonitor::handle_osd_failure(MOSDFailure *m) -{ - dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl; - - // FIXME? - - // take their word for it - int from = m->get_failed().num(); - if (osdmap->is_up(from) && - (osdmap->osd_inst.count(from) == 0 || - osdmap->osd_inst[from] == m->get_inst())) { - pending.new_down[from] = m->get_inst(); - - if (osdmap->is_in(from)) - pending_out[from] = g_clock.now(); - - //awaiting_maps[pending.epoch][m->get_source()] = - - accept_pending(); - bcast_latest_osd_map_mds(); - //bcast_latest_osd_map_osd(); // FIXME: which osds can i tell? - } - - send_incremental_map(m->get_epoch(), m->get_source()); - - delete m; -} - - - -void OSDMonitor::fake_osd_failure(int osd, bool down) -{ - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << endl; - pending.new_down[osd] = osdmap->osd_inst[osd]; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << endl; - pending.new_out.push_back(osd); - } - accept_pending(); - bcast_latest_osd_map_osd(); - bcast_latest_osd_map_mds(); -} - - -void OSDMonitor::handle_osd_boot(MOSDBoot *m) -{ - dout(7) << "osd_boot from " << m->get_source() << endl; - assert(m->get_source().is_osd()); - int from = m->get_source().num(); - - if (osdmap->get_epoch() == 0) { - // waiting for boot! - osdmap->osd_inst[from] = m->get_source_inst(); - - if (osdmap->osd_inst.size() == osdmap->osds.size()) { - dout(-7) << "osd_boot all osds booted." << endl; - osdmap->inc_epoch(); - osdmap->encode(maps[osdmap->get_epoch()]); // 1 - pending.epoch = osdmap->get_epoch()+1; // 2 - - send_map(); - bcast_latest_osd_map_osd(); - bcast_latest_osd_map_mds(); - } else { - dout(7) << "osd_boot waiting for " - << (osdmap->osds.size() - osdmap->osd_inst.size()) - << " osds to boot" << endl; - } - return; - } - - // already up? mark down first? - if (osdmap->is_up(from)) { - assert(m->get_source_inst() > osdmap->osd_inst[from]); // this better be newer! - pending.new_down[from] = osdmap->osd_inst[from]; - accept_pending(); - } - - // mark up. - pending_out.erase(from); - assert(osdmap->is_down(from)); - pending.new_up[from] = m->get_source_inst(); - - // mark in? - if (osdmap->out_osds.count(from)) - pending.new_in.push_back(from); - - accept_pending(); - - // the booting osd will spread word - send_incremental_map(m->sb.current_epoch, m->get_source()); - delete m; - - // tell mds - bcast_latest_osd_map_mds(); -} - -void OSDMonitor::handle_osd_in(MOSDIn *m) -{ - dout(7) << "osd_in from " << m->get_source() << endl; - int from = m->get_source().num(); - if (osdmap->is_out(from)) { - pending.new_in.push_back(from); - accept_pending(); - send_incremental_map(m->map_epoch, m->get_source()); - } -} - -void OSDMonitor::handle_osd_out(MOSDOut *m) -{ - dout(7) << "osd_out from " << m->get_source() << endl; - int from = m->get_source().num(); - if (osdmap->is_in(from)) { - pending.new_out.push_back(from); - accept_pending(); - send_incremental_map(m->map_epoch, m->get_source()); - } -} - - -void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) -{ - dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; - - if (osdmap->get_epoch() == 0) { - awaiting_map[1][m->get_source()] = m->get_since(); - } else { - if (m->get_since()) - send_incremental_map(m->get_since(), m->get_source()); - else - send_full_map(m->get_source()); - } - delete m; -} - - - -void OSDMonitor::accept_pending() -{ - dout(-10) << "accept_pending " << osdmap->get_epoch() << " -> " << pending.epoch << endl; - - // accept pending into a new map! - pending.encode( inc_maps[ pending.epoch ] ); - - // advance! - osdmap->apply_incremental(pending); - - - // tell me about it - for (map::iterator i = pending.new_up.begin(); - i != pending.new_up.end(); - i++) { - dout(0) << "osd" << i->first << " UP " << i->second << endl; - derr(0) << "osd" << i->first << " UP " << i->second << endl; - messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); - } - for (map::iterator i = pending.new_down.begin(); - i != pending.new_down.end(); - i++) { - dout(0) << "osd" << i->first << " DOWN " << i->second << endl; - derr(0) << "osd" << i->first << " DOWN " << i->second << endl; - messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); - } - for (list::iterator i = pending.new_in.begin(); - i != pending.new_in.end(); - i++) { - dout(0) << "osd" << *i << " IN" << endl; - derr(0) << "osd" << *i << " IN" << endl; - } - for (list::iterator i = pending.new_out.begin(); - i != pending.new_out.end(); - i++) { - dout(0) << "osd" << *i << " OUT" << endl; - derr(0) << "osd" << *i << " OUT" << endl; - } - - // clear new pending - OSDMap::Incremental next(osdmap->get_epoch() + 1); - pending = next; -} - -void OSDMonitor::send_map() -{ - dout(10) << "send_map " << osdmap->get_epoch() << endl; - - map s; - s.swap( awaiting_map[osdmap->get_epoch()] ); - awaiting_map.erase(osdmap->get_epoch()); - - for (map::iterator i = s.begin(); - i != s.end(); - i++) - send_incremental_map(i->second, i->first); -} - - -void OSDMonitor::send_full_map(msg_addr_t who) -{ - messenger->send_message(new MOSDMap(osdmap), who); -} - -void OSDMonitor::send_incremental_map(epoch_t since, msg_addr_t dest) -{ - dout(-10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << dest << endl; - - MOSDMap *m = new MOSDMap; - - for (epoch_t e = osdmap->get_epoch(); - e > since; - e--) { - bufferlist bl; - if (inc_maps.count(e)) { - dout(-10) << "send_incremental_map inc " << e << endl; - m->incremental_maps[e] = inc_maps[e]; - } else if (maps.count(e)) { - dout(-10) << "send_incremental_map full " << e << endl; - m->maps[e] = maps[e]; - //if (!full) break; - } - else { - assert(0); // we should have all maps. - } - } - - messenger->send_message(m, dest); -} - - - -void OSDMonitor::bcast_latest_osd_map_mds() -{ - epoch_t e = osdmap->get_epoch(); - dout(1) << "bcast_latest_osd_map_mds epoch " << e << endl; - - // tell mds - for (int i=0; iget_epoch()-1, MSG_ADDR_MDS(i)); - } -} - -void OSDMonitor::bcast_latest_osd_map_osd() -{ - epoch_t e = osdmap->get_epoch(); - dout(1) << "bcast_latest_osd_map_osd epoch " << e << endl; - - // tell osds - set osds; - osdmap->get_all_osds(osds); - for (set::iterator it = osds.begin(); - it != osds.end(); - it++) { - if (osdmap->is_down(*it)) continue; - - send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(*it)); - } -} - - - -void OSDMonitor::tick() -{ - dout(10) << "tick" << endl; - - // mark down osds out? - utime_t now = g_clock.now(); - list mark_out; - for (map::iterator i = pending_out.begin(); - i != pending_out.end(); - i++) { - utime_t down = now; - down -= i->second; - - if (down.sec() >= g_conf.mon_osd_down_out_interval) { - dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; - mark_out.push_back(i->first); - } - } - for (list::iterator i = mark_out.begin(); - i != mark_out.end(); - i++) { - pending_out.erase(*i); - pending.new_out.push_back( *i ); - accept_pending(); - } - - // next! - g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger)); -} diff --git a/branches/aleung/security1/ceph/mds/OSDMonitor.h b/branches/aleung/security1/ceph/mds/OSDMonitor.h deleted file mode 100644 index cd8babc054225..0000000000000 --- a/branches/aleung/security1/ceph/mds/OSDMonitor.h +++ /dev/null @@ -1,85 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef __OSDMONITOR_H -#define __OSDMONITOR_H - -#include - -#include -#include -using namespace std; - -#include "include/types.h" -#include "msg/Messenger.h" - -#include "osd/OSDMap.h" - -class OSDMonitor : public Dispatcher { - // me - int whoami; - Messenger *messenger; - - // maps - OSDMap *osdmap; - map maps; - map inc_maps; - - OSDMap::Incremental pending; - - map > awaiting_map; - - // osd down -> out - map pending_out; - - - void tick(); // check state, take actions - - // maps - void accept_pending(); // accept pending, new map. - void send_map(); // send current map to waiters. - void send_full_map(msg_addr_t dest); - void send_incremental_map(epoch_t since, msg_addr_t dest); - void bcast_latest_osd_map_mds(); - void bcast_latest_osd_map_osd(); - - - public: - OSDMonitor(int w, Messenger *m) : - whoami(w), - messenger(m), - osdmap(0) { - } - - void init(); - - void dispatch(Message *m); - void handle_shutdown(Message *m); - - void handle_osd_boot(class MOSDBoot *m); - void handle_osd_in(class MOSDIn *m); - void handle_osd_out(class MOSDOut *m); - void handle_osd_failure(class MOSDFailure *m); - void handle_osd_getmap(class MOSDGetMap *m); - - void handle_ping_ack(class MPingAck *m); - - // hack - void fake_osd_failure(int osd, bool down); - void fake_osdmap_update(); - void fake_reorg(); - -}; - -#endif diff --git a/branches/aleung/security1/ceph/mds/Renamer.cc b/branches/aleung/security1/ceph/mds/Renamer.cc index db7a4f59a1378..cf7d79170f479 100644 --- a/branches/aleung/security1/ceph/mds/Renamer.cc +++ b/branches/aleung/security1/ceph/mds/Renamer.cc @@ -27,8 +27,7 @@ #include "msg/Message.h" #include "msg/Messenger.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" +#include "events/EString.h" #include "events/EUnlink.h" #include "messages/MRenameWarning.h" @@ -112,7 +111,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // not import anymore! cache->imports.erase(in->dir); in->dir->state_clear(CDIR_STATE_IMPORT); - in->dir->put(CDIR_PIN_IMPORT); + in->dir->put(CDir::PIN_IMPORT); in->dir->set_dir_auth( CDIR_AUTH_PARENT ); dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; @@ -163,7 +162,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // i am now an import cache->imports.insert(in->dir); in->dir->state_set(CDIR_STATE_IMPORT); - in->dir->get(CDIR_PIN_IMPORT); + in->dir->get(CDir::PIN_IMPORT); in->dir->set_dir_auth( mds->get_nodeid() ); dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; @@ -212,7 +211,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // now export cache->exports.insert(in->dir); in->dir->state_set(CDIR_STATE_EXPORT); - in->dir->get(CDIR_PIN_EXPORT); + in->dir->get(CDir::PIN_EXPORT); assert(dir_auth >= 0); // better be defined in->dir->set_dir_auth( dir_auth ); @@ -253,7 +252,7 @@ void Renamer::fix_renamed_dir(CDir *srcdir, // remove from export list cache->exports.erase(in->dir); in->dir->state_clear(CDIR_STATE_EXPORT); - in->dir->put(CDIR_PIN_EXPORT); + in->dir->put(CDir::PIN_EXPORT); CDir *oldcon = cache->get_auth_container(srcdir); assert(oldcon); @@ -416,17 +415,21 @@ void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change // mark dentries dirty - srcdn->mark_dirty(); - destdn->mark_dirty(); - in->mark_dirty(); + srcdn->_mark_dirty(); // fixme + destdn->_mark_dirty(); // fixme + in->_mark_dirty(); // fixme // local, restrict notify to ppl with open dirs - set notify = srcdir->get_open_by(); - for (set::iterator it = destdir->open_by_begin(); - it != destdir->open_by_end(); + set notify; + for (map::iterator it = srcdir->replicas_begin(); + it != srcdir->replicas_end(); + ++it) + notify.insert(it->first); + for (map::iterator it = destdir->replicas_begin(); + it != destdir->replicas_end(); it++) - if (notify.count(*it) == 0) notify.insert(*it); + if (notify.count(it->first) == 0) notify.insert(it->first); if (notify.size()) { // warn + notify @@ -554,11 +557,11 @@ void Renamer::file_rename_foreign_src(CDentry *srcdn, if (in->is_dir() && in->dir) fix_renamed_dir(srcdir, in, destdir, true); // auth changed - srcdn->mark_dirty(); + srcdn->_mark_dirty(); // fixme // proxy! - in->state_set(CINODE_STATE_PROXY); - in->get(CINODE_PIN_PROXY); + in->state_set(CInode::STATE_PROXY); + in->get(CInode::PIN_PROXY); // generate notify list (everybody but src|dst) and send warnings set notify; @@ -614,10 +617,10 @@ void Renamer::file_rename_ack(CInode *in, int initiator) // we got all our MNotifyAck's. // was i proxy (if not, it's cuz this was a local rename) - if (in->state_test(CINODE_STATE_PROXY)) { + if (in->state_test(CInode::STATE_PROXY)) { dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; - in->state_clear(CINODE_STATE_PROXY); - in->put(CINODE_PIN_PROXY); + in->state_clear(CInode::STATE_PROXY); + in->put(CInode::PIN_PROXY); } // done! @@ -671,7 +674,7 @@ void Renamer::handle_rename_prep(MRenamePrep *m) } // pin - srcin->get(CINODE_PIN_RENAMESRC); + srcin->get(CInode::PIN_RENAMESRC); // send rename request MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator @@ -737,11 +740,11 @@ void Renamer::handle_rename(MRename *m) } // mark dirty - destdn->mark_dirty(); - in->mark_dirty(); + destdn->_mark_dirty(); // fixme + in->_mark_dirty(); // fixme // unpin - in->put(CINODE_PIN_RENAMESRC); + in->put(CInode::PIN_RENAMESRC); // ok, send notifies. set notify; diff --git a/branches/aleung/security1/ceph/mds/Server.cc b/branches/aleung/security1/ceph/mds/Server.cc index 62d2edc1b1d18..1f3dddf128f37 100644 --- a/branches/aleung/security1/ceph/mds/Server.cc +++ b/branches/aleung/security1/ceph/mds/Server.cc @@ -19,6 +19,7 @@ #include "Migrator.h" #include "MDBalancer.h" #include "Renamer.h" +#include "MDStore.h" #include "msg/Messenger.h" @@ -33,10 +34,8 @@ #include "messages/MInodeLink.h" -#include "events/EInodeUpdate.h" -#include "events/EDirUpdate.h" -#include "events/EMknod.h" -#include "events/EMkdir.h" +#include "events/EString.h" +#include "events/EUpdate.h" #include "include/filepath.h" #include "common/Timer.h" @@ -107,7 +106,7 @@ void Server::handle_client_mount(MClientMount *m) // ack messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), - m->get_source(), m->get_source_inst()); + m->get_source_inst()); delete m; } @@ -120,14 +119,14 @@ void Server::handle_client_unmount(Message *m) mds->clientmap.rem_mount(n); - if (mds->clientmap.get_mount_set().empty()) { + if (g_conf.mds_shutdown_on_last_unmount && + mds->clientmap.get_mount_set().empty()) { dout(3) << "all clients done, initiating shutdown" << endl; mds->shutdown_start(); } // ack by sending back to client - entity_inst_t srcinst = m->get_source_inst(); // make a copy! - messenger->send_message(m, m->get_source(), srcinst); + messenger->send_message(m, m->get_source_inst()); } @@ -194,7 +193,7 @@ void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tra // send reply messenger->send_message(reply, - MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst()); + req->get_client_inst()); // discard request mdcache->request_finish(req); @@ -204,9 +203,27 @@ void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tra } +void Server::submit_update(MClientRequest *req, + CInode *wrlockedi, + LogEvent *event, + Context *oncommit) +{ + // log + mdlog->submit_entry(event); + + // pin + mdcache->request_pin_inode(req, wrlockedi); + + // wait + mdlog->wait_for_sync(oncommit); +} + + /* * commit event(s) to the metadata journal, then reply. * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) + * + * NOTE: this is old and bad (write-behind!) */ void Server::commit_request(MClientRequest *req, MClientReply *reply, @@ -337,7 +354,7 @@ void Server::handle_client_request(MClientRequest *req) // send error messenger->send_message(new MClientReply(req, r), - MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst()); + req->get_client_inst()); // // is this a special debug command? @@ -493,13 +510,35 @@ void Server::dispatch_request(Message *m, CInode *ref) } +// FIXME: this probably should go somewhere else. + +bool Server::try_open_dir(CInode *in, MClientRequest *req) +{ + if (!in->dir && in->is_frozen_dir()) { + // doh! + dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; + assert(in->get_parent_dir()); + in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryRequest(mds, req, in)); + return false; + } + + in->get_or_open_dir(mds->mdcache); + return true; +} + + + +// =============================================================================== // STAT void Server::handle_client_stat(MClientRequest *req, CInode *ref) { + // FIXME: this is really not the way to handle the statlite mask. + // do I need file info? int mask = req->get_iarg(); if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { @@ -514,133 +553,216 @@ void Server::handle_client_stat(MClientRequest *req, mds->balancer->hit_inode(ref, META_POP_IRD); // reply - dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; + //dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; MClientReply *reply = new MClientReply(req); - reply_request(req, reply, ref); } + +// =============================================================================== // INODE UPDATES + +/* + * finisher: do a inode_file_write_finish and reply. + */ +class C_MDS_utime_finish : public Context { + MDS *mds; + MClientRequest *req; + CInode *in; + version_t pv; + time_t mtime, atime; +public: + C_MDS_utime_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, time_t mt, time_t at) : + mds(m), req(r), in(i), + pv(pdv), + mtime(mt), atime(at) { } + void finish(int r) { + assert(r == 0); + + // apply + in->inode.mtime = mtime; + in->inode.atime = atime; + in->mark_dirty(pv); + + // unlock + mds->locker->inode_file_write_finish(in); + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_result(0); + mds->server->reply_request(req, reply, in); + } +}; + + // utime void Server::handle_client_utime(MClientRequest *req, - CInode *cur) + CInode *cur) { // write if (!mds->locker->inode_file_write_start(cur, req)) return; // fw or (wait for) sync - // do update - cur->inode.mtime = req->get_targ(); - cur->inode.atime = req->get_targ2(); - if (cur->is_auth()) - cur->mark_dirty(); - - mds->locker->inode_file_write_finish(cur); - mds->balancer->hit_inode(cur, META_POP_IWR); - // init reply - MClientReply *reply = new MClientReply(req, 0); - reply->set_result(0); - - // commit - commit_request(req, reply, cur, - new EInodeUpdate(cur)); + // prepare + version_t pdv = cur->pre_dirty(); + time_t mtime = req->get_targ(); + time_t atime = req->get_targ2(); + C_MDS_utime_finish *fin = new C_MDS_utime_finish(mds, req, cur, pdv, + mtime, atime); + + // log + wait + EUpdate *le = new EUpdate("utime"); + le->metablob.add_dir_context(cur->get_parent_dir()); + inode_t *pi = le->metablob.add_dentry(cur->parent, true); + pi->mtime = mtime; + pi->atime = mtime; + pi->version = pdv; + + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } - -// HARD +// -------------- + +/* + * finisher: do a inode_hard_write_finish and reply. + */ +class C_MDS_chmod_finish : public Context { + MDS *mds; + MClientRequest *req; + CInode *in; + version_t pv; + int mode; +public: + C_MDS_chmod_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int mo) : + mds(m), req(r), in(i), pv(pdv), mode(mo) { } + void finish(int r) { + assert(r == 0); + + // apply + in->inode.mode &= ~04777; + in->inode.mode |= (mode & 04777); + in->mark_dirty(pv); + + // unlock + mds->locker->inode_hard_write_finish(in); + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_result(0); + mds->server->reply_request(req, reply, in); + } +}; + // chmod void Server::handle_client_chmod(MClientRequest *req, - CInode *cur) + CInode *cur) { // write if (!mds->locker->inode_hard_write_start(cur, req)) return; // fw or (wait for) lock - - // check permissions - - // do update + mds->balancer->hit_inode(cur, META_POP_IWR); + + // prepare + version_t pdv = cur->pre_dirty(); int mode = req->get_iarg(); - cur->inode.mode &= ~04777; - cur->inode.mode |= (mode & 04777); - cur->mark_dirty(); + C_MDS_chmod_finish *fin = new C_MDS_chmod_finish(mds, req, cur, pdv, + mode); + + // log + wait + EUpdate *le = new EUpdate("chmod"); + le->metablob.add_dir_context(cur->get_parent_dir()); + inode_t *pi = le->metablob.add_dentry(cur->parent, true); + pi->mode = mode; + pi->version = pdv; + + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); +} - mds->locker->inode_hard_write_finish(cur); - mds->balancer->hit_inode(cur, META_POP_IWR); +// chown - // start reply - MClientReply *reply = new MClientReply(req, 0); +class C_MDS_chown_finish : public Context { + MDS *mds; + MClientRequest *req; + CInode *in; + version_t pv; + int uid, gid; +public: + C_MDS_chown_finish(MDS *m, MClientRequest *r, CInode *i, version_t pdv, int u, int g) : + mds(m), req(r), in(i), pv(pdv), uid(u), gid(g) { } + void finish(int r) { + assert(r == 0); - // commit - commit_request(req, reply, cur, - new EInodeUpdate(cur)); -} + // apply + if (uid >= 0) in->inode.uid = uid; + if (gid >= 0) in->inode.gid = gid; + in->mark_dirty(pv); + + // unlock + mds->locker->inode_hard_write_finish(in); + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_result(0); + mds->server->reply_request(req, reply, in); + } +}; -// chown void Server::handle_client_chown(MClientRequest *req, - CInode *cur) + CInode *cur) { // write if (!mds->locker->inode_hard_write_start(cur, req)) return; // fw or (wait for) lock - // check permissions + mds->balancer->hit_inode(cur, META_POP_IWR); - // do update + // prepare + version_t pdv = cur->pre_dirty(); int uid = req->get_iarg(); int gid = req->get_iarg2(); - cur->inode.uid = uid; - cur->inode.gid = gid; - cur->mark_dirty(); - - mds->locker->inode_hard_write_finish(cur); - - mds->balancer->hit_inode(cur, META_POP_IWR); - - // start reply - MClientReply *reply = new MClientReply(req, 0); - - // commit - commit_request(req, reply, cur, - new EInodeUpdate(cur)); + C_MDS_chown_finish *fin = new C_MDS_chown_finish(mds, req, cur, pdv, + uid, gid); + + // log + wait + EUpdate *le = new EUpdate("chown"); + le->metablob.add_dir_context(cur->get_parent_dir()); + inode_t *pi = le->metablob.add_dentry(cur->parent, true); + if (uid >= 0) pi->uid = uid; + if (gid >= 0) pi->gid = gid; + pi->version = pdv; + + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } -bool Server::try_open_dir(CInode *in, MClientRequest *req) -{ - if (!in->dir && in->is_frozen_dir()) { - // doh! - dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; - assert(in->get_parent_dir()); - in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryRequest(mds, req, in)); - return false; - } - in->get_or_open_dir(mds); - return true; -} + +// ================================================================= // DIRECTORY and NAMESPACE OPS // READDIR int Server::encode_dir_contents(CDir *dir, - list& inls, - list& dnls) + list& inls, + list& dnls) { int numfiles = 0; @@ -651,18 +773,14 @@ int Server::encode_dir_contents(CDir *dir, // hashed? if (dir->is_hashed() && - mds->get_nodeid() != mds->hash_dentry( dir->ino(), it->first )) + mds->get_nodeid() != mds->mdcache->hash_dentry( dir->ino(), it->first )) continue; - - // is dentry readable? - if (dn->is_xlocked()) { - // ***** FIXME ***** - // ? - dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl; - } - + + if (dn->is_null()) continue; + CInode *in = dn->inode; - if (!in) continue; // null dentry? + if (!in) + continue; // hmm, fixme!, what about REMOTE links? dout(12) << "including inode " << *in << endl; @@ -709,7 +827,7 @@ void Server::handle_hash_readdir(MHashReaddir *m) // sent it back! messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), - m->get_source(), m->get_source_inst(), MDS_PORT_CACHE); + m->get_source_inst(), MDS_PORT_CACHE); } @@ -921,91 +1039,167 @@ void Server::handle_client_readdir(MClientRequest *req, } + +// ------------------------------------------------ + // MKNOD -void Server::handle_client_mknod(MClientRequest *req, CInode *ref) +class C_MDS_mknod_finish : public Context { + MDS *mds; + MClientRequest *req; + CDentry *dn; + CInode *newi; + version_t pv; +public: + C_MDS_mknod_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : + mds(m), req(r), dn(d), newi(ni), + pv(d->get_projected_version()) {} + void finish(int r) { + assert(r == 0); + + // link the inode + dn->get_dir()->link_inode(dn, newi); + + // dirty inode, dn, dir + newi->mark_dirty(pv); + + // unlock + mds->locker->dentry_xlock_finish(dn); + + // hit pop + mds->balancer->hit_inode(newi, META_POP_IWR); + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_result(0); + mds->server->reply_request(req, reply, newi); + } +}; + +void Server::handle_client_mknod(MClientRequest *req, CInode *diri) { - // make dentry and inode, link. - CInode *newi = mknod(req, ref); - if (!newi) return; + CInode *newi = 0; + CDentry *dn = 0; - // it's a file! + // make dentry and inode, xlock dentry. + if (!prepare_mknod(req, diri, &newi, &dn)) + return; + assert(newi); + assert(dn); + + // it's a file. newi->inode.mode = req->get_iarg(); newi->inode.mode &= ~INODE_TYPE_MASK; newi->inode.mode |= INODE_MODE_FILE; - mds->balancer->hit_inode(newi, META_POP_IWR); - - // commit - commit_request(req, new MClientReply(req, 0), ref, - new EMknod(newi)); + // prepare finisher + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + EUpdate *le = new EUpdate("mknod"); + le->metablob.add_dir_context(diri->dir); + inode_t *pi = le->metablob.add_dentry(dn, true, newi); + pi->version = dn->get_projected_version(); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } -// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical. - -CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist) -{ - dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl; - // get containing directory (without last bit) - filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); - string name = req->get_filepath().last_bit(); - - // did we get to parent? - dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl; +/* + * verify that the dir exists and would own the dname. + * do not check if the dentry exists. + */ +CDir *Server::validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& name) +{ // make sure parent is a dir? if (!diri->is_dir()) { - dout(7) << "not a dir" << endl; + dout(7) << "validate_new_dentry_dir: not a dir" << endl; reply_request(req, -ENOTDIR); - return 0; + return false; } // am i not open, not auth? if (!diri->dir && !diri->is_auth()) { int dirauth = diri->authority(); - dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; + dout(7) << "validate_new_dentry_dir: don't know dir auth, not open, auth is i think mds" << dirauth << endl; mdcache->request_forward(req, dirauth); - return 0; + return false; } - if (!try_open_dir(diri, req)) return 0; + if (!try_open_dir(diri, req)) + return false; CDir *dir = diri->dir; // make sure it's my dentry int dnauth = dir->dentry_authority(name); if (dnauth != mds->get_nodeid()) { // fw - - dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl; + dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir + << " dn " << name + << " not mine, fw to " << dnauth << endl; mdcache->request_forward(req, dnauth); - return 0; + return false; } - // ok, done passing buck. + // dir auth pinnable? + if (!dir->can_auth_pin()) { + dout(7) << "validate_new_dentry_dir: dir " << *dir << " not pinnable, waiting" << endl; + dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds, req, diri)); + return false; + } // frozen? if (dir->is_frozen()) { dout(7) << "dir is frozen " << *dir << endl; dir->add_waiter(CDIR_WAIT_UNFREEZE, new C_MDS_RetryRequest(mds, req, diri)); - return 0; + return false; } + return dir; +} + +/* + * prepare a mknod-type operation (mknod, mkdir, symlink, open+create). + * create the inode and dentry, but do not link them. + * pre_dirty the dentry+dir. + * xlock the dentry. + * + * return val + * 0 - wait for something + * 1 - created + * 2 - already exists (only if okexist=true) + */ +int Server::prepare_mknod(MClientRequest *req, CInode *diri, + CInode **pin, CDentry **pdn, + bool okexist) +{ + dout(10) << "prepare_mknod " << req->get_filepath() << " in " << *diri << endl; + + // get containing directory (without last bit) + filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); + string name = req->get_filepath().last_bit(); + + CDir *dir = validate_new_dentry_dir(req, diri, name); + if (!dir) return 0; + // make sure name doesn't already exist - CDentry *dn = dir->lookup(name); - if (dn) { - if (!dn->can_read(req)) { - dout(10) << "waiting on (existing!) dentry " << *dn << endl; + *pdn = dir->lookup(name); + if (*pdn) { + if (!(*pdn)->can_read(req)) { + dout(10) << "waiting on (existing!) dentry " << **pdn << endl; dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri)); return 0; } - if (!dn->is_null()) { + if (!(*pdn)->is_null()) { // name already exists if (okexist) { dout(10) << "dentry " << name << " exists in " << *dir << endl; - return dn->inode; + *pin = (*pdn)->inode; + return 2; } else { dout(10) << "dentry " << name << " exists in " << *dir << endl; reply_request(req, -EEXIST); @@ -1021,33 +1215,124 @@ CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist) return 0; } - // create! - CInode *newi = mdcache->create_inode(); - newi->inode.uid = req->get_caller_uid(); - newi->inode.gid = req->get_caller_gid(); - newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime(); // now + // make sure dir is pinnable + + + // create inode + *pin = mdcache->create_inode(); + (*pin)->inode.uid = req->get_caller_uid(); + (*pin)->inode.gid = req->get_caller_gid(); + (*pin)->inode.ctime = (*pin)->inode.mtime = (*pin)->inode.atime = g_clock.gettime(); // now + // note: inode.version will get set by finisher's mark_dirty. - // link - if (!dn) - dn = dir->add_dentry(name, newi); - else - dir->link_inode(dn, newi); + // create dentry + if (!*pdn) + *pdn = dir->add_dentry(name, 0); + + (*pdn)->pre_dirty(); + + // xlock dentry + bool res = mds->locker->dentry_xlock_start(*pdn, req, diri); + assert(res == true); // bump modify pop mds->balancer->hit_dir(dir, META_POP_DWR); + + return 1; +} + + + + + +// MKDIR + +void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) +{ + CInode *newi = 0; + CDentry *dn = 0; - // mark dirty - dn->mark_dirty(); - newi->mark_dirty(); + // make dentry and inode, xlock dentry. + if (!prepare_mknod(req, diri, &newi, &dn)) + return; + assert(newi); + assert(dn); + + // it's a directory. + newi->inode.mode = req->get_iarg(); + newi->inode.mode &= ~INODE_TYPE_MASK; + newi->inode.mode |= INODE_MODE_DIR; + newi->inode.layout = g_OSD_MDDirLayout; + + // ...and that new dir is empty. + CDir *newdir = newi->get_or_open_dir(mds->mdcache); + newdir->mark_complete(); + newdir->mark_dirty(newdir->pre_dirty()); + + // prepare finisher + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + EUpdate *le = new EUpdate("mkdir"); + le->metablob.add_dir_context(diri->dir); + inode_t *pi = le->metablob.add_dentry(dn, true, newi); + pi->version = dn->get_projected_version(); + le->metablob.add_dir(newi->dir, true); - // journal it - //mdlog->submit_entry(new EMknod(newi)); + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); - // ok! - return newi; + + /* old export heuristic. pbly need to reimplement this at some point. + if ( + diri->dir->is_auth() && + diri->dir->is_rep() && + newdir->is_auth() && + !newdir->is_hashing()) { + int dest = rand() % mds->mdsmap->get_num_mds(); + if (dest != whoami) { + dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; + mdcache->migrator->export_dir(newdir, dest); + } + } + */ +} + + + +// SYMLINK + +void Server::handle_client_symlink(MClientRequest *req, CInode *diri) +{ + CInode *newi = 0; + CDentry *dn = 0; + + // make dentry and inode, xlock dentry. + if (!prepare_mknod(req, diri, &newi, &dn)) + return; + assert(newi); + assert(dn); + + // it's a symlink + newi->inode.mode &= ~INODE_TYPE_MASK; + newi->inode.mode |= INODE_MODE_SYMLINK; + newi->symlink = req->get_sarg(); + + // prepare finisher + C_MDS_mknod_finish *fin = new C_MDS_mknod_finish(mds, req, dn, newi); + EUpdate *le = new EUpdate("symlink"); + le->metablob.add_dir_context(diri->dir); + inode_t *pi = le->metablob.add_dentry(dn, true, newi); + pi->version = dn->get_projected_version(); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); } + + + // LINK class C_MDS_LinkTraverse : public Context { @@ -1070,41 +1355,13 @@ void Server::handle_client_link(MClientRequest *req, CInode *ref) { // figure out name string dname = req->get_filepath().last_bit(); - dout(7) << "dname is " << dname << endl; - - // make sure parent is a dir? - if (!ref->is_dir()) { - dout(7) << "not a dir " << *ref << endl; - reply_request(req, -EINVAL); - return; - } - - // am i not open, not auth? - if (!ref->dir && !ref->is_auth()) { - int dirauth = ref->authority(); - dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; - mdcache->request_forward(req, dirauth); - return; - } - - if (!try_open_dir(ref, req)) return; - CDir *dir = ref->dir; - dout(7) << "handle_client_link dir is " << *dir << endl; - - - - // make sure it's my dentry - int dauth = dir->dentry_authority(dname); - if (dauth != mds->get_nodeid()) { - // fw - dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl; - mdcache->request_forward(req, dauth); - return; - } - // ok, done passing buck. + dout(7) << "handle_client_link dname is " << dname << endl; + // validate dir + CDir *dir = validate_new_dentry_dir(req, ref, dname); + if (!dir) return; - // exists? + // dentry exists? CDentry *dn = dir->lookup(dname); if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { dout(7) << "handle_client_link dn exists " << *dn << endl; @@ -1112,14 +1369,15 @@ void Server::handle_client_link(MClientRequest *req, CInode *ref) return; } - // keep src dir in memory - mdcache->request_pin_dir(req, dir); + // xlock dentry + if (!dn->is_xlockedbyme(req)) { + if (!mds->locker->dentry_xlock_start(dn, req, ref)) + return; + } // discover link target filepath target = req->get_sarg(); - dout(7) << "handle_client_link discovering target " << target << endl; - C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); @@ -1160,7 +1418,7 @@ public: } }; -void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace) +void Server::handle_client_link_2(int r, MClientRequest *req, CInode *diri, vector& trace) { // target dne? if (r < 0) { @@ -1182,59 +1440,38 @@ void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vecto return; } - // keep target inode in memory - mdcache->request_pin_inode(req, targeti); - - dout(7) << "dir is " << *ref << endl; - - // xlock the dentry - CDir *dir = ref->dir; + // what was the new dentry again? + CDir *dir = diri->dir; assert(dir); - string dname = req->get_filepath().last_bit(); - int dauth = dir->dentry_authority(dname); - if (mds->get_nodeid() != dauth) { - // ugh, exported out from under us - dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl; - mdcache->request_forward(req, dauth); - return; - } - CDentry *dn = dir->lookup(dname); - if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { - dout(7) << "handle_client_link dn exists " << *dn << endl; - reply_request(req, -EEXIST); - return; - } + assert(dn); + assert(dn->is_xlockedbyme(req)); - if (!dn) dn = dir->add_dentry(dname); - - if (!dn->is_xlockedbyme(req)) { - if (!mds->locker->dentry_xlock_start(dn, req, ref)) { - if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); - return; - } - } - - // ok xlocked! + // ok! if (targeti->is_auth()) { // mine - if (targeti->is_anchored()) { + + // same dir? + if (targeti->get_parent_dir() == dn->get_dir()) { + dout(7) << "target is in the same dir, sweet" << endl; + } + else if (targeti->is_anchored()) { dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; } else { assert(targeti->inode.nlink == 1); dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; mdcache->anchor_inode(targeti, - new C_MDS_RetryRequest(mds, req, ref)); + new C_MDS_RetryRequest(mds, req, diri)); return; } // ok, inc link! targeti->inode.nlink++; dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl; - targeti->mark_dirty(); + targeti->_mark_dirty(); // fixme } else { // remote: send nlink++ request, wait @@ -1243,20 +1480,20 @@ void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vecto // wait targeti->add_waiter(CINODE_WAIT_LINK, - new C_MDS_RemoteLink(this, req, ref, dn, targeti)); + new C_MDS_RemoteLink(this, req, diri, dn, targeti)); return; } - handle_client_link_finish(req, ref, dn, targeti); + handle_client_link_finish(req, diri, dn, targeti); } void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, - CDentry *dn, CInode *targeti) + CDentry *dn, CInode *targeti) { // create remote link dn->dir->link_inode(dn, targeti->ino()); dn->link_remote( targeti ); // since we have it - dn->mark_dirty(); + dn->_mark_dirty(); // fixme mds->balancer->hit_dir(dn->dir, META_POP_DWR); @@ -1449,7 +1686,7 @@ void Server::handle_client_unlink(MClientRequest *req, MClientReply *reply = new MClientReply(req,0); mdcache->dentry_unlink(dn, new C_MDS_CommitRequest(this, req, reply, diri, - new EInodeUpdate(diri))); // FIXME WRONG EVENT + new EString("unlink fixme"))); return; } @@ -1954,85 +2191,13 @@ void Server::handle_client_rename_local(MClientRequest *req, MClientReply *reply = new MClientReply(req, 0); mdcache->renamer->file_rename( srcdn, destdn, new C_MDS_CommitRequest(this, req, reply, srcdn->inode, - new EInodeUpdate(srcdn->inode)) ); // FIXME WRONG EVENT -} - - - - - - - -// MKDIR - -void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) -{ - // make dentry and inode, link. - CInode *newi = mknod(req, diri); - if (!newi) return; - - // make my new inode a dir. - newi->inode.mode = req->get_iarg(); - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_DIR; - - // use dir layout - newi->inode.layout = g_OSD_MDDirLayout; - - // init dir to be empty - assert(!newi->is_frozen_dir()); // bc mknod worked - CDir *newdir = newi->get_or_open_dir(mds); - newdir->mark_complete(); - newdir->mark_dirty(); - - mds->balancer->hit_dir(newdir, META_POP_DWR); - - if ( - diri->dir->is_auth() && - diri->dir->is_rep() && - newdir->is_auth() && - !newdir->is_hashing()) { - int dest = rand() % mds->mdsmap->get_num_mds(); - if (dest != mds->get_nodeid()) { - dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; - mdcache->migrator->export_dir(newdir, dest); - } - } - - // commit to log - commit_request(req, new MClientReply(req, 0), diri, - new EMkdir(newdir)); - //new EInodeUpdate(newi),//); - //new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery! - return; + new EString("file rename fixme")) ); } -// SYMLINK - -void Server::handle_client_symlink(MClientRequest *req, CInode *diri) -{ - // make dentry and inode, link. - CInode *newi = mknod(req, diri); - if (!newi) return; - - // make my new inode a symlink - newi->inode.mode &= ~INODE_TYPE_MASK; - newi->inode.mode |= INODE_MODE_SYMLINK; - - // set target - newi->symlink = req->get_sarg(); - - mds->balancer->hit_dir(diri->dir, META_POP_DWR); - - // commit - commit_request(req, new MClientReply(req, 0), diri, - new EInodeUpdate(newi)); // FIXME should be differnet log entry -} - @@ -2056,7 +2221,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) // do update cur->inode.size = req->get_sizearg(); - cur->mark_dirty(); + cur->_mark_dirty(); // fixme mds->locker->inode_file_write_finish(cur); @@ -2067,7 +2232,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) // commit commit_request(req, reply, cur, - new EInodeUpdate(cur)); + new EString("truncate fixme")); } @@ -2076,7 +2241,7 @@ void Server::handle_client_truncate(MClientRequest *req, CInode *cur) // open, openc, close void Server::handle_client_open(MClientRequest *req, - CInode *cur) + CInode *cur) { int flags = req->get_iarg(); int mode = req->get_iarg2(); @@ -2133,18 +2298,77 @@ void Server::handle_client_open(MClientRequest *req, } +class C_MDS_openc_finish : public Context { + MDS *mds; + MClientRequest *req; + CDentry *dn; + CInode *newi; + version_t pv; +public: + C_MDS_openc_finish(MDS *m, MClientRequest *r, CDentry *d, CInode *ni) : + mds(m), req(r), dn(d), newi(ni), + pv(d->get_projected_version()) {} + void finish(int r) { + assert(r == 0); + + // link the inode + dn->get_dir()->link_inode(dn, newi); + + // dirty inode, dn, dir + newi->mark_dirty(pv); + + // unlock + mds->locker->dentry_xlock_finish(dn); + + // hit pop + mds->balancer->hit_inode(newi, META_POP_IWR); + + // ok, do the open. + mds->server->handle_client_open(req, newi); + } +}; + -void Server::handle_client_openc(MClientRequest *req, CInode *ref) +void Server::handle_client_openc(MClientRequest *req, CInode *diri) { dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; - CInode *in = mknod(req, ref, true); - if (!in) return; - - in->inode.mode = 0644; // wtf FIXME - in->inode.mode |= INODE_MODE_FILE; + CInode *in = 0; + CDentry *dn = 0; + + // make dentry and inode, xlock dentry. + int r = prepare_mknod(req, diri, &in, &dn); + if (!r) + return; // wait on something + assert(in); + assert(dn); + + if (r == 1) { + // created. + // it's a file. + in->inode.mode = 0644; // FIXME req should have a umask + in->inode.mode |= INODE_MODE_FILE; + + // prepare finisher + C_MDS_openc_finish *fin = new C_MDS_openc_finish(mds, req, dn, in); + EUpdate *le = new EUpdate("openc"); + le->metablob.add_dir_context(diri->dir); + inode_t *pi = le->metablob.add_dentry(dn, true, in); + pi->version = dn->get_projected_version(); + + // log + wait + mdlog->submit_entry(le); + mdlog->wait_for_sync(fin); - handle_client_open(req, in); + /* + FIXME. this needs to be rewritten when the write capability stuff starts + getting journaled. + */ + } else { + // exists! + // FIXME: do i need to repin path based existant inode? hmm. + handle_client_open(req, in); + } } diff --git a/branches/aleung/security1/ceph/mds/Server.h b/branches/aleung/security1/ceph/mds/Server.h index 53e917386440e..d4509f1418e07 100644 --- a/branches/aleung/security1/ceph/mds/Server.h +++ b/branches/aleung/security1/ceph/mds/Server.h @@ -40,6 +40,11 @@ public: // generic request helpers void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); + + void submit_update(MClientRequest *req, CInode *wrlockedi, + LogEvent *event, + Context *oncommit); + void commit_request(MClientRequest *req, MClientReply *reply, CInode *tracei, @@ -117,8 +122,16 @@ public: void handle_client_truncate(MClientRequest *req, CInode *in); void handle_client_fsync(MClientRequest *req, CInode *in); + + // some helpers CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc + CDir *validate_new_dentry_dir(MClientRequest *req, CInode *diri, string& dname); + int prepare_mknod(MClientRequest *req, CInode *diri, + CInode **pin, CDentry **pdn, + bool okexist=false); + + }; diff --git a/branches/aleung/security1/ceph/mds/events/EAlloc.h b/branches/aleung/security1/ceph/mds/events/EAlloc.h index b3b5f21f84038..9360db4ab49bb 100644 --- a/branches/aleung/security1/ceph/mds/events/EAlloc.h +++ b/branches/aleung/security1/ceph/mds/events/EAlloc.h @@ -61,49 +61,15 @@ class EAlloc : public LogEvent { void print(ostream& out) { if (what == EALLOC_EV_ALLOC) - out << "alloc " << hex << id << dec << " tablev " << table_version; + out << "EAlloc alloc " << hex << id << dec << " tablev " << table_version; else - out << "dealloc " << hex << id << dec << " tablev " << table_version; + out << "EAlloc dealloc " << hex << id << dec << " tablev " << table_version; } - // live journal - bool can_expire(MDS *mds) { - if (mds->idalloc->get_committed_version() < table_version) - return false; // still dirty - else - return true; // already flushed - } - - void retire(MDS *mds, Context *c) { - mds->idalloc->save(c, table_version); - } - - - // recovery - bool has_happened(MDS *mds) { - if (mds->idalloc->get_version() >= table_version) { - cout << " event " << table_version << " <= table " << mds->idalloc->get_version() << endl; - return true; - } else - return false; - } - - void replay(MDS *mds) { - assert(table_version-1 == mds->idalloc->get_version()); - - if (what == EALLOC_EV_ALLOC) { - idno_t nid = mds->idalloc->alloc_id(true); - assert(nid == id); // this should match. - } - else if (what == EALLOC_EV_FREE) { - mds->idalloc->reclaim_id(id, true); - } - else - assert(0); - - assert(table_version == mds->idalloc->get_version()); - } + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); }; diff --git a/branches/aleung/security1/ceph/mds/events/EDirUpdate.h b/branches/aleung/security1/ceph/mds/events/EDirUpdate.h deleted file mode 100644 index 9c8881d4c91b9..0000000000000 --- a/branches/aleung/security1/ceph/mds/events/EDirUpdate.h +++ /dev/null @@ -1,97 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EDIRUPDATE_H -#define __EDIRUPDATE_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "ETrace.h" -#include "../CDir.h" -#include "../MDCache.h" -#include "../MDStore.h" - - - -class EDirUpdate : public LogEvent { - protected: - ETrace trace; - inodeno_t dirino; - version_t version; - - public: - EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE), - trace(dir->inode) { - this->dirino = dir->ino(); - version = dir->get_version(); - } - EDirUpdate() : LogEvent(EVENT_DIRUPDATE) { - } - - void print(ostream& out) { - out << "up dir " << dirino << " " - << trace - << "/ v " << version; - } - - virtual void encode_payload(bufferlist& bl) { - trace.encode(bl); - bl.append((char*)&version, sizeof(version)); - bl.append((char*)&dirino, sizeof(dirino)); - } - void decode_payload(bufferlist& bl, int& off) { - trace.decode(bl, off); - bl.copy(off, sizeof(version), (char*)&version); - off += sizeof(version); - bl.copy(off, sizeof(dirino), (char*)&dirino); - off += sizeof(dirino); - } - - - virtual bool can_expire(MDS *mds) { - // am i obsolete? - CInode *in = mds->mdcache->get_inode(dirino); - if (!in) return true; - CDir *dir = in->dir; - if (!dir) return true; - - dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl; - - if (!dir->is_auth()) return true; // not mine! - if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME - - if (!dir->is_dirty()) return true; - - if (dir->get_committing_version() > version) - return true; - - return false; - } - - virtual void retire(MDS *mds, Context *c) { - // commit directory - CInode *in = mds->mdcache->get_inode(dirino); - assert(in); - CDir *dir = in->dir; - assert(dir); - - dout(10) << "EDirUpdate committing dir " << *dir << endl; - mds->mdstore->commit_dir(dir, c); - } - -}; - -#endif diff --git a/branches/aleung/security1/ceph/mds/events/EExportFinish.h b/branches/aleung/security1/ceph/mds/events/EExportFinish.h new file mode 100644 index 0000000000000..114d580b6a499 --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EExportFinish.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EEXPORTFINISH_H +#define __EEXPORTFINISH_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../MDS.h" + +class EExportFinish : public LogEvent { + protected: + inodeno_t dirino; // exported dir + bool success; + + public: + EExportFinish(CDir *dir, bool s) : LogEvent(EVENT_EXPORTFINISH), + dirino(dir->ino()), + success(s) { } + EExportFinish() : LogEvent(EVENT_EXPORTFINISH) { } + + void print(ostream& out) { + out << "export_finish " << dirino; + if (success) + out << " success"; + else + out << " failure"; + } + + virtual void encode_payload(bufferlist& bl) { + bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&success, sizeof(success)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + bl.copy(off, sizeof(success), (char*)&success); + off += sizeof(success); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/events/EExportStart.h b/branches/aleung/security1/ceph/mds/events/EExportStart.h new file mode 100644 index 0000000000000..37ed92a7239c2 --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EExportStart.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EEXPORTSTART_H +#define __EEXPORTSTART_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../MDS.h" + +#include "EMetaBlob.h" + +class EExportStart : public LogEvent { + public: + EMetaBlob metablob; // exported dir + protected: + inodeno_t dirino; + int dest; // dest mds + set bounds; + + public: + EExportStart(CDir *dir, int d) : LogEvent(EVENT_EXPORTSTART), + dirino(dir->ino()), + dest(d) { + metablob.add_dir_context(dir); + } + EExportStart() : LogEvent(EVENT_EXPORTSTART) { } + + set &get_bounds() { return bounds; } + + void print(ostream& out) { + out << "export_start " << dirino << " -> " << dest; + } + + virtual void encode_payload(bufferlist& bl) { + metablob._encode(bl); + bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&dest, sizeof(dest)); + ::_encode(bounds, bl); + } + void decode_payload(bufferlist& bl, int& off) { + metablob._decode(bl, off); + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + bl.copy(off, sizeof(dest), (char*)&dest); + off += sizeof(dest); + ::_decode(bounds, bl, off); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/events/EImportFinish.h b/branches/aleung/security1/ceph/mds/events/EImportFinish.h new file mode 100644 index 0000000000000..14a9ab6403af6 --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EImportFinish.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EIMPORTFINISH_H +#define __EIMPORTFINISH_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../MDS.h" + +class EImportFinish : public LogEvent { + protected: + inodeno_t dirino; // imported dir + bool success; + + public: + EImportFinish(CDir *dir, bool s) : LogEvent(EVENT_IMPORTFINISH), + dirino(dir->ino()), + success(s) { } + EImportFinish() : LogEvent(EVENT_IMPORTFINISH) { } + + void print(ostream& out) { + out << "import_finish " << dirino; + if (success) + out << " success"; + else + out << " failed"; + } + + virtual void encode_payload(bufferlist& bl) { + bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&success, sizeof(success)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + bl.copy(off, sizeof(success), (char*)&success); + off += sizeof(success); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/events/EImportMap.h b/branches/aleung/security1/ceph/mds/events/EImportMap.h new file mode 100644 index 0000000000000..50f366faaa9fa --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EImportMap.h @@ -0,0 +1,66 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EIMPORTMAP_H +#define __MDS_EIMPORTMAP_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class EImportMap : public LogEvent { +public: + EMetaBlob metablob; + set imports; + set exports; + //set hashdirs; + map > nested_exports; + + EImportMap() : LogEvent(EVENT_IMPORTMAP) { } + + void print(ostream& out) { + out << "import_map " << imports.size() << " imports, " + << exports.size() << " exports" + << " " << metablob; + } + + void encode_payload(bufferlist& bl) { + metablob._encode(bl); + ::_encode(imports, bl); + ::_encode(exports, bl); + for (set::iterator p = imports.begin(); + p != imports.end(); + ++p) { + ::_encode(nested_exports[*p], bl); + if (nested_exports[*p].empty()) + nested_exports.erase(*p); + } + } + void decode_payload(bufferlist& bl, int& off) { + metablob._decode(bl, off); + ::_decode(imports, bl, off); + ::_decode(exports, bl, off); + for (set::iterator p = imports.begin(); + p != imports.end(); + ++p) { + ::_decode(nested_exports[*p], bl, off); + if (nested_exports[*p].empty()) + nested_exports.erase(*p); + } + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/events/EImportStart.h b/branches/aleung/security1/ceph/mds/events/EImportStart.h new file mode 100644 index 0000000000000..59c074dec6f4f --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EImportStart.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EIMPORTSTART_H +#define __EIMPORTSTART_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../MDS.h" + +#include "EMetaBlob.h" + +class EImportStart : public LogEvent { +protected: + inodeno_t dirino; + list bounds; + + public: + EMetaBlob metablob; + + EImportStart(inodeno_t di, + list& b) : LogEvent(EVENT_IMPORTSTART), + dirino(di), bounds(b) { } + EImportStart() : LogEvent(EVENT_IMPORTSTART) { } + + void print(ostream& out) { + out << "EImportStart " << metablob; + } + + virtual void encode_payload(bufferlist& bl) { + bl.append((char*)&dirino, sizeof(dirino)); + metablob._encode(bl); + ::_encode(bounds, bl); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + metablob._decode(bl, off); + ::_decode(bounds, bl, off); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/events/EInodeUpdate.h b/branches/aleung/security1/ceph/mds/events/EInodeUpdate.h deleted file mode 100644 index dba233c833883..0000000000000 --- a/branches/aleung/security1/ceph/mds/events/EInodeUpdate.h +++ /dev/null @@ -1,55 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EINODEUPDATE_H -#define __EINODEUPDATE_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "ETrace.h" - - -class EInodeUpdate : public LogEvent { - protected: - ETrace trace; - - public: - EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE), - trace(in) { - } - EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { } - - void print(ostream& out) { - out << "up inode " << trace.back().inode.ino - << " " << trace - << " v " << trace.back().inode.version; - } - - virtual void encode_payload(bufferlist& bl) { - trace.encode(bl); - } - void decode_payload(bufferlist& bl, int& off) { - trace.decode(bl, off); - } - - bool can_expire(MDS *mds); - void retire(MDS *mds, Context *c); - bool has_happened(MDS *mds); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/aleung/security1/ceph/mds/events/EMetaBlob.h b/branches/aleung/security1/ceph/mds/events/EMetaBlob.h new file mode 100644 index 0000000000000..800c6674c91a8 --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EMetaBlob.h @@ -0,0 +1,339 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EMETABLOB_H +#define __MDS_EMETABLOB_H + +#include +#include +using namespace std; + +#include "../CInode.h" +#include "../CDir.h" +#include "../CDentry.h" + + +class MDS; + +/* + * a bunch of metadata in the journal + */ + +/* notes: + * + * - make sure you adjust the inode.version for any modified inode you + * journal. CDir and CDentry maintain a projected_version, but CInode + * doesn't, since the journaled inode usually has to be modifed + * manually anyway (to delay the change in the MDS's cache until after + * it is journaled). + * + */ + + +class EMetaBlob { + + /* fullbit - a regular dentry + inode + */ + struct fullbit { + string dn; // dentry + version_t dnv; + inode_t inode; // if it's not + string symlink; + bool dirty; + + fullbit(const string& d, version_t v, inode_t& i, bool dr) : dn(d), dnv(v), inode(i), dirty(dr) { } + fullbit(const string& d, version_t v, inode_t& i, string& sym, bool dr) : dn(d), dnv(v), inode(i), symlink(sym), dirty(dr) { } + fullbit(bufferlist& bl, int& off) { _decode(bl, off); } + void _encode(bufferlist& bl) { + ::_encode(dn, bl); + bl.append((char*)&dnv, sizeof(dnv)); + bl.append((char*)&inode, sizeof(inode)); + if (inode.is_symlink()) + ::_encode(symlink, bl); + bl.append((char*)&dirty, sizeof(dirty)); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(dn, bl, off); + bl.copy(off, sizeof(dnv), (char*)&dnv); + off += sizeof(dnv); + bl.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); + if (inode.is_symlink()) + ::_decode(symlink, bl, off); + bl.copy(off, sizeof(dirty), (char*)&dirty); + off += sizeof(dirty); + } + }; + + /* remotebit - a dentry + remote inode link (i.e. just an ino) + */ + struct remotebit { + string dn; + version_t dnv; + inodeno_t ino; + bool dirty; + + remotebit(const string& d, version_t v, inodeno_t i, bool dr) : dn(d), dnv(v), ino(i), dirty(dr) { } + remotebit(bufferlist& bl, int& off) { _decode(bl, off); } + void _encode(bufferlist& bl) { + ::_encode(dn, bl); + bl.append((char*)&dnv, sizeof(dnv)); + bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&dirty, sizeof(dirty)); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(dn, bl, off); + bl.copy(off, sizeof(dnv), (char*)&dnv); + off += sizeof(dnv); + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + bl.copy(off, sizeof(dirty), (char*)&dirty); + off += sizeof(dirty); + } + }; + + /* + * nullbit - a null dentry + */ + struct nullbit { + string dn; + version_t dnv; + bool dirty; + nullbit(const string& d, version_t v, bool dr) : dn(d), dnv(v), dirty(dr) { } + nullbit(bufferlist& bl, int& off) { _decode(bl, off); } + void _encode(bufferlist& bl) { + ::_encode(dn, bl); + bl.append((char*)&dnv, sizeof(dnv)); + bl.append((char*)&dirty, sizeof(dirty)); + } + void _decode(bufferlist& bl, int& off) { + ::_decode(dn, bl, off); + bl.copy(off, sizeof(dnv), (char*)&dnv); + off += sizeof(dnv); + bl.copy(off, sizeof(dirty), (char*)&dirty); + off += sizeof(dirty); + } + }; + + + /* dirlump - contains metadata for any dir we have contents for. + */ + struct dirlump { + static const int STATE_IMPORT = (1<<0); + static const int STATE_COMPLETE = (1<<1); + static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is! + + dirslice_t dirslice; + version_t dirv; + int state; + int nfull, nremote, nnull; + bufferlist bfull, bremote, bnull; + + private: + bool dn_decoded; + list dfull; + list dremote; + list dnull; + + public: + dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { } + + bool is_import() { return state & STATE_IMPORT; } + void mark_import() { state |= STATE_IMPORT; } + bool is_complete() { return state & STATE_COMPLETE; } + void mark_complete() { state |= STATE_COMPLETE; } + bool is_dirty() { return state & STATE_DIRTY; } + void mark_dirty() { state |= STATE_DIRTY; } + + list &get_dfull() { return dfull; } + list &get_dremote() { return dremote; } + list &get_dnull() { return dnull; } + + void _encode_bits() { + for (list::iterator p = dfull.begin(); p != dfull.end(); ++p) + p->_encode(bfull); + for (list::iterator p = dremote.begin(); p != dremote.end(); ++p) + p->_encode(bremote); + for (list::iterator p = dnull.begin(); p != dnull.end(); ++p) + p->_encode(bnull); + } + void _decode_bits() { + if (dn_decoded) return; + int off = 0; + for (int i=0; i lump_order; + map lump_map; + + public: + + // remote pointer to to-be-journaled inode iff it's a normal (non-remote) dentry + inode_t *add_dentry(CDentry *dn, bool dirty, CInode *in=0) { + CDir *dir = dn->get_dir(); + if (!in) in = dn->get_inode(); + + // add the dir + dirlump& lump = add_dir(dir, false); + + // add the dirbit + if (dn->is_remote()) { + lump.nremote++; + if (dirty) + lump.get_dremote().push_front(remotebit(dn->get_name(), + dn->get_projected_version(), + dn->get_remote_ino(), + dirty)); + else + lump.get_dremote().push_back(remotebit(dn->get_name(), + dn->get_projected_version(), + dn->get_remote_ino(), + dirty)); + } + else if (!in) { + lump.nnull++; + if (dirty) + lump.get_dnull().push_front(nullbit(dn->get_name(), + dn->get_projected_version(), + dirty)); + else + lump.get_dnull().push_back(nullbit(dn->get_name(), + dn->get_projected_version(), + dirty)); + } + else { + lump.nfull++; + if (dirty) { + lump.get_dfull().push_front(fullbit(dn->get_name(), + dn->get_projected_version(), + in->inode, in->symlink, + dirty)); + return &lump.get_dfull().front().inode; + } else { + lump.get_dfull().push_back(fullbit(dn->get_name(), + dn->get_projected_version(), + in->inode, in->symlink, + dirty)); + return &lump.get_dfull().back().inode; + } + } + return 0; + } + + dirlump& add_dir(CDir *dir, bool dirty) { + if (lump_map.count(dir->ino()) == 0) { + lump_order.push_back(dir->ino()); + lump_map[dir->ino()].dirv = dir->get_projected_version(); + } + dirlump& l = lump_map[dir->ino()]; + if (dir->is_complete()) l.mark_complete(); + if (dir->is_import()) l.mark_import(); + if (dirty) l.mark_dirty(); + return l; + } + + void add_dir_context(CDir *dir, bool toroot=false) { + // already have this dir? (we must always add in order) + if (lump_map.count(dir->ino())) + return; + + CInode *diri = dir->get_inode(); + if (!toroot && + (dir->is_import() || dir->is_hashed())) + return; // stop at import point + if (!dir->get_inode()->get_parent_dn()) + return; + + CDentry *parent = diri->get_parent_dn(); + add_dir_context(parent->get_dir(), toroot); + add_dentry(parent, false); + } + + + // encoding + + void _encode(bufferlist& bl) { + int n = lump_map.size(); + bl.append((char*)&n, sizeof(n)); + for (list::iterator i = lump_order.begin(); + i != lump_order.end(); + ++i) { + bl.append((char*)&(*i), sizeof(*i)); + lump_map[*i]._encode(bl); + } + } + void _decode(bufferlist& bl, int& off) { + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EMKDIR_H -#define __EMKDIR_H - -#include -#include "config.h" -#include "include/types.h" - -#include "ETrace.h" -#include "../MDS.h" -#include "../MDStore.h" - - -class EMkdir : public LogEvent { - protected: - ETrace trace; - //version_t pdirv; - - public: - EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR), - trace(dir->inode) { - //pdirv = dir->inode->get_parent_dir()->get_version(); - } - EMkdir() : LogEvent(EVENT_MKDIR) { } - - void print(ostream& out) { - out << "mkdir "; - trace.print(out); - } - - virtual void encode_payload(bufferlist& bl) { - trace.encode(bl); - //bl.append((char*)&pdirv, sizeof(pdirv)); - } - void decode_payload(bufferlist& bl, int& off) { - trace.decode(bl, off); - //bl.copy(off, sizeof(pdirv), (char*)&pdirv); - //off += sizeof(pdirv); - } - - bool can_expire(MDS *mds); - void retire(MDS *mds, Context *c); - - // recovery - bool has_happened(MDS *mds); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/aleung/security1/ceph/mds/events/EMknod.h b/branches/aleung/security1/ceph/mds/events/EMknod.h deleted file mode 100644 index 27ade4671a0c7..0000000000000 --- a/branches/aleung/security1/ceph/mds/events/EMknod.h +++ /dev/null @@ -1,60 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __EMKNOD_H -#define __EMKNOD_H - -#include -#include "config.h" -#include "include/types.h" - -#include "../LogEvent.h" -#include "ETrace.h" -#include "../MDS.h" -#include "../MDStore.h" - - -class EMknod : public LogEvent { - protected: - ETrace trace; - //version_t pdirv; - - public: - EMknod(CInode *in) : LogEvent(EVENT_MKNOD), - trace(in) { - //pdirv = in->get_parent_dir()->get_version(); - } - EMknod() : LogEvent(EVENT_MKNOD) { } - - void print(ostream& out) { - out << "mknod " << trace; - } - - virtual void encode_payload(bufferlist& bl) { - trace.encode(bl); - //bl.append((char*)&pdirv, sizeof(pdirv)); - } - void decode_payload(bufferlist& bl, int& off) { - trace.decode(bl, off); - //bl.copy(off, sizeof(pdirv), (char*)&pdirv); - //off += sizeof(pdirv); - } - - bool can_expire(MDS *mds); - void retire(MDS *mds, Context *c); - bool has_happened(MDS *mds); - void replay(MDS *mds); - -}; - -#endif diff --git a/branches/aleung/security1/ceph/mds/events/EPurgeFinish.h b/branches/aleung/security1/ceph/mds/events/EPurgeFinish.h index bacfa8e93c737..b00f5f90313fc 100644 --- a/branches/aleung/security1/ceph/mds/events/EPurgeFinish.h +++ b/branches/aleung/security1/ceph/mds/events/EPurgeFinish.h @@ -39,9 +39,8 @@ class EPurgeFinish : public LogEvent { bl.copy(off, sizeof(ino), (char*)&ino); } - bool can_expire(MDS *mds); - void retire(MDS *mds, Context *c); - bool has_happened(MDS *mds); + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); void replay(MDS *mds); }; diff --git a/branches/aleung/security1/ceph/mds/events/EString.h b/branches/aleung/security1/ceph/mds/events/EString.h index 6bd10030549ba..0ef7577406454 100644 --- a/branches/aleung/security1/ceph/mds/events/EString.h +++ b/branches/aleung/security1/ceph/mds/events/EString.h @@ -39,7 +39,6 @@ class EString : public LogEvent { event = bl.c_str() + off; off += event.length() + 1; } - void encode_payload(bufferlist& bl) { bl.append(event.c_str(), event.length()+1); } @@ -48,6 +47,10 @@ class EString : public LogEvent { out << '"' << event << '"'; } + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); + }; #endif diff --git a/branches/aleung/security1/ceph/mds/events/ETrace.h b/branches/aleung/security1/ceph/mds/events/ETrace.h deleted file mode 100644 index a320137512178..0000000000000 --- a/branches/aleung/security1/ceph/mds/events/ETrace.h +++ /dev/null @@ -1,119 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __MDS_ETRACE_H -#define __MDS_ETRACE_H - -#include -#include -using namespace std; - -#include "../CInode.h" -#include "../CDir.h" -#include "../CDentry.h" - - -// path trace for use in journal events - -class ETrace { - - // segment. - struct bit { - inodeno_t dirino; - version_t dirv; - string dn; - inode_t inode; - - bit(bufferlist& bl, int& off) { _decode(bl,off); } - bit(inodeno_t di, version_t dv, const string& d, inode_t i) : - dirino(di), dirv(dv), dn(d), inode(i) {} - - void _encode(bufferlist& bl) { - bl.append((char*)&dirino, sizeof(dirino)); - bl.append((char*)&dirv, sizeof(dirv)); - ::_encode(dn, bl); - bl.append((char*)&inode, sizeof(inode)); - } - void _decode(bufferlist& bl, int& off) { - bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino); - bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv); - ::_decode(dn, bl, off); - bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); - } - }; - - public: - list trace; - - ETrace(CInode *in = 0) { - if (in) { - CDir *dir; - CDentry *dn; - do { - dn = in->get_parent_dn(); - if (!dn) break; - dir = dn->get_dir(); - if (!dir) break; - - trace.push_front(bit(dir->ino(), - dir->get_version(), - dn->get_name(), - in->inode)); - - in = dir->get_inode(); - } while (!dir->is_import()); - } - } - - bit& back() { - return trace.back(); - } - - void decode(bufferlist& bl, int& off) { - int n; - bl.copy(off, sizeof(n), (char*)&n); - off += sizeof(n); - for (int i=0; i::iterator i = trace.begin(); - i != trace.end(); - i++) - i->_encode(bl); - } - - void print(ostream& out) const { - for (list::const_iterator p = trace.begin(); - p != trace.end(); - p++) { - if (p == trace.begin()) - out << "[" << p->dirino << "]/" << p->dn; - else - out << "/" << p->dn; - } - } - - CInode *restore_trace(MDS *mds); - -}; - -inline ostream& operator<<(ostream& out, const ETrace& t) { - t.print(out); - return out; -} - -#endif diff --git a/branches/aleung/security1/ceph/mds/events/EUnlink.h b/branches/aleung/security1/ceph/mds/events/EUnlink.h index 9b7484174886a..7d972488dab1b 100644 --- a/branches/aleung/security1/ceph/mds/events/EUnlink.h +++ b/branches/aleung/security1/ceph/mds/events/EUnlink.h @@ -19,45 +19,52 @@ #include "include/types.h" #include "../LogEvent.h" -#include "ETrace.h" +#include "EMetaBlob.h" #include "../CInode.h" #include "../CDentry.h" #include "../CDir.h" +/// help rewrite me + class EUnlink : public LogEvent { protected: - ETrace diritrace; version_t dirv; string dname; - ETrace inodetrace; public: + EMetaBlob metaglob; + + /* EUnlink(CDir *dir, CDentry* dn, CInode *in) : LogEvent(EVENT_UNLINK), diritrace(dir->inode), dirv(dir->get_version()), dname(dn->get_name()), inodetrace(in) {} + */ EUnlink() : LogEvent(EVENT_UNLINK) { } virtual void encode_payload(bufferlist& bl) { + /* diritrace.encode(bl); bl.append((char*)&dirv, sizeof(dirv)); ::_encode(dname, bl); inodetrace.encode(bl); + */ } void decode_payload(bufferlist& bl, int& off) { + /* diritrace.decode(bl,off); bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv); ::_decode(dname, bl, off); inodetrace.decode(bl, off); + */ } - bool can_expire(MDS *mds); - void retire(MDS *mds, Context *c); - bool has_happened(MDS *mds); + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); void replay(MDS *mds); }; diff --git a/branches/aleung/security1/ceph/mds/events/EUpdate.h b/branches/aleung/security1/ceph/mds/events/EUpdate.h new file mode 100644 index 0000000000000..4a8dad5876a62 --- /dev/null +++ b/branches/aleung/security1/ceph/mds/events/EUpdate.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EUPDATE_H +#define __MDS_EUPDATE_H + +#include "../LogEvent.h" +#include "EMetaBlob.h" + +class EUpdate : public LogEvent { +public: + EMetaBlob metablob; + string type; + + EUpdate() : LogEvent(EVENT_UPDATE) { } + EUpdate(const char *s) : LogEvent(EVENT_UPDATE), + type(s) { } + + void print(ostream& out) { + if (type.length()) + out << type << " "; + out << metablob; + } + + void encode_payload(bufferlist& bl) { + ::_encode(type, bl); + metablob._encode(bl); + } + void decode_payload(bufferlist& bl, int& off) { + ::_decode(type, bl, off); + metablob._decode(bl, off); + } + + bool has_expired(MDS *mds); + void expire(MDS *mds, Context *c); + void replay(MDS *mds); +}; + +#endif diff --git a/branches/aleung/security1/ceph/mds/journal.cc b/branches/aleung/security1/ceph/mds/journal.cc index 9ac2406e2cbc2..2182d33ffc878 100644 --- a/branches/aleung/security1/ceph/mds/journal.cc +++ b/branches/aleung/security1/ceph/mds/journal.cc @@ -11,264 +11,394 @@ * */ -#include "events/ETrace.h" -#include "events/EMknod.h" -#include "events/EMkdir.h" -#include "events/EInodeUpdate.h" +#include "events/EString.h" + +#include "events/EMetaBlob.h" +#include "events/EAlloc.h" +#include "events/EUpdate.h" +#include "events/EImportMap.h" + #include "events/EPurgeFinish.h" #include "events/EUnlink.h" +#include "events/EExportStart.h" +#include "events/EExportFinish.h" +#include "events/EImportStart.h" +#include "events/EImportFinish.h" #include "MDS.h" +#include "MDLog.h" #include "MDCache.h" +#include "MDStore.h" +#include "Migrator.h" #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " -#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " +#define dout(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " +#define derr(l) if (l<=g_conf.debug_mds || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " // ----------------------- -// ETrace +// EString -CInode *ETrace::restore_trace(MDS *mds) +bool EString::has_expired(MDS *mds) { + dout(10) << "EString.has_expired " << event << endl; + return true; +} +void EString::expire(MDS *mds, Context *c) { - CInode *in = 0; - for (list::iterator p = trace.begin(); - p != trace.end(); - ++p) { - // the dir - CInode *diri = mds->mdcache->get_inode(p->dirino); - if (!diri) { - dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl; - diri = new CInode(mds->mdcache); - diri->inode.ino = p->dirino; - diri->inode.mode = INODE_MODE_DIR; - mds->mdcache->add_inode(diri); - - CDir *dir = diri->get_or_open_dir(mds); - - // root? import? - if (p == trace.begin()) { - mds->mdcache->add_import(dir); - if (dir->ino() == 1) - mds->mdcache->set_root(diri); - } - } else { - dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl; - diri->get_or_open_dir(mds); - } - assert(diri->dir); - dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl; - - // the inode - in = mds->mdcache->get_inode(p->inode.ino); - if (!in) { - dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl; - in = new CInode(mds->mdcache); - in->inode = p->inode; - mds->mdcache->add_inode(in); - - // the dentry - CDentry *dn = diri->dir->add_dentry( p->dn, in ); - dn->mark_dirty(); - assert(dn); - } else { - dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl; - in->inode = p->inode; - } - dout(20) << "ETrace.restore_trace in is " << *in << endl; - } - return in; + dout(10) << "EString.expire " << event << endl; +} +void EString::replay(MDS *mds) +{ + dout(10) << "EString.replay " << event << endl; } + // ----------------------- -// EMkdir -// - trace goes to new dir's inode. +// EMetaBlob -bool EMkdir::can_expire(MDS *mds) +/* + * we need to ensure that a journaled item has either + * + * - been safely committed to its dirslice. + * + * - has been safely exported. note that !is_auth() && !is_proxy() + * implies safely exported. if !is_auth() && is_proxy(), we need to + * add a waiter for the export to complete. + * + */ +bool EMetaBlob::has_expired(MDS *mds) { - // am i obsolete? - CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); - if (!in) return true; - CDir *dir = in->dir; - if (!dir) return true; - CDir *pdir = in->get_parent_dir(); - assert(pdir); - - dout(10) << "EMkdir.can_expire in is " << *in << endl; - dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl; - dout(10) << "EMkdir.can_expire dir is " << *dir << endl; - bool commitparent = in->get_last_committed_version() < trace.back().inode.version; - bool commitnew = dir->get_last_committed_version() == 0; + // examine dirv's for my lumps + for (map::iterator lp = lump_map.begin(); + lp != lump_map.end(); + ++lp) { + CInode *diri = mds->mdcache->get_inode(lp->first); + if (!diri) + continue; // we expired it + CDir *dir = diri->dir; + if (!dir) + continue; // we expired it + + // FIXME: check the slice only + + if (dir->is_proxy()) { + dout(10) << "EMetaBlob.has_expired am proxy, needed dirv " << lp->second.dirv + << " for " << *dir << endl; + return false; // we need to wait until the export flushes! + } + if (!dir->is_auth()) { + dout(10) << "EMetaBlob.has_expired not auth, needed dirv " << lp->second.dirv + << " for " << *dir << endl; + continue; // not our problem + } - if (commitparent || commitnew) return false; - return true; + if (dir->get_last_committed_version() < lp->second.dirv) { + dout(10) << "EMetaBlob.has_expired need dirv " << lp->second.dirv + << " for " << *dir << endl; + return false; // not committed. + } else { + dout(10) << "EMetaBlob.has_expired have dirv " << lp->second.dirv + << " for " << *dir << endl; + } + } + + return true; // all dirlumps expired. } -void EMkdir::retire(MDS *mds, Context *c) +void EMetaBlob::expire(MDS *mds, Context *c) { - // commit parent dir AND my dir - CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); - assert(in); - CDir *dir = in->dir; - assert(dir); - CDir *pdir = in->get_parent_dir(); - assert(pdir); - - dout(10) << "EMkdir.retire in is " << *in << endl; - dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl; - dout(10) << "EMkdir.retire dir is " << *dir << endl; - bool commitparent = in->get_last_committed_version() < trace.back().inode.version; - bool commitnew = dir->get_last_committed_version() == 0; - - if (commitparent && commitnew) { - // both - dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl; - C_Gather *gather = new C_Gather(c); - mds->mdstore->commit_dir(pdir, gather->new_sub()); - mds->mdstore->commit_dir(dir, gather->new_sub()); - } else if (commitparent) { - // just parent - dout(10) << "EMkdir.retire committing parent dir " << *dir << endl; - mds->mdstore->commit_dir(pdir, c); + list commit; + list waitfor_export; + int ncommit = 0; + + // examine dirv's for my lumps + // make list of dir slices i need to commit + for (map::iterator lp = lump_map.begin(); + lp != lump_map.end(); + ++lp) { + CInode *diri = mds->mdcache->get_inode(lp->first); + if (!diri) + continue; // we expired it + CDir *dir = diri->dir; + if (!dir) + continue; // we expired it + + // FIXME: check the slice only + + if (dir->is_proxy()) { + // wait until export is acked (logged on remote) and committed (logged locally) + CDir *ex = mds->mdcache->get_export_container(dir); + dout(10) << "EMetaBlob.expire proxy for " << *dir + << ", waiting for export finish on " << *ex << endl; + waitfor_export.push_back(ex); + continue; + } + if (!dir->is_auth()) { + dout(10) << "EMetaBlob.expire not auth, needed dirv " << lp->second.dirv + << " for " << *dir << endl; + continue; // not our problem + } + if (dir->get_last_committed_version() < lp->second.dirv) { + dout(10) << "EMetaBlob.expire need dirv " << lp->second.dirv + << ", committing " << *dir << endl; + commit.push_back(dir); + ncommit++; + } else { + dout(10) << "EMetaBlob.expire have dirv " << lp->second.dirv + << " on " << *dir << endl; + } + } + + // commit + assert(!commit.empty()); + + if (ncommit == 1) { + mds->mdstore->commit_dir(commit.front(), c); } else { - // just new dir - dout(10) << "EMkdir.retire committing new dir " << *dir << endl; - mds->mdstore->commit_dir(dir, c); + C_Gather *gather = new C_Gather(c); + for (list::iterator p = commit.begin(); + p != commit.end(); + ++p) + mds->mdstore->commit_dir(*p, gather->new_sub()); + for (list::iterator p = waitfor_export.begin(); + p != waitfor_export.end(); + ++p) + mds->mdcache->migrator->add_export_finish_waiter(*p, gather->new_sub()); } } -bool EMkdir::has_happened(MDS *mds) -{ - return false; -} - -void EMkdir::replay(MDS *mds) +void EMetaBlob::replay(MDS *mds) { - dout(10) << "EMkdir.replay " << *this << endl; - CInode *in = trace.restore_trace(mds); + dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps" << endl; - // mark dir inode dirty - in->mark_dirty(); + // walk through my dirs (in order!) + for (list::iterator lp = lump_order.begin(); + lp != lump_order.end(); + ++lp) { + dout(10) << "EMetaBlob.replay dir " << *lp << endl; + dirlump &lump = lump_map[*lp]; - // mark parent dir dirty, and set version. - // this may end up being below water when dir is fetched from disk. - CDir *pdir = in->get_parent_dir(); - if (!pdir->is_dirty()) pdir->mark_dirty(); - pdir->set_version(trace.back().dirv); - - // mark new dir dirty + complete - CDir *dir = in->get_or_open_dir(mds); - dir->mark_dirty(); - dir->mark_complete(); -} + // the dir + CInode *diri = mds->mdcache->get_inode(*lp); + CDir *dir; + if (!diri) { + assert(*lp == 1); + diri = mds->mdcache->create_root_inode(); + dout(10) << "EMetaBlob.replay created root " << *diri << endl; + } + if (diri->dir) { + dir = diri->dir; + dout(20) << "EMetaBlob.replay had dir " << *dir << endl; + } else { + dir = diri->get_or_open_dir(mds->mdcache); + if (*lp == 1) + dir->set_dir_auth(CDIR_AUTH_UNKNOWN); + dout(10) << "EMetaBlob.replay added dir " << *dir << endl; + } + dir->set_version( lump.dirv ); + if (lump.is_dirty()) + dir->_mark_dirty(); + if (lump.is_complete()) + dir->mark_complete(); + + // decode bits + lump._decode_bits(); + + // full dentry+inode pairs + for (list::iterator p = lump.get_dfull().begin(); + p != lump.get_dfull().end(); + p++) { + CInode *in = mds->mdcache->get_inode(p->inode.ino); + if (!in) { + // inode + in = new CInode(mds->mdcache); + in->inode = p->inode; + if (in->inode.is_symlink()) in->symlink = p->symlink; + mds->mdcache->add_inode(in); + // dentry + CDentry *dn = dir->add_dentry( p->dn, in ); + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay added " << *dn << " " << *in << endl; + } else { + // inode + in->inode = p->inode; + if (in->inode.is_symlink()) in->symlink = p->symlink; + // dentry + CDentry *dn = in->get_parent_dn(); + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay had " << *in->get_parent_dn() << " " << *in << endl; + } + } + // remote dentries + for (list::iterator p = lump.get_dremote().begin(); + p != lump.get_dremote().end(); + p++) { + CDentry *dn = dir->lookup(p->dn); + if (!dn) { + dn = dir->add_dentry(p->dn, p->ino); + dn->set_remote_ino(p->ino); + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay added " << *dn << endl; + } else { + dn->set_remote_ino(p->ino); + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay had " << *dn << endl; + } + } + // null dentries + for (list::iterator p = lump.get_dnull().begin(); + p != lump.get_dnull().end(); + p++) { + CDentry *dn = dir->lookup(p->dn); + if (!dn) { + dn = dir->add_dentry(p->dn); + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay added " << *dn << endl; + } else { + dn->set_version(p->dnv); + dn->_mark_dirty(); + dout(10) << "EMetaBlob.replay had " << *dn << endl; + } + } + } +} -// ----------------------- -// EMknod - -bool EMknod::can_expire(MDS *mds) -{ - // am i obsolete? - CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); - if (!in) return true; - if (!in->is_auth()) return true; // not my inode anymore! - if (in->get_version() != trace.back().inode.version) - return true; // i'm obsolete! (another log entry follows) - if (in->get_last_committed_version() >= trace.back().inode.version) - return true; +// ----------------------- +// EAlloc - return false; +bool EAlloc::has_expired(MDS *mds) +{ + version_t cv = mds->idalloc->get_committed_version(); + if (cv < table_version) { + dout(10) << "EAlloc.has_expired v " << table_version << " > " << cv + << ", still dirty" << endl; + return false; // still dirty + } else { + dout(10) << "EAlloc.has_expired v " << table_version << " <= " << cv + << ", already flushed" << endl; + return true; // already flushed + } } -void EMknod::retire(MDS *mds, Context *c) +void EAlloc::expire(MDS *mds, Context *c) { - // commit parent directory - CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); - assert(diri); - CDir *dir = diri->dir; - assert(dir); - - dout(10) << "EMknod.retire committing parent dir " << *dir << endl; - mds->mdstore->commit_dir(dir, c); + dout(10) << "EAlloc.expire saving idalloc table" << endl; + mds->idalloc->save(c, table_version); } -bool EMknod::has_happened(MDS *mds) +void EAlloc::replay(MDS *mds) { - return false; + if (mds->idalloc->get_version() >= table_version) { + dout(10) << "EAlloc.replay event " << table_version + << " <= table " << mds->idalloc->get_version() << endl; + } else { + dout(10) << " EAlloc.replay event " << table_version + << " - 1 == table " << mds->idalloc->get_version() << endl; + assert(table_version-1 == mds->idalloc->get_version()); + + if (what == EALLOC_EV_ALLOC) { + idno_t nid = mds->idalloc->alloc_id(true); + assert(nid == id); // this should match. + } + else if (what == EALLOC_EV_FREE) { + mds->idalloc->reclaim_id(id, true); + } + else + assert(0); + + assert(table_version == mds->idalloc->get_version()); + } } - -void EMknod::replay(MDS *mds) -{ - dout(10) << "EMknod.replay " << *this << endl; - CInode *in = trace.restore_trace(mds); - in->mark_dirty(); - // mark parent dir dirty, and set version. - // this may end up being below water when dir is fetched from disk. - CDir *pdir = in->get_parent_dir(); - if (!pdir->is_dirty()) pdir->mark_dirty(); - pdir->set_version(trace.back().dirv); -} +// ----------------------- +// EUpdate +bool EUpdate::has_expired(MDS *mds) +{ + return metablob.has_expired(mds); +} -// ----------------------- -// EInodeUpdate +void EUpdate::expire(MDS *mds, Context *c) +{ + metablob.expire(mds, c); +} -bool EInodeUpdate::can_expire(MDS *mds) +void EUpdate::replay(MDS *mds) { - CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); - if (!in) return true; + metablob.replay(mds); +} - if (!in->is_auth()) return true; // not my inode anymore! - if (in->get_version() != trace.back().inode.version) - return true; // i'm obsolete! (another log entry follows) - /* - // frozen -> exporting -> obsolete (FOR NOW?) - if (in->is_frozen()) - return true; - */ +// ----------------------- +// EImportMap - if (in->get_last_committed_version() >= trace.back().inode.version) +bool EImportMap::has_expired(MDS *mds) +{ + if (mds->mdlog->last_import_map > get_end_off()) { + dout(10) << "EImportMap.has_expired -- there's a newer map" << endl; return true; - - return false; + } + else if (mds->mdlog->is_capped()) { + dout(10) << "EImportMap.has_expired -- log is capped, allowing map to expire" << endl; + return true; + } else { + dout(10) << "EImportMap.has_expired -- not until there's a newer map written" << endl; + return false; + } } -void EInodeUpdate::retire(MDS *mds, Context *c) -{ - // commit parent directory - CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); - assert(diri); - CDir *dir = diri->dir; - assert(dir); +/* +class C_MDS_ImportMapFlush : public Context { + MDS *mds; + off_t end_off; +public: + C_MDS_ImportMapFlush(MDS *m, off_t eo) : mds(m), end_off(eo) { } + void finish(int r) { + // am i the last thing in the log? + if (mds->mdlog->get_write_pos() == end_off) { + // yes. we're good. + } else { + // no. submit another import_map so that we can go away. + } + } +}; +*/ - dout(10) << "EMknod.retire committing parent dir " << *dir << endl; - mds->mdstore->commit_dir(dir, c); -} - -bool EInodeUpdate::has_happened(MDS *mds) +void EImportMap::expire(MDS *mds, Context *c) { - return false; + dout(10) << "EImportMap.has_expire -- waiting for a newer map to be written (or for shutdown)" << endl; + mds->mdlog->import_map_expire_waiters.push_back(c); } -void EInodeUpdate::replay(MDS *mds) +void EImportMap::replay(MDS *mds) { - dout(10) << "EInodeUpdate.replay " << *this << endl; - CInode *in = trace.restore_trace(mds); - in->mark_dirty(); + dout(10) << "EImportMap.replay -- reconstructing import/export spanning tree" << endl; + assert(mds->mdcache->imports.empty()); - // mark parent dir dirty, and set version. - // this may end up being below water when dir is fetched from disk. - CDir *pdir = in->get_parent_dir(); - if (!pdir->is_dirty()) pdir->mark_dirty(); - pdir->set_version(trace.back().dirv); + // first, stick the spanning tree in my cache + metablob.replay(mds); + + // restore import/export maps + for (set::iterator p = imports.begin(); + p != imports.end(); + ++p) { + mds->mdcache->add_ambiguous_import(*p, nested_exports[*p]); + mds->mdcache->finish_ambiguous_import(*p); + } + + mds->mdcache->show_imports(); } @@ -276,8 +406,9 @@ void EInodeUpdate::replay(MDS *mds) // ----------------------- // EUnlink -bool EUnlink::can_expire(MDS *mds) +bool EUnlink::has_expired(MDS *mds) { + /* // dir CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); CDir *dir = 0; @@ -291,12 +422,13 @@ bool EUnlink::can_expire(MDS *mds) if (in && in->get_last_committed_version() < inodetrace.back().inode.version) return false; } - + */ return true; } -void EUnlink::retire(MDS *mds, Context *c) +void EUnlink::expire(MDS *mds, Context *c) { + /* CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); CDir *dir = diri->dir; assert(dir); @@ -304,42 +436,154 @@ void EUnlink::retire(MDS *mds, Context *c) // okay! dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; mds->mdstore->commit_dir(dir, dirv, c); + */ } -bool EUnlink::has_happened(MDS *mds) +void EUnlink::replay(MDS *mds) +{ +} + + + + +// ----------------------- +// EPurgeFinish + + +bool EPurgeFinish::has_expired(MDS *mds) { return true; } -void EUnlink::replay(MDS *mds) +void EPurgeFinish::expire(MDS *mds, Context *c) { } +void EPurgeFinish::replay(MDS *mds) +{ +} + + +// ========================================================================= + // ----------------------- -// EPurgeFinish +// EExportStart +bool EExportStart::has_expired(MDS *mds) +{ + CInode *diri = mds->mdcache->get_inode(dirino); + if (!diri) return true; + CDir *dir = diri->dir; + if (!dir) return true; + if (!mds->mdcache->migrator->is_exporting(dir)) + return true; + dout(10) << "EExportStart.has_expired still exporting " << *dir << endl; + return false; +} -bool EPurgeFinish::can_expire(MDS *mds) +void EExportStart::expire(MDS *mds, Context *c) { + CInode *diri = mds->mdcache->get_inode(dirino); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + assert(mds->mdcache->migrator->is_exporting(dir)); + + dout(10) << "EExportStart.expire waiting for export of " << *dir << endl; + mds->mdcache->migrator->add_export_finish_waiter(dir, c); +} + +void EExportStart::replay(MDS *mds) +{ + dout(10) << "EExportStart.replay " << dirino << " -> " << dest << endl; + metablob.replay(mds); + + // put in pending_exports lists + mds->mdlog->pending_exports[dirino] = bounds; +} + +// ----------------------- +// EExportFinish + +bool EExportFinish::has_expired(MDS *mds) +{ + // we can always expire. return true; } -void EPurgeFinish::retire(MDS *mds, Context *c) +void EExportFinish::expire(MDS *mds, Context *c) { + assert(0); // should never happen. } -bool EPurgeFinish::has_happened(MDS *mds) +void EExportFinish::replay(MDS *mds) +{ + dout(10) << "EExportFinish.replay " << dirino << " success=" << success << endl; + + assert(mds->mdlog->pending_exports.count(dirino)); + + // finish? + if (success) + mds->mdcache->finish_ambiguous_export(dirino, mds->mdlog->pending_exports[dirino]); + + // remove from pending_exports list + mds->mdlog->pending_exports.erase(dirino); +} + + +// ----------------------- +// EImportStart + +bool EImportStart::has_expired(MDS *mds) +{ + return metablob.has_expired(mds); +} + +void EImportStart::expire(MDS *mds, Context *c) +{ + dout(10) << "EImportStart.expire " << dirino << endl; + metablob.expire(mds, c); +} + +void EImportStart::replay(MDS *mds) +{ + dout(10) << "EImportStart.replay " << dirino << endl; + metablob.replay(mds); + + // convert list -> set + set b; + for (list::iterator p = bounds.begin(); p != bounds.end(); ++p) + b.insert(*p); + + // put in ambiguous import list + mds->mdcache->add_ambiguous_import(dirino, b); +} + +// ----------------------- +// EImportFinish + +bool EImportFinish::has_expired(MDS *mds) { return true; } +void EImportFinish::expire(MDS *mds, Context *c) +{ + assert(0); // shouldn't ever happen +} -void EPurgeFinish::replay(MDS *mds) +void EImportFinish::replay(MDS *mds) { + dout(10) << "EImportFinish.replay " << dirino << " success=" << success << endl; + if (success) + mds->mdcache->finish_ambiguous_import(dirino); + else + mds->mdcache->cancel_ambiguous_import(dirino); } + diff --git a/branches/aleung/security1/ceph/mds/mdstypes.h b/branches/aleung/security1/ceph/mds/mdstypes.h index b448123bf929e..1ac4525e76559 100644 --- a/branches/aleung/security1/ceph/mds/mdstypes.h +++ b/branches/aleung/security1/ceph/mds/mdstypes.h @@ -4,6 +4,8 @@ #include #include +#include +#include using namespace std; #include "config.h" @@ -12,6 +14,37 @@ using namespace std; #include + +// md ops +#define MDS_OP_STATFS 1 + +#define MDS_OP_STAT 100 +#define MDS_OP_LSTAT 101 +#define MDS_OP_UTIME 102 +#define MDS_OP_CHMOD 103 +#define MDS_OP_CHOWN 104 + + +#define MDS_OP_READDIR 200 +#define MDS_OP_MKNOD 201 +#define MDS_OP_LINK 202 +#define MDS_OP_UNLINK 203 +#define MDS_OP_RENAME 204 + +#define MDS_OP_MKDIR 220 +#define MDS_OP_RMDIR 221 +#define MDS_OP_SYMLINK 222 + +#define MDS_OP_OPEN 301 +#define MDS_OP_TRUNCATE 306 +#define MDS_OP_FSYNC 307 +//#define MDS_OP_CLOSE 310 +#define MDS_OP_RELEASE 308 + + + +// ================================================================ + /* meta_load_t * hierarchical load for an inode/dir and it's children */ @@ -41,7 +74,7 @@ class meta_load_t { inline ostream& operator<<( ostream& out, meta_load_t& load ) { - return out << "metaload"; } @@ -132,4 +165,126 @@ inline mds_load_t operator/( mds_load_t& a, double d ) */ +// ================================================================ +// dir slices + +struct dirslice_t { + short hash_mask; + short hash_val; +}; + + + +// ================================================================ + +#define MDS_PIN_REPLICATED 1 + +class MDSCacheObject { + protected: + unsigned state; // state bits + + int ref; // reference count + set ref_set; + + map replicas; // [auth] mds -> nonce + int replica_nonce; // [replica] defined on replica + + public: + MDSCacheObject() : + state(0), + ref(0), + replica_nonce(0) {} + virtual ~MDSCacheObject() {} + + // -------------------------------------------- + // state + unsigned get_state() { return state; } + void state_clear(unsigned mask) { state &= ~mask; } + void state_set(unsigned mask) { state |= mask; } + unsigned state_test(unsigned mask) { return state & mask; } + void state_reset(unsigned s) { state = s; } + + // -------------------------------------------- + // pins + int get_num_ref() { return ref; } + bool is_pinned_by(int by) { return ref_set.count(by); } + set& get_ref_set() { return ref_set; } + + virtual void last_put() {} + virtual void bad_put(int by) { + assert(ref_set.count(by) == 1); + assert(ref > 0); + } + void put(int by) { + if (ref == 0 || ref_set.count(by) != 1) { + bad_put(by); + } else { + ref--; + ref_set.erase(by); + assert(ref == (int)ref_set.size()); + if (ref == 0) + last_put(); + } + } + + virtual void first_get() {} + virtual void bad_get(int by) { + assert(ref_set.count(by) == 0); + assert(0); + } + void get(int by) { + if (ref_set.count(by)) { + bad_get(by); + } else { + if (ref == 0) + first_get(); + ref++; + ref_set.insert(by); + assert(ref == (int)ref_set.size()); + } + } + + + + // -------------------------------------------- + // replication + bool is_replicated() { return !replicas.empty(); } + bool is_replica(int mds) { return replicas.count(mds); } + int num_replicas() { return replicas.size(); } + int add_replica(int mds) { + if (replicas.count(mds)) + return ++replicas[mds]; // inc nonce + if (replicas.empty()) + get(MDS_PIN_REPLICATED); + return replicas[mds] = 1; + } + void add_replica(int mds, int nonce) { + if (replicas.empty()) + get(MDS_PIN_REPLICATED); + replicas[mds] = nonce; + } + int get_replica_nonce(int mds) { + assert(replicas.count(mds)); + return replicas[mds]; + } + void remove_replica(int mds) { + assert(replicas.count(mds)); + replicas.erase(mds); + if (replicas.empty()) + put(MDS_PIN_REPLICATED); + } + void clear_replicas() { + if (!replicas.empty()) + put(MDS_PIN_REPLICATED); + replicas.clear(); + } + map::iterator replicas_begin() { return replicas.begin(); } + map::iterator replicas_end() { return replicas.end(); } + const map& get_replicas() { return replicas; } + + int get_replica_nonce() { return replica_nonce;} + void set_replica_nonce(int n) { replica_nonce = n; } +}; + + #endif diff --git a/branches/aleung/security1/ceph/mds/oldcachestuff.cc b/branches/aleung/security1/ceph/mds/oldcachestuff.cc deleted file mode 100644 index 31bb9eaa81e3d..0000000000000 --- a/branches/aleung/security1/ceph/mds/oldcachestuff.cc +++ /dev/null @@ -1,944 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -/* - - -OLD LOCK CRAP: - (old): - sync - soft metadata.. no reads/writes can proceed. (eg no stat) - lock - hard(+soft) metadata.. path traversals stop etc. (??) - - - replication consistency modes: - hard+soft - hard and soft are defined on all replicas. - all reads proceed (in absense of sync lock) - writes require sync lock, fw to auth - -> normal behavior. - - hard - hard only, soft is undefined - reads require a sync - writes proceed if field updates are monotonic (e.g. size, m/c/atime) - -> 'softasync' - - types of access by cache users: - - hard soft - R - read_hard_try path traversal - R <= R read_soft_start stat - R <= W write_soft_start touch - W => W write_hard_start chmod - - note on those implications: - read_soft_start() calls read_hard_try() - write_soft_start() calls read_hard_try() - a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a - lock is held) - - - relationship with frozen directories: - - read_hard_try - can proceed, because any hard changes require a lock, which - requires an active authority, which implies things are unfrozen. - write_hard_start - waits (has to; only auth can initiate) - read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth) - write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth)) - - if sticky is on, an export_dir will drop any sync or lock so that the freeze will - proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing(). - - - -NAMESPACE: - - none right now. - - -*/ - - -/* soft sync locks: mtime, size, etc. - */ - -bool MDCache::read_soft_start(CInode *in, Message *m) -{ - // if (!read_hard_try(in, m)) - // return false; - - // if frozen: i can't proceed (for now, see above) - if (in->is_frozen()) { - dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; - } - - - dout(5) << "read_soft_start " << *in << endl; - - // what soft sync mode? - - if (in->is_softasync()) { - // softasync: hard consistency only - - if (in->is_auth()) { - // i am auth: i need sync - if (in->is_syncbyme()) goto yes; - if (in->is_lockbyme()) goto yes; // lock => sync - if (!in->is_cached_by_anyone() && - !in->is_open_write()) goto yes; // i'm alone - } else { - // i am replica: fw to auth - int auth = in->authority(); - dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mds->messenger->send_message(m, - MSG_ADDR_MDS(auth), m->get_dest_port(), - MDS_PORT_CACHE); - return false; - } - } else { - // normal: soft+hard consistency - - if (in->is_syncbyauth()) { - // wait for sync - } else { - // i'm consistent - goto yes; - } - } - - // we need sync - if (in->is_syncbyauth() && !in->is_softasync()) { - dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl; - } else if (in->is_softasync() && in->is_auth()) { - dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl; - } else - assert(2+2==5); - - if (!in->can_auth_pin()) { - dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return false; - } - - if (in->is_auth()) { - // wait for sync - in->add_waiter(CINODE_WAIT_SYNC, - new C_MDS_RetryMessage(mds, m)); - - if (!in->is_presync()) - inode_sync_start(in); - } else { - // wait for unsync - in->add_waiter(CINODE_WAIT_UNSYNC, - new C_MDS_RetryMessage(mds, m)); - - assert(in->is_syncbyauth()); - - if (!in->is_waitonunsync()) - inode_sync_wait(in); - } - - return false; - - yes: - mds->balancer->hit_inode(in, MDS_POP_SOFTRD); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; -} - - -int MDCache::read_soft_finish(CInode *in) -{ - dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl; - return 0; // do nothing, actually.. -} - - -bool MDCache::write_soft_start(CInode *in, Message *m) -{ - // if (!read_hard_try(in, m)) - //return false; - - // if frozen: i can't proceed (for now, see above) - if (in->is_frozen()) { - dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; - } - - dout(5) << "write_soft_start " << *in << endl; - // what soft sync mode? - - if (in->is_softasync()) { - // softasync: hard consistency only - - if (in->is_syncbyauth()) { - // wait for sync release - } else { - // i'm inconsistent; write away! - goto yes; - } - - } else { - // normal: soft+hard consistency - - if (in->is_auth()) { - // i am auth: i need sync - if (in->is_syncbyme()) goto yes; - if (in->is_lockbyme()) goto yes; // lock => sync - if (!in->is_cached_by_anyone() && - !in->is_open_write()) goto yes; // i'm alone - } else { - // i am replica: fw to auth - int auth = in->authority(); - dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mds->messenger->send_message(m, - MSG_ADDR_MDS(auth), m->get_dest_port(), - MDS_PORT_CACHE); - return false; - } - } - - // we need sync - if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) { - dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl; - } else if (!in->is_softasync() && in->is_auth()) { - dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl; - } else - assert(2+2==5); - - if (!in->can_auth_pin()) { - dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, - new C_MDS_RetryMessage(mds,m)); - return false; - } - - if (in->is_auth()) { - // wait for sync - in->add_waiter(CINODE_WAIT_SYNC, - new C_MDS_RetryMessage(mds, m)); - - if (!in->is_presync()) - inode_sync_start(in); - } else { - // wait for unsync - in->add_waiter(CINODE_WAIT_UNSYNC, - new C_MDS_RetryMessage(mds, m)); - - assert(in->is_syncbyauth()); - assert(in->is_softasync()); - - if (!in->is_waitonunsync()) - inode_sync_wait(in); - } - - return false; - - yes: - mds->balancer->hit_inode(in, MDS_POP_SOFTWR); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; -} - - -int MDCache::write_soft_finish(CInode *in) -{ - dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl; - return 0; // do nothing, actually.. -} - - - - - - - - -/* hard locks: owner, mode - */ - -/* -bool MDCache::read_hard_try(CInode *in, - Message *m) -{ - //dout(5) << "read_hard_try " << *in << endl; - - if (in->is_auth()) { - // auth - goto yes; // fine - } else { - // replica - if (in->is_lockbyauth()) { - // locked by auth; wait! - dout(7) << "read_hard_try waiting on " << *in << endl; - in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m)); - if (!in->is_waitonunlock()) - inode_lock_wait(in); - return false; - } else { - // not locked. - goto yes; - } - } - - yes: - mds->balancer->hit_inode(in, MDS_POP_HARDRD); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; -} - - -bool MDCache::write_hard_start(CInode *in, - Message *m) -{ - // if frozen: i can't proceed; only auth can initiate lock - if (in->is_frozen()) { - dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl; - in->add_waiter(CDIR_WAIT_UNFREEZE, - new C_MDS_RetryMessage(mds, m)); - return false; - } - - // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where - // the freeze is waiting for our lock to be released) - - - if (in->is_auth()) { - // auth - if (in->is_lockbyme()) goto success; - if (!in->is_cached_by_anyone()) goto success; - - // need lock - if (!in->can_auth_pin()) { - dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl; - in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m)); - return false; - } - - in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m)); - - if (!in->is_prelock()) - inode_lock_start(in); - - return false; - } else { - // replica - // fw to auth - int auth = in->authority(); - dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl; - assert(auth != mds->get_nodeid()); - mds->messenger->send_message(m, - MSG_ADDR_MDS(auth), m->get_dest_port(), - MDS_PORT_CACHE); - return false; - } - - success: - in->lock_active_count++; - dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl; - assert(in->lock_active_count > 0); - - mds->balancer->hit_inode(in, MDS_POP_HARDWR); - mds->balancer->hit_inode(in, MDS_POP_ANY); - return true; -} - -void MDCache::write_hard_finish(CInode *in) -{ - in->lock_active_count--; - dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl; - assert(in->lock_active_count >= 0); - - // release lock? - if (in->lock_active_count == 0 && - in->is_lockbyme() && - !g_conf.mdcache_sticky_lock) { - dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl; - inode_lock_release(in); - } -} - - -void MDCache::inode_lock_start(CInode *in) -{ - dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl; - - assert(in->is_auth()); - assert(!in->is_prelock()); - assert(!in->is_lockbyme()); - assert(!in->is_lockbyauth()); - - in->lock_waiting_for_ack = in->cached_by; - in->dist_state |= CINODE_DIST_PRELOCK; - in->get(CINODE_PIN_PRELOCK); - in->auth_pin(); - - // send messages - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } -} - - -void MDCache::inode_lock_release(CInode *in) -{ - dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl; - - assert(in->is_lockbyme()); - assert(in->is_auth()); - - in->dist_state &= ~CINODE_DIST_LOCKBYME; - - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - mds->messenger->send_message(new MInodeLockRelease(in), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - - in->auth_unpin(); -} - -void MDCache::inode_lock_wait(CInode *in) -{ - dout(5) << "lock_wait on " << *in << endl; - assert(!in->is_auth()); - assert(in->is_lockbyauth()); - - in->dist_state |= CINODE_DIST_WAITONUNLOCK; - in->get(CINODE_PIN_WAITONUNLOCK); -} - - -void MDCache::handle_inode_lock_start(MInodeLockStart *m) -{ - // authority is requesting a lock - CInode *in = get_inode(m->get_ino()); - if (!in) { - // don't have it anymore! - dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl; - mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - delete m; // done - return; - } - - // we shouldn't be authoritative... - assert(!in->is_auth()); - - dout(7) << "handle_lock_start " << *in << ", sending ack" << endl; - - // lock it - in->dist_state |= CINODE_DIST_LOCKBYAUTH; - - // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority()); - - // send ack - mds->messenger->send_message(new MInodeLockAck(in->ino()), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - - delete m; // done -} - - -void MDCache::handle_inode_lock_ack(MInodeLockAck *m) -{ - CInode *in = get_inode(m->get_ino()); - int from = m->get_source(); - dout(7) << "handle_lock_ack from " << from << " on " << *in << endl; - - assert(in); - assert(in->is_auth()); - assert(in->dist_state & CINODE_DIST_PRELOCK); - - // remove it from waiting list - in->lock_waiting_for_ack.erase(from); - - if (!m->did_have()) { - // erase from cached_by too! - in->cached_by_remove(from); - } - - if (in->lock_waiting_for_ack.size()) { - - // more coming - dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl; - - } else { - - // yay! - dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl; - - in->dist_state &= ~CINODE_DIST_PRELOCK; - in->dist_state |= CINODE_DIST_LOCKBYME; - in->put(CINODE_PIN_PRELOCK); - - // do waiters! - in->finish_waiting(CINODE_WAIT_LOCK); - } - - delete m; // done -} - - -void MDCache::handle_inode_lock_release(MInodeLockRelease *m) -{ - CInode *in = get_inode(m->get_ino()); - - if (!in) { - dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl; - delete m; // done - return; - } - - if (!in->is_lockbyauth()) { - dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl; - assert(0); // i should have it, locked, or not have it at all! - delete m; // done - return; - } - - dout(7) << "handle_lock_release " << *in << endl; - assert(!in->is_auth()); - - // release state - in->dist_state &= ~CINODE_DIST_LOCKBYAUTH; - - // waiters? - if (in->is_waitonunlock()) { - in->put(CINODE_PIN_WAITONUNLOCK); - in->dist_state &= ~CINODE_DIST_WAITONUNLOCK; - - // finish - in->finish_waiting(CINODE_WAIT_UNLOCK); - } - - // done - delete m; -} -*/ - - - - - - - - - -// sync interface - -void MDCache::inode_sync_wait(CInode *in) -{ - assert(!in->is_auth()); - - int auth = in->authority(); - dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl; - - assert(in->is_syncbyauth()); - assert(!in->is_waitonunsync()); - - in->dist_state |= CINODE_DIST_WAITONUNSYNC; - in->get(CINODE_PIN_WAITONUNSYNC); - - if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) || - (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) { - // actually recall; if !sticky, auth will immediately release. - dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl; - mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino), - MSG_ADDR_MDS(auth), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } -} - - -void MDCache::inode_sync_start(CInode *in) -{ - // wait for all replicas - dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl; - - assert(in->is_auth()); - assert(!in->is_presync()); - assert(!in->is_sync()); - - in->sync_waiting_for_ack.clear(); - in->dist_state |= CINODE_DIST_PRESYNC; - in->get(CINODE_PIN_PRESYNC); - in->auth_pin(); - - in->sync_replicawantback = false; - - // send messages - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - - // sync clients - int last = -1; - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times) - in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - -} - -void MDCache::inode_sync_release(CInode *in) -{ - dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl; - - assert(in->is_syncbyme()); - assert(in->is_auth()); - - in->dist_state &= ~CINODE_DIST_SYNCBYME; - - // release replicas - for (set::iterator it = in->cached_by_begin(); - it != in->cached_by_end(); - it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } - - // release writers - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - - in->auth_unpin(); -} - - - - -// messages -void MDCache::handle_inode_sync_start(MInodeSyncStart *m) -{ - // assume asker == authority for now. - - // authority is requesting a lock - CInode *in = get_inode(m->get_ino()); - if (!in) { - // don't have it anymore! - dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl; - mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - delete m; // done - return; - } - - dout(10) << "handle_sync_start " << *in << endl; - - // we shouldn't be authoritative... - assert(!in->is_auth()); - - // sanity check: make sure we know who _is_ authoritative! - assert(m->get_asker() == in->authority()); - - // lock it - in->dist_state |= CINODE_DIST_SYNCBYAUTH; - - // open for write by clients? - if (in->is_open_write()) { - dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl; - - // sync clients - in->sync_waiting_for_ack.clear(); - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); - mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - - in->pending_sync_request = m; - } else { - // no writers, ack. - dout(7) << "handle_sync_start " << *in << ", sending ack" << endl; - - inode_sync_ack(in, m); - } -} - -void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback) -{ - dout(7) << "sending inode_sync_ack " << *in << endl; - - // send ack - mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback), - MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, - MDS_PORT_CACHE); - - delete m; -} - -void MDCache::handle_inode_sync_ack(MInodeSyncAck *m) -{ - CInode *in = get_inode(m->get_ino()); - assert(in); - - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl; - - if (in->is_auth()) { - assert(in->is_presync()); - } else { - assert(in->is_syncbyauth()); - assert(in->pending_sync_request); - } - - // remove it from waiting list - in->sync_waiting_for_ack.erase(m->get_source()); - - if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) { - // erase from cached_by too! - in->cached_by_remove(m->get_source()); - } - - if (m->replica_wantsback()) - in->sync_replicawantback = true; - - if (in->sync_waiting_for_ack.size()) { - - // more coming - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl; - - } else { - - // yay! - dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl; - - if (!in->is_auth()) { - // replica, sync ack back to auth - assert(in->pending_sync_request); - inode_sync_ack(in, in->pending_sync_request, true); - in->pending_sync_request = 0; - delete m; - return; - } - - in->dist_state &= ~CINODE_DIST_PRESYNC; - in->dist_state |= CINODE_DIST_SYNCBYME; - in->put(CINODE_PIN_PRESYNC); - - // do waiters! - in->finish_waiting(CINODE_WAIT_SYNC); - - - // release sync right away? - if (in->is_syncbyme()) { - if (in->is_freezing()) { - dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl; - inode_sync_release(in); - } - else if (in->sync_replicawantback) { - dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl; - inode_sync_release(in); - } - else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) || - (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) { - dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl; - inode_sync_release(in); - } - else { - dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl; - } - } else { - dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl; - } - } - - delete m; // done -} - - -void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) -{ - CInode *in = get_inode(m->get_ino()); - - if (!in) { - dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl; - delete m; // done - return; - } - - if (!in->is_syncbyauth()) { - dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl; - assert(0); // this shouldn't happen. - delete m; // done - return; - } - - dout(7) << "handle_sync_release " << *in << endl; - assert(!in->is_auth()); - - // release state - in->dist_state &= ~CINODE_DIST_SYNCBYAUTH; - - // waiters? - if (in->is_waitonunsync()) { - in->put(CINODE_PIN_WAITONUNSYNC); - in->dist_state &= ~CINODE_DIST_WAITONUNSYNC; - - // finish - in->finish_waiting(CINODE_WAIT_UNSYNC); - } - - // client readers? - if (in->is_open_write()) { - dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl; - for (multiset::iterator it = in->get_open_write().begin(); - it != in->get_open_write().end(); - it++) { - mds->messenger->send_message(new MInodeSyncRelease(in), - MSG_ADDR_CLIENT(*it), 0, - MDS_PORT_CACHE); - } - } - - - // done - delete m; -} - - -void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m) -{ - CInode *in = get_inode(m->get_ino()); - - if (!in) { - dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl; - assert(0); // shouldn't happen - delete m; // done - return; - } - if(!in->is_auth()) { - do_ino_proxy(in, m); - return; - } - - if (in->is_syncbyme()) { - dout(7) << "handle_sync_recall " << *in << ", releasing" << endl; - inode_sync_release(in); - } - else if (in->is_presync()) { - dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl; - in->sync_replicawantback = true; - } - else { - dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl; - } - - // done - delete m; -} - - - - - - - - - - -// DIR SYNC - -/* - - dir sync - - - this are used when a directory is HASHED only. namely, - - to stat the dir inode we need an accurate directory size (????) - - for a readdir - -*/ - -void MDCache::dir_sync_start(CDir *dir) -{ - // wait for all replicas - dout(5) << "sync_start on " << *dir << endl; - - assert(dir->is_hashed()); - assert(dir->is_auth()); - assert(!dir->is_presync()); - assert(!dir->is_sync()); - - dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set(); - dir->state_set(CDIR_STATE_PRESYNC); - dir->auth_pin(); - - //dir->sync_replicawantback = false; - - // send messages - for (set::iterator it = dir->sync_waiting_for_ack.begin(); - it != dir->sync_waiting_for_ack.end(); - it++) { - mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()), - MSG_ADDR_MDS(*it), MDS_PORT_CACHE, - MDS_PORT_CACHE); - } -} - - -void MDCache::dir_sync_release(CDir *dir) -{ - - -} - -void MDCache::dir_sync_wait(CDir *dir) -{ - -} - - -void handle_dir_sync_start(MDirSyncStart *m) -{ -} - - - - diff --git a/branches/aleung/security1/ceph/messages/MCacheExpire.h b/branches/aleung/security1/ceph/messages/MCacheExpire.h index 11d941f5131d1..461d283c23072 100644 --- a/branches/aleung/security1/ceph/messages/MCacheExpire.h +++ b/branches/aleung/security1/ceph/messages/MCacheExpire.h @@ -15,80 +15,71 @@ #ifndef __MCACHEEXPIRE_H #define __MCACHEEXPIRE_H - class MCacheExpire : public Message { + int from; map inodes; map dirs; - int from; + map > dentries; public: + int get_from() { return from; } map& get_inodes() { return inodes; } map& get_dirs() { return dirs; } - int get_from() { return from; } + map >& get_dentries() { return dentries; } MCacheExpire() {} - MCacheExpire(int from) : Message(MSG_MDS_CACHEEXPIRE) { - this->from = from; - } + MCacheExpire(int f) : + Message(MSG_MDS_CACHEEXPIRE), + from(f) { } + virtual char *get_type_name() { return "CEx";} void add_inode(inodeno_t ino, int nonce) { - inodes.insert(pair(ino,nonce)); + inodes[ino] = nonce; } void add_dir(inodeno_t ino, int nonce) { - dirs.insert(pair(ino,nonce)); + dirs[ino] = nonce; + } + void add_dentry(inodeno_t dirino, const string& dn, int nonce) { + dentries[dirino][dn] = nonce; + } + void add_dentries(inodeno_t dirino, map& dmap) { + dentries[dirino] = dmap; } - virtual void decode_payload(crope& s, int& off) { - int n; + void decode_payload() { + int off = 0; - s.copy(off, sizeof(from), (char*)&from); + payload.copy(off, sizeof(from), (char*)&from); off += sizeof(from); - // inodes - s.copy(off, sizeof(int), (char*)&n); - off += sizeof(int); - for (int i=0; i(ino,nonce)); - } + ::_decode(inodes, payload, off); + ::_decode(dirs, payload, off); - // dirs - s.copy(off, sizeof(int), (char*)&n); + int n; + payload.copy(off, sizeof(int), (char*)&n); off += sizeof(int); for (int i=0; i(ino,nonce)); + ::_decode(dentries[ino], payload, off); } } - - void rope_map(crope& s, map& mp) { - int n = mp.size(); - s.append((char*)&n, sizeof(int)); - for (map::iterator it = mp.begin(); - it != mp.end(); - it++) { - inodeno_t ino = it->first; - int nonce = it->second; - s.append((char*)&ino, sizeof(ino)); - s.append((char*)&nonce, sizeof(nonce)); - } - } - virtual void encode_payload(crope& s) { - s.append((char*)&from, sizeof(from)); - rope_map(s, inodes); - rope_map(s, dirs); + void encode_payload() { + payload.append((char*)&from, sizeof(from)); + ::_encode(inodes, payload); + ::_encode(dirs, payload); + + int n = dentries.size(); + payload.append((char*)&n, sizeof(n)); + for (map >::iterator p = dentries.begin(); + p != dentries.end(); + ++p) { + payload.append((char*)&p->first, sizeof(p->first)); + ::_encode(p->second, payload); + } } }; diff --git a/branches/aleung/security1/ceph/messages/MClientBoot.h b/branches/aleung/security1/ceph/messages/MClientBoot.h index 1f2bd70fa6618..257ab465c3b08 100644 --- a/branches/aleung/security1/ceph/messages/MClientBoot.h +++ b/branches/aleung/security1/ceph/messages/MClientBoot.h @@ -20,15 +20,12 @@ class MClientBoot : public Message { public: - MClientBoot() : Message(MSG_CLIENT_BOOT) { - } + MClientBoot() : Message(MSG_CLIENT_BOOT) { } - char *get_type_name() { return "Cboot"; } + char *get_type_name() { return "client_boot"; } - virtual void decode_payload(crope& s, int& off) { - } - virtual void encode_payload(crope& s) { - } + void encode_payload() { } + void decode_payload() { } }; #endif diff --git a/branches/aleung/security1/ceph/messages/MClientRequest.h b/branches/aleung/security1/ceph/messages/MClientRequest.h index 235b17618a20c..6bd18c3dae8c2 100644 --- a/branches/aleung/security1/ceph/messages/MClientRequest.h +++ b/branches/aleung/security1/ceph/messages/MClientRequest.h @@ -19,6 +19,7 @@ #include "msg/Message.h" #include "include/filepath.h" +#include "mds/mdstypes.h" #include "mds/MDS.h" /** diff --git a/branches/aleung/security1/ceph/messages/MDiscoverReply.h b/branches/aleung/security1/ceph/messages/MDiscoverReply.h index 78e5d001086ec..c759bc9a76bd1 100644 --- a/branches/aleung/security1/ceph/messages/MDiscoverReply.h +++ b/branches/aleung/security1/ceph/messages/MDiscoverReply.h @@ -71,26 +71,27 @@ class MDiscoverReply : public Message { string error_dentry; // dentry that was not found (to trigger waiters on asker) - vector dirs; // not inode-aligned if no_base_dir = true. - filepath path; // not inode-aligned if no_base_dentry = true - vector path_xlock; - vector inodes; + vector dirs; // not inode-aligned if no_base_dir = true. + vector dentries; // not inode-aligned if no_base_dentry = true + vector inodes; + + string path; public: // accessors inodeno_t get_base_ino() { return base_ino; } int get_num_inodes() { return inodes.size(); } - int get_num_dentries() { return path.depth(); } + int get_num_dentries() { return dentries.size(); } int get_num_dirs() { return dirs.size(); } int get_depth() { // return depth of deepest object (in dir/dentry/inode units) return max( inodes.size(), // at least this many - max( no_base_dentry + path.depth() + flag_error_dn, // inode start + path + possible error + max( no_base_dentry + dentries.size() + flag_error_dn, // inode start + path + possible error dirs.size() + no_base_dir )); // dn/inode + dirs } bool has_base_dir() { return !no_base_dir && dirs.size(); } - bool has_base_dentry() { return !no_base_dentry && path.depth(); } + bool has_base_dentry() { return !no_base_dentry && dentries.size(); } bool has_root() { if (base_ino == 0) { assert(no_base_dir && no_base_dentry); @@ -98,8 +99,8 @@ class MDiscoverReply : public Message { } return false; } - const string& get_path() { return path.get_path(); } - bool get_path_xlock(int i) { return path_xlock[i]; } + + const string& get_path() { return path; } // bool is_flag_forward() { return flag_forward; } bool is_flag_error_dn() { return flag_error_dn; } @@ -108,8 +109,7 @@ class MDiscoverReply : public Message { // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } - const string& get_dentry(int n) { return path[n - no_base_dentry]; } - bool get_dentry_xlock(int n) { return path_xlock[n - no_base_dentry]; } + CDentryDiscover& get_dentry(int n) { return *(dentries[n - no_base_dentry]); } CInodeDiscover& get_inode(int n) { return *(inodes[n]); } inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } @@ -136,20 +136,19 @@ class MDiscoverReply : public Message { // builders bool is_empty() { - return dirs.empty() && path.depth() == 0 && - inodes.empty() && + return dirs.empty() && dentries.empty() && inodes.empty() && !flag_error_dn && !flag_error_dir; } - void set_path(const filepath& dp) { path = dp; } - void add_dentry(const string& dn, bool xlock) { - if (path.depth() == 0 && dirs.empty()) no_base_dir = true; - path.add_dentry(dn); - path_xlock.push_back(xlock); + void add_dentry(CDentryDiscover* ddis) { + if (dentries.empty() && dirs.empty()) no_base_dir = true; + dentries.push_back(ddis); + if (path.length()) path += "/"; + path += ddis->get_dname(); } - + void add_inode(CInodeDiscover* din) { - if (inodes.empty() && path.depth() == 0) no_base_dir = no_base_dentry = true; + if (inodes.empty() && dentries.empty()) no_base_dir = no_base_dentry = true; inodes.push_back( din ); } @@ -204,18 +203,12 @@ class MDiscoverReply : public Message { } //dout(12) << n << " inodes out" << endl; - // filepath - path._decode(payload, off); - //dout(12) << path.depth() << " dentries out" << endl; - - // path_xlock + // dentries payload.copy(off, sizeof(int), (char*)&n); off += sizeof(int); for (int i=0; i_decode(payload, off); } } void encode_payload() { @@ -246,19 +239,14 @@ class MDiscoverReply : public Message { (*it)->_encode( payload ); //dout(12) << n << " inodes in" << endl; - // path - path._encode( payload ); - //dout(12) << path.depth() << " dentries in" << endl; - - // path_xlock - n = path_xlock.size(); + // dentries + n = dentries.size(); payload.append((char*)&n, sizeof(int)); - for (vector::iterator it = path_xlock.begin(); - it != path_xlock.end(); - it++) { - bool b = *it; - payload.append((char*)&b, sizeof(bool)); - } + for (vector::iterator it = dentries.begin(); + it != dentries.end(); + it++) + (*it)->_encode( payload ); + //dout(12) << n << " dentries in" << endl; } }; diff --git a/branches/aleung/security1/ceph/messages/MExportDir.h b/branches/aleung/security1/ceph/messages/MExportDir.h index 2879579f6929f..8fdda89466b1e 100644 --- a/branches/aleung/security1/ceph/messages/MExportDir.h +++ b/branches/aleung/security1/ceph/messages/MExportDir.h @@ -21,80 +21,42 @@ class MExportDir : public Message { inodeno_t ino; - int ndirs; - bufferlist state; - - list exports; - - // hashed pre-discovers - //map > hashed_prediscover; + list dirstate; // a bl for reach dir + list exports; public: MExportDir() {} - MExportDir(CInode *in) : - Message(MSG_MDS_EXPORTDIR) { - this->ino = in->inode.ino; - ndirs = 0; + MExportDir(inodeno_t dirino) : + Message(MSG_MDS_EXPORTDIR), + ino(dirino) { } virtual char *get_type_name() { return "Ex"; } inodeno_t get_ino() { return ino; } - int get_ndirs() { return ndirs; } - bufferlist& get_state() { return state; } + list& get_dirstate() { return dirstate; } list& get_exports() { return exports; } - + void add_dir(bufferlist& dir) { - state.claim_append( dir ); - ndirs++; + dirstate.push_back(dir); + } + void set_dirstate(const list& ls) { + dirstate = ls; + } + void add_export(inodeno_t dirino) { + exports.push_back(dirino); } - void add_export(CDir *dir) { exports.push_back(dir->ino()); } - virtual void decode_payload() { int off = 0; payload.copy(off, sizeof(ino), (char*)&ino); off += sizeof(ino); - payload.copy(off, sizeof(ndirs), (char*)&ndirs); - off += sizeof(ndirs); - - // exports - int nex; - payload.copy(off, sizeof(nex), (char*)&nex); - off += sizeof(int); - dout(12) << nex << " nested exports out" << endl; - for (int i=0; i::iterator it = exports.begin(); - it != exports.end(); - it++) { - inodeno_t ino = *it; - payload.append((char*)&ino, sizeof(ino)); - } - - // dir data - size_t len = state.length(); - payload.append((char*)&len, sizeof(len)); - payload.claim_append(state); + ::_encode(exports, payload); + ::_encode(dirstate, payload); } }; diff --git a/branches/aleung/security1/ceph/messages/MExportDirPrep.h b/branches/aleung/security1/ceph/messages/MExportDirPrep.h index 6e814212ac98b..6967d950afad9 100644 --- a/branches/aleung/security1/ceph/messages/MExportDirPrep.h +++ b/branches/aleung/security1/ceph/messages/MExportDirPrep.h @@ -85,7 +85,7 @@ class MExportDirPrep : public Message { void add_export(inodeno_t dirino) { exports.push_back( dirino ); } - void add_inode(inodeno_t dirino, string& dentry, CInodeDiscover *in) { + void add_inode(inodeno_t dirino, const string& dentry, CInodeDiscover *in) { inodes.push_back(in); inode_dirino.insert(pair(in->get_ino(), dirino)); inode_dentry.insert(pair(in->get_ino(), dentry)); diff --git a/branches/aleung/security1/ceph/messages/MFailure.h b/branches/aleung/security1/ceph/messages/MFailure.h index 1663565b692dd..0ec53f6e36b18 100644 --- a/branches/aleung/security1/ceph/messages/MFailure.h +++ b/branches/aleung/security1/ceph/messages/MFailure.h @@ -20,15 +20,15 @@ class MFailure : public Message { public: - msg_addr_t failed; + entity_name_t failed; entity_inst_t inst; MFailure() {} - MFailure(msg_addr_t f, entity_inst_t& i) : + MFailure(entity_name_t f, entity_inst_t& i) : Message(MSG_FAILURE), failed(f), inst(i) {} - msg_addr_t get_failed() { return failed; } + entity_name_t get_failed() { return failed; } entity_inst_t& get_inst() { return inst; } void decode_payload() { diff --git a/branches/aleung/security1/ceph/messages/MFailureAck.h b/branches/aleung/security1/ceph/messages/MFailureAck.h index ee9a0d04d0fd4..ec0036dcdac55 100644 --- a/branches/aleung/security1/ceph/messages/MFailureAck.h +++ b/branches/aleung/security1/ceph/messages/MFailureAck.h @@ -20,13 +20,13 @@ class MFailureAck : public Message { public: - msg_addr_t failed; + entity_name_t failed; MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) { this->failed = m->get_failed(); } MFailureAck() {} - msg_addr_t get_failed() { return failed; } + entity_name_t get_failed() { return failed; } virtual void decode_payload(crope& s, int& off) { s.copy(0, sizeof(failed), (char*)&failed); diff --git a/branches/aleung/security1/ceph/messages/MMDSBeacon.h b/branches/aleung/security1/ceph/messages/MMDSBeacon.h new file mode 100644 index 0000000000000..86eccc689d396 --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMDSBeacon.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSBEACON_H +#define __MMDSBEACON_H + +#include "msg/Message.h" + +#include "include/types.h" + +#include "mds/MDSMap.h" + +class MMDSBeacon : public Message { + int state; + version_t seq; + + public: + MMDSBeacon() : Message(MSG_MDS_BEACON) {} + MMDSBeacon(int st, version_t se) : Message(MSG_MDS_BEACON), + state(st), seq(se) { } + + int get_state() { return state; } + version_t get_seq() { return seq; } + char *get_type_name() { return "mdsbeacon"; } + + void print(ostream& out) { + out << "mdsbeacon(" << MDSMap::get_state_name(state) + << " seq " << seq << ")"; + } + + void encode_payload() { + payload.append((char*)&state, sizeof(state)); + payload.append((char*)&seq, sizeof(seq)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(state), (char*)&state); + off += sizeof(state); + payload.copy(off, sizeof(seq), (char*)&seq); + off += sizeof(seq); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMDSCacheRejoin.h b/branches/aleung/security1/ceph/messages/MMDSCacheRejoin.h new file mode 100644 index 0000000000000..2789e30844743 --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMDSCacheRejoin.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSCACHEREJOIN_H +#define __MMDSCACHEREJOIN_H + +#include "msg/Message.h" + +#include "include/types.h" + +// sent from replica to auth + +class MMDSCacheRejoin : public Message { + public: + map inodes; // ino -> caps_wanted + set dirs; + map > dentries; // dir -> (dentries...) + + MMDSCacheRejoin() : Message(MSG_MDS_CACHEREJOIN) {} + + char *get_type_name() { return "cache_rejoin"; } + + void print(ostream& out) { + out << "cache_rejoin" << endl; + } + + void add_dir(inodeno_t dirino) { + dirs.insert(dirino); + } + void add_dentry(inodeno_t dirino, const string& dn) { + dentries[dirino].insert(dn); + } + void add_inode(inodeno_t ino, int cw) { + inodes[ino] = cw; + } + + void encode_payload() { + ::_encode(inodes, payload); + ::_encode(dirs, payload); + for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) + ::_encode(dentries[*p], payload); + } + void decode_payload() { + int off = 0; + ::_decode(inodes, payload, off); + ::_decode(dirs, payload, off); + for (set::iterator p = dirs.begin(); p != dirs.end(); ++p) + ::_decode(dentries[*p], payload, off); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMDSCacheRejoinAck.h b/branches/aleung/security1/ceph/messages/MMDSCacheRejoinAck.h new file mode 100644 index 0000000000000..b8f0d23ebbba0 --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMDSCacheRejoinAck.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSCACHEREJOINACK_H +#define __MMDSCACHEREJOINACK_H + +#include "msg/Message.h" + +#include "include/types.h" + +// sent from auth back to replica + +class MMDSCacheRejoinAck : public Message { + public: + struct inodeinfo { + inodeno_t ino; + int hardlock; + int filelock; + int nonce; + inodeinfo() {} + inodeinfo(inodeno_t i, int h, int f, int n) : ino(i), hardlock(h), filelock(f), nonce(n) {} + }; + struct dninfo { + int lock; + int nonce; + dninfo() {} + dninfo(int l, int n) : lock(l), nonce(n) {} + }; + struct dirinfo { + inodeno_t dirino; + int nonce; + dirinfo() {} + dirinfo(inodeno_t i, int n) : dirino(i), nonce(n) {} + }; + list inodes; + map > dentries; + list dirs; + + MMDSCacheRejoinAck() : Message(MSG_MDS_CACHEREJOINACK) {} + + char *get_type_name() { return "cache_rejoin_ack"; } + + void print(ostream& out) { + out << "cache_rejoin" << endl; + } + + void add_dir(inodeno_t dirino, int nonce) { + dirs.push_back(dirinfo(dirino,nonce)); + } + void add_dentry(inodeno_t dirino, const string& dn, int ls, int nonce) { + dentries[dirino][dn] = dninfo(ls, nonce); + } + void add_inode(inodeno_t ino, int hl, int fl, int nonce) { + inodes.push_back(inodeinfo(ino, hl, fl, nonce)); + } + + void encode_payload() { + ::_encode(inodes, payload); + ::_encode(dirs, payload); + for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) + ::_encode(dentries[p->dirino], payload); + } + void decode_payload() { + int off = 0; + ::_decode(inodes, payload, off); + ::_decode(dirs, payload, off); + for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) + ::_decode(dentries[p->dirino], payload, off); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMDSImportMap.h b/branches/aleung/security1/ceph/messages/MMDSImportMap.h new file mode 100644 index 0000000000000..22774cdabc2ec --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMDSImportMap.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSIMPORTMAP_H +#define __MMDSIMPORTMAP_H + +#include "msg/Message.h" + +#include "include/types.h" + + +class MMDSImportMap : public Message { + public: + map > imap; + map > ambiguous_imap; + + MMDSImportMap() : Message(MSG_MDS_IMPORTMAP) {} + + char *get_type_name() { return "mdsimportmap"; } + + void print(ostream& out) { + out << "mdsimportmap(" << imap.size() + << "+" << ambiguous_imap.size() + << " imports)"; + } + + void add_import(inodeno_t im) { + imap[im].clear(); + } + void add_import_export(inodeno_t im, inodeno_t ex) { + imap[im].insert(ex); + } + + void add_ambiguous_import(inodeno_t im, const set& m) { + ambiguous_imap[im] = m; + } + + void encode_payload() { + ::_encode(imap, payload); + ::_encode(ambiguous_imap, payload); + } + void decode_payload() { + int off = 0; + ::_decode(imap, payload, off); + ::_decode(ambiguous_imap, payload, off); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMDSMap.h b/branches/aleung/security1/ceph/messages/MMDSMap.h index c8dd60abcb331..701ba9a050cc3 100644 --- a/branches/aleung/security1/ceph/messages/MMDSMap.h +++ b/branches/aleung/security1/ceph/messages/MMDSMap.h @@ -21,6 +21,7 @@ class MMDSMap : public Message { public: + /* map maps; map incremental_maps; @@ -42,25 +43,33 @@ class MMDSMap : public Message { (e == 0 || i->first > e)) e = i->first; return e; } + */ + version_t epoch; + bufferlist encoded; + + version_t get_epoch() const { return epoch; } + bufferlist& get_encoded() { return encoded; } MMDSMap() : Message(MSG_MDS_MAP) {} MMDSMap(MDSMap *mm) : Message(MSG_MDS_MAP) { - mm->encode(maps[mm->get_epoch()]); + epoch = mm->get_epoch(); + mm->encode(encoded); } // marshalling virtual void decode_payload() { int off = 0; - ::_decode(maps, payload, off); - ::_decode(incremental_maps, payload, off); + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + ::_decode(encoded, payload, off); } virtual void encode_payload() { - ::_encode(maps, payload); - ::_encode(incremental_maps, payload); + payload.append((char*)&epoch, sizeof(epoch)); + ::_encode(encoded, payload); } virtual char *get_type_name() { return "mdsmap"; } diff --git a/branches/aleung/security1/ceph/messages/MMonElectionAck.h b/branches/aleung/security1/ceph/messages/MMonElectionAck.h index dbfa30c9cb099..2399cca73d60c 100644 --- a/branches/aleung/security1/ceph/messages/MMonElectionAck.h +++ b/branches/aleung/security1/ceph/messages/MMonElectionAck.h @@ -20,27 +20,12 @@ class MMonElectionAck : public Message { public: - int q; - int refresh_num; + MMonElectionAck() : Message(MSG_MON_ELECTION_ACK) {} + + virtual char *get_type_name() { return "election_ack"; } - MMonElectionAck() {} - MMonElectionAck(int _q, int _n) : - Message(MSG_MON_ELECTION_ACK), - q(_q), refresh_num(_n) {} - - void decode_payload() { - int off = 0; - payload.copy(off, sizeof(q), (char*)&q); - off += sizeof(q); - payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); - off += sizeof(refresh_num); - } - void encode_payload() { - payload.append((char*)&q, sizeof(q)); - payload.append((char*)&refresh_num, sizeof(refresh_num)); - } - - virtual char *get_type_name() { return "MonElAck"; } + void encode_payload() {} + void decode_payload() {} }; #endif diff --git a/branches/aleung/security1/ceph/messages/MMonElectionPropose.h b/branches/aleung/security1/ceph/messages/MMonElectionPropose.h new file mode 100644 index 0000000000000..d9310f222bc7b --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMonElectionPropose.h @@ -0,0 +1,32 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONPROPOSE_H +#define __MMONELECTIONPROPOSE_H + +#include "msg/Message.h" + + +class MMonElectionPropose : public Message { + public: + MMonElectionPropose() : Message(MSG_MON_ELECTION_PROPOSE) {} + + virtual char *get_type_name() { return "election_propose"; } + + void encode_payload() {} + void decode_payload() {} + +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMonElectionVictory.h b/branches/aleung/security1/ceph/messages/MMonElectionVictory.h new file mode 100644 index 0000000000000..8bdbf2f85a3aa --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMonElectionVictory.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONVICTORY_H +#define __MMONELECTIONVICTORY_H + +#include "msg/Message.h" + + +class MMonElectionVictory : public Message { + public: + //set active_set; + + MMonElectionVictory(/*set& as*/) : Message(MSG_MON_ELECTION_VICTORY)//, + //active_set(as) + {} + + virtual char *get_type_name() { return "election_victory"; } + + void encode_payload() { + //::_encode(active_set, payload); + } + void decode_payload() { + //int off = 0; + //::_decode(active_set, payload, off); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MMonPaxos.h b/branches/aleung/security1/ceph/messages/MMonPaxos.h new file mode 100644 index 0000000000000..b3f6e850a9c5d --- /dev/null +++ b/branches/aleung/security1/ceph/messages/MMonPaxos.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONPAXOS_H +#define __MMONPAXOS_H + +#include "msg/Message.h" + +class MMonPaxos : public Message { + public: + // op types + const static int OP_COLLECT = 1; // proposer: propose round + const static int OP_LAST = 2; // voter: accept proposed round + const static int OP_OLDROUND = 3; // voter: notify proposer he proposed an old round + const static int OP_BEGIN = 4; // proposer: value proposed for this round + const static int OP_ACCEPT = 5; // voter: accept propsed value + const static int OP_SUCCESS = 7; // proposer: notify learners of agreed value + const static int OP_ACK = 8; // learner: notify proposer that new value has been saved + + int op; + int machine_id; + version_t proposal; + version_t n; + bufferlist value; + + MMonPaxos() : Message(MSG_MON_PAXOS) {} + MMonPaxos(int o, int mid, + version_t pn, version_t v) : Message(MSG_MON_PAXOS), + op(o), machine_id(mid), + proposal(pn), n(v) {} + MMonPaxos(int o, int mid, + version_t pn, version_t v, + bufferlist& b) : Message(MSG_MON_PAXOS), + op(o), machine_id(mid), + proposal(pn), n(v), + value(b) {} + + virtual char *get_type_name() { return "paxos"; } + + void print(ostream& out) { + out << "paxos(op " << op + << ", machine " << machine_id + << ", proposal " << proposal + << ", state " << n + << ", " << value.length() << " bytes)"; + } + + void encode_payload() { + payload.append((char*)&op, sizeof(op)); + payload.append((char*)&machine_id, sizeof(machine_id)); + payload.append((char*)&proposal, sizeof(proposal)); + payload.append((char*)&n, sizeof(n)); + ::_encode(value, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(op), (char*)&op); + off += sizeof(op); + payload.copy(off, sizeof(machine_id), (char*)&machine_id); + off += sizeof(machine_id); + payload.copy(off, sizeof(proposal), (char*)&proposal); + off += sizeof(proposal); + payload.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + ::_decode(value, payload, off); + } +}; + +#endif diff --git a/branches/aleung/security1/ceph/messages/MNSLookup.h b/branches/aleung/security1/ceph/messages/MNSLookup.h index cbea43092908a..b6df663a15a88 100644 --- a/branches/aleung/security1/ceph/messages/MNSLookup.h +++ b/branches/aleung/security1/ceph/messages/MNSLookup.h @@ -18,18 +18,18 @@ #include "msg/Message.h" class MNSLookup : public Message { - msg_addr_t entity; + entity_name_t entity; public: MNSLookup() {} - MNSLookup(msg_addr_t e) : + MNSLookup(entity_name_t e) : Message(MSG_NS_LOOKUP) { entity = e; } char *get_type_name() { return "NSLook"; } - msg_addr_t get_entity() { return entity; } + entity_name_t get_entity() { return entity; } void encode_payload() { payload.append((char*)&entity, sizeof(entity)); diff --git a/branches/aleung/security1/ceph/messages/MNSLookupReply.h b/branches/aleung/security1/ceph/messages/MNSLookupReply.h index e87b48435c92a..e6720eba397d8 100644 --- a/branches/aleung/security1/ceph/messages/MNSLookupReply.h +++ b/branches/aleung/security1/ceph/messages/MNSLookupReply.h @@ -20,7 +20,7 @@ class MNSLookupReply : public Message { public: - map entity_map; + map entity_map; public: MNSLookupReply() {} diff --git a/branches/aleung/security1/ceph/messages/MNSRegister.h b/branches/aleung/security1/ceph/messages/MNSRegister.h index 9af0dd15aa1dc..01d29a2315fa9 100644 --- a/branches/aleung/security1/ceph/messages/MNSRegister.h +++ b/branches/aleung/security1/ceph/messages/MNSRegister.h @@ -19,13 +19,13 @@ #include "msg/TCPMessenger.h" class MNSRegister : public Message { - msg_addr_t addr; + entity_name_t addr; int rank; long tid; public: MNSRegister() {} - MNSRegister(msg_addr_t a, int r, int ti) : + MNSRegister(entity_name_t a, int r, int ti) : Message(MSG_NS_REGISTER) { addr = a; rank = r; @@ -34,7 +34,7 @@ class MNSRegister : public Message { char *get_type_name() { return "NSReg"; } - msg_addr_t get_entity() { return addr; } + entity_name_t get_entity() { return addr; } int get_rank() { return rank; } long get_tid() { return tid; } diff --git a/branches/aleung/security1/ceph/messages/MNSRegisterAck.h b/branches/aleung/security1/ceph/messages/MNSRegisterAck.h index 54e4b93db2118..fa2f88ac10e82 100644 --- a/branches/aleung/security1/ceph/messages/MNSRegisterAck.h +++ b/branches/aleung/security1/ceph/messages/MNSRegisterAck.h @@ -19,12 +19,12 @@ #include "msg/TCPMessenger.h" class MNSRegisterAck : public Message { - msg_addr_t entity; + entity_name_t entity; long tid; public: MNSRegisterAck() {} - MNSRegisterAck(long t, msg_addr_t e) : + MNSRegisterAck(long t, entity_name_t e) : Message(MSG_NS_REGISTERACK) { entity = e; tid = t; @@ -32,7 +32,7 @@ class MNSRegisterAck : public Message { char *get_type_name() { return "NSRegA"; } - msg_addr_t get_entity() { return entity; } + entity_name_t get_entity() { return entity; } long get_tid() { return tid; } void encode_payload() { diff --git a/branches/aleung/security1/ceph/messages/MOSDBoot.h b/branches/aleung/security1/ceph/messages/MOSDBoot.h index 2f60fd4ae9fc2..b22e3fb8d4eeb 100644 --- a/branches/aleung/security1/ceph/messages/MOSDBoot.h +++ b/branches/aleung/security1/ceph/messages/MOSDBoot.h @@ -17,6 +17,7 @@ #include "msg/Message.h" #include "include/types.h" +#include "osd/osd_types.h" class MOSDBoot : public Message { public: diff --git a/branches/aleung/security1/ceph/messages/MOSDFailure.h b/branches/aleung/security1/ceph/messages/MOSDFailure.h index 7dd75758ff0d6..c4a557856594a 100644 --- a/branches/aleung/security1/ceph/messages/MOSDFailure.h +++ b/branches/aleung/security1/ceph/messages/MOSDFailure.h @@ -20,31 +20,26 @@ class MOSDFailure : public Message { public: - msg_addr_t failed; - entity_inst_t inst; + entity_inst_t failed; epoch_t epoch; MOSDFailure() {} - MOSDFailure(msg_addr_t f, const entity_inst_t& i, epoch_t e) : + MOSDFailure(entity_inst_t f, epoch_t e) : Message(MSG_OSD_FAILURE), - failed(f), inst(i), epoch(e) {} + failed(f), epoch(e) {} - msg_addr_t get_failed() { return failed; } - entity_inst_t& get_inst() { return inst; } + entity_inst_t get_failed() { return failed; } epoch_t get_epoch() { return epoch; } void decode_payload() { int off = 0; payload.copy(off, sizeof(failed), (char*)&failed); off += sizeof(failed); - payload.copy(off, sizeof(inst), (char*)&inst); - off += sizeof(inst); payload.copy(off, sizeof(epoch), (char*)&epoch); off += sizeof(epoch); } void encode_payload() { payload.append((char*)&failed, sizeof(failed)); - payload.append((char*)&inst, sizeof(inst)); payload.append((char*)&epoch, sizeof(epoch)); } diff --git a/branches/aleung/security1/ceph/messages/MOSDOp.h b/branches/aleung/security1/ceph/messages/MOSDOp.h index e90cea08f8161..8a9d56d863b77 100644 --- a/branches/aleung/security1/ceph/messages/MOSDOp.h +++ b/branches/aleung/security1/ceph/messages/MOSDOp.h @@ -16,6 +16,7 @@ #define __MOSDOP_H #include "msg/Message.h" +#include "osd/osd_types.h" #include "crypto/CryptoLib.h" #include "crypto/ExtCap.h" @@ -29,9 +30,7 @@ using namespace CryptoLib; * */ -//#define OSD_OP_MKFS 20 - -// client ops +// osd client ops #define OSD_OP_READ 1 #define OSD_OP_STAT 2 @@ -52,34 +51,6 @@ using namespace CryptoLib; #define OSD_OP_PUSH 31 -typedef struct { - long pcid; - - // who's asking? - tid_t tid; - msg_addr_t client; - entity_inst_t client_inst; - - // for replication - tid_t rep_tid; - - object_t oid; - objectrev_t rev; - pg_t pg; - - epoch_t map_epoch; - - eversion_t pg_trim_to; // primary->replica: trim to here - - int op; - size_t length, offset; - eversion_t version; - eversion_t old_version; - - bool want_ack; - bool want_commit; -} MOSDOp_st; - class MOSDOp : public Message { public: static const char* get_opname(int op) { @@ -107,7 +78,33 @@ public: } private: - MOSDOp_st st; + struct { + long pcid; + + // who's asking? + entity_inst_t client; + reqid_t reqid; // minor weirdness: entity_name_t is in reqid_t too. + + // for replication + tid_t rep_tid; + + object_t oid; + objectrev_t rev; + pg_t pg; + + epoch_t map_epoch; + + eversion_t pg_trim_to; // primary->replica: trim to here + + int op; + size_t length, offset; + eversion_t version; + eversion_t old_version; + + bool want_ack; + bool want_commit; + } st; + bufferlist data; map attrset; @@ -117,16 +114,19 @@ private: friend class MOSDOpReply; public: - const tid_t get_tid() { return st.tid; } - const msg_addr_t& get_client() { return st.client; } - const entity_inst_t& get_client_inst() { return st.client_inst; } - void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } + const reqid_t& get_reqid() { return st.reqid; } + const tid_t get_client_tid() { return st.reqid.tid; } + int get_client_inc() { return st.reqid.inc; } + + const entity_name_t& get_client() { return st.client.name; } + const entity_inst_t& get_client_inst() { return st.client; } + void set_client_inst(const entity_inst_t& i) { st.client = i; } const tid_t get_rep_tid() { return st.rep_tid; } void set_rep_tid(tid_t t) { st.rep_tid = t; } - const object_t get_oid() { return st.oid; } - const pg_t get_pg() { return st.pg; } + const object_t get_oid() { return st.oid; } + const pg_t get_pg() { return st.pg; } const epoch_t get_map_epoch() { return st.map_epoch; } //const int get_pg_role() { return st.pg_role; } // who am i asking for? @@ -168,19 +168,22 @@ private: void set_pcid(long pcid) { this->st.pcid = pcid; } long get_pcid() { return st.pcid; } - MOSDOp(long tid, msg_addr_t asker, + MOSDOp(entity_inst_t asker, int inc, long tid, object_t oid, pg_t pg, epoch_t mapepoch, int op) : Message(MSG_OSD_OP) { memset(&st, 0, sizeof(st)); this->st.client = asker; - this->st.tid = tid; - this->st.rep_tid = 0; + this->st.reqid.name = asker.name; + this->st.reqid.inc = inc; + this->st.reqid.tid = tid; this->st.oid = oid; this->st.pg = pg; this->st.map_epoch = mapepoch; this->st.op = op; + this->st.rep_tid = 0; + this->st.want_ack = true; this->st.want_commit = true; } @@ -214,13 +217,15 @@ private: } virtual char *get_type_name() { return "oop"; } + + void print(ostream& out) { + out << "osd_op(" << st.reqid + << " " << get_opname(st.op) + << " " << st.oid + //<< " " << this + << ")"; + } }; -inline ostream& operator<<(ostream& out, MOSDOp& op) -{ - return out << "MOSDOp(" << op.get_client() << "." << op.get_tid() - << " op " << MOSDOp::get_opname(op.get_op()) - << " oid " << hex << op.get_oid() << dec << " " << &op << ")"; -} #endif diff --git a/branches/aleung/security1/ceph/messages/MOSDOpReply.h b/branches/aleung/security1/ceph/messages/MOSDOpReply.h index 35c6ad5898b0b..05106e096d176 100644 --- a/branches/aleung/security1/ceph/messages/MOSDOpReply.h +++ b/branches/aleung/security1/ceph/messages/MOSDOpReply.h @@ -28,38 +28,36 @@ * */ - -typedef struct { - // req - long pcid; - tid_t tid; - tid_t rep_tid; - - object_t oid; - pg_t pg; - - int op; - - // reply - int result; - bool commit; - size_t length, offset; - size_t object_size; - eversion_t version; - - eversion_t pg_complete_thru; - - epoch_t map_epoch; -} MOSDOpReply_st; - - class MOSDOpReply : public Message { - MOSDOpReply_st st; + struct { + // req + reqid_t reqid; + + tid_t rep_tid; + + object_t oid; + pg_t pg; + + int op; + + // reply + int result; + bool commit; + size_t length, offset; + size_t object_size; + eversion_t version; + + eversion_t pg_complete_thru; + + epoch_t map_epoch; + } st; + bufferlist data; map attrset; public: - long get_tid() { return st.tid; } + const reqid_t& get_reqid() { return st.reqid; } + long get_tid() { return st.reqid.tid; } long get_rep_tid() { return st.rep_tid; } object_t get_oid() { return st.oid; } pg_t get_pg() { return st.pg; } @@ -84,7 +82,6 @@ class MOSDOpReply : public Message { void set_attrset(map &as) { attrset = as; } void set_op(int op) { st.op = op; } - void set_tid(tid_t t) { st.tid = t; } void set_rep_tid(tid_t t) { st.rep_tid = t; } // data payload @@ -98,18 +95,13 @@ class MOSDOpReply : public Message { // osdmap epoch_t get_map_epoch() { return st.map_epoch; } - // keep a pcid (procedure call id) to match up request+reply - void set_pcid(long pcid) { this->st.pcid = pcid; } - long get_pcid() { return st.pcid; } public: MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : Message(MSG_OSD_OPREPLY) { memset(&st, 0, sizeof(st)); - this->st.pcid = req->st.pcid; - + this->st.reqid = req->st.reqid; this->st.op = req->st.op; - this->st.tid = req->st.tid; this->st.rep_tid = req->st.rep_tid; this->st.oid = req->st.oid; @@ -141,6 +133,16 @@ public: } virtual char *get_type_name() { return "oopr"; } + + void print(ostream& out) { + out << "osd_op_reply(" << st.reqid + << " " << MOSDOp::get_opname(st.op) + << " " << st.oid << " = " << st.result + //<< " " << this + << ")"; + } + }; + #endif diff --git a/branches/aleung/security1/ceph/mkmonmap.cc b/branches/aleung/security1/ceph/mkmonmap.cc index 6d049f4bd7186..1ec4c808d6204 100644 --- a/branches/aleung/security1/ceph/mkmonmap.cc +++ b/branches/aleung/security1/ceph/mkmonmap.cc @@ -25,48 +25,7 @@ using namespace std; #include "mon/MonMap.h" -bool parse_ip_port(const char *s, tcpaddr_t& tcpaddr) -{ - unsigned char addr[4]; - int port = 0; - - int count = 0; // digit count - - while (1) { - // parse the #. - int val = 0; - int numdigits = 0; - - while (*s >= '0' && *s <= '9') { - int digit = *s - '0'; - //cout << "digit " << digit << endl; - val *= 10; - val += digit; - numdigits++; - s++; - } - //cout << "val " << val << endl; - - if (numdigits == 0) return false; // no digits - if (count < 3 && *s != '.') return false; // should have 3 periods - if (count == 3 && *s != ':') return false; // then a colon - s++; - - if (count <= 3) - addr[count] = val; - else - port = val; - - count++; - if (count == 5) break; - } - - // copy into inst - memcpy((char*)&tcpaddr.sin_addr.s_addr, (char*)addr, 4); - tcpaddr.sin_port = port; - return true; -} int main(int argc, char **argv) @@ -83,14 +42,13 @@ int main(int argc, char **argv) outfn = args[++i]; else { // parse ip:port - tcpaddr_t addr; - if (!parse_ip_port(args[i], addr)) { + entity_inst_t inst; + if (!parse_ip_port(args[i], inst.addr)) { cerr << "mkmonmap: invalid ip:port '" << args[i] << "'" << endl; return -1; } - entity_inst_t inst; - inst.set_addr(addr); - cout << "mkmonmap: mon" << monmap.num_mon << " " << inst << endl; + inst.name = MSG_ADDR_MON(monmap.num_mon); + cout << "mkmonmap: adding " << inst << endl; monmap.add_mon(inst); } } diff --git a/branches/aleung/security1/ceph/mon/ClientMonitor.cc b/branches/aleung/security1/ceph/mon/ClientMonitor.cc index 055a74237f21a..cb454ed598231 100644 --- a/branches/aleung/security1/ceph/mon/ClientMonitor.cc +++ b/branches/aleung/security1/ceph/mon/ClientMonitor.cc @@ -60,17 +60,19 @@ void ClientMonitor::handle_client_boot(MClientBoot *m) assert(m->get_source().is_client()); int from = m->get_source().num(); - // choose an MDS id + // choose a client id if (from < 0 || - (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_inst())) { + (client_map.count(m->get_source()) && client_map[m->get_source()] != m->get_source_addr())) { from = ++num_clients; dout(10) << "client_boot assigned client" << from << endl; } - client_map[MSG_ADDR_CLIENT(from)] = m->get_source_inst(); + client_map[MSG_ADDR_CLIENT(from)] = m->get_source_addr(); // reply with latest mds map - mon->mdsmon->send_latest(MSG_ADDR_CLIENT(from), m->get_source_inst()); + entity_inst_t to = m->get_source_inst(); + to.name = MSG_ADDR_CLIENT(from); + mon->mdsmon->send_latest(to); delete m; } @@ -107,7 +109,7 @@ void ClientMonitor::handle_client_auth_user(MClientAuthUser *m) userTicket = user_tickets[uid]; // reply to auth_user messenger->send_message(new MClientAuthUserAck(userTicket), - m->get_source(), m->get_source_inst()); + m->get_source_inst()); } diff --git a/branches/aleung/security1/ceph/mon/ClientMonitor.h b/branches/aleung/security1/ceph/mon/ClientMonitor.h index 2900e23e5115a..5c593e8930cc3 100644 --- a/branches/aleung/security1/ceph/mon/ClientMonitor.h +++ b/branches/aleung/security1/ceph/mon/ClientMonitor.h @@ -37,7 +37,7 @@ class ClientMonitor : public Dispatcher { private: int num_clients; - map client_map; + map client_map; map user_tickets; void bcast_latest_mds(); @@ -47,7 +47,7 @@ class ClientMonitor : public Dispatcher { void handle_client_boot(class MClientBoot *m); void handle_client_auth_user(class MClientAuthUser *m); - void send_ticket(msg_addr_t dest, const entity_inst_t& inst); + void send_ticket(const entity_inst_t& inst); public: ClientMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l), diff --git a/branches/aleung/security1/ceph/mon/Elector.cc b/branches/aleung/security1/ceph/mon/Elector.cc index 5b793777ab3dd..d3098ba065a47 100644 --- a/branches/aleung/security1/ceph/mon/Elector.cc +++ b/branches/aleung/security1/ceph/mon/Elector.cc @@ -1,227 +1,216 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ #include "Elector.h" #include "Monitor.h" #include "common/Timer.h" -#include "messages/MMonElectionRefresh.h" -#include "messages/MMonElectionStatus.h" +#include "messages/MMonElectionPropose.h" #include "messages/MMonElectionAck.h" -#include "messages/MMonElectionCollect.h" +#include "messages/MMonElectionVictory.h" #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " " -#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".elector " - -class C_Elect_ReadTimer : public Context { - Elector *mon; -public: - C_Elect_ReadTimer(Elector *m) : mon(m){} - void finish(int r) { - mon->read_timer(); - } -}; - -void Elector::read_timer() +void Elector::start() { - lock.Lock(); - { - read_num++; - status_msg_count = 0; - old_views = views; // TODO deep copy - for (unsigned i=0; imessenger->send_message(new MMonElectionCollect(read_num), - MSG_ADDR_MON(processes[i]), - mon->monmap->get_inst(processes[i])); - } - } - lock.Unlock(); -}; + dout(5) << "start -- can i be leader?" << endl; -class C_Elect_TripTimer : public Context { - Elector *mon; -public: - C_Elect_TripTimer(Elector *m) : mon(m){} - void finish(int r) { - mon->trip_timer(); + leader_acked = -1; + + // start by trying to elect me + start_stamp = g_clock.now(); + acked_me.clear(); + acked_me.insert(whoami); + electing_me = true; + + // bcast to everyone else + for (int i=0; imonmap->num_mon; ++i) { + if (i == whoami) continue; + mon->messenger->send_message(new MMonElectionPropose, + mon->monmap->get_inst(i)); } -}; + + reset_timer(); +} -void Elector::trip_timer() +void Elector::defer(int who) { - lock.Lock(); - { - views[whoami].expired = true; - registry[whoami].epoch.s_num++; - dout(1) << "Process " << whoami - << " timed out (" << ack_msg_count << "/" << (f + 1) - << ") ... increasing epoch. Now epoch is " - << registry[whoami].epoch.s_num - << endl; + dout(5) << "defer to " << who << endl; + + if (electing_me) { + acked_me.clear(); + electing_me = false; } - lock.Unlock(); -}; + // ack them + leader_acked = who; + ack_stamp = g_clock.now(); + mon->messenger->send_message(new MMonElectionAck, + mon->monmap->get_inst(who)); + + // set a timer + reset_timer(1.0); // give the leader some extra time to declare victory +} -class C_Elect_RefreshTimer : public Context { - Elector *mon; +class C_Mon_ElectionExpire : public Context { + Elector *elector; public: - C_Elect_RefreshTimer(Elector *m) : mon(m) {} + C_Mon_ElectionExpire(Elector *e) : elector(e) { } void finish(int r) { - mon->refresh_timer(); + elector->expire(); } }; -void Elector::refresh_timer() +void Elector::reset_timer(double plus) { - lock.Lock(); - { - ack_msg_count = 0; - refresh_num++; - MMonElectionRefresh *msg = new MMonElectionRefresh(whoami, registry[whoami], refresh_num); - for (unsigned i=0; imessenger->send_message(msg, MSG_ADDR_MON(processes[i]), mon->monmap->get_inst(processes[i])); - } - - // Start the trip timer - //round_trip_timer = new C_Elect_TripTimer(this); - g_timer.add_event_after(trip_delta, new C_Elect_TripTimer(this)); - } - lock.Unlock(); -}; + // set the timer + cancel_timer(); + expire_event = new C_Mon_ElectionExpire(this); + g_timer.add_event_after(g_conf.mon_lease + plus, + expire_event); +} +void Elector::cancel_timer() +{ + if (expire_event) + g_timer.cancel_event(expire_event); +} -////////////////////////// +void Elector::expire() +{ + dout(5) << "election timer expired" << endl; + + // did i win? + if (electing_me && + acked_me.size() > (unsigned)(mon->monmap->num_mon / 2)) { + // i win + victory(); + } else { + // whoever i deferred to didn't declare victory quickly enough. + start(); + } +} -Elector::Epoch Elector::get_min_epoch() +void Elector::victory() { - assert(!views.empty()); - Epoch min = views[0].state.epoch; - for (unsigned i=1; imonmap->num_mon; ++i) { + if (i == whoami) continue; + mon->messenger->send_message(new MMonElectionVictory, + mon->monmap->get_inst(i)); } - return min; + + // tell monitor + mon->win_election(acked_me); } -void Elector::dispatch(Message *m) +void Elector::handle_propose(MMonElectionPropose *m) { - lock.Lock(); - { - switch (m->get_type()) { - case MSG_MON_ELECTION_ACK: - handle_ack((MMonElectionAck*)m); - break; - - case MSG_MON_ELECTION_STATUS: - handle_status((MMonElectionStatus*)m); - break; - - case MSG_MON_ELECTION_COLLECT: - handle_collect((MMonElectionCollect*)m); - break; - - case MSG_MON_ELECTION_REFRESH: - handle_refresh((MMonElectionRefresh*)m); - break; - - default: - assert(0); + dout(5) << "handle_propose from " << m->get_source() << endl; + int from = m->get_source().num(); + + if (from > whoami) { + // wait, i should win! + if (!electing_me) + start(); + } else { + // they would win over me + if (leader_acked < 0 || // haven't acked anyone yet, or + leader_acked > from || // they would win over who you did ack, or + leader_acked == from) { // this is the guy we're already deferring to + defer(from); + } else { + // ignore them! + dout(5) << "no, we already acked " << leader_acked << endl; } } - lock.Unlock(); + + delete m; } - -void Elector::handle_ack(MMonElectionAck* msg) + +void Elector::handle_ack(MMonElectionAck *m) { - assert(refresh_num >= msg->refresh_num); + dout(5) << "handle_ack from " << m->get_source() << endl; + int from = m->get_source().num(); - if (refresh_num > msg->refresh_num) { - // we got the message too late... discard it - return; - } - ack_msg_count++; - if (ack_msg_count >= f + 1) { - dout(5) << "Received _f+1 acks, increase freshness" << endl; - //g_timer.cancel_event(round_trip_task); - //round_trip_timer->cancel(); - registry[whoami].freshness++; + if (electing_me) { + // thanks + acked_me.insert(from); + dout(5) << " so far i have " << acked_me << endl; + + // is that _everyone_? + if (acked_me.size() == (unsigned)mon->monmap->num_mon) { + // if yes, shortcut to election finish + victory(); + } + } else { + // ignore, i'm deferring already. } - delete msg; + delete m; } -void Elector::handle_collect(MMonElectionCollect* msg) +void Elector::handle_victory(MMonElectionVictory *m) { - mon->messenger->send_message(new MMonElectionStatus(msg->get_source().num(), - msg->read_num, - registry), - msg->get_source(), - mon->monmap->get_inst(msg->get_source().num())); - delete msg; + dout(5) << "handle_victory from " << m->get_source() << endl; + int from = m->get_source().num(); + + if (from < whoami) { + // ok, fine, they win + mon->lose_election(from); + + // cancel my timer + cancel_timer(); + } else { + // no, that makes no sense, i should win. start over! + start(); + } } -void Elector::handle_refresh(MMonElectionRefresh* msg) -{ - if (registry[msg->p] < msg->state) { - // update local data - registry[msg->p] = msg->state; - - // reply to msg - mon->messenger->send_message(new MMonElectionAck(msg->p, - msg->refresh_num), - msg->get_source(), - mon->monmap->get_inst(msg->get_source().num())); - } - delete msg; -} -void Elector::handle_status(MMonElectionStatus* msg) +void Elector::dispatch(Message *m) { - if (read_num != msg->read_num) { - dout(1) << "handle_status " - << ":DISCARDED B/C OF READNUM(" << read_num << ":" - << msg->read_num << ")" - << endl; - return; - } - for (unsigned i=0; iregistry[r] > views[r].state ) { - views[r].state = msg->registry[r]; - } - } - - status_msg_count++; - if (status_msg_count >= (int)processes.size() - f) { // Responses from quorum collected - for (unsigned i=0; i old_views[r].state )) { - dout(5) << ":Other process (" << r << ") has expired" << endl; - views[r].expired = true; - } - if (views[r].state.epoch > old_views[r].state.epoch) { - views[r].expired = false; - } - } - Epoch leader_epoch = get_min_epoch(); - leader_id = leader_epoch.p_id; - dout(1) << " thinks leader has ID: " << leader_id << endl; + switch (m->get_type()) { + case MSG_MON_ELECTION_ACK: + handle_ack((MMonElectionAck*)m); + break; + + case MSG_MON_ELECTION_PROPOSE: + handle_propose((MMonElectionPropose*)m); + break; + + case MSG_MON_ELECTION_VICTORY: + handle_victory((MMonElectionVictory*)m); + break; - // Restarts the timer for the next iteration - g_timer.add_event_after(main_delta + trip_delta, new C_Elect_ReadTimer(this)); + default: + assert(0); } } diff --git a/branches/aleung/security1/ceph/mon/Elector.h b/branches/aleung/security1/ceph/mon/Elector.h index 7ec3a40a59130..67ed59945c46b 100644 --- a/branches/aleung/security1/ceph/mon/Elector.h +++ b/branches/aleung/security1/ceph/mon/Elector.h @@ -21,143 +21,52 @@ using namespace std; #include "include/types.h" #include "msg/Message.h" +#include "include/Context.h" + +#include "common/Timer.h" class Monitor; class Elector { - public: - - //// sub-classes - - // Epoch - class Epoch { - public: - int p_id; - int s_num; - - Epoch(int p_id=0, int s_num=0) { - this->p_id = p_id; - this->s_num = s_num; - } - }; - - - // State - class State { - public: - Epoch epoch; - int freshness; - - State() : freshness(0) {}; - State(Epoch& e, int f) : - epoch(e), freshness(f) {} - }; - - - class View { - public: - State state; - bool expired; - View() : expired(false) {} - View(State& s, bool e) : state(s), expired(e) {} - }; - - - /////////////// private: Monitor *mon; int whoami; - Mutex lock; - // used during refresh phase - int ack_msg_count; - int refresh_num; - - // used during read phase - int read_num; - int status_msg_count; - - // the leader process id - int leader_id; - // f-accessible - int f; - - // the processes that compose the group - vector processes; - // parameters for the process - int main_delta; - int trip_delta; - - // state variables - map registry; - map views; - map old_views; + Context *expire_event; - // get the minimum epoch in the view map - Epoch get_min_epoch(); + void reset_timer(double plus=0.0); + void cancel_timer(); + + // electing me + bool electing_me; + utime_t start_stamp; + set acked_me; + + // electing them + int leader_acked; // who i've acked + utime_t ack_stamp; // and when - // handlers for election messages + public: + + void start(); // start an electing me + void defer(int who); + void expire(); // timer goes off + void victory(); + + void handle_propose(class MMonElectionPropose *m); void handle_ack(class MMonElectionAck *m); - void handle_collect(class MMonElectionCollect *m); - void handle_refresh(class MMonElectionRefresh *m); - void handle_status(class MMonElectionStatus *m); + void handle_victory(class MMonElectionVictory *m); + public: Elector(Monitor *m, int w) : mon(m), whoami(w) { // initialize all those values! // ... } - // timer methods - void read_timer(); - void trip_timer(); - void refresh_timer(); - void dispatch(Message *m); - }; -inline bool operator>(const Elector::Epoch& l, const Elector::Epoch& r) { - if (l.s_num == r.s_num) - return (l.p_id > r.p_id); - else - return (l.s_num > r.s_num); -} - -inline bool operator<(const Elector::Epoch& l, const Elector::Epoch& r) { - if (l.s_num == r.s_num) - return (l.p_id < r.p_id); - else - return (l.s_num < r.s_num); -} - -inline bool operator==(const Elector::Epoch& l, const Elector::Epoch& r) { - return ((l.s_num == r.s_num) && (l.p_id > r.p_id)); -} - - -inline bool operator>(const Elector::State& l, const Elector::State& r) -{ - if (l.epoch == r.epoch) - return (l.freshness > r.freshness); - else - return l.epoch > r.epoch; -} - -inline bool operator<(const Elector::State& l, const Elector::State& r) -{ - if (l.epoch == r.epoch) - return (l.freshness < r.freshness); - else - return l.epoch < r.epoch; -} - -inline bool operator==(const Elector::State& l, const Elector::State& r) -{ - return ( (l.epoch == r.epoch) && (l.freshness == r.freshness) ); -} - - #endif diff --git a/branches/aleung/security1/ceph/mon/MDSMonitor.cc b/branches/aleung/security1/ceph/mon/MDSMonitor.cc index a31d264b529c4..24beadf85e9f0 100644 --- a/branches/aleung/security1/ceph/mon/MDSMonitor.cc +++ b/branches/aleung/security1/ceph/mon/MDSMonitor.cc @@ -14,11 +14,11 @@ #include "MDSMonitor.h" #include "Monitor.h" +#include "MonitorStore.h" -#include "messages/MMDSBoot.h" #include "messages/MMDSMap.h" #include "messages/MMDSGetMap.h" -//#include "messages/MMDSFailure.h" +#include "messages/MMDSBeacon.h" #include "common/Timer.h" @@ -31,95 +31,220 @@ /********* MDS map **************/ -void MDSMonitor::create_initial() -{ - mdsmap.epoch = 0; // until everyone boots - mdsmap.ctime = g_clock.now(); - for (int i=0; iget_type()) { - case MSG_MDS_BOOT: - handle_mds_boot((MMDSBoot*)m); + case MSG_MDS_BEACON: + handle_mds_beacon((MMDSBeacon*)m); break; case MSG_MDS_GETMAP: handle_mds_getmap((MMDSGetMap*)m); break; - /* - case MSG_MDS_FAILURE: - handle_mds_failure((MMDSFailure*)m); - break; - */ - - case MSG_SHUTDOWN: - handle_mds_shutdown(m); - break; - default: assert(0); } } -void MDSMonitor::handle_mds_boot(MMDSBoot *m) + + +void MDSMonitor::election_finished() { - dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; - assert(m->get_source().is_mds()); - int from = m->get_source().num(); + if (mon->is_leader()) { - // choose an MDS id - if (from < 0 || !mdsmap.is_down(from)) { - for (from=0; ; ++from) - if (mdsmap.is_down(from)) break; - dout(10) << "mds_boot assigned mds" << from << endl; - } - - if (mdsmap.get_epoch() == 0) { - // waiting for boot! - mdsmap.mds_inst[from] = m->get_source_inst(); - mdsmap.down_mds.erase(from); - - if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) { - mdsmap.inc_epoch(); - dout(-7) << "mds_boot all MDSs booted." << endl; - mdsmap.encode(maps[mdsmap.get_epoch()]); // 1 - - bcast_latest_mds(); - send_current(); + // FIXME be smarter later. + + if (g_conf.mkfs) { + create_initial(); + save_map(); } else { - dout(7) << "mds_boot waiting for " - << (mdsmap.get_num_mds() - mdsmap.mds_inst.size()) - << " mdss to boot" << endl; + load_map(); } - return; - } else { - dout(0) << "mds_boot everyone already booted, so who is this? write me." << endl; - assert(0); } } -void MDSMonitor::handle_mds_shutdown(Message *m) + +void MDSMonitor::create_initial() { - assert(m->get_source().is_mds()); - int from = m->get_source().num(); + mdsmap.epoch = 0; // until everyone boots + mdsmap.ctime = g_clock.now(); + + mdsmap.encode(encoded_map); + + print_map(); +} + +void MDSMonitor::load_map() +{ + int r = mon->store->get_bl_ss(encoded_map, "mdsmap", "current"); + assert(r > 0); + mdsmap.decode(encoded_map); + dout(7) << "load_map epoch " << mdsmap.get_epoch() << endl; +} - mdsmap.mds_inst.erase(from); - mdsmap.all_mds.erase(from); +void MDSMonitor::save_map() +{ + dout(7) << "save_map epoch " << mdsmap.get_epoch() << endl; + + int r = mon->store->put_bl_ss(encoded_map, "mdsmap", "current"); + assert(r>=0); +} - dout(7) << "mds_shutdown from " << m->get_source() - << ", still have " << mdsmap.all_mds +void MDSMonitor::print_map() +{ + dout(7) << "print_map epoch " << mdsmap.get_epoch() << endl; + entity_inst_t blank; + set all; + mdsmap.get_mds_set(all); + for (set::iterator p = all.begin(); + p != all.end(); + ++p) { + dout(7) << " mds" << *p << "." << mdsmap.mds_inc[*p] + << " : " << MDSMap::get_state_name(mdsmap.get_state(*p)) + << " : " << (mdsmap.have_inst(*p) ? mdsmap.get_inst(*p) : blank) + << endl; + } +} + + +void MDSMonitor::issue_map() +{ + mdsmap.inc_epoch(); + encoded_map.clear(); + mdsmap.encode(encoded_map); + + dout(7) << "issue_map epoch " << mdsmap.get_epoch() << endl; + + save_map(); + print_map(); + + // bcast map + bcast_latest_mds(); + send_current(); +} + + +void MDSMonitor::handle_mds_beacon(MMDSBeacon *m) +{ + dout(7) << "mds_beacon " << *m + << " from " << m->get_source() + << " " << m->get_source_inst() << endl; + int from = m->get_source().num(); + int state = m->get_state(); + version_t seq = m->get_seq(); + + // initial boot? + bool booted = false; - // tell someone? - // fixme + // choose an MDS id + if (from >= 0) { + // wants to be (or already is) a specific MDS. + if (mdsmap.is_down(from)) { + dout(10) << "mds_beacon assigning requested mds" << from << endl; + booted = true; + } else if (mdsmap.get_inst(from) != m->get_source_inst()) { + dout(10) << "mds_beacon not assigning requested mds" << from + << ", that mds is up and someone else" << endl; + from = -1; + } + } + if (from < 0) { + // pick a failed mds? + set failed; + mdsmap.get_failed_mds_set(failed); + if (!failed.empty()) { + from = *failed.begin(); + dout(10) << "mds_beacon assigned failed mds" << from << endl; + booted = true; + } + } + if (from < 0) { + // ok, just pick any unused mds id. + for (from=0; ; ++from) { + if (mdsmap.is_dne(from) || + mdsmap.is_out(from)) { + dout(10) << "mds_beacon assigned out|dne mds" << from << endl; + booted = true; + break; + } + } + } + + // old beacon? + if (mdsmap.mds_state_seq[from] > seq) { + dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << endl; + delete m; + return; + } + + // reply to beacon? + if (state != MDSMap::STATE_OUT) { + last_beacon[from] = g_clock.now(); // note time + messenger->send_message(new MMDSBeacon(state, seq), + m->get_source_inst()); + } + + + // make sure it's in the map + if (booted) { + mdsmap.mds_inst[from].addr = m->get_source_addr(); + mdsmap.mds_inst[from].name = MSG_ADDR_MDS(from); + mdsmap.mds_inc[from]++; + + // starting -> creating|starting|replay + if (mdsmap.is_degraded() && + !mdsmap.is_failed(from)) { + dout(10) << "mds_beacon currently degraded, mds" << from << " will be standby" << endl; + state = MDSMap::STATE_STANDBY; + } + else if (state == MDSMap::STATE_STARTING) { + if (mdsmap.is_failed(from)) { + dout(10) << "mds_beacon will recover mds" << from << endl; + state = MDSMap::STATE_REPLAY; + } + else if (mdsmap.is_out(from)) { + dout(10) << "mds_beacon will start mds" << from << endl; + state = MDSMap::STATE_STARTING; + } + else { + dout(10) << "mds_beacon will create mds" << from << endl; + state = MDSMap::STATE_CREATING; + } + } + } + + // if creating -> active, go to standby instead + if (state == MDSMap::STATE_ACTIVE && mdsmap.is_creating(from)) { + mdsmap.mds_created.insert(from); + dout(10) << "mds_beacon created mds" << from << endl; + + if (mdsmap.is_degraded()) { + dout(10) << "mds_beacon current degraded, marking mds" << from << " as standby" << endl; + state = MDSMap::STATE_STANDBY; + } + } + + + // did we update the map? + if (mdsmap.mds_state.count(from) == 0 || + mdsmap.mds_state[from] != state) { + // update mds state + dout(10) << "mds_beacon mds" << from << " " << MDSMap::get_state_name(mdsmap.mds_state[from]) + << " -> " << MDSMap::get_state_name(state) + << endl; + mdsmap.mds_state[from] = state; + if (mdsmap.is_up(from)) + mdsmap.mds_state_seq[from] = seq; + else + mdsmap.mds_state_seq.erase(from); + + issue_map(); + } + delete m; } @@ -128,9 +253,9 @@ void MDSMonitor::handle_mds_getmap(MMDSGetMap *m) { dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl; if (mdsmap.get_epoch() > 0) - send_full(m->get_source(), m->get_source_inst()); + send_full(m->get_source_inst()); else - awaiting_map[m->get_source()] = m->get_source_inst(); + awaiting_map.push_back( m->get_source_inst() ); } @@ -139,35 +264,107 @@ void MDSMonitor::bcast_latest_mds() dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; // tell mds - for (set::iterator p = mdsmap.get_mds().begin(); - p != mdsmap.get_mds().end(); - p++) { - if (mdsmap.is_down(*p)) continue; - send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p)); - } + set up; + mdsmap.get_up_mds_set(up); + for (set::iterator p = up.begin(); + p != up.end(); + p++) + send_full(mdsmap.get_inst(*p)); } -void MDSMonitor::send_full(msg_addr_t dest, const entity_inst_t& inst) +void MDSMonitor::send_full(entity_inst_t dest) { - dout(11) << "send_full to " << dest << " inst " << inst << endl; - messenger->send_message(new MMDSMap(&mdsmap), dest, inst); + dout(11) << "send_full to " << dest << endl; + messenger->send_message(new MMDSMap(&mdsmap), dest); } void MDSMonitor::send_current() { dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl; - for (map::iterator i = awaiting_map.begin(); + for (list::iterator i = awaiting_map.begin(); i != awaiting_map.end(); i++) - send_full(i->first, i->second); + send_full(*i); awaiting_map.clear(); } -void MDSMonitor::send_latest(msg_addr_t dest, const entity_inst_t& inst) +void MDSMonitor::send_latest(entity_inst_t dest) { // FIXME: check if we're locked, etc. if (mdsmap.get_epoch() > 0) - send_full(dest, inst); + send_full(dest); else - awaiting_map[dest] = inst; + awaiting_map.push_back(dest); +} + + +void MDSMonitor::tick() +{ + // make sure mds's are still alive + utime_t now = g_clock.now(); + if (now > g_conf.mds_beacon_grace) { + utime_t cutoff = now; + cutoff -= g_conf.mds_beacon_grace; + + bool changed = false; + + set up; + mdsmap.get_up_mds_set(up); + + for (set::iterator p = up.begin(); + p != up.end(); + ++p) { + if (last_beacon.count(*p)) { + if (last_beacon[*p] < cutoff) { + + // failure! + int newstate; + switch (mdsmap.get_state(*p)) { + case MDSMap::STATE_CREATING: + // didn't finish creating + newstate = MDSMap::STATE_DNE; + break; + + case MDSMap::STATE_STANDBY: + if (mdsmap.has_created(*p)) + newstate = MDSMap::STATE_OUT; + else + newstate = MDSMap::STATE_DNE; + break; + + case MDSMap::STATE_REPLAY: + case MDSMap::STATE_REJOIN: + case MDSMap::STATE_ACTIVE: + case MDSMap::STATE_STOPPING: + newstate = MDSMap::STATE_FAILED; + break; + + case MDSMap::STATE_STARTING: + case MDSMap::STATE_STOPPED: + newstate = MDSMap::STATE_OUT; + break; + + default: + assert(0); + } + + dout(10) << "no beacon from mds" << *p << " since " << last_beacon[*p] + << ", marking " << mdsmap.get_state_name(newstate) + << endl; + + // update map + mdsmap.mds_state[*p] = newstate; + mdsmap.mds_state_seq.erase(*p); + changed = true; + } + } else { + dout(10) << "no beacons from mds" << *p << ", assuming one " << now << endl; + last_beacon[*p] = now; + } + } + + if (changed) { + issue_map(); + } + } } diff --git a/branches/aleung/security1/ceph/mon/MDSMonitor.h b/branches/aleung/security1/ceph/mon/MDSMonitor.h index 58cb8912f0bf6..c3bc3d165883c 100644 --- a/branches/aleung/security1/ceph/mon/MDSMonitor.h +++ b/branches/aleung/security1/ceph/mon/MDSMonitor.h @@ -35,39 +35,52 @@ class MDSMonitor : public Dispatcher { MDSMap mdsmap; private: - map maps; + bufferlist encoded_map; //map inc_maps; //MDSMap::Incremental pending_inc; - map awaiting_map; - + list awaiting_map; + + // beacons + map last_beacon; + + bool is_alive(int mds); + // maps void create_initial(); void send_current(); // send current map to waiters. - void send_full(msg_addr_t dest, const entity_inst_t& inst); + void send_full(entity_inst_t dest); void bcast_latest_mds(); + void issue_map(); + + void save_map(); + void load_map(); + void print_map(); + //void accept_pending(); // accept pending, new map. //void send_incremental(epoch_t since, msg_addr_t dest); - void handle_mds_boot(class MMDSBoot *m); - void handle_mds_failure(class MMDSFailure *m); + void handle_mds_state(class MMDSState *m); + void handle_mds_beacon(class MMDSBeacon *m); + //void handle_mds_failure(class MMDSFailure *m); void handle_mds_getmap(class MMDSGetMap *m); - void handle_mds_shutdown(Message *m); public: MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) { - create_initial(); } void dispatch(Message *m); void tick(); // check state, take actions - void send_latest(msg_addr_t dest, const entity_inst_t& inst); + void election_starting(); + void election_finished(); + + void send_latest(entity_inst_t dest); }; diff --git a/branches/aleung/security1/ceph/mon/MonMap.h b/branches/aleung/security1/ceph/mon/MonMap.h index 37dd983cec04e..6f4d9117bf020 100644 --- a/branches/aleung/security1/ceph/mon/MonMap.h +++ b/branches/aleung/security1/ceph/mon/MonMap.h @@ -33,18 +33,17 @@ class MonMap { esignPub pub_key; bool keyConvert; - // no key supplied, so create one - MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) { + MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} + + void generate_key_pair(string& private_key) { esignPriv tempKey = esignPrivKey("crypto/esig1536.dat"); pub_key = esignPubKey(tempKey); pub_str_key = pubToString(pub_key); // now throw away the private key keyConvert = false; + assert(0); // FIXME } - // the map constructor when I have a public key - MonMap(int s,string key) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1), - pub_str_key(key), keyConvert(false) { - } + void add_mon(entity_inst_t inst) { mon_inst.push_back(inst); @@ -60,7 +59,7 @@ class MonMap { return last_mon; } - const entity_inst_t get_inst(int m) { + const entity_inst_t &get_inst(int m) { assert(m < num_mon); return mon_inst[m]; } diff --git a/branches/aleung/security1/ceph/mon/Monitor.cc b/branches/aleung/security1/ceph/mon/Monitor.cc index 8e2abf57524cc..a9dd22f2e648c 100644 --- a/branches/aleung/security1/ceph/mon/Monitor.cc +++ b/branches/aleung/security1/ceph/mon/Monitor.cc @@ -17,7 +17,7 @@ #include "osd/OSDMap.h" -#include "ebofs/Ebofs.h" +#include "MonitorStore.h" #include "msg/Message.h" #include "msg/Messenger.h" @@ -26,6 +26,8 @@ #include "messages/MPingAck.h" #include "messages/MGenericMessage.h" +#include "messages/MMonPaxos.h" + #include "common/Timer.h" #include "common/Clock.h" @@ -39,20 +41,51 @@ #define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " +void Monitor::set_new_private_key(string& pk) +{ + dout(10) << "set_new_private_key" << endl; + + // FIXME. + assert(0); +} void Monitor::init() { + lock.Lock(); + dout(1) << "init" << endl; // store char s[80]; - sprintf(s, "dev/mon%d", whoami); - store = new Ebofs(s); + sprintf(s, "mondata/mon%d", whoami); + store = new MonitorStore(s); - if (g_conf.mkfs) + if (g_conf.mkfs) { store->mkfs(); - int r = store->mount(); - assert(r >= 0); + + // i should have already been provided a key via set_new_private_key(). + // save it. + // FIXME. + bufferlist bl; + //bl.append(myPrivKey.c_str(), myPrivKey.length()); + store->put_bl_ss(bl, "private_key", 0); + assert(0); + } + else { + store->mount(); + + // load private key + // FIXME. + bufferlist bl; + store->get_bl_ss(bl, "private_key", 0); + //myPrivKey = bl.c_str(); + + // der? + myPrivKey = esignPrivKey("crypto/esig1536.dat"); + myPubKey = esignPubKey(myPrivKey); + + assert(0); + } // create osdmon = new OSDMonitor(this, messenger, lock); @@ -64,18 +97,29 @@ void Monitor::init() // start ticker reset_tick(); + + // call election? + if (monmap->num_mon > 1) { + assert(monmap->num_mon != 2); + call_election(); + } else { + // we're standalone. + set q; + q.insert(whoami); + win_election(q); + } + + lock.Unlock(); } void Monitor::shutdown() { dout(1) << "shutdown" << endl; + // cancel all events cancel_tick(); - - if (store) { - store->umount(); - delete store; - } + timer.cancel_all(); + timer.join(); // stop osds. for (set::iterator it = osdmon->osdmap.get_osds().begin(); @@ -84,15 +128,20 @@ void Monitor::shutdown() if (osdmon->osdmap.is_down(*it)) continue; dout(10) << "sending shutdown to osd" << *it << endl; messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - MSG_ADDR_OSD(*it), osdmon->osdmap.get_inst(*it)); + osdmon->osdmap.get_inst(*it)); } + osdmon->mark_all_down(); // monitors too. for (int i=0; inum_mon; i++) if (i != whoami) messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), - MSG_ADDR_MON(i), monmap->get_inst(i)); + monmap->get_inst(i)); + // unmount my local storage + if (store) + delete store; + // clean up if (monmap) delete monmap; if (osdmon) delete osdmon; @@ -107,14 +156,38 @@ void Monitor::shutdown() void Monitor::call_election() { + if (monmap->num_mon == 1) return; + dout(10) << "call_election" << endl; state = STATE_STARTING; + elector.start(); + osdmon->election_starting(); //mdsmon->election_starting(); } +void Monitor::win_election(set& active) +{ + state = STATE_LEADER; + leader = whoami; + quorum = active; + dout(10) << "win_election, quorum is " << quorum << endl; + // init + osdmon->election_finished(); + mdsmon->election_finished(); + + // init paxos + test_paxos.leader_start(); +} + +void Monitor::lose_election(int l) +{ + state = STATE_PEON; + leader = l; + dout(10) << "lose_election, leader is mon" << leader << endl; +} @@ -130,14 +203,8 @@ void Monitor::dispatch(Message *m) break; case MSG_SHUTDOWN: - if (m->get_source().is_mds()) { - mdsmon->dispatch(m); - if (mdsmon->mdsmap.get_num_mds() == 0) - shutdown(); - } - else if (m->get_source().is_osd()) { - osdmon->dispatch(m); - } + assert(m->get_source().is_osd()); + osdmon->dispatch(m); break; @@ -152,9 +219,15 @@ void Monitor::dispatch(Message *m) // MDSs - case MSG_MDS_BOOT: + case MSG_MDS_BEACON: case MSG_MDS_GETMAP: mdsmon->dispatch(m); + + // hackish: did all mds's shut down? + if (g_conf.mon_stop_with_last_mds && + mdsmon->mdsmap.get_num_up_or_failed_mds() == 0) + shutdown(); + break; // clients @@ -164,11 +237,25 @@ void Monitor::dispatch(Message *m) break; + // paxos + case MSG_MON_PAXOS: + // send it to the right paxos instance + switch (((MMonPaxos*)m)->machine_id) { + case PAXOS_TEST: + test_paxos.dispatch(m); + break; + case PAXOS_OSDMAP: + //... + + default: + assert(0); + } + break; + // elector messages + case MSG_MON_ELECTION_PROPOSE: case MSG_MON_ELECTION_ACK: - case MSG_MON_ELECTION_STATUS: - case MSG_MON_ELECTION_COLLECT: - case MSG_MON_ELECTION_REFRESH: + case MSG_MON_ELECTION_VICTORY: elector.dispatch(m); break; @@ -200,65 +287,42 @@ void Monitor::handle_ping_ack(MPingAck *m) -/************ TIMER ***************/ +/************ TICK ***************/ class C_Mon_Tick : public Context { Monitor *mon; public: C_Mon_Tick(Monitor *m) : mon(m) {} void finish(int r) { - mon->tick(this); + mon->tick(); } }; - void Monitor::cancel_tick() { - if (!tick_timer) return; - - if (g_timer.cancel_event(tick_timer)) { - dout(10) << "cancel_tick canceled" << endl; - } else { - // already dispatched! - dout(10) << "cancel_tick timer dispatched, waiting to cancel" << endl; - tick_timer = (Context*)1; // hackish. - while (tick_timer) - tick_timer_cond.Wait(lock); - } + if (tick_timer) timer.cancel_event(tick_timer); } void Monitor::reset_tick() { - if (tick_timer) - cancel_tick(); + cancel_tick(); tick_timer = new C_Mon_Tick(this); - g_timer.add_event_after(g_conf.mon_tick_interval, tick_timer); + timer.add_event_after(g_conf.mon_tick_interval, tick_timer); } -void Monitor::tick(Context *timer) +void Monitor::tick() { - lock.Lock(); - { - if (tick_timer != timer) { - dout(10) << "tick - canceled" << endl; - tick_timer = 0; - tick_timer_cond.Signal(); - lock.Unlock(); - return; - } - - tick_timer = 0; - - // ok go. - dout(10) << "tick" << endl; + tick_timer = 0; - osdmon->tick(); - - // next tick! - reset_tick(); - } - lock.Unlock(); + // ok go. + dout(11) << "tick" << endl; + + osdmon->tick(); + mdsmon->tick(); + + // next tick! + reset_tick(); } diff --git a/branches/aleung/security1/ceph/mon/Monitor.h b/branches/aleung/security1/ceph/mon/Monitor.h index b071e39b18a86..47e41af8f3f06 100644 --- a/branches/aleung/security1/ceph/mon/Monitor.h +++ b/branches/aleung/security1/ceph/mon/Monitor.h @@ -18,17 +18,25 @@ #include "include/types.h" #include "msg/Messenger.h" +#include "common/Timer.h" + #include "MonMap.h" #include "Elector.h" +#include "Paxos.h" #include "crypto/CryptoLib.h" using namespace CryptoLib; -class ObjectStore; +class MonitorStore; class OSDMonitor; class MDSMonitor; class ClientMonitor; +#define PAXOS_TEST 0 +#define PAXOS_OSDMAP 1 +#define PAXOS_MDSMAP 2 +#define PAXOS_CLIENTMAP 3 + class Monitor : public Dispatcher { protected: // me @@ -43,14 +51,15 @@ protected: esignPub myPubKey; // timer. + SafeTimer timer; Context *tick_timer; - Cond tick_timer_cond; void cancel_tick(); void reset_tick(); friend class C_Mon_Tick; // my local store - ObjectStore *store; + //ObjectStore *store; + MonitorStore *store; const static int INO_ELECTOR = 1; const static int INO_MON_MAP = 2; @@ -65,12 +74,17 @@ protected: epoch_t mon_epoch; // monitor epoch (election instance) set quorum; // current active set of monitors (if !starting) - void call_election(); + //void call_election(); + + // paxos + Paxos test_paxos; + friend class Paxos; + // monitor state - const static int STATE_STARTING = 0; - const static int STATE_LEADER = 1; - const static int STATE_PEON = 2; + const static int STATE_STARTING = 0; // electing + const static int STATE_LEADER = 1; + const static int STATE_PEON = 2; int state; int leader; // current leader (to best of knowledge) @@ -93,57 +107,39 @@ protected: friend class MDSMonitor; friend class ClientMonitor; + // initiate election + void call_election(); + + // end election (called by Elector) + void win_election(set& q); + void lose_election(int l); + + + public: Monitor(int w, Messenger *m, MonMap *mm) : whoami(w), messenger(m), monmap(mm), - tick_timer(0), + timer(lock), tick_timer(0), store(0), elector(this, w), mon_epoch(0), + + test_paxos(this, w, PAXOS_TEST, "tester"), // tester state machine + state(STATE_STARTING), leader(0), osdmon(0), mdsmon(0), clientmon(0) { - // hack leader, until election works. - if (whoami == 0) - state = STATE_LEADER; - else - state = STATE_PEON; - - // init keys - myPrivKey = esignPrivKey("crypto/esig1536.dat"); - myPubKey = esignPubKey(myPrivKey); - } - Monitor(int w, Messenger *m, MonMap *mm, esignPriv key) : - whoami(w), - messenger(m), - monmap(mm), - myPrivKey(key), - tick_timer(0), - store(0), - elector(this, w), - mon_epoch(0), - state(STATE_STARTING), - leader(0), - osdmon(0), - mdsmon(0) - { - // hack leader, until election works. - if (whoami == 0) - state = STATE_LEADER; - else - state = STATE_PEON; - - // init keys - myPubKey = esignPubKey(myPrivKey); } + void set_new_private_key(string& pk); + void init(); void shutdown(); void dispatch(Message *m); - void tick(Context *timer); + void tick(); }; diff --git a/branches/aleung/security1/ceph/mon/MonitorStore.cc b/branches/aleung/security1/ceph/mon/MonitorStore.cc new file mode 100644 index 0000000000000..f93bb2082427b --- /dev/null +++ b/branches/aleung/security1/ceph/mon/MonitorStore.cc @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MonitorStore.h" +#include "common/Clock.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " store(" << dir <<") " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " store(" << dir <<") " + +#include +#include +#include +#include + + +void MonitorStore::mount() +{ + dout(1) << "mount" << endl; + // verify dir exists + DIR *d = ::opendir(dir.c_str()); + if (!d) { + derr(1) << "basedir " << dir << " dne" << endl; + assert(0); + } + ::closedir(d); +} + + +void MonitorStore::mkfs() +{ + dout(1) << "mkfs" << endl; + + char cmd[200]; + sprintf(cmd, "test -d %s && /bin/rm -r %s ; mkdir -p %s", dir.c_str(), dir.c_str(), dir.c_str()); + dout(1) << cmd << endl; + system(cmd); +} + + +version_t MonitorStore::get_int(const char *a, const char *b) +{ + char fn[200]; + if (b) + sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); + else + sprintf(fn, "%s/%s", dir.c_str(), a); + + FILE *f = ::fopen(fn, "r"); + if (!f) + return 0; + + char buf[20]; + ::fgets(buf, 20, f); + ::fclose(f); + + version_t val = atoi(buf); + + if (b) { + dout(15) << "get_int " << a << "/" << b << " = " << val << endl; + } else { + dout(15) << "get_int " << a << " = " << val << endl; + } + return val; +} + + +void MonitorStore::put_int(version_t val, const char *a, const char *b) +{ + char fn[200]; + sprintf(fn, "%s/%s", dir.c_str(), a); + if (b) { + ::mkdir(fn, 0755); + dout(15) << "set_int " << a << "/" << b << " = " << val << endl; + sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); + } else { + dout(15) << "set_int " << a << " = " << val << endl; + } + + char vs[30]; + sprintf(vs, "%lld\n", val); + + char tfn[200]; + sprintf(tfn, "%s.new", fn); + + int fd = ::open(tfn, O_WRONLY|O_CREAT); + assert(fd > 0); + ::fchmod(fd, 0644); + ::write(fd, vs, strlen(vs)); + ::close(fd); + ::rename(tfn, fn); +} + + +// ---------------------------------------- +// buffers + +bool MonitorStore::exists_bl_ss(const char *a, const char *b) +{ + char fn[200]; + if (b) { + dout(15) << "exists_bl " << a << "/" << b << endl; + sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); + } else { + dout(15) << "exists_bl " << a << endl; + sprintf(fn, "%s/%s", dir.c_str(), a); + } + + struct stat st; + int r = ::stat(fn, &st); + return r == 0; +} + + +int MonitorStore::get_bl_ss(bufferlist& bl, const char *a, const char *b) +{ + char fn[200]; + if (b) { + sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); + } else { + sprintf(fn, "%s/%s", dir.c_str(), a); + } + + int fd = ::open(fn, O_RDONLY); + if (!fd) { + if (b) { + dout(15) << "get_bl " << a << "/" << b << " DNE" << endl; + } else { + dout(15) << "get_bl " << a << " DNE" << endl; + } + return 0; + } + + // read size + __int32_t len = 0; + ::read(fd, &len, sizeof(len)); + + // read buffer + bl.clear(); + bufferptr bp(len); + ::read(fd, bp.c_str(), len); + bl.append(bp); + ::close(fd); + + if (b) { + dout(15) << "get_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; + } else { + dout(15) << "get_bl " << a << " = " << bl.length() << " bytes" << endl; + } + + return len; +} + +int MonitorStore::put_bl_ss(bufferlist& bl, const char *a, const char *b) +{ + char fn[200]; + sprintf(fn, "%s/%s", dir.c_str(), a); + if (b) { + ::mkdir(fn, 0755); + dout(15) << "put_bl " << a << "/" << b << " = " << bl.length() << " bytes" << endl; + sprintf(fn, "%s/%s/%s", dir.c_str(), a, b); + } else { + dout(15) << "put_bl " << a << " = " << bl.length() << " bytes" << endl; + } + + char tfn[200]; + sprintf(tfn, "%s.new", fn); + int fd = ::open(tfn, O_WRONLY|O_CREAT); + assert(fd); + + // write size + __int32_t len = bl.length(); + ::write(fd, &len, sizeof(len)); + + // write data + for (list::const_iterator it = bl.buffers().begin(); + it != bl.buffers().end(); + it++) + ::write(fd, it->c_str(), it->length()); + + ::fchmod(fd, 0644); + ::fsync(fd); + ::close(fd); + ::rename(tfn, fn); + + return 0; +} diff --git a/branches/aleung/security1/ceph/mon/MonitorStore.h b/branches/aleung/security1/ceph/mon/MonitorStore.h new file mode 100644 index 0000000000000..f1d5f67ab3473 --- /dev/null +++ b/branches/aleung/security1/ceph/mon/MonitorStore.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MON_MONITORSTORE_H +#define __MON_MONITORSTORE_H + +#include "include/types.h" +#include "include/buffer.h" + +#include + +class MonitorStore { + string dir; + +public: + MonitorStore(char *d) : dir(d) { + } + ~MonitorStore() { + } + + void mkfs(); // wipe + void mount(); + + // ints (stored as ascii) + version_t get_int(const char *a, const char *b=0); + void put_int(version_t v, const char *a, const char *b=0); + + // buffers + // ss and sn varieties. + bool exists_bl_ss(const char *a, const char *b=0); + int get_bl_ss(bufferlist& bl, const char *a, const char *b); + int put_bl_ss(bufferlist& bl, const char *a, const char *b); + bool exists_bl_sn(const char *a, version_t b) { + char bs[20]; + sprintf(bs, "%llu", b); + return exists_bl_ss(a, bs); + } + int get_bl_sn(bufferlist& bl, const char *a, version_t b) { + char bs[20]; + sprintf(bs, "%llu", b); + return get_bl_ss(bl, a, bs); + } + int put_bl_sn(bufferlist& bl, const char *a, version_t b) { + char bs[20]; + sprintf(bs, "%llu", b); + return put_bl_ss(bl, a, bs); + } + + /* + version_t get_incarnation() { return get_int("incarnation"); } + void set_incarnation(version_t i) { set_int(i, "incarnation"); } + + version_t get_last_proposal() { return get_int("last_proposal"); } + void set_last_proposal(version_t i) { set_int(i, "last_proposal"); } + */ +}; + + +#endif diff --git a/branches/aleung/security1/ceph/mon/OSDMonitor.cc b/branches/aleung/security1/ceph/mon/OSDMonitor.cc index 90b45fe84697a..1325ab1730348 100644 --- a/branches/aleung/security1/ceph/mon/OSDMonitor.cc +++ b/branches/aleung/security1/ceph/mon/OSDMonitor.cc @@ -15,7 +15,7 @@ #include "Monitor.h" #include "MDSMonitor.h" -#include "osd/ObjectStore.h" +#include "MonitorStore.h" #include "messages/MOSDFailure.h" #include "messages/MOSDMap.h" @@ -59,7 +59,7 @@ void OSDMonitor::fake_osdmap_update() // tell a random osd int osd = rand() % g_conf.num_osd; send_incremental(osdmap.get_epoch()-1, // ick! FIXME - MSG_ADDR_OSD(osd), osdmap.get_inst(osd)); + osdmap.get_inst(osd)); } @@ -78,19 +78,20 @@ void OSDMonitor::fake_reorg() accept_pending(); // tell him! - send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(r), osdmap.get_inst(r)); + send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(r)); // do it again? /* if (g_conf.num_osd - d > 4 && g_conf.num_osd - d > g_conf.num_osd/2) - g_timer.add_event_after(g_conf.fake_osdmap_expand, + mon->timer.add_event_after(g_conf.fake_osdmap_expand, new C_Mon_Faker(this)); */ } +/* void OSDMonitor::init() { // start with blank map @@ -104,15 +105,9 @@ void OSDMonitor::init() // set up pending_inc pending_inc.epoch = osdmap.get_epoch()+1; - - } else { - // FIXME. when elections work! - if (mon->is_leader()) { - create_initial(); - issue_leases(); - } } } +*/ @@ -222,33 +217,31 @@ void OSDMonitor::create_initial() i != g_fake_osd_down.end(); i++) { dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; - g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); + mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); } for (map::iterator i = g_fake_osd_out.begin(); i != g_fake_osd_out.end(); i++) { dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; - g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); + mon->timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); } } bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl) { - object_t oid(Monitor::INO_OSD_MAP, epoch); - if (!mon->store->exists(oid)) + if (!mon->store->exists_bl_sn("osdmap", epoch)) return false; - int r = mon->store->read(oid, 0, 0, bl); + int r = mon->store->get_bl_sn(bl, "osdmap", epoch); assert(r > 0); return true; } bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl) { - object_t oid(Monitor::INO_OSD_INC_MAP, epoch); - if (!mon->store->exists(oid)) + if (!mon->store->exists_bl_sn("osdincmap", epoch)) return false; - int r = mon->store->read(oid, 0, 0, bl); + int r = mon->store->get_bl_sn(bl, "osdincmap", epoch); assert(r > 0); return true; } @@ -259,11 +252,8 @@ void OSDMonitor::save_map() bufferlist bl; osdmap.encode(bl); - ObjectStore::Transaction t; - t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl); - t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); - mon->store->apply_transaction(t); - mon->store->sync(); + mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); + mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); } void OSDMonitor::save_inc_map(OSDMap::Incremental &inc) @@ -274,12 +264,9 @@ void OSDMonitor::save_inc_map(OSDMap::Incremental &inc) bufferlist incbl; inc.encode(incbl); - ObjectStore::Transaction t; - t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl); - t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); // not strictly needed?? - t.write(object_t(Monitor::INO_OSD_INC_MAP,osdmap.get_epoch()), 0, incbl.length(), incbl); - mon->store->apply_transaction(t); - mon->store->sync(); + mon->store->put_bl_sn(bl, "osdmap", osdmap.get_epoch()); + mon->store->put_bl_sn(incbl, "osdincmap", osdmap.get_epoch()); + mon->store->put_int(osdmap.get_epoch(), "osd_epoch"); } @@ -338,11 +325,11 @@ void OSDMonitor::handle_osd_failure(MOSDFailure *m) // FIXME // take their word for it - int from = m->get_failed().num(); + int from = m->get_failed().name.num(); if (osdmap.is_up(from) && (osdmap.osd_inst.count(from) == 0 || - osdmap.osd_inst[from] == m->get_inst())) { - pending_inc.new_down[from] = m->get_inst(); + osdmap.osd_inst[from] == m->get_failed())) { + pending_inc.new_down[from] = m->get_failed(); if (osdmap.is_in(from)) down_pending_out[from] = g_clock.now(); @@ -351,7 +338,7 @@ void OSDMonitor::handle_osd_failure(MOSDFailure *m) accept_pending(); - send_incremental(m->get_epoch(), m->get_source(), m->get_source_inst()); + send_incremental(m->get_epoch(), m->get_source_inst()); send_waiting(); bcast_latest_mds(); @@ -363,22 +350,33 @@ void OSDMonitor::handle_osd_failure(MOSDFailure *m) void OSDMonitor::fake_osd_failure(int osd, bool down) { - lock.Lock(); - { - if (down) { - dout(1) << "fake_osd_failure DOWN osd" << osd << endl; - pending_inc.new_down[osd] = osdmap.osd_inst[osd]; - } else { - dout(1) << "fake_osd_failure OUT osd" << osd << endl; - pending_inc.new_out.push_back(osd); - } - accept_pending(); - bcast_latest_osd(); - bcast_latest_mds(); + if (down) { + dout(1) << "fake_osd_failure DOWN osd" << osd << endl; + pending_inc.new_down[osd] = osdmap.osd_inst[osd]; + } else { + dout(1) << "fake_osd_failure OUT osd" << osd << endl; + pending_inc.new_out.push_back(osd); } - lock.Unlock(); + accept_pending(); + bcast_latest_osd(); + bcast_latest_mds(); } +void OSDMonitor::mark_all_down() +{ + dout(7) << "mark_all_down" << endl; + + for (set::iterator it = osdmap.get_osds().begin(); + it != osdmap.get_osds().end(); + it++) { + if (osdmap.is_down(*it)) continue; + pending_inc.new_down[*it] = osdmap.get_inst(*it); + } + accept_pending(); +} + + + void OSDMonitor::handle_osd_boot(MOSDBoot *m) { @@ -430,7 +428,7 @@ void OSDMonitor::handle_osd_boot(MOSDBoot *m) accept_pending(); // the booting osd will spread word - send_incremental(m->sb.current_epoch, m->get_source(), m->get_source_inst()); + send_incremental(m->sb.current_epoch, m->get_source_inst()); delete m; // tell mds @@ -445,7 +443,7 @@ void OSDMonitor::handle_osd_in(MOSDIn *m) if (osdmap.is_out(from)) pending_inc.new_in.push_back(from); accept_pending(); - send_incremental(m->map_epoch, m->get_source(), m->get_source_inst()); + send_incremental(m->map_epoch, m->get_source_inst()); } void OSDMonitor::handle_osd_out(MOSDOut *m) @@ -455,7 +453,7 @@ void OSDMonitor::handle_osd_out(MOSDOut *m) if (osdmap.is_in(from)) { pending_inc.new_out.push_back(from); accept_pending(); - send_incremental(m->map_epoch, m->get_source(), m->get_source_inst()); + send_incremental(m->map_epoch, m->get_source_inst()); } } @@ -468,7 +466,7 @@ void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) awaiting_map[m->get_source()].second = m->get_since(); } else { //if (m->get_since()) - send_incremental(m->get_since(), m->get_source(), m->get_source_inst()); + send_incremental(m->get_since(), m->get_source_inst()); //else //send_full(m->get_source(), m->get_source_inst()); } @@ -497,14 +495,13 @@ void OSDMonitor::accept_pending() i++) { dout(0) << "osd" << i->first << " UP " << i->second << endl; derr(0) << "osd" << i->first << " UP " << i->second << endl; - messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); } for (map::iterator i = pending_inc.new_down.begin(); i != pending_inc.new_down.end(); i++) { dout(0) << "osd" << i->first << " DOWN " << i->second << endl; derr(0) << "osd" << i->first << " DOWN " << i->second << endl; - messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); + messenger->mark_down(i->second.addr); } for (list::iterator i = pending_inc.new_in.begin(); i != pending_inc.new_in.end(); @@ -528,19 +525,19 @@ void OSDMonitor::send_waiting() { dout(10) << "send_waiting " << osdmap.get_epoch() << endl; - for (map >::iterator i = awaiting_map.begin(); + for (map >::iterator i = awaiting_map.begin(); i != awaiting_map.end(); i++) - send_incremental(i->second.second, i->first, i->second.first); + send_incremental(i->second.second, i->second.first); } -void OSDMonitor::send_full(msg_addr_t who, const entity_inst_t& inst) +void OSDMonitor::send_full(entity_inst_t who) { - messenger->send_message(new MOSDMap(&osdmap), who, inst); + messenger->send_message(new MOSDMap(&osdmap), who); } -void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst) +void OSDMonitor::send_incremental(epoch_t since, entity_inst_t dest) { dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch() << " to " << dest << endl; @@ -564,7 +561,7 @@ void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_i } } - messenger->send_message(m, dest, inst); + messenger->send_message(m, dest); } @@ -575,11 +572,12 @@ void OSDMonitor::bcast_latest_mds() dout(1) << "bcast_latest_mds epoch " << e << endl; // tell mds - for (set::iterator i = mon->mdsmon->mdsmap.get_mds().begin(); - i != mon->mdsmon->mdsmap.get_mds().end(); + set up; + mon->mdsmon->mdsmap.get_up_mds_set(up); + for (set::iterator i = up.begin(); + i != up.end(); i++) { - if (mon->mdsmon->mdsmap.is_down(*i)) continue; - send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i)); + send_incremental(osdmap.get_epoch()-1, mon->mdsmon->mdsmap.get_inst(*i)); } } @@ -596,7 +594,7 @@ void OSDMonitor::bcast_latest_osd() it++) { if (osdmap.is_down(*it)) continue; - send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(*it), osdmap.get_inst(*it)); + send_incremental(osdmap.get_epoch()-1, osdmap.get_inst(*it)); } } @@ -639,10 +637,38 @@ void OSDMonitor::election_starting() void OSDMonitor::election_finished() { - dout(10) << "election_starting" << endl; + dout(10) << "election_finished" << endl; + + if (mon->is_leader()) { + if (g_conf.mkfs) { + create_initial(); + save_map(); + } else { + // + epoch_t epoch = mon->store->get_int("osd_epoch"); + dout(10) << " last epoch was " << epoch << endl; + bufferlist bl, blinc; + int r = mon->store->get_bl_sn(bl, "osdmap", epoch); + assert(r>0); + osdmap.decode(bl); + + // pending_inc + pending_inc.epoch = epoch+1; + } + } + + /* state = STATE_INIT; + // map? + if (osdmap.get_epoch() == 0 && + mon->is_leader()) { + create_initial(); + } + + + if (mon->is_leader()) { // leader. if (mon->monmap->num_mon == 1) { @@ -652,10 +678,10 @@ void OSDMonitor::election_finished() } else if (mon->is_peon()) { // peon. send info - messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch), - MSG_ADDR_MON(mon->leader), mon->monmap->get_inst(mon->leader)); + //messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch), + // mon->monmap->get_inst(mon->leader)); } - + */ } @@ -694,7 +720,7 @@ void OSDMonitor::handle_info(MMonOSDMapInfo *m) // bring up to date if (epoch < osdmap.get_epoch()) - send_incremental(epoch, m->get_source(), m->get_source_inst()); + send_incremental(epoch, m->get_source_inst()); delete m; } @@ -716,7 +742,7 @@ void OSDMonitor::issue_leases() i++) { if (*i == mon->whoami) continue; messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire), - MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); + mon->monmap->get_inst(*i)); pending_ack.insert(*i); } } @@ -795,7 +821,7 @@ void OSDMonitor::update_map() if (*i == mon->whoami) continue; messenger->send_message(new MMonOSDMapUpdatePrepare(epoch, map_bl, inc_map_bl), - MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); + mon->monmap->get_inst(*i)); pending_ack.insert(*i); } } @@ -822,7 +848,7 @@ void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m) // ack messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()), - m->get_source(), m->get_source_inst()); + m->get_source_inst()); delete m; } diff --git a/branches/aleung/security1/ceph/mon/OSDMonitor.h b/branches/aleung/security1/ceph/mon/OSDMonitor.h index 9936ecc1ff70e..bf393f17d9f7a 100644 --- a/branches/aleung/security1/ceph/mon/OSDMonitor.h +++ b/branches/aleung/security1/ceph/mon/OSDMonitor.h @@ -36,7 +36,7 @@ public: OSDMap osdmap; private: - map > awaiting_map; + map > awaiting_map; void create_initial(); bool get_map_bl(epoch_t epoch, bufferlist &bl); @@ -60,13 +60,13 @@ private: int state; utime_t lease_expire; // when lease expires - void init(); + //void init(); // maps void accept_pending(); // accept pending, new map. void send_waiting(); // send current map to waiters. - void send_full(msg_addr_t dest, const entity_inst_t& inst); - void send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst); + void send_full(entity_inst_t dest); + void send_incremental(epoch_t since, entity_inst_t dest); void bcast_latest_mds(); void bcast_latest_osd(); @@ -89,7 +89,7 @@ private: OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l), state(STATE_SYNC) { - init(); + //init(); } void dispatch(Message *m); @@ -100,6 +100,8 @@ private: void issue_leases(); + void mark_all_down(); + void fake_osd_failure(int osd, bool down); void fake_osdmap_update(); void fake_reorg(); diff --git a/branches/aleung/security1/ceph/mon/Paxos.cc b/branches/aleung/security1/ceph/mon/Paxos.cc new file mode 100644 index 0000000000000..67c4e2e99e179 --- /dev/null +++ b/branches/aleung/security1/ceph/mon/Paxos.cc @@ -0,0 +1,182 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Paxos.h" +#include "Monitor.h" +#include "MonitorStore.h" + +#include "messages/MMonPaxos.h" + +#include "config.h" +#undef dout +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".paxos(" << machine_name << ") " + + +// --------------------------------- +// proposer +void Paxos::propose(version_t v, bufferlist& value) +{ +//todo high rf +} + +void Paxos::handle_last(MMonPaxos *m) +{ +//todo high rf + dout(10) << "handle_last " << *m << endl; + delete m; +} + +void Paxos::handle_accept(MMonPaxos *m) +{ +//todo high rf + dout(10) << "handle_accept " << *m << endl; + delete m; + +} + +void Paxos::handle_ack(MMonPaxos *m) +{ +//todo high rf + dout(10) << "handle_ack " << *m << endl; + delete m; +} + +void Paxos::handle_old_round(MMonPaxos *m) +{ +//todo high rf + dout(10) << "handle_old_round " << *m << endl; + delete m; +} + + +/* + * return a globally unique, monotonically increasing proposal number + */ +version_t Paxos::get_new_proposal_number(version_t gt) +{ + // read last + version_t last = mon->store->get_int("last_paxos_proposal"); + if (last < gt) + last = gt; + + // update + last /= 100; + last++; + + // make it unique among all monitors. + version_t pn = last*100 + (version_t)whoami; + + // write + mon->store->put_int(pn, "last_paxos_proposal"); + + dout(10) << "get_new_proposal_number = " << pn << endl; + return pn; +} + + +// --------------------------------- +// accepter +void Paxos::handle_collect(MMonPaxos *m) +{ +//todo high rf + // ... + + delete m; +} + + + + +// --------------------------------- +// learner +void Paxos::handle_success(MMonPaxos *m) +{ + //todo high rf + delete m; +} + +void Paxos::handle_begin(MMonPaxos *m) +{ + //todo high rf + delete m; +} + +// --------------------------------- + +void Paxos::leader_start() +{ + dout(10) << "i am the leader" << endl; + + // .. do something else too + version_t pn = get_new_proposal_number(); + for (int i=0; imonmap->num_mon; ++i) { + if (i == whoami) continue; + // todo high rf I pass the pn twice... what is the last parameter for? + mon->messenger->send_message(new MMonPaxos(MMonPaxos::OP_COLLECT, whoami, pn, pn), + mon->monmap->get_inst(i)); + } +} + + + +void Paxos::dispatch(Message *m) +{ + switch (m->get_type()) { + + case MSG_MON_PAXOS: + { + MMonPaxos *pm = (MMonPaxos*)m; + + // NOTE: these ops are defined in messages/MMonPaxos.h + switch (pm->op) { + // learner + case MMonPaxos::OP_COLLECT: + handle_collect(pm); + break; + + case MMonPaxos::OP_LAST: + handle_last(pm); + break; + + case MMonPaxos::OP_OLDROUND: + handle_old_round(pm); + break; + + case MMonPaxos::OP_BEGIN: + handle_begin(pm); + break; + + case MMonPaxos::OP_ACCEPT: + handle_accept(pm); + break; + + case MMonPaxos::OP_SUCCESS: + handle_success(pm); + break; + + case MMonPaxos::OP_ACK: + handle_ack(pm); + break; + + default: + assert(0); + } + } + break; + + default: + assert(0); + } +} + diff --git a/branches/aleung/security1/ceph/mon/Paxos.h b/branches/aleung/security1/ceph/mon/Paxos.h new file mode 100644 index 0000000000000..52a509d25aa76 --- /dev/null +++ b/branches/aleung/security1/ceph/mon/Paxos.h @@ -0,0 +1,73 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MON_PAXOS_H +#define __MON_PAXOS_H + +#include "include/types.h" +#include "include/buffer.h" +#include "msg/Message.h" + +#include "include/Context.h" + +#include "common/Timer.h" + +class Monitor; +class MMonPaxos; + +// i am one state machine. +class Paxos { + Monitor *mon; + int whoami; + + // my state machine info + int machine_id; + const char *machine_name; + map accepted_values; + map accepted_proposal_number; + + // proposer + void propose(version_t v, bufferlist& value); + + void handle_last(MMonPaxos*); + void handle_accept(MMonPaxos*); + void handle_ack(MMonPaxos*); + void handle_old_round(MMonPaxos*); + + version_t get_new_proposal_number(version_t gt=0); + + // accepter + void handle_collect(MMonPaxos*); + + // learner + void handle_success(MMonPaxos*); + void handle_begin(MMonPaxos*); + + +public: + Paxos(Monitor *m, int w, + int mid,const char *mnm) : mon(m), whoami(w), + machine_id(mid), machine_name(mnm) { + } + + void dispatch(Message *m); + + void leader_start(); + +}; + + + +#endif + diff --git a/branches/aleung/security1/ceph/msg/Dispatcher.h b/branches/aleung/security1/ceph/msg/Dispatcher.h index e6fe8d8da47ce..8b6fe92381427 100644 --- a/branches/aleung/security1/ceph/msg/Dispatcher.h +++ b/branches/aleung/security1/ceph/msg/Dispatcher.h @@ -26,15 +26,8 @@ class Dispatcher { // how i receive messages virtual void dispatch(Message *m) = 0; - // how i deal with transmission failures. - virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) { delete m; } - - // lookups - virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst) { assert(0); return 0; } - - // this is how i send messages - //int send_message(Message *m, msg_addr_t dest, int dest_port); + virtual void ms_handle_failure(Message *m, const entity_inst_t& inst) { delete m; } }; #endif diff --git a/branches/aleung/security1/ceph/msg/FakeMessenger.cc b/branches/aleung/security1/ceph/msg/FakeMessenger.cc index 2a89ebb5faeb3..d2db8c8f7e11c 100644 --- a/branches/aleung/security1/ceph/msg/FakeMessenger.cc +++ b/branches/aleung/security1/ceph/msg/FakeMessenger.cc @@ -50,47 +50,24 @@ using namespace __gnu_cxx; int nranks = 0; // this identify each entity_inst_t -map directory; +map directory; hash_map loggers; LogType fakemsg_logtype; -set shutdown_set; +set shutdown_set; Mutex lock; Cond cond; -bool pending_timer = false; - bool awake = false; bool fm_shutdown = false; pthread_t thread_id; -class C_FakeKicker : public Context { - void finish(int r) { - dout(18) << "timer kick" << endl; - pending_timer = true; - lock.Lock(); - cond.Signal(); // why not - lock.Unlock(); - } -}; - -void FakeMessenger::callback_kick() -{ - pending_timer = true; - lock.Lock(); - cond.Signal(); // why not - lock.Unlock(); -} void *fakemessenger_thread(void *ptr) { - //dout(1) << "thread start, setting timer kicker" << endl; - //g_timer.set_messenger_kicker(new C_FakeKicker()); - //msgr_callback_kicker = new C_FakeKicker(); - lock.Lock(); while (1) { dout(20) << "thread waiting" << endl; @@ -107,11 +84,6 @@ void *fakemessenger_thread(void *ptr) } lock.Unlock(); - //cout << "unsetting messenger" << endl; - //g_timer.unset_messenger_kicker(); - //g_timer.unset_messenger(); - //msgr_callback_kicker = 0; - dout(1) << "thread finish (i woke up but no messages, bye)" << endl; return 0; } @@ -164,29 +136,16 @@ int fakemessenger_do_loop_2() dout(18) << "do_loop top" << endl; - /*// timer? - if (pending_timer) { - pending_timer = false; - dout(5) << "pending timer" << endl; - g_timer.execute_pending(); - } - */ - - // callbacks - lock.Unlock(); - Messenger::do_callbacks(); - lock.Lock(); - // messages - map::iterator it = directory.begin(); + map::iterator it = directory.begin(); while (it != directory.end()) { FakeMessenger *mgr = it->second; - dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has " << mgr->num_incoming() << " queued" << endl; + dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has " << mgr->num_incoming() << " queued" << endl; if (!mgr->is_ready()) { - dout(18) << "messenger " << mgr << " at " << mgr->get_myaddr() << " has no dispatcher, skipping" << endl; + dout(18) << "messenger " << mgr << " at " << mgr->get_myname() << " has no dispatcher, skipping" << endl; it++; continue; } @@ -196,10 +155,9 @@ int fakemessenger_do_loop_2() if (m) { //dout(18) << "got " << m << endl; - dout(1) << "---- '" << m->get_type_name() - << "' from " << m->get_source() // << ':' << m->get_source_port() - << " to " << m->get_dest() //<< ':' << m->get_dest_port() - << " ---- " << m + dout(1) << "---- " << m->get_dest() + << " <- " << m->get_source() + << " ---- " << *m << endl; if (g_conf.fakemessenger_serialize) { @@ -228,7 +186,7 @@ int fakemessenger_do_loop_2() // deal with shutdowns.. dleayed to avoid concurrent directory modification if (!shutdown_set.empty()) { - for (set::iterator it = shutdown_set.begin(); + for (set::iterator it = shutdown_set.begin(); it != shutdown_set.end(); it++) { dout(7) << "fakemessenger: removing " << *it << " from directory" << endl; @@ -253,25 +211,23 @@ int fakemessenger_do_loop_2() } -FakeMessenger::FakeMessenger(msg_addr_t me) : Messenger(me) +FakeMessenger::FakeMessenger(entity_name_t me) : Messenger(me) { - entity_inst_t fakeinst; lock.Lock(); { // assign rank - fakeinst.addr.sin_port = - fakeinst.rank = nranks++; - set_myinst(fakeinst); + _myinst.name = me; + _myinst.addr.port = nranks++; + //if (!me.is_mon()) + //_myinst.addr.nonce = getpid(); // add to directory - directory[ fakeinst.rank ] = this; + directory[ _myinst.addr ] = this; } lock.Unlock(); - cout << "fakemessenger " << get_myaddr() << " messenger is " << this << " at " << fakeinst << endl; - - //g_timer.set_messenger(this); + cout << "fakemessenger " << get_myname() << " messenger is " << this << " at " << _myinst << endl; qlen = 0; @@ -299,8 +255,8 @@ int FakeMessenger::shutdown() { //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; lock.Lock(); - assert(directory.count(get_myinst().rank) == 1); - shutdown_set.insert(get_myinst().rank); + assert(directory.count(_myinst.addr) == 1); + shutdown_set.insert(_myinst.addr); /* directory.erase(myaddr); @@ -322,31 +278,27 @@ int FakeMessenger::shutdown() return 0; } -/* -void FakeMessenger::trigger_timer(Timer *t) -{ - // note timer to call - pending_timer = t; - - // wake up thread? - cond.Signal(); // why not -} -*/ -void FakeMessenger::reset_myaddr(msg_addr_t m) +void FakeMessenger::reset_myname(entity_name_t m) { - dout(1) << "reset_myaddr from " << get_myaddr() << " to " << m << endl; - _set_myaddr(m); + dout(1) << "reset_myname from " << get_myname() << " to " << m << endl; + _set_myname(m); + + directory.erase(_myinst.addr); + _myinst.name = m; + directory[_myinst.addr] = this; + } -int FakeMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port, int fromport) +int FakeMessenger::send_message(Message *m, entity_inst_t inst, int port, int fromport) { - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - //m->set_lamport_send_stamp( get_lamport() ); + entity_name_t dest = inst.name; + + m->set_source(get_myname(), fromport); + m->set_source_addr(get_myaddr()); - m->set_source_inst(get_myinst()); + m->set_dest(inst.name, port); lock.Lock(); @@ -365,16 +317,19 @@ int FakeMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst, #endif // queue - FakeMessenger *dm = directory[inst.rank]; + FakeMessenger *dm = directory[inst.addr]; if (!dm) { - dout(1) << "** destination " << dest << " (" << inst << ") dne" << endl; - assert(dm); + dout(1) << "** destination " << inst << " dne" << endl; + for (map::iterator p = directory.begin(); + p != directory.end(); + ++p) { + dout(1) << "** have " << p->first << " to " << p->second << endl; + } + //assert(dm); } dm->queue_incoming(m); - dout(1) << "--> " << get_myaddr() << " sending " << m << " '" << m->get_type_name() << "'" - << " to " << dest - << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl; + dout(1) << "--> " << get_myname() << " -> " << inst.name << " " << *m << endl; } catch (...) { diff --git a/branches/aleung/security1/ceph/msg/FakeMessenger.h b/branches/aleung/security1/ceph/msg/FakeMessenger.h index 7833f224f8bbd..13cd6f95326d1 100644 --- a/branches/aleung/security1/ceph/msg/FakeMessenger.h +++ b/branches/aleung/security1/ceph/msg/FakeMessenger.h @@ -31,24 +31,31 @@ class FakeMessenger : public Messenger { int qlen; list incoming; // incoming queue + entity_inst_t _myinst; + public: - FakeMessenger(msg_addr_t me); + FakeMessenger(entity_name_t me); ~FakeMessenger(); virtual int shutdown(); - void reset_myaddr(msg_addr_t m); + const entity_inst_t& get_myinst() { + return _myinst; + }; + const entity_addr_t& get_myaddr() { + return _myinst.addr; + } + + void reset_myname(entity_name_t m); // msg interface - virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst, int port=0, int fromport=0); + virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); // events //virtual void trigger_timer(Timer *t); int get_dispatch_queue_len() { return qlen; } - void callback_kick(); - // -- incoming queue -- // (that nothing uses) Message *get_message() { diff --git a/branches/aleung/security1/ceph/msg/HostMonitor.cc b/branches/aleung/security1/ceph/msg/HostMonitor.cc index 33bef09565df2..44ab35a9fcc10 100644 --- a/branches/aleung/security1/ceph/msg/HostMonitor.cc +++ b/branches/aleung/security1/ceph/msg/HostMonitor.cc @@ -107,7 +107,7 @@ void HostMonitor::schedule_heartbeat() // take note of a live host -void HostMonitor::host_is_alive(msg_addr_t host) +void HostMonitor::host_is_alive(entity_name_t host) { if (hosts.count(host)) status[host].last_heard_from = g_clock.gettime(); @@ -122,7 +122,7 @@ void HostMonitor::initiate_heartbeat() // send out pings inflight_pings.clear(); - for (set::iterator it = hosts.begin(); + for (set::iterator it = hosts.begin(); it != hosts.end(); it++) { // have i heard from them recently? @@ -154,7 +154,7 @@ void HostMonitor::check_heartbeat() dout(DBL) << "check_heartbeat()" << endl; // check inflight pings - for (set::iterator it = inflight_pings.begin(); + for (set::iterator it = inflight_pings.begin(); it != inflight_pings.end(); it++) { status[*it].num_heartbeats_missed++; @@ -208,7 +208,7 @@ void HostMonitor::proc_message(Message *m) void HostMonitor::handle_ping_ack(MPingAck *m) { - msg_addr_t from = m->get_source(); + entity_name_t from = m->get_source(); dout(DBL) << "ping ack from " << from << endl; status[from].last_pinged = g_clock.gettime(); @@ -224,7 +224,7 @@ void HostMonitor::handle_failure_ack(MFailureAck *m) // FIXME: this doesn't handle failed -> alive transitions gracefully at all.. // the higher-up's acknowledged our failure notification, we can stop resending it. - msg_addr_t failed = m->get_failed(); + entity_name_t failed = m->get_failed(); dout(DBL) << "handle_failure_ack " << failed << endl; unacked_failures.erase(failed); acked_failures.insert(failed); diff --git a/branches/aleung/security1/ceph/msg/HostMonitor.h b/branches/aleung/security1/ceph/msg/HostMonitor.h index 20ef24eff8daf..fffe798b71450 100644 --- a/branches/aleung/security1/ceph/msg/HostMonitor.h +++ b/branches/aleung/security1/ceph/msg/HostMonitor.h @@ -38,19 +38,19 @@ class HostMonitor { string whoami; // hosts i monitor - set hosts; + set hosts; // who i tell when they fail - set notify; + set notify; int notify_port; // their status - map status; + map status; - set inflight_pings; // pings we sent that haven't replied yet + set inflight_pings; // pings we sent that haven't replied yet - set unacked_failures; // failed hosts that haven't been acked yet. - set acked_failures; // these failures have been acked. + set unacked_failures; // failed hosts that haven't been acked yet. + set acked_failures; // these failures have been acked. float heartbeat_interval; // how often to do a heartbeat float max_ping_time; // how long before it's a miss @@ -69,11 +69,11 @@ class HostMonitor { this->whoami = whoami; notify_port = 0; } - set& get_hosts() { return hosts; } - set& get_notify() { return notify; } + set& get_hosts() { return hosts; } + set& get_notify() { return notify; } void set_notify_port(int p) { notify_port = p; } - void remove_host(msg_addr_t h) { + void remove_host(entity_name_t h) { hosts.erase(h); status.erase(h); unacked_failures.erase(h); @@ -83,7 +83,7 @@ class HostMonitor { void init(); void shutdown(); - void host_is_alive(msg_addr_t who); + void host_is_alive(entity_name_t who); void proc_message(Message *m); void handle_ping_ack(class MPingAck *m); diff --git a/branches/aleung/security1/ceph/msg/MPIMessenger.cc b/branches/aleung/security1/ceph/msg/MPIMessenger.cc index 3dfcd3224a4b9..6c4e65d063fc9 100644 --- a/branches/aleung/security1/ceph/msg/MPIMessenger.cc +++ b/branches/aleung/security1/ceph/msg/MPIMessenger.cc @@ -505,7 +505,7 @@ class C_MPIKicker : public Context { } }; -MPIMessenger::MPIMessenger(msg_addr_t myaddr) : Messenger(myaddr) +MPIMessenger::MPIMessenger(entity_name_t myaddr) : Messenger(myaddr) { // my address this->myaddr = myaddr; @@ -576,7 +576,7 @@ int MPIMessenger::shutdown() /* note: send_message _MUST_ be non-blocking */ -int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +int MPIMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) { // set envelope m->set_source(myaddr, fromport); diff --git a/branches/aleung/security1/ceph/msg/MPIMessenger.h b/branches/aleung/security1/ceph/msg/MPIMessenger.h index d050f5bf49470..88e753de89749 100644 --- a/branches/aleung/security1/ceph/msg/MPIMessenger.h +++ b/branches/aleung/security1/ceph/msg/MPIMessenger.h @@ -28,18 +28,18 @@ class Timer; class MPIMessenger : public Messenger { protected: - msg_addr_t myaddr; // my address + entity_name_t myaddr; // my address //class Logger *logger; // for logging public: - MPIMessenger(msg_addr_t myaddr); + MPIMessenger(entity_name_t myaddr); ~MPIMessenger(); // init, shutdown MPI and associated event loop thread. virtual int shutdown(); // message interface - virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); }; /** diff --git a/branches/aleung/security1/ceph/msg/MTMessenger.cc b/branches/aleung/security1/ceph/msg/MTMessenger.cc index 301915a336ea5..02ab9981ff353 100644 --- a/branches/aleung/security1/ceph/msg/MTMessenger.cc +++ b/branches/aleung/security1/ceph/msg/MTMessenger.cc @@ -151,7 +151,7 @@ MTMessenger::~MTMessenger() } // send a request and wait for the response -Message *MTMessenger::sendrecv(Message *m, msg_addr_t dest) +Message *MTMessenger::sendrecv(Message *m, entity_name_t dest) { int dest_tag = 0; // servers listen for any tag int my_tag = get_tag(); diff --git a/branches/aleung/security1/ceph/msg/MTMessenger.h b/branches/aleung/security1/ceph/msg/MTMessenger.h index 6489de407ba2f..477a39c60561d 100644 --- a/branches/aleung/security1/ceph/msg/MTMessenger.h +++ b/branches/aleung/security1/ceph/msg/MTMessenger.h @@ -33,7 +33,7 @@ public: ~MTMessenger(); // send a request to a server and wait (block) for the response; - virtual Message *sendrecv(Message *m, msg_addr_t dest); + virtual Message *sendrecv(Message *m, entity_name_t dest); // wait (block) for a request from anyone Message *recvreq(); diff --git a/branches/aleung/security1/ceph/msg/Message.cc b/branches/aleung/security1/ceph/msg/Message.cc index ec7f991f42989..cb2af2704ecb9 100644 --- a/branches/aleung/security1/ceph/msg/Message.cc +++ b/branches/aleung/security1/ceph/msg/Message.cc @@ -9,6 +9,7 @@ using namespace std; #include "messages/MGenericMessage.h" +/* #include "messages/MNSConnect.h" #include "messages/MNSConnectAck.h" #include "messages/MNSRegister.h" @@ -16,16 +17,18 @@ using namespace std; #include "messages/MNSLookup.h" #include "messages/MNSLookupReply.h" #include "messages/MNSFailure.h" +*/ + +#include "messages/MMonPaxos.h" #include "messages/MMonElectionAck.h" -#include "messages/MMonElectionCollect.h" -#include "messages/MMonElectionRefresh.h" -#include "messages/MMonElectionStatus.h" +#include "messages/MMonElectionPropose.h" +#include "messages/MMonElectionVictory.h" #include "messages/MPing.h" #include "messages/MPingAck.h" -#include "messages/MFailure.h" -#include "messages/MFailureAck.h" +//#include "messages/MFailure.h" +//#include "messages/MFailureAck.h" #include "messages/MOSDBoot.h" #include "messages/MOSDIn.h" @@ -52,7 +55,10 @@ using namespace std; #include "messages/MMDSGetMap.h" #include "messages/MMDSMap.h" -#include "messages/MMDSBoot.h" +#include "messages/MMDSBeacon.h" +#include "messages/MMDSImportMap.h" +#include "messages/MMDSCacheRejoin.h" +#include "messages/MMDSCacheRejoinAck.h" #include "messages/MDirUpdate.h" #include "messages/MDiscover.h" @@ -130,6 +136,7 @@ decode_message(msg_envelope_t& env, bufferlist& payload) // -- with payload -- + /* case MSG_NS_CONNECT: m = new MNSConnect(); break; @@ -151,18 +158,20 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_NS_FAILURE: m = new MNSFailure(); break; + */ - case MSG_MON_ELECTION_ACK: - m = new MMonElectionAck(); + case MSG_MON_PAXOS: + m = new MMonPaxos; break; - case MSG_MON_ELECTION_COLLECT: - m = new MMonElectionCollect(); + + case MSG_MON_ELECTION_PROPOSE: + m = new MMonElectionPropose; break; - case MSG_MON_ELECTION_REFRESH: - m = new MMonElectionRefresh(); + case MSG_MON_ELECTION_ACK: + m = new MMonElectionAck; break; - case MSG_MON_ELECTION_STATUS: - m = new MMonElectionStatus(); + case MSG_MON_ELECTION_VICTORY: + m = new MMonElectionVictory; break; case MSG_PING: @@ -171,12 +180,14 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_PING_ACK: m = new MPingAck(); break; + /* case MSG_FAILURE: m = new MFailure(); break; case MSG_FAILURE_ACK: m = new MFailureAck(); break; + */ case MSG_OSD_BOOT: m = new MOSDBoot(); @@ -253,8 +264,17 @@ decode_message(msg_envelope_t& env, bufferlist& payload) case MSG_MDS_MAP: m = new MMDSMap(); break; - case MSG_MDS_BOOT: - m = new MMDSBoot(); + case MSG_MDS_BEACON: + m = new MMDSBeacon; + break; + case MSG_MDS_IMPORTMAP: + m = new MMDSImportMap; + break; + case MSG_MDS_CACHEREJOIN: + m = new MMDSCacheRejoin; + break; + case MSG_MDS_CACHEREJOINACK: + m = new MMDSCacheRejoinAck; break; case MSG_MDS_DIRUPDATE: diff --git a/branches/aleung/security1/ceph/msg/Message.h b/branches/aleung/security1/ceph/msg/Message.h index eaace16354a9a..f95c3199e427d 100644 --- a/branches/aleung/security1/ceph/msg/Message.h +++ b/branches/aleung/security1/ceph/msg/Message.h @@ -11,8 +11,6 @@ * */ - - #ifndef __MESSAGE_H #define __MESSAGE_H @@ -38,10 +36,10 @@ #define MSG_SHUTDOWN 99999 + #define MSG_MON_ELECTION_ACK 15 -#define MSG_MON_ELECTION_COLLECT 16 -#define MSG_MON_ELECTION_REFRESH 17 -#define MSG_MON_ELECTION_STATUS 18 +#define MSG_MON_ELECTION_PROPOSE 16 +#define MSG_MON_ELECTION_VICTORY 17 #define MSG_MON_OSDMAP_INFO 20 #define MSG_MON_OSDMAP_LEASE 21 @@ -50,6 +48,8 @@ #define MSG_MON_OSDMAP_UPDATE_ACK 24 #define MSG_MON_OSDMAP_UPDATE_COMMIT 25 +#define MSG_MON_PAXOS 30 + #define MSG_OSD_OP 40 // delete, etc. #define MSG_OSD_OPREPLY 41 // delete, etc. #define MSG_OSD_PING 42 @@ -89,10 +89,14 @@ // *** MDS *** -#define MSG_MDS_BOOT 100 -#define MSG_MDS_GETMAP 101 -#define MSG_MDS_MAP 102 -#define MSG_MDS_HEARTBEAT 103 +#define MSG_MDS_GETMAP 102 +#define MSG_MDS_MAP 103 +#define MSG_MDS_HEARTBEAT 104 // for mds load balancer +#define MSG_MDS_BEACON 105 // to monitor + +#define MSG_MDS_IMPORTMAP 106 +#define MSG_MDS_CACHEREJOIN 107 +#define MSG_MDS_CACHEREJOINACK 108 #define MSG_MDS_DISCOVER 110 #define MSG_MDS_DISCOVERREPLY 111 @@ -177,130 +181,14 @@ using std::list; using __gnu_cxx::crope; +#include "include/types.h" #include "include/buffer.h" - -#include "tcp.h" +#include "msg_types.h" -// use fixed offsets and static entity -> logical addr mapping! -#define MSG_ADDR_NAMER_BASE 0 -#define MSG_ADDR_RANK_BASE 1 -#define MSG_ADDR_MDS_BASE 2 -#define MSG_ADDR_OSD_BASE 3 -#define MSG_ADDR_MON_BASE 4 -#define MSG_ADDR_CLIENT_BASE 5 - -#define MSG_ADDR_NEW -1 - - -// new typed msg_addr_t way! -class msg_addr_t { -public: - int _type; - int _num; - - msg_addr_t() : _type(0), _num(0) {} - msg_addr_t(int t, int n) : _type(t), _num(n) {} - - int num() const { return _num; } - int type() const { return _type; } - const char *type_str() const { - switch (type()) { - case MSG_ADDR_RANK_BASE: return "rank"; - case MSG_ADDR_MDS_BASE: return "mds"; - case MSG_ADDR_OSD_BASE: return "osd"; - case MSG_ADDR_MON_BASE: return "mon"; - case MSG_ADDR_CLIENT_BASE: return "client"; - case MSG_ADDR_NAMER_BASE: return "namer"; - } - return "unknown"; - } - - bool is_new() const { return num() == MSG_ADDR_NEW; } - - bool is_client() const { return type() == MSG_ADDR_CLIENT_BASE; } - bool is_mds() const { return type() == MSG_ADDR_MDS_BASE; } - bool is_osd() const { return type() == MSG_ADDR_OSD_BASE; } - bool is_mon() const { return type() == MSG_ADDR_MON_BASE; } - bool is_namer() const { return type() == MSG_ADDR_NAMER_BASE; } -}; - -inline bool operator== (const msg_addr_t& l, const msg_addr_t& r) { return (l._type == r._type) && (l._num == r._num); } -inline bool operator!= (const msg_addr_t& l, const msg_addr_t& r) { return (l._type != r._type) || (l._num != r._num); } -inline bool operator< (const msg_addr_t& l, const msg_addr_t& r) { return (l._type < r._type) || (l._type == r._type && l._num < r._num); } - -inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) { - //if (addr.is_namer()) return out << "namer"; - if (addr.is_new() || addr.num() < 0) - return out << addr.type_str() << "?"; - else - return out << addr.type_str() << addr.num(); -} - -namespace __gnu_cxx { - template<> struct hash< msg_addr_t > - { - size_t operator()( const msg_addr_t m ) const - { - static hash H; - return H(m.type() ^ m.num()); - } - }; -} - -#define MSG_ADDR_RANK(x) msg_addr_t(MSG_ADDR_RANK_BASE,x) -#define MSG_ADDR_MDS(x) msg_addr_t(MSG_ADDR_MDS_BASE,x) -#define MSG_ADDR_OSD(x) msg_addr_t(MSG_ADDR_OSD_BASE,x) -#define MSG_ADDR_MON(x) msg_addr_t(MSG_ADDR_MON_BASE,x) -#define MSG_ADDR_CLIENT(x) msg_addr_t(MSG_ADDR_CLIENT_BASE,x) -#define MSG_ADDR_NAMER(x) msg_addr_t(MSG_ADDR_NAMER_BASE,x) - -#define MSG_ADDR_UNDEF msg_addr_t() -#define MSG_ADDR_DIRECTORY MSG_ADDR_NAMER(0) - -#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(MSG_ADDR_NEW) -#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(MSG_ADDR_NEW) -#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(MSG_ADDR_NEW) -#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(MSG_ADDR_NEW) -#define MSG_ADDR_NAMER_NEW MSG_ADDR_NAMER(MSG_ADDR_NEW) - - -class entity_inst_t { - public: - tcpaddr_t addr; - __int64_t rank; - - entity_inst_t() : rank(-1) { - memset(&addr, 0, sizeof(addr)); - } - entity_inst_t(tcpaddr_t& a, int r) : addr(a), rank(r) { - memset(&addr, 0, sizeof(addr)); - } - - void set_addr(tcpaddr_t a) { - addr = a; - - // figure out rank - rank = *((unsigned*)&a.sin_addr.s_addr); - rank |= (__uint64_t)a.sin_port << 32; - } -}; - -inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return a.rank == b.rank && a.addr == b.addr; } -inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return !(a == b); } -inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return a.rank > b.rank; } -inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank >= b.rank; } -inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return a.rank < b.rank; } -inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank <= b.rank; } - -inline ostream& operator<<(ostream& out, const entity_inst_t &i) -{ - //return out << "rank" << i.rank << "_" << i.addr; - return out << i.addr; -} - +// ====================================================== // abstract Message class @@ -308,12 +196,9 @@ inline ostream& operator<<(ostream& out, const entity_inst_t &i) typedef struct { int type; - msg_addr_t source, dest; - entity_inst_t source_inst; + entity_inst_t src, dst; int source_port, dest_port; int nchunks; - __uint64_t lamport_send_stamp; - __uint64_t lamport_recv_stamp; } msg_envelope_t; #define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) @@ -332,27 +217,16 @@ public: public: Message() { env.source_port = env.dest_port = -1; - env.source = env.dest = MSG_ADDR_UNDEF; env.nchunks = 0; - env.lamport_send_stamp = 0; - env.lamport_recv_stamp = 0; }; Message(int t) { env.source_port = env.dest_port = -1; - env.source = env.dest = MSG_ADDR_UNDEF; env.nchunks = 0; env.type = t; - env.lamport_send_stamp = 0; - env.lamport_recv_stamp = 0; } virtual ~Message() { } - void set_lamport_send_stamp(__uint64_t t) { env.lamport_send_stamp = t; } - void set_lamport_recv_stamp(__uint64_t t) { env.lamport_recv_stamp = t; } - __uint64_t get_lamport_send_stamp() { return env.lamport_send_stamp; } - __uint64_t get_lamport_recv_stamp() { return env.lamport_recv_stamp; } - // for rpc-type procedural messages (pcid = procedure call id) virtual long get_pcid() { return 0; } @@ -382,16 +256,23 @@ public: virtual char *get_type_name() = 0; // source/dest - msg_addr_t& get_dest() { return env.dest; } - void set_dest(msg_addr_t a, int p) { env.dest = a; env.dest_port = p; } + entity_inst_t& get_dest_inst() { return env.dst; } + void set_dest_inst(entity_inst_t& inst) { env.dst = inst; } + + entity_inst_t& get_source_inst() { return env.src; } + void set_source_inst(entity_inst_t& inst) { env.src = inst; } + + entity_name_t& get_dest() { return env.dst.name; } + void set_dest(entity_name_t a, int p) { env.dst.name = a; env.dest_port = p; } int get_dest_port() { return env.dest_port; } + void set_dest_port(int p) { env.dest_port = p; } - msg_addr_t& get_source() { return env.source; } - void set_source(msg_addr_t a, int p) { env.source = a; env.source_port = p; } + entity_name_t& get_source() { return env.src.name; } + void set_source(entity_name_t a, int p) { env.src.name = a; env.source_port = p; } int get_source_port() { return env.source_port; } - entity_inst_t& get_source_inst() { return env.source_inst; } - void set_source_inst(const entity_inst_t &i) { env.source_inst = i; } + entity_addr_t& get_source_addr() { return env.src.addr; } + void set_source_addr(const entity_addr_t &i) { env.src.addr = i; } // PAYLOAD ---- void reset_payload() { diff --git a/branches/aleung/security1/ceph/msg/Messenger.cc b/branches/aleung/security1/ceph/msg/Messenger.cc index b033bbfc08638..a6133260e9b9e 100644 --- a/branches/aleung/security1/ceph/msg/Messenger.cc +++ b/branches/aleung/security1/ceph/msg/Messenger.cc @@ -25,52 +25,6 @@ using namespace std; -#include "config.h" -#undef dout -#define dout(l) if (l<=g_conf.debug) cout << "messenger: " -#define DEBUGLVL 10 // debug level of output - - - -// -------- -// callbacks - -Mutex msgr_callback_lock; -list msgr_callback_queue; -//Context* msgr_callback_kicker = 0; - -void Messenger::queue_callback(Context *c) { - msgr_callback_lock.Lock(); - msgr_callback_queue.push_back(c); - msgr_callback_lock.Unlock(); - - callback_kick(); -} -void Messenger::queue_callbacks(list& ls) { - msgr_callback_lock.Lock(); - msgr_callback_queue.splice(msgr_callback_queue.end(), ls); - msgr_callback_lock.Unlock(); - - callback_kick(); -} - -void Messenger::do_callbacks() { - // take list - msgr_callback_lock.Lock(); - list ls; - ls.splice(ls.begin(), msgr_callback_queue); - msgr_callback_lock.Unlock(); - - // do them - for (list::iterator it = ls.begin(); - it != ls.end(); - it++) { - dout(10) << "--- doing callback " << *it << endl; - (*it)->finish(0); - delete *it; - } -} - // --------- // incoming messages diff --git a/branches/aleung/security1/ceph/msg/Messenger.h b/branches/aleung/security1/ceph/msg/Messenger.h index 85ef499eb97bb..991e80c839112 100644 --- a/branches/aleung/security1/ceph/msg/Messenger.h +++ b/branches/aleung/security1/ceph/msg/Messenger.h @@ -26,8 +26,6 @@ using namespace std; #include "include/Context.h" -typedef __uint64_t lamport_t; - class MDS; class Timer; @@ -35,32 +33,23 @@ class Timer; class Messenger { private: Dispatcher *dispatcher; - msg_addr_t _myaddr; - entity_inst_t _myinst; - + entity_name_t _myname; public: - Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w) { } + Messenger(entity_name_t w) : dispatcher(0), _myname(w) { } virtual ~Messenger() { } - const entity_inst_t &get_myinst() { return _myinst; } - void set_myinst(entity_inst_t& v) { _myinst = v; } - - msg_addr_t get_myaddr() { return _myaddr; } - void _set_myaddr(msg_addr_t m) { _myaddr = m; } + // accessors + entity_name_t get_myname() { return _myname; } + void _set_myname(entity_name_t m) { _myname = m; } - virtual void reset_myaddr(msg_addr_t m) = 0; + virtual void reset_myname(entity_name_t m) = 0; + virtual const entity_addr_t &get_myaddr() = 0; - virtual int shutdown() = 0; + entity_inst_t get_myinst() { return entity_inst_t(_myname, get_myaddr()); } - // callbacks - static void do_callbacks(); - - void queue_callback(Context *c); - void queue_callbacks(list& ls); - virtual void callback_kick() = 0; - + // hrmpf. virtual int get_dispatch_queue_len() { return 0; }; // setup @@ -70,22 +59,23 @@ class Messenger { bool is_ready() { return dispatcher != 0; } // dispatch incoming messages - virtual void dispatch(Message *m); + virtual void dispatch(Message *m) { + assert(dispatcher); + dispatcher->dispatch(m); + } + + // shutdown + virtual int shutdown() = 0; // send message - virtual void prepare_dest(const entity_inst_t& inst) {} - //virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; - virtual int send_message(Message *m, msg_addr_t dest, entity_inst_t inst, + virtual void prepare_dest(const entity_addr_t& addr) {} + virtual int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0) = 0; - // make a procedure call - //virtual Message* sendrecv(Message *m, msg_addr_t dest, int port=0); - + //virtual Message* sendrecv(Message *m, msg_name_t dest, int port=0); - virtual void mark_down(msg_addr_t a, entity_inst_t& i) {} - virtual void mark_up(msg_addr_t a, entity_inst_t& i) {} - //virtual void reset(msg_addr_t a) { mark_down(a); mark_up(a); } + virtual void mark_down(entity_addr_t a) {} }; diff --git a/branches/aleung/security1/ceph/msg/NewMessenger.cc b/branches/aleung/security1/ceph/msg/NewMessenger.cc index 6cd5d291b60c3..1455c31724c68 100644 --- a/branches/aleung/security1/ceph/msg/NewMessenger.cc +++ b/branches/aleung/security1/ceph/msg/NewMessenger.cc @@ -124,7 +124,7 @@ void Rank::Namer::handle_register(MNSRegister *m) << " addr " << m->get_entity() << endl; // pick id - msg_addr_t entity = m->get_entity(); + entity_name_t entity = m->get_entity(); if (entity.is_new()) { // make up a new address! @@ -172,7 +172,7 @@ void Rank::Namer::handle_register(MNSRegister *m) void Rank::Namer::handle_started(Message *m) { - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(10) << "namer.handle_started from entity " << who << endl; assert(rank.entity_unstarted.count(who)); @@ -195,7 +195,7 @@ void Rank::Namer::handle_started(Message *m) void Rank::Namer::handle_unregister(Message *m) { - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(1) << "namer.handle_unregister entity " << who << endl; rank.show_dir(); @@ -252,14 +252,14 @@ void Rank::Namer::handle_failure(MNSFailure *m) << endl; // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); + list rm; + for (hash_map::iterator i = rank.entity_map.begin(); i != rank.entity_map.end(); i++) { if (i->second != m->get_inst()) continue; rm.push_back(i->first); } - for (list::iterator i = rm.begin(); + for (list::iterator i = rm.begin(); i != rm.end(); i++) { dout(10) << "namer.handle_failure inst " << m->get_inst() @@ -555,7 +555,7 @@ void Rank::Sender::fail_and_requeue(list& out) // FIXME: possible race before i reclaim lock here? Dispatcher *dis = 0; - msg_addr_t dis_dest; + entity_name_t dis_dest; list lost; @@ -897,7 +897,7 @@ int Rank::start_rank() my_inst.rank = my_rank; // create my rank - msg_addr_t raddr = MSG_ADDR_RANK(my_rank); + entity_name_t raddr = MSG_ADDR_RANK(my_rank); entity_map[raddr] = my_inst; entity_unstarted.insert(raddr); local[raddr] = messenger = new EntityMessenger(raddr); @@ -913,7 +913,7 @@ int Rank::start_rank() void Rank::start_namer() { // create namer0 - msg_addr_t naddr = MSG_ADDR_NAMER(0); + entity_name_t naddr = MSG_ADDR_NAMER(0); entity_map[naddr] = my_inst; local[naddr] = new EntityMessenger(naddr); namer = new Namer(local[naddr]); @@ -961,7 +961,7 @@ void Rank::show_dir() { dout(10) << "show_dir ---" << endl; - for (hash_map::iterator i = entity_map.begin(); + for (hash_map::iterator i = entity_map.begin(); i != entity_map.end(); i++) { if (local.count(i->first)) { @@ -976,7 +976,7 @@ void Rank::show_dir() /* lookup * NOTE: assumes directory.lock held */ -void Rank::lookup(msg_addr_t addr) +void Rank::lookup(entity_name_t addr) { dout(10) << "lookup " << addr << endl; assert(lock.is_locked()); @@ -992,7 +992,7 @@ void Rank::lookup(msg_addr_t addr) /* register_entity */ -Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) +Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) { dout(10) << "register_entity " << addr << endl; lock.Lock(); @@ -1080,7 +1080,7 @@ void Rank::submit_messages(list& ls) } -void Rank::prepare_dest(msg_addr_t dest) +void Rank::prepare_dest(entity_name_t dest) { lock.Lock(); @@ -1117,7 +1117,7 @@ void Rank::prepare_dest(msg_addr_t dest) void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); // lookup EntityMessenger *entity = 0; @@ -1175,7 +1175,7 @@ void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) void Rank::submit_message(Message *m) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); // lookup EntityMessenger *entity = 0; @@ -1320,11 +1320,11 @@ void Rank::handle_lookup_reply(MNSLookupReply *m) list waiting; dout(10) << "got lookup reply" << endl; - for (map::iterator it = m->entity_map.begin(); + for (map::iterator it = m->entity_map.begin(); it != m->entity_map.end(); it++) { dout(10) << "lookup got " << it->first << " at " << it->second << endl; - msg_addr_t addr = it->first; + entity_name_t addr = it->first; entity_inst_t inst = it->second; if (down.count(addr)) { @@ -1492,7 +1492,7 @@ int Rank::find_ns_addr(tcpaddr_t &nsa) * EntityMessenger */ -Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) : +Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : Messenger(myaddr), stop(false), dispatch_thread(this) @@ -1583,12 +1583,12 @@ int Rank::EntityMessenger::shutdown() } -void Rank::EntityMessenger::prepare_send_message(msg_addr_t dest) +void Rank::EntityMessenger::prepare_send_message(entity_name_t dest) { rank.prepare_dest(dest); } -int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst) +int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst) { // set envelope m->set_source(get_myaddr(), 0); @@ -1610,7 +1610,7 @@ int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entit } -int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) { // set envelope m->set_source(get_myaddr(), fromport); @@ -1632,13 +1632,13 @@ int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, i } -void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i) +void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) { assert(a != get_myaddr()); rank.mark_down(a,i); } -void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) +void Rank::mark_down(entity_name_t a, entity_inst_t& inst) { if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer lock.Lock(); @@ -1676,13 +1676,13 @@ void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) lock.Unlock(); } -void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i) +void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) { assert(a != get_myaddr()); rank.mark_up(a, i); } -void Rank::mark_up(msg_addr_t a, entity_inst_t& i) +void Rank::mark_up(entity_name_t a, entity_inst_t& i) { if (my_rank == 0) return; lock.Lock(); diff --git a/branches/aleung/security1/ceph/msg/NewMessenger.h b/branches/aleung/security1/ceph/msg/NewMessenger.h index a1c7af6e5c83b..0e04315a10883 100644 --- a/branches/aleung/security1/ceph/msg/NewMessenger.h +++ b/branches/aleung/security1/ceph/msg/NewMessenger.h @@ -39,7 +39,7 @@ class Rank : public Dispatcher { int nrank; int nclient, nmds, nosd, nmon; - map > waiting; + map > waiting; Namer(EntityMessenger *msgr); ~Namer(); @@ -101,7 +101,7 @@ class Rank : public Dispatcher { bool done; int sd; - set entities; + set entities; list q; Mutex lock; @@ -174,7 +174,7 @@ class Rank : public Dispatcher { } public: - EntityMessenger(msg_addr_t myaddr); + EntityMessenger(entity_name_t myaddr); ~EntityMessenger(); void ready(); @@ -186,12 +186,12 @@ class Rank : public Dispatcher { virtual void callback_kick() {} virtual int shutdown(); - virtual void prepare_send_message(msg_addr_t dest); - virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst); + virtual void prepare_send_message(entity_name_t dest); + virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst); - virtual void mark_down(msg_addr_t a, entity_inst_t& i); - virtual void mark_up(msg_addr_t a, entity_inst_t& i); + virtual void mark_down(entity_name_t a, entity_inst_t& i); + virtual void mark_up(entity_name_t a, entity_inst_t& i); //virtual void reset(msg_addr_t a); }; @@ -210,7 +210,7 @@ class Rank : public Dispatcher { bool single_dispatch_stop; list single_dispatch_queue; - map > waiting_for_ready; + map > waiting_for_ready; void single_dispatcher_entry(); void _submit_single_dispatch(Message *m); @@ -229,20 +229,20 @@ class Rank : public Dispatcher { entity_inst_t my_inst; // lookup - hash_map entity_map; - hash_set entity_unstarted; + hash_map entity_map; + hash_set entity_unstarted; - map > waiting_for_lookup; - set looking_up; + map > waiting_for_lookup; + set looking_up; - hash_set down; + hash_set down; // register map waiting_for_register_cond; - map waiting_for_register_result; + map waiting_for_register_result; // local - map local; + map local; // remote hash_map rank_sender; @@ -258,7 +258,7 @@ class Rank : public Dispatcher { void show_dir(); - void lookup(msg_addr_t addr); + void lookup(entity_name_t addr); void dispatch(Message *m); void handle_connect_ack(class MNSConnectAck *m); @@ -267,8 +267,8 @@ class Rank : public Dispatcher { Sender *connect_rank(const entity_inst_t& inst); - void mark_down(msg_addr_t addr, entity_inst_t& i); - void mark_up(msg_addr_t addr, entity_inst_t& i); + void mark_down(entity_name_t addr, entity_inst_t& i); + void mark_up(entity_name_t addr, entity_inst_t& i); tcpaddr_t get_listen_addr() { return accepter.listen_addr; } @@ -287,16 +287,16 @@ public: int start_rank(); void wait(); - EntityMessenger *register_entity(msg_addr_t addr); + EntityMessenger *register_entity(entity_name_t addr); void unregister_entity(EntityMessenger *ms); void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(msg_addr_t dest); + void prepare_dest(entity_name_t dest); void submit_message(Message *m); void submit_messages(list& ls); // create a new messenger - EntityMessenger *new_entity(msg_addr_t addr); + EntityMessenger *new_entity(entity_name_t addr); } ; diff --git a/branches/aleung/security1/ceph/msg/NewerMessenger.cc b/branches/aleung/security1/ceph/msg/NewerMessenger.cc index d1ed3fb00fdb3..c277eea4b409b 100644 --- a/branches/aleung/security1/ceph/msg/NewerMessenger.cc +++ b/branches/aleung/security1/ceph/msg/NewerMessenger.cc @@ -136,7 +136,7 @@ void Rank::Namer::handle_register(MNSRegister *m) << " addr " << m->get_entity() << endl; // pick id - msg_addr_t entity = m->get_entity(); + entity_name_t entity = m->get_entity(); if (entity.is_new()) { // make up a new address! @@ -184,7 +184,7 @@ void Rank::Namer::handle_register(MNSRegister *m) void Rank::Namer::handle_started(Message *m) { - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(10) << "namer.handle_started from entity " << who << endl; assert(rank.entity_unstarted.count(who)); @@ -207,7 +207,7 @@ void Rank::Namer::handle_started(Message *m) void Rank::Namer::handle_unregister(Message *m) { - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(1) << "namer.handle_unregister entity " << who << endl; rank.show_dir(); @@ -264,14 +264,14 @@ void Rank::Namer::handle_failure(MNSFailure *m) << endl; // search for entities on this instance - list rm; - for (hash_map::iterator i = rank.entity_map.begin(); + list rm; + for (hash_map::iterator i = rank.entity_map.begin(); i != rank.entity_map.end(); i++) { if (i->second != m->get_inst()) continue; rm.push_back(i->first); } - for (list::iterator i = rm.begin(); + for (list::iterator i = rm.begin(); i != rm.end(); i++) { dout(10) << "namer.handle_failure inst " << m->get_inst() @@ -852,7 +852,7 @@ void Rank::Pipe::fail(list& out) // what do i do about reader()? FIXME // sort my messages by (source) dispatcher, dest. - map > > by_dis; + map > > by_dis; lock.Lock(); { // include out at front of queue @@ -878,10 +878,10 @@ void Rank::Pipe::fail(list& out) lock.Unlock(); // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); + for (map > >::iterator i = by_dis.begin(); i != by_dis.end(); ++i) - for (map >::iterator j = i->second.begin(); + for (map >::iterator j = i->second.begin(); j != i->second.end(); ++j) for (list::iterator k = j->second.begin(); @@ -1032,7 +1032,7 @@ int Rank::start_rank() messenger->set_dispatcher(this); } else { // create my rank - msg_addr_t raddr = MSG_ADDR_RANK(my_rank); + entity_name_t raddr = MSG_ADDR_RANK(my_rank); entity_map[raddr] = my_inst; entity_unstarted.insert(raddr); local[raddr] = messenger = new EntityMessenger(raddr); @@ -1048,7 +1048,7 @@ int Rank::start_rank() void Rank::start_namer() { // create namer0 - msg_addr_t naddr = MSG_ADDR_NAMER(0); + entity_name_t naddr = MSG_ADDR_NAMER(0); entity_map[naddr] = my_inst; local[naddr] = new EntityMessenger(naddr); namer = new Namer(local[naddr]); @@ -1087,7 +1087,7 @@ void Rank::show_dir() { dout(10) << "show_dir ---" << endl; - for (hash_map::iterator i = entity_map.begin(); + for (hash_map::iterator i = entity_map.begin(); i != entity_map.end(); i++) { if (local.count(i->first)) { @@ -1102,7 +1102,7 @@ void Rank::show_dir() /* lookup * NOTE: assumes directory.lock held */ -void Rank::lookup(msg_addr_t addr) +void Rank::lookup(entity_name_t addr) { dout(10) << "lookup " << addr << endl; assert(lock.is_locked()); @@ -1118,7 +1118,7 @@ void Rank::lookup(msg_addr_t addr) /* register_entity */ -Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) +Rank::EntityMessenger *Rank::register_entity(entity_name_t addr) { dout(10) << "register_entity " << addr << endl; lock.Lock(); @@ -1209,7 +1209,7 @@ void Rank::submit_messages(list& ls) void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); // lookup EntityMessenger *entity = 0; @@ -1267,7 +1267,7 @@ void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) void Rank::submit_message(Message *m) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); // lookup EntityMessenger *entity = 0; @@ -1404,11 +1404,11 @@ void Rank::handle_lookup_reply(MNSLookupReply *m) list waiting; dout(10) << "got lookup reply" << endl; - for (map::iterator it = m->entity_map.begin(); + for (map::iterator it = m->entity_map.begin(); it != m->entity_map.end(); it++) { dout(10) << "lookup got " << it->first << " at " << it->second << endl; - msg_addr_t addr = it->first; + entity_name_t addr = it->first; entity_inst_t inst = it->second; if (entity_map.count(addr) && @@ -1563,7 +1563,7 @@ int Rank::find_ns_addr(tcpaddr_t &nsa) * EntityMessenger */ -Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) : +Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : Messenger(myaddr), stop(false), dispatch_thread(this) @@ -1665,7 +1665,7 @@ void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) rank.lock.Unlock(); } -int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst, +int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, int port, int fromport) { // set envelope @@ -1688,7 +1688,7 @@ int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entit } -int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +int Rank::EntityMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) { // set envelope m->set_source(get_myaddr(), fromport); @@ -1710,13 +1710,13 @@ int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, i } -void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i) +void Rank::EntityMessenger::mark_down(entity_name_t a, entity_inst_t& i) { assert(a != get_myaddr()); rank.mark_down(a,i); } -void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) +void Rank::mark_down(entity_name_t a, entity_inst_t& inst) { //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer lock.Lock(); @@ -1755,13 +1755,13 @@ void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) lock.Unlock(); } -void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i) +void Rank::EntityMessenger::mark_up(entity_name_t a, entity_inst_t& i) { assert(a != get_myaddr()); rank.mark_up(a, i); } -void Rank::mark_up(msg_addr_t a, entity_inst_t& i) +void Rank::mark_up(entity_name_t a, entity_inst_t& i) { if (my_rank == 0) return; lock.Lock(); diff --git a/branches/aleung/security1/ceph/msg/NewerMessenger.h b/branches/aleung/security1/ceph/msg/NewerMessenger.h index 6a4e003352aa8..29b885745df48 100644 --- a/branches/aleung/security1/ceph/msg/NewerMessenger.h +++ b/branches/aleung/security1/ceph/msg/NewerMessenger.h @@ -51,7 +51,7 @@ class Rank : public Dispatcher { int nrank; int nclient, nmds, nosd, nmon; - map > waiting; + map > waiting; Namer(EntityMessenger *msgr); ~Namer(); @@ -212,7 +212,7 @@ class Rank : public Dispatcher { } public: - EntityMessenger(msg_addr_t myaddr); + EntityMessenger(entity_name_t myaddr); ~EntityMessenger(); void ready(); @@ -225,12 +225,12 @@ class Rank : public Dispatcher { virtual void callback_kick() {} virtual int shutdown(); virtual void prepare_dest(const entity_inst_t& inst); - virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); - virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst, + virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, entity_name_t dest, const entity_inst_t& inst, int port=0, int fromport=0); - virtual void mark_down(msg_addr_t a, entity_inst_t& i); - virtual void mark_up(msg_addr_t a, entity_inst_t& i); + virtual void mark_down(entity_name_t a, entity_inst_t& i); + virtual void mark_up(entity_name_t a, entity_inst_t& i); //virtual void reset(msg_addr_t a); }; @@ -249,7 +249,7 @@ class Rank : public Dispatcher { bool single_dispatch_stop; list single_dispatch_queue; - map > waiting_for_ready; + map > waiting_for_ready; void single_dispatcher_entry(); void _submit_single_dispatch(Message *m); @@ -268,18 +268,18 @@ class Rank : public Dispatcher { entity_inst_t my_inst; // lookup - hash_map entity_map; - hash_set entity_unstarted; + hash_map entity_map; + hash_set entity_unstarted; - map > waiting_for_lookup; - set looking_up; + map > waiting_for_lookup; + set looking_up; // register map waiting_for_register_cond; - map waiting_for_register_result; + map waiting_for_register_result; // local - map local; + map local; // remote hash_map rank_pipe; @@ -294,7 +294,7 @@ class Rank : public Dispatcher { void show_dir(); - void lookup(msg_addr_t addr); + void lookup(entity_name_t addr); void dispatch(Message *m); void handle_connect_ack(class MNSConnectAck *m); @@ -303,8 +303,8 @@ class Rank : public Dispatcher { Pipe *connect_rank(const entity_inst_t& inst); - void mark_down(msg_addr_t addr, entity_inst_t& i); - void mark_up(msg_addr_t addr, entity_inst_t& i); + void mark_down(entity_name_t addr, entity_inst_t& i); + void mark_up(entity_name_t addr, entity_inst_t& i); tcpaddr_t get_listen_addr() { return accepter.listen_addr; } @@ -323,7 +323,7 @@ public: int start_rank(); void wait(); - EntityMessenger *register_entity(msg_addr_t addr); + EntityMessenger *register_entity(entity_name_t addr); void unregister_entity(EntityMessenger *ms); void submit_message(Message *m, const entity_inst_t& inst); @@ -332,7 +332,7 @@ public: void submit_messages(list& ls); // create a new messenger - EntityMessenger *new_entity(msg_addr_t addr); + EntityMessenger *new_entity(entity_name_t addr); } ; diff --git a/branches/aleung/security1/ceph/msg/SerialMessenger.h b/branches/aleung/security1/ceph/msg/SerialMessenger.h index d03e7377d2826..1c5c9e9c3961a 100644 --- a/branches/aleung/security1/ceph/msg/SerialMessenger.h +++ b/branches/aleung/security1/ceph/msg/SerialMessenger.h @@ -21,8 +21,8 @@ class SerialMessenger : public Dispatcher { public: virtual void dispatch(Message *m) = 0; // i receive my messages here - virtual void send(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // doesn't block - virtual Message *sendrecv(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply + virtual void send(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // doesn't block + virtual Message *sendrecv(Message *m, entity_name_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply }; #endif diff --git a/branches/aleung/security1/ceph/msg/SimpleMessenger.cc b/branches/aleung/security1/ceph/msg/SimpleMessenger.cc index f6377e828c77d..5bb9e84d188d6 100644 --- a/branches/aleung/security1/ceph/msg/SimpleMessenger.cc +++ b/branches/aleung/security1/ceph/msg/SimpleMessenger.cc @@ -17,17 +17,11 @@ #include #include #include +#include #include "config.h" #include "messages/MGenericMessage.h" -#include "messages/MNSConnect.h" -#include "messages/MNSConnectAck.h" -#include "messages/MNSRegister.h" -#include "messages/MNSRegisterAck.h" -#include "messages/MNSLookup.h" -#include "messages/MNSLookupReply.h" -#include "messages/MNSFailure.h" //#include "messages/MFailure.h" @@ -35,8 +29,8 @@ #undef dout -#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_inst.addr << " " -#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_inst.addr << " " +#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- " << rank.my_addr << " " +#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- " << rank.my_addr << " " @@ -51,11 +45,31 @@ Rank rank; * Accepter */ +void simplemessenger_sigint(int r) +{ + rank.sigint(); +} + +void Rank::sigint() +{ + lock.Lock(); + derr(0) << "got control-c, exiting" << endl; + ::close(accepter.listen_sd); + _exit(-1); + lock.Unlock(); +} + + + + int Rank::Accepter::start() { // bind to a socket dout(10) << "accepter.start binding to listen " << endl; + // use whatever user specified.. + g_my_addr.make_addr(rank.listen_addr); + /* socket creation */ listen_sd = socket(AF_INET,SOCK_STREAM,0); assert(listen_sd > 0); @@ -66,18 +80,17 @@ int Rank::Accepter::start() derr(0) << "accepter.start unable to bind to " << rank.listen_addr << endl; assert(rc >= 0); + // what port did we get? socklen_t llen = sizeof(rank.listen_addr); getsockname(listen_sd, (sockaddr*)&rank.listen_addr, &llen); - int myport = rank.listen_addr.sin_port; + dout(10) << "accepter.start bound to " << rank.listen_addr << endl; // listen! rc = ::listen(listen_sd, 1000); assert(rc >= 0); - - //dout(10) << "accepter.start listening on " << myport << endl; - // my address is... + // my address is... HELP HELP HELP! char host[100]; bzero(host, 100); gethostname(host, 100); @@ -85,18 +98,26 @@ int Rank::Accepter::start() struct hostent *myhostname = gethostbyname( host ); - struct sockaddr_in my_addr; - memset(&my_addr, 0, sizeof(my_addr)); + // figure out my_addr + if (g_my_addr.port > 0) { + // user specified it, easy peasy. + rank.my_addr = g_my_addr; + } else { + // look up my hostname. blech! this sucks. + rank.listen_addr.sin_family = myhostname->h_addrtype; + memcpy((char *) &rank.listen_addr.sin_addr.s_addr, + myhostname->h_addr_list[0], + myhostname->h_length); + + // set up my_addr with a nonce + rank.my_addr.set_addr(rank.listen_addr); + rank.my_addr.nonce = getpid(); // FIXME: pid might not be best choice here. + } + + dout(10) << "accepter.start my addr is " << rank.my_addr << endl; - my_addr.sin_family = myhostname->h_addrtype; - memcpy((char *) &my_addr.sin_addr.s_addr, - myhostname->h_addr_list[0], - myhostname->h_length); - my_addr.sin_port = myport; - - rank.listen_addr = my_addr; - - dout(10) << "accepter.start listen addr is " << rank.listen_addr << endl; + // set up signal handler + signal(SIGINT, simplemessenger_sigint); // start thread create(); @@ -117,8 +138,10 @@ void *Rank::Accepter::entry() dout(10) << "accepted incoming on sd " << sd << endl; rank.lock.Lock(); - Pipe *p = new Pipe(sd); - rank.pipes.insert(p); + if (!rank.local.empty()) { + Pipe *p = new Pipe(sd); + rank.pipes.insert(p); + } rank.lock.Unlock(); } else { dout(10) << "no incoming connection?" << endl; @@ -140,7 +163,7 @@ int Rank::Pipe::accept() // my creater gave me sd via accept() // announce myself. - int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); + int rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); if (rc < 0) { ::close(sd); done = true; @@ -148,7 +171,7 @@ int Rank::Pipe::accept() } // identify peer - rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst)); + rc = tcp_read(sd, (char*)&peer_addr, sizeof(peer_addr)); if (rc < 0) { dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; ::close(sd); @@ -161,39 +184,34 @@ int Rank::Pipe::accept() writer_thread.create(); // register pipe. - if (peer_inst.rank >= 0) { - rank.lock.Lock(); - { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - // install a pipe! - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl; - rank.rank_pipe[peer_inst.rank] = this; + rank.lock.Lock(); + { + if (rank.rank_pipe.count(peer_addr) == 0) { + // install a pipe! + dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr << endl; + rank.rank_pipe[peer_addr] = this; + } else { + // low ranks' Pipes "win" + if (peer_addr < rank.my_addr) { + dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr + << ", already had pipe, but switching to this new one" << endl; + // switch to this new Pipe + rank.rank_pipe[peer_addr]->close(); // close old one + rank.rank_pipe[peer_addr] = this; } else { - // low ranks' Pipes "win" - if (peer_inst.rank < rank.my_inst.rank || - rank.my_inst.rank < 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, but switching to this new one" << endl; - // switch to this new Pipe - rank.rank_pipe[peer_inst.rank]->close(); // close old one - rank.rank_pipe[peer_inst.rank] = this; - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst - << ", already had pipe, sticking with it" << endl; - } + dout(10) << "pipe(" << peer_addr << ' ' << this << ").accept peer is " << peer_addr + << ", already had pipe, sticking with it" << endl; } } - rank.lock.Unlock(); - } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl; } + rank.lock.Unlock(); return 0; // success. } int Rank::Pipe::connect() { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect" << endl; // create socket? sd = socket(AF_INET,SOCK_STREAM,0); @@ -209,34 +227,42 @@ int Rank::Pipe::connect() assert(rc>=0); // connect! - rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr)); - if (rc < 0) return rc; + tcpaddr_t tcpaddr; + peer_addr.make_addr(tcpaddr); + rc = ::connect(sd, (sockaddr*)&tcpaddr, sizeof(myAddr)); + if (rc < 0) { + dout(10) << "connect error " << peer_addr + << ", " << errno << ": " << strerror(errno) << endl; + return rc; + } // identify peer - entity_inst_t inst; - rc = tcp_read(sd, (char*)&inst, sizeof(inst)); - if (inst.rank < 0) - inst = peer_inst; // i know better than they do. - if (peer_inst != inst && inst.rank > 0) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl; + entity_addr_t paddr; + rc = tcp_read(sd, (char*)&paddr, sizeof(paddr)); + if (!rc) { // bool + dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect couldn't read peer addr" << endl; + return -1; + } + if (peer_addr != paddr) { + derr(0) << "pipe(" << peer_addr << ' ' << this << ").connect peer is " << paddr << ", wtf" << endl; assert(0); return -1; } // identify myself - rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); + rc = tcp_write(sd, (char*)&rank.my_addr, sizeof(rank.my_addr)); if (rc < 0) return -1; // register pipe rank.lock.Lock(); { - if (rank.rank_pipe.count(peer_inst.rank) == 0) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl; - rank.rank_pipe[peer_inst.rank] = this; + if (rank.rank_pipe.count(peer_addr) == 0) { + dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect registering pipe" << endl; + rank.rank_pipe[peer_addr] = this; } else { // this is normal. - dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").connect pipe already registered." << endl; } } rank.lock.Unlock(); @@ -251,32 +277,37 @@ int Rank::Pipe::connect() void Rank::Pipe::close() { - if (sent_close) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl; - return; - } - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").close" << endl; // unreg ourselves rank.lock.Lock(); { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl; - rank.rank_pipe.erase(peer_inst.rank); + if (rank.rank_pipe.count(peer_addr) && + rank.rank_pipe[peer_addr] == this) { + dout(10) << "pipe(" << peer_addr << ' ' << this + << ").close unregistering pipe" << endl; + rank.rank_pipe.erase(peer_addr); } } rank.lock.Unlock(); - // queue close message. - if (socket_error) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close not queueing MSG_CLOSE, socket error" << endl; + // queue close message? + if (!need_to_send_close) { + dout(10) << "pipe(" << peer_addr << ' ' << this + << ").close already closing/closed" << endl; + return; + } + + if (!writer_running) { + dout(10) << "pipe(" << peer_addr << ' ' << this + << ").close not queueing MSG_CLOSE, no writer running" << endl; } else { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this + << ").close queueing MSG_CLOSE" << endl; lock.Lock(); q.push_back(new MGenericMessage(MSG_CLOSE)); cond.Signal(); - sent_close = true; + need_to_send_close = false; lock.Unlock(); } } @@ -297,34 +328,25 @@ void Rank::Pipe::reader() if (!m || m->get_type() == 0) { if (m) { delete m; - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader read MSG_CLOSE message" << endl; + need_to_send_close = false; } else { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").reader read null message" << endl; } - if (!sent_close) - close(); + close(); done = true; cond.Signal(); // wake up writer too. break; } - dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").reader got message for " << m->get_dest() << endl; EntityMessenger *entity = 0; rank.lock.Lock(); { - if (rank.entity_map.count(m->get_source()) && - rank.entity_map[m->get_source()] > m->get_source_inst()) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() - << " inst " << m->get_source_inst() - << " > " << rank.entity_map[m->get_source()] - << ", WATCH OUT " << *m << endl; - assert(0); - } - if (g_conf.ms_single_dispatch) { // submit to single dispatch queue rank._submit_single_dispatch(m); @@ -335,8 +357,12 @@ void Rank::Pipe::reader() } else { entity = rank.find_unnamed(m->get_dest()); if (!entity) { - derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; - assert(0); // FIXME do this differently + if (rank.stopped.count(m->get_dest())) { + // ignore it + } else { + derr(0) << "pipe(" << peer_addr << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; + assert(0); // FIXME do this differently + } } } } @@ -358,7 +384,7 @@ void Rank::Pipe::reader() lock.Unlock(); if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader queueing for reap" << endl; ::close(sd); rank.lock.Lock(); { @@ -378,7 +404,9 @@ void Rank::Pipe::writer() if (!server) { int rc = connect(); if (rc < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl; + derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error connecting, " + << errno << ": " << strerror(errno) + << endl; done = true; list out; fail(out); @@ -390,7 +418,7 @@ void Rank::Pipe::writer() while (!q.empty() || !done) { if (!q.empty()) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer grabbing message(s)" << endl; // grab outgoing list list out; @@ -403,10 +431,10 @@ void Rank::Pipe::writer() Message *m = out.front(); out.pop_front(); - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << *m << endl; // stamp. - m->set_source_inst(rank.my_inst); + m->set_source_addr(rank.my_addr); // marshall if (m->empty_payload()) @@ -414,7 +442,9 @@ void Rank::Pipe::writer() if (write_message(m) < 0) { // failed! - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl; + derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() + << ", " << errno << ": " << strerror(errno) + << endl; out.push_front(m); fail(out); done = true; @@ -434,12 +464,12 @@ void Rank::Pipe::writer() } // wait - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sleeping" << endl; cond.Wait(lock); } lock.Unlock(); - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer finishing" << endl; // reap? bool reap = false; @@ -451,7 +481,7 @@ void Rank::Pipe::writer() lock.Unlock(); if (reap) { - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer queueing for reap" << endl; ::close(sd); rank.lock.Lock(); { @@ -470,12 +500,12 @@ Message *Rank::Pipe::read_message() msg_envelope_t env; if (!tcp_read( sd, (char*)&env, sizeof(env) )) { - socket_error = true; + need_to_send_close = false; return 0; } - dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type - << " src " << env.source << " dst " << env.dest + dout(20) << "pipe(" << peer_addr << ' ' << this << ").reader got envelope type=" << env.type + << " src " << env.src << " dst " << env.dst << " nchunks=" << env.nchunks << endl; @@ -484,7 +514,7 @@ Message *Rank::Pipe::read_message() for (int i=0; iget_source() << endl; return m; @@ -528,16 +558,16 @@ int Rank::Pipe::write_message(Message *m) env->nchunks = 1; #endif - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer sending " << m << " " << *m << " to " << m->get_dest() << endl; // send envelope int r = tcp_write( sd, (char*)env, sizeof(*env) ); if (r < 0) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m + derr(1) << "pipe(" << peer_addr << ' ' << this << ").writer error sending envelope for " << *m << " to " << m->get_dest() << endl; - socket_error = true; + need_to_send_close = false; return -1; } @@ -548,18 +578,18 @@ int Rank::Pipe::write_message(Message *m) for (list::const_iterator it = blist.buffers().begin(); it != blist.buffers().end(); it++) { - dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; + dout(10) << "pipe(" << peer_addr << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; int size = (*it).length(); r = tcp_write( sd, (char*)&size, sizeof(size) ); if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; - socket_error = true; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; + need_to_send_close = false; return -1; } r = tcp_write( sd, (*it).c_str(), size ); if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; - socket_error = true; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; + need_to_send_close = false; return -1; } i++; @@ -569,11 +599,11 @@ int Rank::Pipe::write_message(Message *m) int size = blist.length(); r = tcp_write( sd, (char*)&size, sizeof(size) ); if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; - socket_error = true; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; + need_to_send_close = false; return -1; } - dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; + dout(20) << "pipe(" << peer_addr << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; for (list::const_iterator it = blist.buffers().begin(); it != blist.buffers().end(); @@ -581,8 +611,8 @@ int Rank::Pipe::write_message(Message *m) if ((*it).length() == 0) continue; // blank buffer. r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); if (r < 0) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; - socket_error = true; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; + need_to_send_close = false; return -1; } } @@ -594,23 +624,23 @@ int Rank::Pipe::write_message(Message *m) void Rank::Pipe::fail(list& out) { - derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl; + derr(10) << "pipe(" << peer_addr << ' ' << this << ").fail" << endl; // FIXME: possible race before i reclaim lock here? // deactivate myself rank.lock.Lock(); { - if (rank.rank_pipe.count(peer_inst.rank) && - rank.rank_pipe[peer_inst.rank] == this) - rank.rank_pipe.erase(peer_inst.rank); + if (rank.rank_pipe.count(peer_addr) && + rank.rank_pipe[peer_addr] == this) + rank.rank_pipe.erase(peer_addr); } rank.lock.Unlock(); // what do i do about reader()? FIXME // sort my messages by (source) dispatcher, dest. - map > > by_dis; + map > > by_dis; lock.Lock(); { // include out at front of queue @@ -626,7 +656,7 @@ void Rank::Pipe::fail(list& out) Dispatcher *dis = mgr->get_dispatcher(); if (mgr->is_stopped()) { // ignore. - dout(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl; + dout(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << *q.front() << ", dispatcher stopping, ignoring." << endl; delete q.front(); } else { by_dis[dis][q.front()->get_dest()].push_back(q.front()); @@ -643,17 +673,17 @@ void Rank::Pipe::fail(list& out) lock.Unlock(); // report failure(s) to dispatcher(s) - for (map > >::iterator i = by_dis.begin(); + for (map > >::iterator i = by_dis.begin(); i != by_dis.end(); ++i) - for (map >::iterator j = i->second.begin(); + for (map >::iterator j = i->second.begin(); j != i->second.end(); ++j) for (list::iterator k = j->second.begin(); k != j->second.end(); ++k) { - derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl; - i->first->ms_handle_failure(*k, j->first, peer_inst); + derr(1) << "pipe(" << peer_addr << ' ' << this << ").fail on " << **k << " to " << (*k)->get_dest_inst() << endl; + i->first->ms_handle_failure(*k, (*k)->get_dest_inst()); } } @@ -678,13 +708,14 @@ Rank::~Rank() { } +/* void Rank::set_listen_addr(tcpaddr_t& a) { dout(10) << "set_listen_addr " << a << endl; memcpy((char*)&listen_addr.sin_addr.s_addr, (char*)&a.sin_addr.s_addr, 4); listen_addr.sin_port = a.sin_port; } - +*/ void Rank::_submit_single_dispatch(Message *m) { @@ -715,7 +746,7 @@ void Rank::single_dispatcher_entry() ls.pop_front(); dout(1) << m->get_dest() - << " <-- " << m->get_source() << " " << m->get_source_inst() + << " <-- " << m->get_source_inst() << " ---- " << *m << " -- " << m << endl; @@ -743,12 +774,12 @@ void Rank::reaper() while (!pipe_reap_queue.empty()) { Pipe *p = pipe_reap_queue.front(); - dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl; + dout(10) << "reaper reaping pipe " << p->get_peer_addr() << endl; pipe_reap_queue.pop_front(); assert(pipes.count(p)); pipes.erase(p); p->join(); - dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl; + dout(10) << "reaper reaped pipe " << p->get_peer_addr() << endl; delete p; } } @@ -770,10 +801,7 @@ int Rank::start_rank() lock.Lock(); - // my_inst - my_inst.set_addr( listen_addr ); - - dout(1) << "start_rank at " << my_inst << endl; + dout(1) << "start_rank at " << listen_addr << endl; lock.Unlock(); return 0; @@ -784,16 +812,16 @@ int Rank::start_rank() /* connect_rank * NOTE: assumes rank.lock held. */ -Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) +Rank::Pipe *Rank::connect_rank(const entity_addr_t& addr) { assert(rank.lock.is_locked()); - assert(inst != rank.my_inst); + assert(addr != rank.my_addr); - dout(10) << "connect_rank to " << inst << endl; + dout(10) << "connect_rank to " << addr << endl; // create pipe - Pipe *pipe = new Pipe(inst); - rank.rank_pipe[inst.rank] = pipe; + Pipe *pipe = new Pipe(addr); + rank.rank_pipe[addr] = pipe; pipes.insert(pipe); return pipe; @@ -803,25 +831,11 @@ Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) -void Rank::show_dir() -{ - dout(10) << "show_dir ---" << endl; - - for (hash_map::iterator i = entity_map.begin(); - i != entity_map.end(); - i++) { - if (local.count(i->first)) { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; - } else { - dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; - } - } -} -Rank::EntityMessenger *Rank::find_unnamed(msg_addr_t a) +Rank::EntityMessenger *Rank::find_unnamed(entity_name_t a) { // find an unnamed local entity of the right type - for (map::iterator p = local.begin(); + for (map::iterator p = local.begin(); p != local.end(); ++p) { if (p->first.type() == a.type() && p->first.is_new()) @@ -835,17 +849,17 @@ Rank::EntityMessenger *Rank::find_unnamed(msg_addr_t a) /* register_entity */ -Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) +Rank::EntityMessenger *Rank::register_entity(entity_name_t name) { - dout(10) << "register_entity " << addr << endl; + dout(10) << "register_entity " << name << endl; lock.Lock(); // create messenger - EntityMessenger *msgr = new EntityMessenger(addr); + EntityMessenger *msgr = new EntityMessenger(name); // add to directory - entity_map[addr] = my_inst; - local[addr] = msgr; + assert(local.count(name) == 0); + local[name] = msgr; lock.Unlock(); return msgr; @@ -855,23 +869,23 @@ Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) void Rank::unregister_entity(EntityMessenger *msgr) { lock.Lock(); - dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; + dout(10) << "unregister_entity " << msgr->get_myname() << endl; // remove from local directory. - assert(local.count(msgr->get_myaddr())); - local.erase(msgr->get_myaddr()); - assert(entity_map.count(msgr->get_myaddr())); - entity_map.erase(msgr->get_myaddr()); - + entity_name_t name = msgr->get_myname(); + assert(local.count(name)); + local.erase(name); + + stopped.insert(name); wait_cond.Signal(); lock.Unlock(); } -void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) +void Rank::submit_message(Message *m, const entity_addr_t& dest_addr) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); // lookup EntityMessenger *entity = 0; @@ -880,7 +894,7 @@ void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) lock.Lock(); { // local? - if (dest_inst.rank == my_inst.rank) { + if (dest_addr == my_addr) { if (local.count(dest)) { // local dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; @@ -890,20 +904,20 @@ void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) entity = local[dest]; } } else { - derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but not in local map?" << endl; - assert(0); // hmpf + derr(0) << "submit_message " << *m << " dest " << dest << " " << dest_addr << " local but not in local map?" << endl; + //assert(0); // hmpf, this is probably mds->mon beacon from newsyn. } } else { // remote. - if (rank_pipe.count( dest_inst.rank )) { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl; + if (rank_pipe.count( dest_addr )) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", already connected." << endl; // connected. - pipe = rank_pipe[ dest_inst.rank ]; + pipe = rank_pipe[ dest_addr ]; } else { - dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_addr << ", connecting." << endl; // not connected. - pipe = connect_rank( dest_inst ); + pipe = connect_rank( dest_addr ); } } } @@ -936,6 +950,8 @@ void Rank::wait() if (local.empty()) { dout(10) << "wait: everything stopped" << endl; break; // everything stopped. + } else { + dout(10) << "wait: local still has " << local.size() << " items, waiting" << endl; } wait_cond.Wait(lock); @@ -944,6 +960,9 @@ void Rank::wait() // done! clean up. + //dout(10) << "wait: stopping accepter thread" << endl; + //accepter.stop(); + // stop dispatch thread if (g_conf.ms_single_dispatch) { dout(10) << "wait: stopping dispatch thread" << endl; @@ -959,7 +978,7 @@ void Rank::wait() { dout(10) << "wait: closing pipes" << endl; list toclose; - for (hash_map<__int64_t,Pipe*>::iterator i = rank_pipe.begin(); + for (hash_map::iterator i = rank_pipe.begin(); i != rank_pipe.end(); i++) toclose.push_back(i->second); @@ -988,12 +1007,11 @@ void Rank::wait() * EntityMessenger */ -Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) : +Rank::EntityMessenger::EntityMessenger(entity_name_t myaddr) : Messenger(myaddr), stop(false), dispatch_thread(this) { - set_myinst(rank.my_inst); } Rank::EntityMessenger::~EntityMessenger() { @@ -1011,10 +1029,15 @@ void Rank::EntityMessenger::dispatch_entry() { // deliver while (!ls.empty()) { + if (stop) { + dout(1) << "dispatch: stop=true, discarding " << ls.size() + << " messages in dispatch queue" << endl; + break; + } Message *m = ls.front(); ls.pop_front(); dout(1) << m->get_dest() - << " <-- " << m->get_source() << " " << m->get_source_inst() + << " <-- " << m->get_source_inst() << " ---- " << *m << " -- " << m << endl; @@ -1027,6 +1050,9 @@ void Rank::EntityMessenger::dispatch_entry() cond.Wait(lock); } lock.Unlock(); + + // deregister + rank.unregister_entity(this); } void Rank::EntityMessenger::ready() @@ -1035,10 +1061,10 @@ void Rank::EntityMessenger::ready() if (g_conf.ms_single_dispatch) { rank.lock.Lock(); - if (rank.waiting_for_ready.count(get_myaddr())) { + if (rank.waiting_for_ready.count(get_myname())) { rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), - rank.waiting_for_ready[get_myaddr()]); - rank.waiting_for_ready.erase(get_myaddr()); + rank.waiting_for_ready[get_myname()]); + rank.waiting_for_ready.erase(get_myname()); rank.single_dispatch_cond.Signal(); } rank.lock.Unlock(); @@ -1053,9 +1079,6 @@ int Rank::EntityMessenger::shutdown() { dout(10) << "shutdown " << get_myaddr() << endl; - // deregister - rank.unregister_entity(this); - // stop my dispatch thread if (dispatch_thread.am_self()) { dout(1) << "shutdown i am dispatch, setting stop flag" << endl; @@ -1073,63 +1096,68 @@ int Rank::EntityMessenger::shutdown() } -void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) +void Rank::EntityMessenger::prepare_dest(const entity_addr_t& addr) { rank.lock.Lock(); { - if (rank.rank_pipe.count(inst.rank) == 0) - rank.connect_rank(inst); + if (rank.rank_pipe.count(addr) == 0) + rank.connect_rank(addr); } rank.lock.Unlock(); } -int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, entity_inst_t inst, +int Rank::EntityMessenger::send_message(Message *m, entity_inst_t dest, int port, int fromport) { // set envelope - m->set_source(get_myaddr(), fromport); - m->set_dest(dest, port); - - m->set_source_inst(rank.my_inst); - + m->set_source(get_myname(), fromport); + m->set_source_addr(rank.my_addr); + m->set_dest_inst(dest); + m->set_dest_port(port); + dout(1) << m->get_source() - << " --> " << m->get_dest() << " " << inst + << " --> " << dest.name << " " << dest.addr << " -- " << *m << " -- " << m << endl; - rank.submit_message(m, inst); + rank.submit_message(m, dest.addr); return 0; } -void Rank::EntityMessenger::reset_myaddr(msg_addr_t newaddr) + +const entity_addr_t &Rank::EntityMessenger::get_myaddr() +{ + return rank.my_addr; +} + + +void Rank::EntityMessenger::reset_myname(entity_name_t newname) { - msg_addr_t oldaddr = get_myaddr(); - dout(10) << "set_myaddr " << oldaddr << " to " << newaddr << endl; + entity_name_t oldname = get_myname(); + dout(10) << "reset_myname " << oldname << " to " << newname << endl; - rank.entity_map.erase(oldaddr); - rank.local.erase(oldaddr); - rank.entity_map[newaddr] = rank.my_inst; - rank.local[newaddr] = this; + rank.local.erase(oldname); + rank.local[newname] = this; - _set_myaddr(newaddr); + _set_myname(newname); } -void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i) +void Rank::EntityMessenger::mark_down(entity_addr_t a) { - assert(a != get_myaddr()); - rank.mark_down(a,i); + rank.mark_down(a); } -void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) +void Rank::mark_down(entity_addr_t addr) { //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer lock.Lock(); + /* if (entity_map.count(a) && entity_map[a] > inst) { dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; @@ -1148,43 +1176,14 @@ void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) entity_map.erase(a); - if (rank_pipe.count(inst.rank)) { - rank_pipe[inst.rank]->close(); - rank_pipe.erase(inst.rank); + if (rank_pipe.count(inst)) { + rank_pipe[inst]->close(); + rank_pipe.erase(inst); } } } + */ lock.Unlock(); } -void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i) -{ - assert(a != get_myaddr()); - rank.mark_up(a, i); -} - -void Rank::mark_up(msg_addr_t a, entity_inst_t& i) -{ - lock.Lock(); - { - dout(10) << "mark_up " << a << " inst " << i << endl; - derr(10) << "mark_up " << a << " inst " << i << endl; - - if (entity_map.count(a) == 0 || - entity_map[a] < i) { - entity_map[a] = i; - connect_rank(i); - } else if (entity_map[a] == i) { - dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; - } else { - dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; - } - - //if (waiting_for_lookup.count(a)) - //lookup(a); - } - lock.Unlock(); -} diff --git a/branches/aleung/security1/ceph/msg/SimpleMessenger.h b/branches/aleung/security1/ceph/msg/SimpleMessenger.h index 0860f65d62cdb..e1265423edb13 100644 --- a/branches/aleung/security1/ceph/msg/SimpleMessenger.h +++ b/branches/aleung/security1/ceph/msg/SimpleMessenger.h @@ -39,7 +39,10 @@ using namespace __gnu_cxx; /* Rank - per-process */ class Rank { - +public: + void sigint(); + +private: class EntityMessenger; class Pipe; @@ -60,6 +63,8 @@ class Rank { } int start(); } accepter; + + void sigint(int r); // pipe @@ -67,10 +72,9 @@ class Rank { protected: int sd; bool done; - entity_inst_t peer_inst; + entity_addr_t peer_addr; bool server; - bool sent_close; - bool socket_error; + bool need_to_send_close; bool reader_running; bool writer_running; @@ -104,22 +108,22 @@ class Rank { void *entry() { pipe->writer(); return 0; } } writer_thread; friend class Writer; - + public: Pipe(int s) : sd(s), done(false), server(true), - sent_close(false), socket_error(false), + need_to_send_close(true), reader_running(false), writer_running(false), reader_thread(this), writer_thread(this) { // server reader_running = true; reader_thread.create(); } - Pipe(const entity_inst_t &pi) : sd(0), - done(false), peer_inst(pi), server(false), - sent_close(false), - reader_running(false), writer_running(false), - reader_thread(this), writer_thread(this) { + Pipe(const entity_addr_t &pi) : sd(0), + done(false), peer_addr(pi), server(false), + need_to_send_close(true), + reader_running(false), writer_running(false), + reader_thread(this), writer_thread(this) { // client writer_running = true; writer_thread.create(); @@ -127,9 +131,9 @@ class Rank { // public constructors static const Pipe& Server(int s); - static const Pipe& Client(const entity_inst_t& pi); + static const Pipe& Client(const entity_addr_t& pi); - entity_inst_t& get_peer_inst() { return peer_inst; } + entity_addr_t& get_peer_addr() { return peer_addr; } void close(); void join() { @@ -186,7 +190,7 @@ class Rank { } public: - EntityMessenger(msg_addr_t myaddr); + EntityMessenger(entity_name_t myaddr); ~EntityMessenger(); void ready(); @@ -196,16 +200,17 @@ class Rank { dispatch_thread.join(); } - void reset_myaddr(msg_addr_t m); + const entity_addr_t &get_myaddr(); + + void reset_myname(entity_name_t m); - void callback_kick() {} int shutdown(); - void prepare_dest(const entity_inst_t& inst); - int send_message(Message *m, msg_addr_t dest, entity_inst_t inst, + void prepare_dest(const entity_addr_t& addr); + int send_message(Message *m, entity_inst_t dest, int port=0, int fromport=0); - void mark_down(msg_addr_t a, entity_inst_t& i); - void mark_up(msg_addr_t a, entity_inst_t& i); + void mark_down(entity_addr_t a); + void mark_up(entity_name_t a, entity_addr_t& i); }; @@ -223,7 +228,7 @@ class Rank { bool single_dispatch_stop; list single_dispatch_queue; - map > waiting_for_ready; + map > waiting_for_ready; void single_dispatcher_entry(); void _submit_single_dispatch(Message *m); @@ -236,54 +241,48 @@ class Rank { // where i listen tcpaddr_t listen_addr; - - // my instance - entity_inst_t my_inst; + entity_addr_t my_addr; - // lookup - hash_map entity_map; - hash_set entity_unstarted; - // local - map local; + map local; + set stopped; + //hash_set entity_unstarted; // remote - hash_map<__int64_t, Pipe*> rank_pipe; + hash_map rank_pipe; set pipes; list pipe_reap_queue; - - void show_dir(); - - Pipe *connect_rank(const entity_inst_t& inst); + + Pipe *connect_rank(const entity_addr_t& addr); - void mark_down(msg_addr_t addr, entity_inst_t& i); - void mark_up(msg_addr_t addr, entity_inst_t& i); + void mark_down(entity_addr_t addr); + //void mark_up(entity_name_t addr, entity_addr_t& i); tcpaddr_t get_listen_addr() { return listen_addr; } void reaper(); - EntityMessenger *find_unnamed(msg_addr_t a); + EntityMessenger *find_unnamed(entity_name_t a); public: Rank(); ~Rank(); - void set_listen_addr(tcpaddr_t& a); + //void set_listen_addr(tcpaddr_t& a); int start_rank(); void wait(); - EntityMessenger *register_entity(msg_addr_t addr); - void rename_entity(EntityMessenger *ms, msg_addr_t newaddr); + EntityMessenger *register_entity(entity_name_t addr); + void rename_entity(EntityMessenger *ms, entity_name_t newaddr); void unregister_entity(EntityMessenger *ms); - void submit_message(Message *m, const entity_inst_t& inst); - void prepare_dest(const entity_inst_t& inst); + void submit_message(Message *m, const entity_addr_t& addr); + void prepare_dest(const entity_addr_t& addr); // create a new messenger - EntityMessenger *new_entity(msg_addr_t addr); + EntityMessenger *new_entity(entity_name_t addr); } ; diff --git a/branches/aleung/security1/ceph/msg/TCPDirectory.cc b/branches/aleung/security1/ceph/msg/TCPDirectory.cc index 111f6ee69f2f3..57000ac30d74c 100644 --- a/branches/aleung/security1/ceph/msg/TCPDirectory.cc +++ b/branches/aleung/security1/ceph/msg/TCPDirectory.cc @@ -54,7 +54,7 @@ void TCPDirectory::handle_register(MNSRegister *m) // pick id int rank = m->get_rank(); - msg_addr_t entity = m->get_entity(); + entity_name_t entity = m->get_entity(); if (entity.is_new()) { // make up a new address! @@ -105,7 +105,7 @@ void TCPDirectory::handle_register(MNSRegister *m) void TCPDirectory::handle_started(Message *m) { - msg_addr_t entity = m->get_source(); + entity_name_t entity = m->get_source(); dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl; hold.erase(entity); @@ -128,7 +128,7 @@ void TCPDirectory::handle_started(Message *m) void TCPDirectory::handle_unregister(Message *m) { - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl; assert(dir.count(who)); @@ -142,7 +142,7 @@ void TCPDirectory::handle_unregister(Message *m) else { if (0) { dout(10) << "dir size now " << dir.size() << endl; - for (hash_map::iterator it = dir.begin(); + for (hash_map::iterator it = dir.begin(); it != dir.end(); it++) { dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; diff --git a/branches/aleung/security1/ceph/msg/TCPDirectory.h b/branches/aleung/security1/ceph/msg/TCPDirectory.h index 1b54bb010e906..7f450e9a64be5 100644 --- a/branches/aleung/security1/ceph/msg/TCPDirectory.h +++ b/branches/aleung/security1/ceph/msg/TCPDirectory.h @@ -40,17 +40,17 @@ class TCPDirectory : public Dispatcher { TCPMessenger *messenger; // directory - hash_map dir; // entity -> rank + hash_map dir; // entity -> rank hash_map rank_addr; // rank -> ADDR (e.g. host:port) __uint64_t version; - map<__uint64_t, msg_addr_t> update_log; + map<__uint64_t, entity_name_t> update_log; int nrank; int nclient, nmds, nosd; - set hold; - map > waiting; + set hold; + map > waiting; // messages void handle_connect(class MNSConnect*); diff --git a/branches/aleung/security1/ceph/msg/TCPMessenger.cc b/branches/aleung/security1/ceph/msg/TCPMessenger.cc index 2c594bb528df6..f40ea9b162e6b 100644 --- a/branches/aleung/security1/ceph/msg/TCPMessenger.cc +++ b/branches/aleung/security1/ceph/msg/TCPMessenger.cc @@ -76,8 +76,8 @@ off_t stat_outq = 0, stat_outqb = 0; // local directory -hash_map directory; // local -hash_set directory_ready; +hash_map directory; // local +hash_set directory_ready; Mutex directory_lock; // connecting @@ -89,7 +89,7 @@ Cond waiting_for_rank; // register long regid = 0; map waiting_for_register_cond; -map waiting_for_register_result; +map waiting_for_register_result; // incoming messages list incoming; @@ -132,11 +132,11 @@ public: } single_out_thread; Mutex lookup_lock; // -hash_map entity_rank; // entity -> rank +hash_map entity_rank; // entity -> rank hash_map rank_sd; // outgoing sockets, rank -> sd hash_map rank_out; hash_map rank_addr; // rank -> tcpaddr -map > waiting_for_lookup; +map > waiting_for_lookup; /* this process */ @@ -292,7 +292,7 @@ public: list waiting; dout(DBL) << "got lookup reply" << endl; - for (map::iterator it = m->entity_rank.begin(); + for (map::iterator it = m->entity_rank.begin(); it != m->entity_rank.end(); it++) { dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; @@ -660,7 +660,7 @@ void tcp_marshall(Message *m) OutThread *tcp_lookup(Message *m) { - msg_addr_t addr = m->get_dest(); + entity_name_t addr = m->get_dest(); if (!entity_rank.count(m->get_dest())) { // lookup and wait. @@ -822,7 +822,7 @@ void *tcp_inthread(void *r) while (!tcp_done) { Message *m = tcp_recv(sd); if (!m) break; - msg_addr_t who = m->get_source(); + entity_name_t who = m->get_source(); dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl; @@ -830,7 +830,7 @@ void *tcp_inthread(void *r) size_t sz = m->get_payload().length(); if (g_conf.tcp_multi_dispatch) { - const msg_addr_t dest = m->get_dest(); + const entity_name_t dest = m->get_dest(); directory_lock.Lock(); TCPMessenger *messenger = directory[ dest ]; directory_lock.Unlock(); @@ -1023,7 +1023,7 @@ void* tcp_dispatchthread(void*) } // ok - msg_addr_t dest = m->get_dest(); + entity_name_t dest = m->get_dest(); directory_lock.Lock(); if (directory.count(dest)) { Messenger *who = directory[ dest ]; @@ -1145,7 +1145,7 @@ void tcpmessenger_wait() -msg_addr_t register_entity(msg_addr_t addr) +entity_name_t register_entity(entity_name_t addr) { lookup_lock.Lock(); @@ -1185,7 +1185,7 @@ msg_addr_t register_entity(msg_addr_t addr) cond.Wait(lookup_lock); // get result, clean up - msg_addr_t entity = waiting_for_register_result[id]; + entity_name_t entity = waiting_for_register_result[id]; waiting_for_register_result.erase(id); waiting_for_register_cond.erase(id); @@ -1204,7 +1204,7 @@ msg_addr_t register_entity(msg_addr_t addr) */ -TCPMessenger::TCPMessenger(msg_addr_t myaddr) : +TCPMessenger::TCPMessenger(entity_name_t myaddr) : Messenger(myaddr), dispatch_thread(this) { @@ -1266,7 +1266,7 @@ tcpaddr_t& TCPMessenger::get_tcpaddr() return listen_addr; } -void TCPMessenger::map_entity_rank(msg_addr_t e, int r) +void TCPMessenger::map_entity_rank(entity_name_t e, int r) { lookup_lock.Lock(); entity_rank[e] = r; @@ -1341,7 +1341,7 @@ int TCPMessenger::shutdown() if (g_conf.tcp_multi_dispatch) { // kill off dispatch threads dout(DBL) << "killing dispatch threads" << endl; - for (hash_map::iterator it = directory.begin(); + for (hash_map::iterator it = directory.begin(); it != directory.end(); it++) it->second->dispatch_stop(); @@ -1391,7 +1391,7 @@ int TCPMessenger::shutdown() /* note: send_message _MUST_ be non-blocking */ -int TCPMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +int TCPMessenger::send_message(Message *m, entity_name_t dest, int port, int fromport) { // set envelope m->set_source(get_myaddr(), fromport); diff --git a/branches/aleung/security1/ceph/msg/TCPMessenger.h b/branches/aleung/security1/ceph/msg/TCPMessenger.h index 5cafbe470214b..414e50f5fef87 100644 --- a/branches/aleung/security1/ceph/msg/TCPMessenger.h +++ b/branches/aleung/security1/ceph/msg/TCPMessenger.h @@ -71,13 +71,13 @@ public: } public: - TCPMessenger(msg_addr_t myaddr); + TCPMessenger(entity_name_t myaddr); ~TCPMessenger(); void ready(); tcpaddr_t& get_tcpaddr(); - void map_entity_rank(msg_addr_t e, int r); + void map_entity_rank(entity_name_t e, int r); void map_rank_addr(int r, tcpaddr_t a); int get_dispatch_queue_len(); @@ -88,7 +88,7 @@ public: virtual int shutdown(); // message interface - virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, entity_name_t dest, int port=0, int fromport=0); }; /** diff --git a/branches/aleung/security1/ceph/msg/msg_types.h b/branches/aleung/security1/ceph/msg/msg_types.h new file mode 100644 index 0000000000000..0b92df47020d0 --- /dev/null +++ b/branches/aleung/security1/ceph/msg/msg_types.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MSG_TYPES_H +#define __MSG_TYPES_H + +#include "include/types.h" +#include "tcp.h" + +// new typed msg_addr_t way! +class entity_name_t { + int _type; + int _num; + +public: + static const int TYPE_MON = 1; + static const int TYPE_MDS = 2; + static const int TYPE_OSD = 3; + static const int TYPE_CLIENT = 4; + + static const int NEW = -1; + + // cons + entity_name_t() : _type(0), _num(0) {} + entity_name_t(int t, int n) : _type(t), _num(n) {} + + int num() const { return _num; } + int type() const { return _type; } + const char *type_str() const { + switch (type()) { + case TYPE_MDS: return "mds"; + case TYPE_OSD: return "osd"; + case TYPE_MON: return "mon"; + case TYPE_CLIENT: return "client"; + default: return "unknown"; + } + } + + bool is_new() const { return num() == NEW; } + + bool is_client() const { return type() == TYPE_CLIENT; } + bool is_mds() const { return type() == TYPE_MDS; } + bool is_osd() const { return type() == TYPE_OSD; } + bool is_mon() const { return type() == TYPE_MON; } +}; + +inline bool operator== (const entity_name_t& l, const entity_name_t& r) { + return (l.type() == r.type()) && (l.num() == r.num()); } +inline bool operator!= (const entity_name_t& l, const entity_name_t& r) { + return (l.type() != r.type()) || (l.num() != r.num()); } +inline bool operator< (const entity_name_t& l, const entity_name_t& r) { + return (l.type() < r.type()) || (l.type() == r.type() && l.num() < r.num()); } + +inline std::ostream& operator<<(std::ostream& out, const entity_name_t& addr) { + //if (addr.is_namer()) return out << "namer"; + if (addr.is_new() || addr.num() < 0) + return out << addr.type_str() << "?"; + else + return out << addr.type_str() << addr.num(); +} + +namespace __gnu_cxx { + template<> struct hash< entity_name_t > + { + size_t operator()( const entity_name_t m ) const + { + static blobhash H; + return H((const char*)&m, sizeof(m)); + } + }; +} + +// get rid of these +#define MSG_ADDR_MDS(x) entity_name_t(entity_name_t::TYPE_MDS,x) +#define MSG_ADDR_OSD(x) entity_name_t(entity_name_t::TYPE_OSD,x) +#define MSG_ADDR_MON(x) entity_name_t(entity_name_t::TYPE_MON,x) +#define MSG_ADDR_CLIENT(x) entity_name_t(entity_name_t::TYPE_CLIENT,x) + +#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(entity_name_t::NEW) +#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(entity_name_t::NEW) +#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(entity_name_t::NEW) +#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(entity_name_t::NEW) + + +/* + * an entity's network address. + * includes a random value that prevents it from being reused. + * thus identifies a particular process instance. + * ipv4 for now. + */ +struct entity_addr_t { + __uint8_t ipq[4]; + __uint32_t port; + __uint32_t nonce; // bind time, or pid, or something unique! + + entity_addr_t() : port(0), nonce(0) { + ipq[0] = ipq[1] = ipq[2] = ipq[3] = 0; + } + + void set_addr(tcpaddr_t a) { + memcpy((char*)ipq, (char*)&a.sin_addr.s_addr, 4); + port = ntohs(a.sin_port); + } + void make_addr(tcpaddr_t& a) const { + memset(&a, 0, sizeof(a)); + a.sin_family = AF_INET; + memcpy((char*)&a.sin_addr.s_addr, (char*)ipq, 4); + a.sin_port = htons(port); + } +}; + +inline ostream& operator<<(ostream& out, const entity_addr_t &addr) +{ + return out << (int)addr.ipq[0] + << '.' << (int)addr.ipq[1] + << '.' << (int)addr.ipq[2] + << '.' << (int)addr.ipq[3] + << ':' << addr.port + << '.' << addr.nonce; +} + +inline bool operator==(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } +inline bool operator!=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } +inline bool operator<(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } +inline bool operator<=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } +inline bool operator>(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } +inline bool operator>=(const entity_addr_t& a, const entity_addr_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } + +namespace __gnu_cxx { + template<> struct hash< entity_addr_t > + { + size_t operator()( const entity_addr_t& x ) const + { + static blobhash H; + return H((const char*)&x, sizeof(x)); + } + }; +} + + +/* + * a particular entity instance + */ +struct entity_inst_t { + entity_name_t name; + entity_addr_t addr; + entity_inst_t() {} + entity_inst_t(entity_name_t n, const entity_addr_t& a) : name(n), addr(a) {} +}; + + +inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) == 0; } +inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) != 0; } +inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) < 0; } +inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) <= 0; } +inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) > 0; } +inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return memcmp(&a, &b, sizeof(a)) >= 0; } + +namespace __gnu_cxx { + template<> struct hash< entity_inst_t > + { + size_t operator()( const entity_inst_t& x ) const + { + static blobhash H; + return H((const char*)&x, sizeof(x)); + } + }; +} + +inline ostream& operator<<(ostream& out, const entity_inst_t &i) +{ + return out << i.name << " " << i.addr; +} + + +#endif diff --git a/branches/aleung/security1/ceph/msg/tcp.h b/branches/aleung/security1/ceph/msg/tcp.h index f38388d456a8c..65043cda8e2ac 100644 --- a/branches/aleung/security1/ceph/msg/tcp.h +++ b/branches/aleung/security1/ceph/msg/tcp.h @@ -18,7 +18,7 @@ inline ostream& operator<<(ostream& out, const tcpaddr_t &a) << (unsigned)addr[1] << "." << (unsigned)addr[2] << "." << (unsigned)addr[3] << ":" - << (int)a.sin_port; + << ntohs(a.sin_port); return out; } diff --git a/branches/aleung/security1/ceph/newsyn.cc b/branches/aleung/security1/ceph/newsyn.cc index f4f6309a1aaa0..ecf90c2319929 100644 --- a/branches/aleung/security1/ceph/newsyn.cc +++ b/branches/aleung/security1/ceph/newsyn.cc @@ -68,7 +68,8 @@ pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) rank.start_rank(); // bind and listen if (mpi_rank < g_conf.num_mon) { - moninst[mpi_rank].set_addr( rank.get_listen_addr() ); + moninst[mpi_rank].addr = rank.my_addr; + moninst[mpi_rank].name = MSG_ADDR_MON(mpi_rank); //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; } @@ -193,6 +194,11 @@ int main(int argc, char **argv) // start up messenger via MPI MonMap *monmap = new MonMap(g_conf.num_mon); + + // need a key pair + string mon_private_key; + monmap->generate_key_pair(mon_private_key); + pair mpiwho = mpi_bootstrap_new(argc, argv, monmap); int myrank = mpiwho.first; int world = mpiwho.second; @@ -225,6 +231,7 @@ int main(int argc, char **argv) // create mon if (myrank < g_conf.num_mon) { Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); + mon->set_new_private_key(mon_private_key); mon->init(); } @@ -242,7 +249,7 @@ int main(int argc, char **argv) for (int i=0; iinit(); started++; @@ -268,7 +275,7 @@ int main(int argc, char **argv) g_timer.add_event_after(kill_osd_after[i], new C_Die); Messenger *m = rank.register_entity(MSG_ADDR_OSD(i)); - cerr << "osd" << i << " at " << rank.my_inst << " " << hostname << "." << pid << endl; + cerr << "osd" << i << " at " << rank.my_addr << " " << hostname << "." << pid << endl; osd[i] = new OSD(i, m, monmap); osd[i]->init(); started++; @@ -286,6 +293,7 @@ int main(int argc, char **argv) set clientlist; map client;//[NUMCLIENT]; map syn;//[NUMCLIENT]; + int nclients = 0; for (int i=0; imount(); + nclients++; } if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; - int nclients = 0; for (set::iterator it = clientlist.begin(); it != clientlist.end(); it++) { int i = *it; //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; - client[i]->mount(); syn[i]->start_thread(); - nclients++; } if (nclients) { - cerr << nclients << " clients at " << rank.my_inst << " " << hostname << "." << pid << endl; + cerr << nclients << " clients at " << rank.my_addr << " " << hostname << "." << pid << endl; } for (set::iterator it = clientlist.begin(); @@ -359,7 +367,7 @@ int main(int argc, char **argv) if (myrank && !started) { //dout(1) << "IDLE" << endl; - cerr << "idle at " << rank.my_inst << " " << hostname << "." << pid << endl; + cerr << "idle at " << rank.my_addr << " " << hostname << "." << pid << endl; //rank.stop_rank(); } diff --git a/branches/aleung/security1/ceph/osbdb/OSBDB.cc b/branches/aleung/security1/ceph/osbdb/OSBDB.cc new file mode 100644 index 0000000000000..c4f4f5a71acbc --- /dev/null +++ b/branches/aleung/security1/ceph/osbdb/OSBDB.cc @@ -0,0 +1,1395 @@ +/* OSBDB.cc -- ObjectStore on top of Berkeley DB. + Copyright (C) 2007 Casey Marshall + +Ceph - scalable distributed file system + +This is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 2.1, as published by the Free Software +Foundation. See file COPYING. */ + + +#include +#include "OSBDB.h" + +using namespace std; + +#undef dout +#define dout(x) if (x <= g_conf.debug_bdbstore) cout << "bdbstore(" << device << ")." +#undef derr +#define derr(x) if (x <= g_conf.debug_bdbstore) cerr << "bdbstore(" << device << ")." + + // Utilities. + +// Starting off with my own bsearch; mail reader to follow... + +// Perform a binary search on a sorted array, returning the insertion +// point for key, or key if it is exactly found. In other words, this +// will return a pointer to the element that will come after key if +// key were to be inserted into the sorted array. +// +// Requires that T have < and > operators defined. +template +uint32_t binary_search (T *array, size_t size, T key) +{ + int low = 0; + int high = size; + int p = (low + high) / 2; + + while (low < high - 1) + { + if (array[p] > key) + { + high = p; + } + else if (array[p] < key) + { + low = p; + } + else + return p; + + p = (low + high) / 2; + } + + if (array[p] < key) + p++; + else if (array[p] > key && p > 0) + p--; + return p; +} + + // Management. + +int OSBDB::opendb(DBTYPE type, int flags) +{ + db = new Db(env, 0); + db->set_error_stream (&std::cerr); + db->set_message_stream (&std::cout); + db->set_flags (0); + if (!g_conf.bdbstore_btree) + { + if (g_conf.bdbstore_pagesize > 0) + db->set_pagesize (g_conf.bdbstore_pagesize); + if (g_conf.bdbstore_ffactor > 0 && g_conf.bdbstore_nelem > 0) + { + db->set_h_ffactor (g_conf.bdbstore_ffactor); + db->set_h_nelem (g_conf.bdbstore_nelem); + } + } + if (g_conf.bdbstore_cachesize > 0) + { + db->set_cachesize (0, g_conf.bdbstore_cachesize, 0); + } + + int ret; + if ((ret = db->open (NULL, device.c_str(), NULL, type, flags, 0)) != 0) + { + derr(1) << "failed to open database: " << device << ": " + << strerror(ret) << std::endl; + return -EINVAL; + } + opened = true; + return 0; +} + +int OSBDB::mount() +{ + dout(2) << "mount " << device << endl; + + if (mounted) + return 0; + + if (!opened) + { + int ret; + if ((ret = opendb ()) != 0) + return ret; + } + + // XXX Do we want anything else in the superblock? + + Dbt key (OSBDB_SUPERBLOCK_KEY, 1); + stored_superblock super; + Dbt value (&super, sizeof (super)); + value.set_dlen (sizeof (super)); + value.set_ulen (sizeof (super)); + value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); + + if (db->get (NULL, &key, &value, 0) != 0) + return -EINVAL; // XXX how to say "badly formed fs?" + + dout(2) << ".mount " << super << endl; + + if (super.version != OSBDB_THIS_VERSION) + return -EINVAL; + + DBTYPE t; + db->get_type (&t); + + if (t == DB_BTREE) + { + u_int32_t minkey; + u_int32_t flags; + db->get_bt_minkey (&minkey); + db->get_flags (&flags); + dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Btree; " + << "min keys per page: " << minkey << "; flags: " + << hex << flags << endl; + cout << dec; + } + else + { + u_int32_t ffactor; + u_int32_t nelem; + u_int32_t flags; + db->get_h_ffactor (&ffactor); + db->get_h_nelem (&nelem); + db->get_flags (&flags); + dout(1) << "mounted version " << OSBDB_THIS_VERSION << "; Hash; " + << "fill factor: " << ffactor + << " table size: " << nelem << "; flags: " + << hex << flags << endl; + cout << dec; + } + + mounted = true; + return 0; +} + +int OSBDB::umount() +{ + if (!mounted) + return -EINVAL; + sync(); + int ret; + if (opened) + { + if ((ret = db->close (0)) != 0) + { + derr(1) << "close: " << db_strerror(ret) << endl; + return -EINVAL; + } + delete db; + db = NULL; + } + mounted = false; + opened = false; + return 0; +} + +int OSBDB::mkfs() +{ + if (mounted) + return -EINVAL; + + dout(2) << "mkfs" << endl; + + unlink (device.c_str()); + int ret; + if ((ret = opendb((g_conf.bdbstore_btree ? DB_BTREE : DB_HASH), DB_CREATE)) != 0) + { + derr(1) << "failed to open database: " << device << ": " + << strerror(ret) << std::endl; + return -EINVAL; + } + opened = true; + dout(3) << "..opened " << device << endl; + + uint32_t c; + ret = db->truncate (NULL, &c, 0); + if (ret != 0) + { + return -EIO; // ??? + } + + Dbt key (OSBDB_SUPERBLOCK_KEY, 1); + struct stored_superblock sb; + sb.version = OSBDB_THIS_VERSION; + Dbt value (&sb, sizeof (sb)); + + dout(3) << "..writing superblock" << endl; + if (db->put (NULL, &key, &value, 0) != 0) + { + return -EIO; // ??? + } + dout(3) << "..wrote superblock" << endl; + + return 0; +} + + // Objects. + +int OSBDB::pick_object_revision_lt(object_t& oid) +{ + if (!mounted) + return -EINVAL; + + // XXX this is pretty lame. Can we do better? + assert(oid.rev > 0); + oid.rev--; + while (oid.rev > 0) + { + if (exists (oid)) + { + return 0; + } + oid.rev--; + } + return -EEXIST; // FIXME +} + +bool OSBDB::exists(object_t oid) +{ + dout(2) << "exists " << oid << endl; + struct stat st; + return (stat (oid, &st) == 0); +} + +int OSBDB::statfs (struct statfs *st) +{ + return -ENOSYS; +} + +int OSBDB::stat(object_t oid, struct stat *st) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "stat " << oid << endl; + + object_inode_key ikey = new_object_inode_key(oid); + stored_object obj; + Dbt key (&ikey, sizeof_object_inode_key()); + Dbt value (&obj, sizeof (obj)); + value.set_flags (DB_DBT_USERMEM); + value.set_ulen (sizeof (obj)); + + dout(3) << " lookup " << ikey << endl; + int ret; + if ((ret = db->get (NULL, &key, &value, 0)) != 0) + { + derr(1) << " get returned " << ret << endl; + return -ENOENT; + } + + st->st_size = obj.length; + dout(3) << "stat length:" << obj.length << endl; + return 0; +} + +int OSBDB::remove(object_t oid, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "remove " << oid << endl; + + oid_t id; + mkoid(id, oid); + Dbt key (&id, sizeof (oid_t)); + db->del (NULL, &key, 0); + object_inode_key _ikey = new_object_inode_key (oid); + Dbt ikey (&_ikey, sizeof_object_inode_key()); + db->del (NULL, &ikey, 0); + + attrs_id aids = new_attrs_id (oid); + Dbt askey (&aids, sizeof_attrs_id()); + Dbt asval; + asval.set_flags (DB_DBT_MALLOC); + if (db->get (NULL, &askey, &asval, 0) == 0) + { + // We have attributes; remove them. + stored_attrs *sap = (stored_attrs *) asval.get_data(); + auto_ptr sa (sap); + for (unsigned i = 0; i < sap->count; i++) + { + attr_id aid = new_attr_id (oid, sap->names[i].name); + Dbt akey (&aid, sizeof (aid)); + db->del (NULL, &akey, 0); + } + db->del (NULL, &askey, 0); + } + + return 0; +} + +int OSBDB::truncate(object_t oid, off_t size, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "truncate " << size << endl; + + if (size > 0xFFFFFFFF) + return -ENOSPC; + + object_inode_key ikey = new_object_inode_key (oid); + stored_object obj; + Dbt key (&ikey, sizeof_object_inode_key()); + Dbt value (&obj, sizeof (obj)); + value.set_dlen (sizeof (obj)); + value.set_ulen (sizeof (obj)); + value.set_flags (DB_DBT_USERMEM); + + if (db->get (NULL, &key, &value, 0) != 0) + return -ENOENT; + + if (obj.length < size) + { + oid_t id; + mkoid (id, oid); + Dbt okey (&id, sizeof (oid_t)); + char b[] = { '\0' }; + Dbt newVal (b, 1); + newVal.set_doff ((size_t) size); + newVal.set_dlen (1); + newVal.set_ulen (1); + newVal.set_flags (DB_DBT_PARTIAL); + if (db->put (NULL, &okey, &newVal, 0) != 0) + return -EIO; + + obj.length = size; + value.set_ulen (sizeof (obj)); + if (db->put (NULL, &key, &value, 0) != 0) + return -EIO; + } + else if (obj.length > size) + { + obj.length = size; + Dbt tval (&obj, sizeof (obj)); + tval.set_ulen (sizeof (obj)); + tval.set_flags (DB_DBT_USERMEM); + if (db->put (NULL, &key, &tval, 0) != 0) + return -EIO; + if (size == 0) + { + char x[1]; + oid_t id; + mkoid (id, oid); + Dbt okey (&id, sizeof (oid_t)); + Dbt oval (&x, 0); + if (db->put (NULL, &okey, &oval, 0) != 0) + return -EIO; + } + else + { + oid_t id; + mkoid (id, oid); + Dbt okey (&id, sizeof (oid_t)); + Dbt oval; + oval.set_flags (DB_DBT_MALLOC); + if (db->get (NULL, &okey, &oval, 0) != 0) + return -EIO; + auto_ptr ovalPtr ((char *) oval.get_data()); + oval.set_size ((size_t) size); + oval.set_ulen ((size_t) size); + if (db->put (NULL, &okey, &oval, 0) != 0) + return -EIO; + } + } + + return 0; +} + +int OSBDB::read(object_t oid, off_t offset, size_t len, bufferlist& bl) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "read " << oid << " " << offset << " " + << len << endl; + + DbTxn *txn = NULL; + //env->txn_begin (NULL, &txn, 0); + + object_inode_key _ikey = new_object_inode_key (oid); + stored_object obj; + Dbt ikey (&_ikey, sizeof_object_inode_key()); + Dbt ival (&obj, sizeof (obj)); + ival.set_flags (DB_DBT_USERMEM); + ival.set_ulen (sizeof(obj)); + + dout(3) << " get " << _ikey << endl; + int ret; + if ((ret = db->get (txn, &ikey, &ival, 0)) != 0) + { + //txn->abort(); + derr(1) << "get returned " << db_strerror (ret) << endl; + return -ENOENT; + } + + if (offset == 0 && len >= obj.length) + { + len = obj.length; + dout(3) << " doing full read of " << len << endl; + oid_t id; + mkoid (id, oid); + Dbt key (&id, sizeof (oid_t)); + Dbt value (bl.c_str(), len); + value.set_ulen (len); + value.set_flags (DB_DBT_USERMEM); + dout(3) << " getting " << oid << endl; + if ((ret = db->get (txn, &key, &value, 0)) != 0) + { + derr(1) << " get returned " << db_strerror (ret) << endl; + //txn->abort(); + return -EIO; + } + } + else + { + if (offset > obj.length) + return 0; + if (offset + len > obj.length) + len = obj.length - (size_t) offset; + dout(3) << " doing partial read of " << len << endl; + oid_t id; + mkoid (id, oid); + Dbt key (&id, sizeof (oid)); + Dbt value (bl.c_str(), len); + value.set_doff ((size_t) offset); + value.set_dlen (len); + value.set_ulen (len); + value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); + dout(3) << " getting " << oid << endl; + if ((ret = db->get (NULL, &key, &value, 0)) != 0) + { + derr(1) << "get returned " << db_strerror (ret) << endl; + //txn->abort(); + return -EIO; + } + } + + //txn->commit (0); + return len; +} + +int OSBDB::write(object_t oid, off_t offset, size_t len, + bufferlist& bl, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "write " << oid << " " << offset << " " + << len << endl; + + if (offset > 0xFFFFFFFFL || offset + len > 0xFFFFFFFFL) + return -ENOSPC; + + DbTxn *txn = NULL; + //env->txn_begin (NULL, &txn, 0); + + object_inode_key _ikey = new_object_inode_key (oid); + stored_object obj; + Dbt ikey (&_ikey, sizeof_object_inode_key()); + Dbt ival (&obj, sizeof (obj)); + ival.set_ulen (sizeof (obj)); + ival.set_flags (DB_DBT_USERMEM); + + int ret; + dout(3) << " getting " << _ikey << endl; + if (db->get (txn, &ikey, &ival, 0) != 0) + { + dout(3) << " writing new object" << endl; + + // New object. + obj.length = (size_t) offset + len; + dout(3) << " mapping " << _ikey << " => " + << obj << endl; + if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) + { + derr(1) << " put returned " << db_strerror (ret) << endl; + return -EIO; + } + + oid_t id; + mkoid (id, oid); + Dbt key (&id, sizeof (oid_t)); + Dbt value (bl.c_str(), len); + if (offset == 0) // whole object + { + value.set_flags (DB_DBT_USERMEM); + value.set_ulen (len); + } + else + { + value.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); + value.set_ulen (len); + value.set_doff ((size_t) offset); + value.set_dlen (len); + } + dout(3) << " mapping " << oid << " => (" + << obj.length << " bytes)" << endl; + if ((ret = db->put (txn, &key, &value, 0)) != 0) + { + derr(1) << " put returned " << db_strerror (ret) << endl; + return -EIO; + } + return len; + } + + if (offset == 0 && len >= obj.length) + { + if (len != obj.length) + { + obj.length = len; + if ((ret = db->put (txn, &ikey, &ival, 0)) != 0) + { + derr(1) << " put returned " << db_strerror (ret) << endl; + return -EIO; + } + } + oid_t id; + mkoid(id, oid); + Dbt key (&id, sizeof (oid_t)); + Dbt value (bl.c_str(), len); + if (db->put (txn, &key, &value, 0) != 0) + { + return -EIO; + } + } + else + { + if (offset + len > obj.length) + { + obj.length = (size_t) offset + len; + if (db->put (NULL, &ikey, &ival, 0) != 0) + { + return -EIO; + } + } + oid_t id; + mkoid(id, oid); + Dbt key (&id, sizeof (oid_t)); + Dbt value (bl.c_str(), len); + value.set_doff ((size_t) offset); + value.set_dlen (len); + value.set_ulen (len); + value.set_flags (DB_DBT_PARTIAL); + if (db->put (NULL, &key, &value, 0) != 0) + { + return -EIO; + } + } + + return len; +} + +int OSBDB::clone(object_t oid, object_t noid) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "clone " << oid << ", " << noid << endl; + + if (exists (noid)) + return -EEXIST; + + object_inode_key _ikey = new_object_inode_key (oid); + object_inode_key _nikey = new_object_inode_key (noid); + stored_object obj; + Dbt ikey (&_ikey, sizeof_object_inode_key()); + Dbt ival (&obj, sizeof (obj)); + Dbt nikey (&_nikey, sizeof_object_inode_key()); + ival.set_ulen (sizeof (obj)); + ival.set_flags (DB_DBT_USERMEM); + + oid_t id, nid; + mkoid(id, oid); + mkoid(nid, noid); + Dbt key (&id, sizeof (oid_t)); + Dbt nkey (&oid, sizeof (oid_t)); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &ikey, &ival, 0) != 0) + return -ENOENT; + if (db->get (NULL, &key, &value, 0) != 0) + return -ENOENT; + auto_ptr valueptr ((char *) value.get_data()); + + if (db->put (NULL, &nikey, &ival, 0) != 0) + return -EIO; + if (db->put (NULL, &nkey, &value, 0) != 0) + return -EIO; + + return 0; +} + + // Collections + +int OSBDB::list_collections(list& ls) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "list_collections" << endl; + + Dbt key (COLLECTIONS_KEY, 1); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &key, &value, 0) != 0) + return 0; // no collections. + + auto_ptr sc ((stored_colls *) value.get_data()); + stored_colls *scp = sc.get(); + for (uint32_t i = 0; i < sc->count; i++) + ls.push_back (scp->colls[i]); + + return scp->count; +} + +int OSBDB::create_collection(coll_t c, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "create_collection " << c << endl; + + Dbt key (COLLECTIONS_KEY, 1); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + stored_colls *scp = NULL; + size_t sz = 0; + bool created = false; + if (db->get (NULL, &key, &value, 0) != 0) + { + sz = sizeof (stored_colls) + sizeof (coll_t); + scp = (stored_colls *) malloc (sz); + scp->count = 0; + created = true; + } + else + { + scp = (stored_colls *) value.get_data(); + sz = value.get_size(); + } + + auto_ptr sc (scp); + int ins = 0; + if (scp->count > 0) + ins = binary_search (scp->colls, scp->count, c); + if (scp->colls[ins] == c) + return -EEXIST; + + dout(3) << "..insertion point: " << ins << endl; + + // Make room for a new collection ID. + if (!created) + { + sz += sizeof (coll_t); + dout(3) << "..increase size to " << sz << endl; + stored_colls *scp2 = (stored_colls *) realloc (scp, sz); + sc.release (); + sc.reset (scp2); + scp = scp2; + } + + int n = (scp->count - ins) * sizeof (coll_t); + if (n > 0) + { + dout(3) << "..moving " << n << " bytes up" << endl; + memmove (&scp->colls[ins + 1], &scp->colls[ins], n); + } + scp->count++; + scp->colls[ins] = c; + + dout(3) << "..collections: " << scp << endl; + + // Put the modified collection list back. + { + Dbt value2 (scp, sz); + if (db->put (NULL, &key, &value2, 0) != 0) + { + return -EIO; + } + } + + // Create the new collection. + { + stored_coll new_coll; + new_coll.count = 0; + Dbt coll_key (&c, sizeof (coll_t)); + Dbt coll_value (&new_coll, sizeof (stored_coll)); + if (db->put (NULL, &coll_key, &coll_value, 0) != 0) + { + return -EIO; + } + } + + return 0; +} + +int OSBDB::destroy_collection(coll_t c, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "destroy_collection " << c << endl; + + Dbt key (COLLECTIONS_KEY, 1); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &key, &value, 0) != 0) + { + return -ENOENT; // XXX + } + + stored_colls *scp = (stored_colls *) value.get_data(); + auto_ptr valueBuf (scp); + if (scp->count == 0) + { + return -ENOENT; + } + uint32_t ins = binary_search (scp->colls, scp->count, c); + if (scp->colls[ins] != c) + { + return -ENOENT; + } + + // Move the rest of the list down in memory, if needed. + if (ins < scp->count - 1) + { + size_t n = scp->count - ins - 1; + memmove (&scp->colls[ins], &scp->colls[ins + 1], n); + } + + // Modify the record size to be one less. + Dbt nvalue (scp, value.get_size() - sizeof (coll_t)); + nvalue.set_flags (DB_DBT_USERMEM); + if (db->put (NULL, &key, &nvalue, 0) != 0) + { + return -EIO; + } + + // Delete the collection. + Dbt collKey (&c, sizeof (coll_t)); + if (db->del (NULL, &collKey, 0) != 0) + { + return -EIO; + } + + return 0; +} + +bool OSBDB::collection_exists(coll_t c) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_exists " << c << endl; + + Dbt key (COLLECTIONS_KEY, 1); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &key, &value, 0) != 0) + return false; + + stored_colls *scp = (stored_colls *) value.get_data(); + auto_ptr sc (scp); + if (scp->count == 0) + return false; + uint32_t ins = binary_search (scp->colls, scp->count, c); + + return (scp->colls[ins] == c); +} + +int OSBDB::collection_stat(coll_t c, struct stat *st) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_stat " << c << endl; + return -ENOSYS; +} + +int OSBDB::collection_add(coll_t c, object_t o, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_add " << c << " " << o << endl; + + Dbt key (&c, sizeof (coll_t)); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &key, &value, 0) != 0) + { + return -ENOENT; + } + + size_t sz = value.get_size(); + stored_coll *scp = (stored_coll *) value.get_data(); + auto_ptr sc (scp); + + // Find the insertion point for the new object ID. + uint32_t ins = 0; + if (scp->count > 0) + { + ins = binary_search (scp->objects, scp->count, o); + // Already there? + if (scp->objects[ins] == o) + { + return -EEXIST; + } + } + + // Make room for the new value, and add it. + sz += sizeof (object_t); + scp = (stored_coll *) realloc (scp, sz); + sc.release(); + sc.reset (scp); + if (ins < scp->count) + { + size_t n = (scp->count - ins) * sizeof (object_t); + memmove (&scp->objects[ins + 1], &scp->objects[ins], n); + } + scp->count++; + scp->objects[ins] = o; + + dout(3) << "..collection: " << scp << endl; + + Dbt nvalue (scp, sz); + if (db->put (NULL, &key, &nvalue, 0) != 0) + { + return -EIO; + } + + return 0; +} + +int OSBDB::collection_remove(coll_t c, object_t o, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_remove " << c << " " << o << endl; + + Dbt key (&c, sizeof (coll_t)); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &key, &value, 0) != 0) + { + return -ENOENT; + } + + stored_coll *scp = (stored_coll *) value.get_data(); + auto_ptr sc (scp); + + if (scp->count == 0) + { + return -ENOENT; + } + uint32_t ins = binary_search (scp->objects, scp->count, o); + if (scp->objects[ins] != o) + { + return -ENOENT; + } + + if (ins < scp->count - 1) + { + size_t n = (scp->count - ins - 1) * sizeof (object_t); + memmove (&scp->objects[ins], &scp->objects[ins + 1], n); + } + scp->count--; + + dout(3) << "..collection " << scp << endl; + + Dbt nval (scp, value.get_size() - sizeof (object_t)); + if (db->put (NULL, &key, &nval, 0) != 0) + { + return -EIO; + } + + return 0; +} + +int OSBDB::collection_list(coll_t c, list& o) +{ + if (!mounted) + return -EINVAL; + + Dbt key (&c, sizeof (coll_t)); + Dbt value; + if (db->get (NULL, &key, &value, 0) != 0) + return -ENOENT; + + stored_coll *scp = (stored_coll *) value.get_data(); + auto_ptr sc (scp); + for (uint32_t i = 0; i < scp->count; i++) + o.push_back (scp->objects[i]); + + return 0; +} + + // Attributes + +int OSBDB::_setattr(object_t oid, const char *name, + const void *value, size_t size, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + if (strlen (name) >= OSBDB_MAX_ATTR_LEN) + return -ENAMETOOLONG; + + // Add name to attribute list, if needed. + attrs_id aids = new_attrs_id (oid); + Dbt attrs_key (&aids, sizeof_attrs_id()); + Dbt attrs_val; + attrs_val.set_flags (DB_DBT_MALLOC); + stored_attrs *sap = NULL; + size_t sz = 0; + + dout(3) << " getting " << aids << endl; + if (db->get (NULL, &attrs_key, &attrs_val, 0) != 0) + { + dout(2) << " first attribute" << endl; + sz = sizeof (stored_attrs); + sap = (stored_attrs *) malloc(sz); + sap->count = 0; + } + else + { + sz = attrs_val.get_size(); + sap = (stored_attrs *) attrs_val.get_data(); + dout(2) << " add to list of " << sap->count << " attrs" << endl; + } + auto_ptr sa (sap); + + attr_name _name; + strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); + + int ins = 0; + if (sap->count > 0) + ins = binary_search (sap->names, sap->count, _name); + dout(3) << " insertion point is " << ins << endl; + if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0) + { + sz += sizeof (attr_name); + dout(3) << " realloc 0x" << hex << ((void *) sap) << " to " + << dec << sz << endl; + sap = (stored_attrs *) realloc (sap, sz); + dout(3) << " returns 0x" << hex << ((void *) sap) << endl; + sa.release (); + sa.reset (sap); + int n = (sap->count - ins) * sizeof (attr_name); + if (n > 0) + { + dout(3) << " move " << n << " bytes from 0x" + << hex << (&sap->names[ins]) << " to 0x" + << hex << (&sap->names[ins+1]) << endl; + memmove (&sap->names[ins+1], &sap->names[ins], n); + } + memset (&sap->names[ins], 0, sizeof (attr_name)); + strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); + sap->count++; + + Dbt newAttrs_val (sap, sz); + newAttrs_val.set_ulen (sz); + newAttrs_val.set_flags (DB_DBT_USERMEM); + dout(3) << " putting " << aids << endl; + if (db->put (NULL, &attrs_key, &newAttrs_val, 0) != 0) + return -EIO; + } + else + { + dout(3) << " attribute " << name << " already exists" << endl; + } + + dout(3) << " attributes list: " << sap << endl; + + // Add the attribute. + attr_id aid = new_attr_id (oid, name); + Dbt attr_key (&aid, sizeof (aid)); + Dbt attr_val ((void *) value, size); + dout(3) << " writing attribute key " << aid << endl; + if (db->put (NULL, &attr_key, &attr_val, 0) != 0) + return -EIO; + + return 0; +} + +int OSBDB::setattr(object_t oid, const char *name, + const void *value, size_t size, + Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "setattr " << oid << ":" << name << " => (" + << size << " bytes)" << endl; + int ret = _setattr (oid, name, value, size, onsafe); + return ret; +} + +int OSBDB::setattrs(object_t oid, map& aset, + Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + map::iterator it; + for (it = aset.begin(); it != aset.end(); it++) + { + string name = it->first; + bufferptr value = it->second; + int ret = _setattr (oid, name.c_str(), value.c_str(), + value.length(), onsafe); + if (ret != 0) + { + return ret; + } + } + return 0; +} + +int OSBDB::_getattr (object_t oid, const char *name, void *value, size_t size) +{ + if (!mounted) + return -EINVAL; + + attr_id aid = new_attr_id (oid, name); + Dbt key (&aid, sizeof (aid)); + Dbt val (value, size); + val.set_ulen (size); + val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); + + if (db->get (NULL, &key, &val, 0) != 0) + { + return -ENOENT; + } + + return val.get_size(); +} + +int OSBDB::getattr(object_t oid, const char *name, void *value, size_t size) +{ + if (!mounted) + return -EINVAL; + + return _getattr (oid, name, value, size); +} + +int OSBDB::getattrs(object_t oid, map& aset) +{ + if (!mounted) + return -EINVAL; + + int count = 0; + for (map::iterator it = aset.begin(); + it != aset.end(); it++) + { + int ret = _getattr (oid, (*it).first.c_str(), + (*it).second.c_str(), + (*it).second.length()); + if (ret < 0) + return ret; + count += ret; + } + return count; +} + +int OSBDB::rmattr(object_t oid, const char *name, Context *onsafe) +{ + if (!mounted) + return -EINVAL; + attrs_id aids = new_attrs_id (oid); + Dbt askey (&aids, sizeof_attrs_id()); + Dbt asvalue; + asvalue.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &askey, &asvalue, 0) != 0) + return -ENOENT; + + stored_attrs *sap = (stored_attrs *) asvalue.get_data(); + auto_ptr sa (sap); + + if (sap->count == 0) + return -ENOENT; + + attr_name _name; + memset(&name, 0, sizeof (_name)); + strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); + int ins = binary_search (sap->names, sap->count, _name); + if (strcmp (sap->names[ins].name, name) != 0) + return -ENOENT; + + // Shift the later elements down by one, if needed. + int n = (sap->count - ins) * sizeof (attr_name); + if (n > 0) + memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); + sap->count--; + asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); + int ret; + if ((ret = db->put (NULL, &askey, &asvalue, 0)) != 0) + { + derr(1) << "put stored_attrs " << db_strerror (ret) << endl; + return -EIO; + } + + // Remove the attribute. + attr_id aid = new_attr_id (oid, name); + Dbt key (&aid, sizeof (aid)); + if ((ret = db->del (NULL, &key, 0)) != 0) + derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; + + return 0; +} + +int OSBDB::listattr(object_t oid, char *attrs, size_t size) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "listattr " << oid << endl; + + attrs_id aids = new_attrs_id (oid); + Dbt key (&aids, sizeof_attrs_id()); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + int ret; + if ((ret = db->get (NULL, &key, &value, 0)) != 0) + { + derr(1) << "fetching " << aids << ": " << db_strerror (ret) + << endl; + return -ENOENT; + } + + stored_attrs *attrsp = (stored_attrs *) value.get_data(); + auto_ptr _attrs (attrsp); + size_t s = 0; + char *p = attrs; + for (unsigned i = 0; i < attrsp->count && s < size; i++) + { + int n = MIN (OSBDB_MAX_ATTR_LEN, + MIN (strlen (attrsp->names[i].name), size - s - 1)); + strncpy (p, attrsp->names[i].name, n); + p[n] = '\0'; + p = p + n + 1; + } + return 0; +} + + // Collection attributes. + +int OSBDB::collection_setattr(coll_t cid, const char *name, + const void *value, size_t size, + Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_setattr" << cid << " " << name + << " (" << size << " bytes)" << endl; + if (strlen (name) >= OSBDB_MAX_ATTR_LEN) + return -ENAMETOOLONG; + + // Add name to attribute list, if needed. + coll_attrs_id aids = new_coll_attrs_id (cid); + Dbt attrs_key (&aids, sizeof_coll_attrs_id()); + Dbt attrs_val; + attrs_val.set_flags (DB_DBT_MALLOC); + stored_attrs *sap = NULL; + size_t sz = 0; + + dout(3) << " getting " << aids << endl; + if (db->get (NULL, &attrs_key, &attrs_val, 0) != 0) + { + dout(2) << " first attribute" << endl; + sz = sizeof (stored_attrs); + sap = (stored_attrs *) malloc(sz); + sap->count = 0; + } + else + { + sz = attrs_val.get_size(); + sap = (stored_attrs *) attrs_val.get_data(); + dout(2) << " add to list of " << sap->count << " attrs" << endl; + } + auto_ptr sa (sap); + + attr_name _name; + strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); + + int ins = 0; + if (sap->count > 0) + ins = binary_search (sap->names, sap->count, _name); + dout(3) << " insertion point is " << ins << endl; + if (sap->count == 0 || strcmp (sap->names[ins].name, name) != 0) + { + sz += sizeof (attr_name); + dout(3) << " realloc 0x" << hex << ((void *) sap) << " to " + << dec << sz << endl; + sap = (stored_attrs *) realloc (sap, sz); + dout(3) << " returns 0x" << hex << ((void *) sap) << endl; + sa.release (); + sa.reset (sap); + int n = (sap->count - ins) * sizeof (attr_name); + if (n > 0) + { + dout(3) << " move " << n << " bytes from 0x" + << hex << (&sap->names[ins]) << " to 0x" + << hex << (&sap->names[ins+1]) << endl; + memmove (&sap->names[ins+1], &sap->names[ins], n); + } + memset (&sap->names[ins], 0, sizeof (attr_name)); + strncpy (sap->names[ins].name, name, OSBDB_MAX_ATTR_LEN); + sap->count++; + + Dbt newAttrs_val (sap, sz); + newAttrs_val.set_ulen (sz); + newAttrs_val.set_flags (DB_DBT_USERMEM); + dout(3) << " putting " << aids << endl; + if (db->put (NULL, &attrs_key, &newAttrs_val, 0) != 0) + return -EIO; + } + else + { + dout(3) << " attribute " << name << " already exists" << endl; + } + + dout(3) << " attributes list: " << sap << endl; + + // Add the attribute. + coll_attr_id aid = new_coll_attr_id (cid, name); + Dbt attr_key (&aid, sizeof (aid)); + Dbt attr_val ((void *) value, size); + dout(3) << " writing attribute key " << aid << endl; + if (db->put (NULL, &attr_key, &attr_val, 0) != 0) + return -EIO; + + return 0; +} + +int OSBDB::collection_rmattr(coll_t cid, const char *name, + Context *onsafe) +{ + if (!mounted) + return -EINVAL; + + coll_attrs_id aids = new_coll_attrs_id (cid); + Dbt askey (&aids, sizeof_coll_attrs_id()); + Dbt asvalue; + asvalue.set_flags (DB_DBT_MALLOC); + + if (db->get (NULL, &askey, &asvalue, 0) != 0) + return -ENOENT; + + stored_attrs *sap = (stored_attrs *) asvalue.get_data(); + auto_ptr sa (sap); + + if (sap->count == 0) + return -ENOENT; + + attr_name _name; + memset(&name, 0, sizeof (_name)); + strncpy (_name.name, name, OSBDB_MAX_ATTR_LEN); + int ins = binary_search (sap->names, sap->count, _name); + if (strcmp (sap->names[ins].name, name) != 0) + return -ENOENT; + + // Shift the later elements down by one, if needed. + int n = (sap->count - ins) * sizeof (attr_name); + if (n > 0) + memmove (&(sap->names[ins]), &(sap->names[ins + 1]), n); + sap->count--; + asvalue.set_size(asvalue.get_size() - sizeof (attr_name)); + int ret; + if ((ret = db->put (NULL, &askey, &asvalue, 0)) != 0) + { + derr(1) << "put stored_attrs " << db_strerror (ret) << endl; + return -EIO; + } + + // Remove the attribute. + coll_attr_id aid = new_coll_attr_id (cid, name); + Dbt key (&aid, sizeof (aid)); + if ((ret = db->del (NULL, &key, 0)) != 0) + derr(1) << "deleting " << aid << ": " << db_strerror(ret) << endl; + + return 0; +} + +int OSBDB::collection_getattr(coll_t cid, const char *name, + void *value, size_t size) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_getattr " << cid << " " << name << endl; + + coll_attr_id caid = new_coll_attr_id (cid, name); + Dbt key (&caid, sizeof (caid)); + Dbt val (value, size); + val.set_ulen (size); + val.set_dlen (size); + val.set_flags (DB_DBT_USERMEM | DB_DBT_PARTIAL); + + if (db->get (NULL, &key, &val, 0) != 0) + return -ENOENT; + + return val.get_size(); +} + +int OSBDB::collection_listattr(coll_t cid, char *attrs, size_t size) +{ + if (!mounted) + return -EINVAL; + + dout(2) << "collection_listattr " << cid << endl; + + coll_attrs_id caids = new_coll_attrs_id (cid); + Dbt key (&caids, sizeof_coll_attrs_id()); + Dbt value; + value.set_flags (DB_DBT_MALLOC); + + int ret; + if ((ret = db->get (NULL, &key, &value, 0)) != 0) + { + derr(1) << "fetching " << caids << ": " << db_strerror (ret) + << endl; + return -ENOENT; + } + + stored_attrs *attrsp = (stored_attrs *) value.get_data(); + auto_ptr _attrs (attrsp); + size_t s = 0; + char *p = attrs; + for (unsigned i = 0; i < attrsp->count && s < size; i++) + { + int n = MIN (OSBDB_MAX_ATTR_LEN, + MIN (strlen (attrsp->names[i].name), size - s - 1)); + strncpy (p, attrsp->names[i].name, n); + p[n] = '\0'; + p = p + n + 1; + } + return 0; +} + + // Sync. + +void OSBDB::sync (Context *onsync) +{ + if (!mounted) + return; + + sync(); + // huh? +} + +void OSBDB::sync() +{ + if (!mounted) + return; + + db->sync(0); +} diff --git a/branches/aleung/security1/ceph/osbdb/OSBDB.h b/branches/aleung/security1/ceph/osbdb/OSBDB.h new file mode 100644 index 0000000000000..9ba42d206d290 --- /dev/null +++ b/branches/aleung/security1/ceph/osbdb/OSBDB.h @@ -0,0 +1,507 @@ +/* OSBDB.h -- ObjectStore on Berkeley DB. -*- c++ -*- + Copyright (C) 2007 Casey Marshall + +Ceph - scalable distributed file system + +This is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 2.1, as published by the Free Software +Foundation. See file COPYING. */ + + +#include +#include "osd/ObjectStore.h" + +// Redefine this to use a different BDB access type. DB_BTREE is +// probably the only other one that makes sense. +#ifndef OSBDB_DB_TYPE +#define OSBDB_DB_TYPE DB_HASH +#endif // OSBDB_DB_TYPE + +/* + * Maximum length of an attribute name. + */ +#define OSBDB_MAX_ATTR_LEN 256 + +#define OSBDB_THIS_VERSION 1 + +#define OSBDB_SUPERBLOCK_KEY ((void *) "s") + +/* + * The "superblock" of the BDB object store. We store one of these in + * the DB, to store version and other information. We don't record + * anything special here, just the version number the database was + * written with. + * + * In principle, this structure is variable-length, depending on the + * software version writing the superblock. + */ +struct stored_superblock +{ + uint32_t version; +}; + +inline ostream& operator<<(ostream& out, const stored_superblock sb) +{ + out << "osbdb.super(" << sb.version << ")" << endl; + return out; +} + +/** + * An object identifier; we define this so we can have a POD object to + * work with. + */ +struct oid_t // POD +{ + char id[16]; +}; + +inline void mkoid (oid_t& id, object_t& oid) +{ + // XXX byte order? + memcpy (id.id, &oid, sizeof (oid_t)); +} + +inline ostream& operator<<(ostream& out, const oid_t id) +{ + for (int i = 0; i < 16; i++) + { + out.fill('0'); + out << setw(2) << hex << (id.id[i] & 0xFF); + if ((i & 3) == 3) + out << ':'; + } + out.unsetf(ios::right); + out << dec; + return out; +} + +/** + * An "inode" key. We map a 'stored_object' struct to this key for + * every object. + */ +struct object_inode_key // POD +{ + oid_t oid; + char tag; +}; + +/** + * "Constructor" for an object_inode_key. + */ +inline object_inode_key new_object_inode_key (object_t& oid) +{ + object_inode_key key; + memset(&key, 0, sizeof (object_inode_key)); + mkoid (key.oid, oid); + key.tag = 'i'; + return key; +} + +/* + * We use this, instead of sizeof(), to try and guarantee that we + * don't include the structure padding, if any. + * + * This *should* return 17: sizeof (oid_t) == 16; sizeof (char) == 1. + */ +inline size_t sizeof_object_inode_key() +{ + return offsetof(object_inode_key, tag) + sizeof (char); +} + + // Frank Poole: Unfortunately, that sounds a little + // like famous last words. + // -- 2001: A Space Odyssey + +inline ostream& operator<<(ostream& out, const object_inode_key o) +{ + out << o.tag << "/" << o.oid; + return out; +} + +/** + * A stored object. This is essentially the "inode" of the object, + * containing things like the object's length. The object itself is + * stored as-is, mapped by the 128-bit object ID. + */ +struct stored_object +{ + uint32_t length; +}; + +inline ostream& operator<<(ostream& out, const stored_object s) +{ + out << "inode(l:" << s.length << ")"; + return out; +} + +/* + * Key referencing the list of attribute names for an object. This is + * simply the object's ID, with an additional character 'a' appended. + */ +struct attrs_id // POD +{ + oid_t oid; + char tag; +}; + +/* + * "Construtor" for attrs_id. + */ +inline struct attrs_id new_attrs_id (object_t& oid) +{ + attrs_id aid; + memset (&aid, 0, sizeof (attrs_id)); + mkoid(aid.oid, oid); + aid.tag = 'a'; + return aid; +} + +/* + * See explanation for sizeof_object_inode_id. + */ +inline size_t sizeof_attrs_id() +{ + return offsetof(struct attrs_id, tag) + sizeof (char); +} + +inline ostream& operator<<(ostream& out, const attrs_id id) +{ + out << id.tag << "/" << id.oid; + return out; +} + +/* + * Encapsulation of a single attribute name. + */ +struct attr_name // POD +{ + char name[OSBDB_MAX_ATTR_LEN]; +}; + +inline ostream& operator<<(ostream& out, const attr_name n) +{ + out << n.name; + return out; +} + +inline bool operator<(const attr_name n1, const attr_name n2) +{ + return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) < 0); +} + +inline bool operator>(const attr_name n1, const attr_name n2) +{ + return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) > 0); +} + +inline bool operator==(const attr_name n1, const attr_name n2) +{ + std::cerr << n1.name << " == " << n2.name << "?" << endl; + return (strncmp (n1.name, n2.name, OSBDB_MAX_ATTR_LEN) == 0); +} + +inline bool operator!=(const attr_name n1, const attr_name n2) +{ + return !(n1 == n2); +} + +inline bool operator>=(const attr_name n1, const attr_name n2) +{ + return !(n1 < n2); +} + +inline bool operator<=(const attr_name n1, const attr_name n2) +{ + return !(n1 > n2); +} + +/* + * A list of an object or collection's attribute names. + */ +struct stored_attrs +{ + uint32_t count; + attr_name names[0]; // actually variable-length +}; + +inline ostream& operator<<(ostream& out, const stored_attrs *sa) +{ + out << sa->count << " [ "; + for (unsigned i = 0; i < sa->count; i++) + out << sa->names[i] << (i == sa->count - 1 ? " " : ", "); + out << "]"; + return out; +} + +/* + * An object attribute key. An object attribute is mapped simply by + * the object ID appended with the attribute name. Attribute names + * may not be empty, and must be less than 256 characters, in this + * implementation. + */ +struct attr_id // POD +{ + oid_t oid; + attr_name name; +}; + +inline attr_id new_attr_id (object_t& oid, const char *name) +{ + attr_id aid; + memset(&aid, 0, sizeof (attr_id)); + mkoid (aid.oid, oid); + strncpy (aid.name.name, name, OSBDB_MAX_ATTR_LEN); + return aid; +} + +inline ostream& operator<<(ostream &out, const attr_id id) +{ + out << id.oid << ":" << id.name; + return out; +} + +/* + * A key for a collection attributes list. + */ +struct coll_attrs_id // POD +{ + coll_t cid; + char tag; +}; + +inline coll_attrs_id new_coll_attrs_id (coll_t cid) +{ + coll_attrs_id catts; + memset(&catts, 0, sizeof (coll_attrs_id)); + catts.cid = cid; + catts.tag = 'C'; + return catts; +} + +inline size_t sizeof_coll_attrs_id() +{ + return offsetof(coll_attrs_id, tag) + sizeof (char); +} + +inline ostream& operator<<(ostream& out, coll_attrs_id id) +{ + out << id.tag << "/" << id.cid; + return out; +} + +/* + * A collection attribute key. Similar to + */ +struct coll_attr_id // POD +{ + coll_t cid; + attr_name name; +}; + +inline coll_attr_id new_coll_attr_id (coll_t cid, const char *name) +{ + coll_attr_id catt; + memset(&catt, 0, sizeof (coll_attr_id)); + catt.cid = cid; + strncpy (catt.name.name, name, OSBDB_MAX_ATTR_LEN); + return catt; +} + +inline ostream& operator<<(ostream& out, coll_attr_id id) +{ + out << id.cid << ":" << id.name; + return out; +} + +/* + * This is the key we store the master collections list under. + */ +#define COLLECTIONS_KEY ((void *) "c") + +/* + * The master list of collections. There should be one of these per + * OSD. The sole reason for this structure is to have the ability + * to enumerate all collections stored on this OSD. + */ +struct stored_colls +{ + // The number of collections. + uint32_t count; + + // The collection identifiers. This is a sorted list of coll_t + // values. + coll_t colls[0]; // actually variable-length +}; + +inline ostream& operator<<(ostream& out, stored_colls *c) +{ + out << c->count << " [ "; + for (unsigned i = 0; i < c->count; i++) + { + out << hex << c->colls[i]; + if (i < c->count - 1) + out << ", "; + } + out << " ]" << dec; + return out; +} + +/* + * A stored collection (a bag of object IDs). These are referenced by + * the bare collection identifier type, a coll_t (thus, a 32-bit + * integer). Internally this is stored as a sorted list of object IDs. + * + * Note, this structure places all collection items in a single + * record; this may be a memory burden for large collections. + */ +struct stored_coll +{ + // The size of this collection. + uint32_t count; + + // The object IDs in this collection. This is a sorted list of all + // object ID's in this collection. + object_t objects[0]; // actually variable-length +}; + +inline ostream& operator<<(ostream& out, stored_coll *c) +{ + out << c->count << " [ "; + for (unsigned i = 0; i < c->count; i++) + { + out << c->objects[i]; + if (i < c->count - 1) + out << ", "; + } + out << " ]"; + return out; +} + +/* + * The object store interface for Berkeley DB. + */ +class OSBDB : public ObjectStore +{ + private: + DbEnv *env; + Db *db; + string device; + bool mounted; + bool opened; + + public: + + OSBDB(const char *dev) + : env(0), db (0), device (dev), mounted(false), opened(false) + { + /*env = new DbEnv (DB_CXX_NO_EXCEPTIONS); + env->set_error_stream (&std::cerr); + // WTF? You can't open an env if you set this flag here, but BDB + // says you also can't set it after you open the env. + //env->set_flags (DB_LOG_INMEMORY, 1); + char *p = strrchr (dev, '/'); + int env_flags = (DB_CREATE | DB_THREAD | DB_INIT_LOCK + | DB_INIT_MPOOL | DB_INIT_TXN | DB_INIT_LOG); + if (p != NULL) + { + *p = '\0'; + if (env->open (dev, env_flags, 0) != 0) + { + std::cerr << "failed to open environment: " + << dev << std::endl; + ::abort(); + } + *p = '/'; + dev = p+1; + } + else + { + if (env->open (NULL, env_flags, 0) != 0) + { + std::cerr << "failed to open environment: ." << std::endl; + ::abort(); + } + } + + // Double WTF: if you remove the DB_LOG_INMEMORY bit, db->open + // fails, inexplicably, with EINVAL!*/ + // env->set_flags (DB_DIRECT_DB | /*DB_AUTO_COMMIT |*/ DB_LOG_INMEMORY, 1); + } + + ~OSBDB() + { + if (mounted) + { + umount(); + } + if (env != NULL) + { + env->close (0); + delete env; + } + } + + int mount(); + int umount(); + int mkfs(); + + int statfs(struct statfs *buf); + + int pick_object_revision_lt(object_t& oid); + + bool exists(object_t oid); + int stat(object_t oid, struct stat *st); + + int remove(object_t oid, Context *onsafe=0); + + int truncate(object_t oid, off_t size, Context *onsafe=0); + + int read(object_t oid, off_t offset, size_t len, + bufferlist& bl); + int write(object_t oid, off_t offset, size_t len, + bufferlist& bl, Context *onsafe); + + int setattr(object_t oid, const char *name, + const void *value, size_t size, Context *onsafe=0); + int setattrs(object_t oid, map& aset, + Context *onsafe=0); + int getattr(object_t oid, const char *name, + void *value, size_t size); + int getattrs(object_t oid, map& aset); + int rmattr(object_t oid, const char *name, + Context *onsafe=0); + int listattr(object_t oid, char *attrs, size_t size); + + int clone(object_t oid, object_t noid); + + // Collections. + + int list_collections(list& ls); + int create_collection(coll_t c, Context *onsafe=0); + int destroy_collection(coll_t c, Context *onsafe=0); + bool collection_exists(coll_t c); + int collection_stat(coll_t c, struct stat *st); + int collection_add(coll_t c, object_t o, Context *onsafe=0); + int collection_remove(coll_t c, object_t o, Context *onsafe=0); + int collection_list(coll_t c, list& o); + + int collection_setattr(coll_t cid, const char *name, + const void *value, size_t size, + Context *onsafe=0); + int collection_rmattr(coll_t cid, const char *name, + Context *onsafe=0); + int collection_getattr(coll_t cid, const char *name, + void *value, size_t size); + int collection_listattr(coll_t cid, char *attrs, size_t size); + + void sync(Context *onsync); + void sync(); + +private: + int opendb (DBTYPE type=DB_UNKNOWN, int flags=0); + + int _setattr(object_t oid, const char *name, const void *value, + size_t size, Context *onsync); + int _getattr(object_t oid, const char *name, void *value, size_t size); +}; diff --git a/branches/aleung/security1/ceph/osd/FakeStore.cc b/branches/aleung/security1/ceph/osd/FakeStore.cc index 36dc01127107e..1ff08530e4cfd 100644 --- a/branches/aleung/security1/ceph/osd/FakeStore.cc +++ b/branches/aleung/security1/ceph/osd/FakeStore.cc @@ -28,7 +28,7 @@ #include #include #include -//#include +#include //#include #ifdef DARWIN @@ -38,7 +38,8 @@ #include "config.h" #undef dout -#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore " +#define dout(l) if (l<=g_conf.debug) cout << g_clock.now() << " osd" << whoami << ".fakestore " +#define derr(l) if (l<=g_conf.debug) cerr << g_clock.now() << " osd" << whoami << ".fakestore " #include "include/buffer.h" @@ -54,159 +55,149 @@ using namespace __gnu_cxx; +int FakeStore::statfs(struct statfs *buf) +{ + return ::statfs(basedir.c_str(), buf); +} +/* + * sorry, these are sentitive to the object_t and coll_t typing. + */ +void FakeStore::get_oname(object_t oid, char *s) +{ + static hash H; + assert(sizeof(oid) == 16); + sprintf(s, "%s/objects/%02x/%016llx.%016llx", basedir.c_str(), H(oid) & HASH_MASK, + *((__uint64_t*)&oid), + *(((__uint64_t*)&oid) + 1)); +} -int FakeStore::mount() +void FakeStore::get_cdir(coll_t cid, char *s) +{ + assert(sizeof(cid) == 8); + sprintf(s, "%s/collections/%016llx", basedir.c_str(), + cid); +} + +void FakeStore::get_coname(coll_t cid, object_t oid, char *s) +{ + assert(sizeof(oid) == 16); + sprintf(s, "%s/collections/%016llx/%016llx.%016llx", basedir.c_str(), cid, + *((__uint64_t*)&oid), + *(((__uint64_t*)&oid) + 1)); +} + + + + +int FakeStore::mkfs() { + char cmd[200]; if (g_conf.fakestore_dev) { dout(0) << "mounting" << endl; - char cmd[100]; sprintf(cmd,"mount %s", g_conf.fakestore_dev); system(cmd); } - string mydir; - get_dir(mydir); - - dout(5) << "init with basedir " << mydir << endl; - - // make sure global base dir exists - struct stat st; - int r = ::stat(basedir.c_str(), &st); - if (r != 0) { - dout(1) << "unable to stat basedir " << basedir << ", r = " << r << endl; - return r; - } + dout(1) << "mkfs in " << basedir << endl; - // all okay. - return 0; -} + // wipe + sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects", + basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str()); + + dout(5) << "wipe: " << cmd << endl; + system(cmd); -int FakeStore::umount() -{ - dout(5) << "finalize" << endl; + // hashed bits too + for (int i=0; i H; - sprintf(s, "%d/%02x/%016llx.%08x.%d", whoami, H(oid) & HASH_MASK, oid.ino, oid.bno, oid.rev); - fn = basedir + "/" + s; - // dout(1) << "oname is " << fn << endl; -} - - - -void FakeStore::wipe_dir(string mydir) -{ - DIR *dir = ::opendir(mydir.c_str()); - if (dir) { - dout(10) << "wiping " << mydir << endl; - struct dirent *ent = 0; - - while ((ent = ::readdir(dir)) != 0) { - if (ent->d_name[0] == '.') continue; - dout(25) << "mkfs unlinking " << ent->d_name << endl; - string fn = mydir + "/" + ent->d_name; - ::unlink(fn.c_str()); - } - - ::closedir(dir); - } else { - dout(1) << "mkfs couldn't read dir " << mydir << endl; - } + return 0; } -int FakeStore::mkfs() +int FakeStore::mount() { if (g_conf.fakestore_dev) { dout(0) << "mounting" << endl; char cmd[100]; sprintf(cmd,"mount %s", g_conf.fakestore_dev); - system(cmd); + //system(cmd); } - - int r = 0; + dout(5) << "basedir " << basedir << endl; + + // make sure global base dir exists struct stat st; - string mydir; - get_dir(mydir); - - dout(1) << "mkfs in " << mydir << endl; - - - // make sure my dir exists - r = ::stat(mydir.c_str(), &st); + int r = ::stat(basedir.c_str(), &st); if (r != 0) { - dout(10) << "creating " << mydir << endl; - mkdir(mydir.c_str(), 0755); - r = ::stat(mydir.c_str(), &st); - if (r != 0) { - dout(1) << "couldnt create dir, r = " << r << endl; - return r; - } + derr(0) << "unable to stat basedir " << basedir << ", r = " << r << endl; + return r; + } + + if (g_conf.fakestore_fake_collections) { + dout(0) << "faking collections (in memory)" << endl; + fake_collections = true; } - else wipe_dir(mydir); - // hashed bits too - for (int i=0; i 0) did += r; else { - dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; + derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; } } if (did < 0) { - dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; + derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; } ::flock(fd, LOCK_UN); @@ -341,24 +332,44 @@ int FakeStore::write(object_t oid, class C_FakeSync : public Context { -public: Context *c; int *n; - C_FakeSync(Context *c_, int *n_) : c(c_), n(n_) { + Mutex *lock; + Cond *cond; + +public: + C_FakeSync(Context *c_, int *n_, Mutex *lo, Cond *co) : + c(c_), n(n_), + lock(lo), cond(co) { + lock->Lock(); ++*n; + lock->Unlock(); } void finish(int r) { c->finish(r); + + lock->Lock(); --(*n); - //cout << "sync, " << *n << " still unsync" << endl; + if (*n == 0) cond->Signal(); + lock->Unlock(); } }; +void FakeStore::sync() +{ + synclock.Lock(); + while (unsync > 0) { + dout(0) << "sync waiting for " << unsync << " items to (fake) sync" << endl; + synccond.Wait(synclock); + } + synclock.Unlock(); +} + void FakeStore::sync(Context *onsafe) { if (g_conf.fakestore_fake_sync) { g_timer.add_event_after((float)g_conf.fakestore_fake_sync, - new C_FakeSync(onsafe, &unsync)); + new C_FakeSync(onsafe, &unsync, &synclock, &synccond)); } else { assert(0); // der..no implemented anymore @@ -366,4 +377,250 @@ void FakeStore::sync(Context *onsafe) } +// ------------------------------- +// attributes + +// objects + +int FakeStore::setattr(object_t oid, const char *name, + const void *value, size_t size, + Context *onsafe) +{ + if (fake_attrs) return attrs.setattr(oid, name, value, size, onsafe); + + char fn[100]; + get_oname(oid, fn); + int r = ::setxattr(fn, name, value, size, 0); + return r; +} + +int FakeStore::setattrs(object_t oid, map& aset) +{ + if (fake_attrs) return attrs.setattrs(oid, aset); + + char fn[100]; + get_oname(oid, fn); + int r = 0; + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + r = ::setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length(), 0); + if (r < 0) break; + } + return r; +} + +int FakeStore::getattr(object_t oid, const char *name, + void *value, size_t size) +{ + if (fake_attrs) return attrs.getattr(oid, name, value, size); + char fn[100]; + get_oname(oid, fn); + int r = ::getxattr(fn, name, value, size); + return r; +} + +int FakeStore::getattrs(object_t oid, map& aset) +{ + if (fake_attrs) return attrs.getattrs(oid, aset); + + char fn[100]; + get_oname(oid, fn); + + char val[1000]; + char names[1000]; + int num = ::listxattr(fn, names, 1000); + + char *name = names; + for (int i=0; i& ls) +{ + if (fake_collections) return collections.list_collections(ls); + + char fn[200]; + sprintf(fn, "%s/collections", basedir.c_str()); + + DIR *dir = ::opendir(fn); + assert(dir); + + struct dirent *de; + while ((de = ::readdir(dir)) != 0) { + // parse + coll_t c = strtoll(de->d_name, 0, 16); + dout(0) << " got " << c << " errno " << errno << " on " << de->d_name << endl; + if (errno) continue; + ls.push_back(c); + } + + ::closedir(dir); + return 0; +} + +int FakeStore::create_collection(coll_t c, + Context *onsafe) +{ + if (fake_collections) return collections.create_collection(c, onsafe); + + char fn[200]; + get_cdir(c, fn); + + int r = ::mkdir(fn, 0755); + + if (onsafe) sync(onsafe); + return r; +} + +int FakeStore::destroy_collection(coll_t c, + Context *onsafe) +{ + if (fake_collections) return collections.destroy_collection(c, onsafe); + + char fn[200]; + get_cdir(c, fn); + char cmd[200]; + sprintf(cmd, "test -d %s && rm -r %s", fn, fn); + system(cmd); + + if (onsafe) sync(onsafe); + return 0; +} + +int FakeStore::collection_stat(coll_t c, struct stat *st) +{ + if (fake_collections) return collections.collection_stat(c, st); + + char fn[200]; + get_cdir(c, fn); + return ::lstat(fn, st); +} + +bool FakeStore::collection_exists(coll_t c) +{ + if (fake_collections) return collections.collection_exists(c); + + struct stat st; + return collection_stat(c, &st) == 0; +} + + +int FakeStore::collection_add(coll_t c, object_t o, + Context *onsafe) +{ + if (fake_collections) return collections.collection_add(c, o, onsafe); + + char cof[200]; + get_coname(c, o, cof); + char of[200]; + get_oname(o, of); + + int r = ::link(of, cof); + if (onsafe) sync(onsafe); + return r; +} + +int FakeStore::collection_remove(coll_t c, object_t o, + Context *onsafe) +{ + if (fake_collections) return collections.collection_remove(c, o, onsafe); + + char cof[200]; + get_coname(c, o, cof); + + int r = ::unlink(cof); + if (onsafe) sync(onsafe); + return r; +} + +int FakeStore::collection_list(coll_t c, list& ls) +{ + if (fake_collections) return collections.collection_list(c, ls); + + char fn[200]; + get_cdir(c, fn); + + DIR *dir = ::opendir(fn); + assert(dir); + + struct dirent *de; + while ((de = ::readdir(dir)) != 0) { + // parse + object_t o; + assert(sizeof(o) == 16); + *(((__uint64_t*)&o) + 0) = strtoll(de->d_name, 0, 16); + assert(de->d_name[16] == '.'); + *(((__uint64_t*)&o) + 1) = strtoll(de->d_name+17, 0, 16); + dout(0) << " got " << o << " errno " << errno << " on " << de->d_name << endl; + if (errno) continue; + ls.push_back(o); + } + + ::closedir(dir); + return 0; +} +// eof. diff --git a/branches/aleung/security1/ceph/osd/FakeStore.h b/branches/aleung/security1/ceph/osd/FakeStore.h index eaa4126e84e46..4ad2cb4a054e8 100644 --- a/branches/aleung/security1/ceph/osd/FakeStore.h +++ b/branches/aleung/security1/ceph/osd/FakeStore.h @@ -32,31 +32,34 @@ using namespace __gnu_cxx; // fake attributes in memory, if we need to. - -class FakeStore : public ObjectStore, - public FakeStoreAttrs, - public FakeStoreCollections { +class FakeStore : public ObjectStore { string basedir; int whoami; - - int unsync; - Mutex lock; + Mutex synclock; + Cond synccond; + int unsync; - // fns - void get_dir(string& dir); - void get_oname(object_t oid, string& fn); - void wipe_dir(string mydir); + // fake attrs? + FakeStoreAttrs attrs; + bool fake_attrs; + // fake collections? + FakeStoreCollections collections; + bool fake_collections; + + // helper fns + void get_oname(object_t oid, char *s); + void get_cdir(coll_t cid, char *s); + void get_coname(coll_t cid, object_t oid, char *s); public: - FakeStore(char *base, int whoami) : FakeStoreAttrs(this), FakeStoreCollections(this) - { - this->basedir = base; - this->whoami = whoami; - unsync = 0; - } - + FakeStore(char *base, int w) : + basedir(base), + whoami(w), + unsync(0), + attrs(this), fake_attrs(false), + collections(this), fake_collections(false) { } int mount(); int umount(); @@ -73,15 +76,35 @@ class FakeStore : public ObjectStore, int stat(object_t oid, struct stat *st); int remove(object_t oid, Context *onsafe); int truncate(object_t oid, off_t size, Context *onsafe); - int read(object_t oid, - off_t offset, size_t len, - bufferlist& bl); - int write(object_t oid, - off_t offset, size_t len, - bufferlist& bl, - Context *onsafe); + int read(object_t oid, off_t offset, size_t len, bufferlist& bl); + int write(object_t oid, off_t offset, size_t len, bufferlist& bl, Context *onsafe); + void sync(); void sync(Context *onsafe); + + // attrs + int setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe=0); + int setattrs(object_t oid, map& aset); + int getattr(object_t oid, const char *name, void *value, size_t size); + int getattrs(object_t oid, map& aset); + int rmattr(object_t oid, const char *name, Context *onsafe=0); + //int listattr(object_t oid, char *attrs, size_t size); + int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0); + int collection_rmattr(coll_t c, const char *name, Context *onsafe=0); + int collection_getattr(coll_t c, const char *name, void *value, size_t size); + //int collection_listattr(coll_t c, char *attrs, size_t size); + + + // collections + int list_collections(list& ls); + int create_collection(coll_t c, Context *onsafe=0); + int destroy_collection(coll_t c, Context *onsafe=0); + int collection_stat(coll_t c, struct stat *st); + bool collection_exists(coll_t c); + int collection_add(coll_t c, object_t o, Context *onsafe=0); + int collection_remove(coll_t c, object_t o, Context *onsafe=0); + int collection_list(coll_t c, list& o); + }; #endif diff --git a/branches/aleung/security1/ceph/osd/OSD.cc b/branches/aleung/security1/ceph/osd/OSD.cc index 838ffa4bd8fb5..2fa8fe2681a2e 100644 --- a/branches/aleung/security1/ceph/osd/OSD.cc +++ b/branches/aleung/security1/ceph/osd/OSD.cc @@ -26,6 +26,10 @@ #include "ebofs/Ebofs.h" +#ifdef USE_OSBDB +#include "osbdb/OSBDB.h" +#endif // USE_OSBDB + #include "Ager.h" @@ -101,7 +105,7 @@ void OSD::force_remount() LogType osd_logtype; -OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) +OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) : timer(osd_lock) { whoami = id; messenger = m; @@ -127,9 +131,8 @@ OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) waiting_for_no_ops = false; if (g_conf.osd_remount_at) - g_timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); + timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); - // init object store // try in this order: @@ -163,8 +166,14 @@ OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) store = new OBFSStore(whoami, NULL, dev_path); } #endif +#ifdef USE_OSBDB + else if (g_conf.bdbstore) { + store = new OSBDB(dev_path); + } +#endif // USE_OSBDB else { - store = new FakeStore(osd_base_path, whoami); + sprintf(dev_path, "osddata/osd%d", whoami); + store = new FakeStore(dev_path, whoami); } } @@ -273,13 +282,11 @@ int OSD::init() // announce to monitor i exist and have booted. int mon = monmap->pick_mon(); - //messenger->send_message(new MOSDBoot(superblock), MSG_ADDR_MON(mon), monmap->get_inst(mon)); // new boot message w/ public key - messenger->send_message(new MOSDBoot(superblock, key_str), MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(new MOSDBoot(superblock, key_str), monmap->get_inst(mon)); // start the heart - next_heartbeat = new C_Heartbeat(this); - g_timer.add_event_after(g_conf.osd_heartbeat_interval, next_heartbeat); + timer.add_event_after(g_conf.osd_heartbeat_interval, new C_Heartbeat(this)); } osd_lock.Unlock(); @@ -290,12 +297,14 @@ int OSD::init() int OSD::shutdown() { - dout(1) << "shutdown, timer has " << g_timer.num_event << endl; - - if (next_heartbeat) g_timer.cancel_event(next_heartbeat); + dout(1) << "shutdown" << endl; state = STATE_STOPPING; + // cancel timers + timer.cancel_all(); + timer.join(); + // finish ops wait_for_no_ops(); @@ -435,7 +444,7 @@ void OSD::_remove_pg(pg_t pgid) p++) t.remove(*p); t.remove_collection(pgid); - t.remove(object_t(1,pgid)); // log too + t.remove(pgid.to_object()); // log too } store->apply_transaction(t); @@ -485,8 +494,6 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch) void OSD::heartbeat() { - osd_lock.Lock(); - utime_t now = g_clock.now(); utime_t since = now; since.sec_ref() -= g_conf.osd_heartbeat_interval; @@ -523,9 +530,9 @@ void OSD::heartbeat() for (set::iterator i = pingset.begin(); i != pingset.end(); i++) { - _share_map_outgoing( MSG_ADDR_OSD(*i), osdmap->get_inst(*i) ); + _share_map_outgoing( osdmap->get_inst(*i) ); messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), - MSG_ADDR_OSD(*i), osdmap->get_inst(*i)); + osdmap->get_inst(*i)); } if (logger) logger->set("pingset", pingset.size()); @@ -536,7 +543,7 @@ void OSD::heartbeat() if ((rand() % g_conf.fake_osdmap_updates) == 0) { //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { messenger->send_message(new MOSDIn(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + monmap->get_inst(mon)); } /* if (osdmap->is_out(whoami)) { @@ -552,11 +559,8 @@ void OSD::heartbeat() } // schedule next! randomly. - next_heartbeat = new C_Heartbeat(this); float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; - g_timer.add_event_after(wait, next_heartbeat); - - osd_lock.Unlock(); + timer.add_event_after(wait, new C_Heartbeat(this)); } @@ -564,30 +568,30 @@ void OSD::heartbeat() // -------------------------------------- // dispatch -bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch) +bool OSD::_share_map_incoming(const entity_inst_t& inst, epoch_t epoch) { bool shared = false; // does client have old map? - if (who.is_client()) { + if (inst.name.is_client()) { if (epoch < osdmap->get_epoch()) { - dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; - send_incremental_map(epoch, who, inst, true); + dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; + send_incremental_map(epoch, inst, true); shared = true; } } // does peer have old map? - if (who.is_osd()) { + if (inst.name.is_osd()) { // remember - if (peer_map_epoch[who] < epoch) - peer_map_epoch[who] = epoch; + if (peer_map_epoch[inst.name] < epoch) + peer_map_epoch[inst.name] = epoch; // older? - if (peer_map_epoch[who] < osdmap->get_epoch()) { - dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; - send_incremental_map(epoch, who, inst, true); - peer_map_epoch[who] = osdmap->get_epoch(); // so we don't send it again. + if (peer_map_epoch[inst.name] < osdmap->get_epoch()) { + dout(10) << inst.name << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; + send_incremental_map(epoch, inst, true); + peer_map_epoch[inst.name] = osdmap->get_epoch(); // so we don't send it again. shared = true; } } @@ -596,17 +600,17 @@ bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t } -void OSD::_share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst) +void OSD::_share_map_outgoing(const entity_inst_t& inst) { - assert(dest.is_osd()); + assert(inst.name.is_osd()); - if (dest.is_osd()) { + if (inst.name.is_osd()) { // send map? - if (peer_map_epoch.count(dest)) { - epoch_t pe = peer_map_epoch[dest]; + if (peer_map_epoch.count(inst.name)) { + epoch_t pe = peer_map_epoch[inst.name]; if (pe < osdmap->get_epoch()) { - send_incremental_map(pe, dest, inst, true); - peer_map_epoch[dest] = osdmap->get_epoch(); + send_incremental_map(pe, inst, true); + peer_map_epoch[inst.name] = osdmap->get_epoch(); } } else { // no idea about peer's epoch. @@ -732,9 +736,12 @@ void OSD::dispatch(Message *m) } -void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +void OSD::ms_handle_failure(Message *m, const entity_inst_t& inst) { + entity_name_t dest = inst.name; + if (g_conf.ms_die_on_failure) { + dout(0) << "ms_handle_failure " << inst << " on " << *m << endl; exit(0); } @@ -744,8 +751,8 @@ void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& in dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", dropping and reporting to mon" << mon << endl; - messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), + monmap->get_inst(mon)); delete m; } else if (dest.is_mon()) { // resend to a different monitor. @@ -753,7 +760,7 @@ void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& in dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", resending to mon" << mon << endl; - messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(m, monmap->get_inst(mon)); } else { // client? @@ -763,24 +770,13 @@ void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& in } } -bool OSD::ms_lookup(msg_addr_t dest, entity_inst_t& inst) -{ - if (dest.is_osd()) { - assert(osdmap); - return osdmap->get_inst(dest.num(), inst); - } - - assert(0); - return false; -} - void OSD::handle_osd_ping(MOSDPing *m) { dout(20) << "osdping from " << m->get_source() << endl; - _share_map_incoming(m->get_source(), m->get_source_inst(), ((MOSDPing*)m)->map_epoch); + _share_map_incoming(m->get_source_inst(), ((MOSDPing*)m)->map_epoch); int from = m->get_source().num(); peer_qlen[from] = m->avg_qlen; @@ -804,7 +800,7 @@ void OSD::wait_for_new_map(Message *m) if (waiting_for_osdmap.empty()) { int mon = monmap->pick_mon(); messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + monmap->get_inst(mon)); } waiting_for_osdmap.push_back(m); @@ -934,7 +930,7 @@ void OSD::handle_osd_map(MOSDMap *m) i++) { int osd = i->first; if (osd == whoami) continue; - messenger->mark_down(MSG_ADDR_OSD(osd), i->second); + messenger->mark_down(i->second.addr); peer_map_epoch.erase(MSG_ADDR_OSD(osd)); // kick any replica ops @@ -966,7 +962,6 @@ void OSD::handle_osd_map(MOSDMap *m) i != inc.new_up.end(); i++) { if (i->first == whoami) continue; - messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); } } @@ -985,7 +980,7 @@ void OSD::handle_osd_map(MOSDMap *m) else { dout(10) << "handle_osd_map missing epoch " << cur+1 << endl; int mon = monmap->pick_mon(); - messenger->send_message(new MOSDGetMap(cur), MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(new MOSDGetMap(cur), monmap->get_inst(mon)); break; } @@ -1042,6 +1037,8 @@ void OSD::advance_map(ObjectStore::Transaction& t) //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl; logger->set_start( osdmap->get_ctime() ); + assert(g_conf.osd_mkfs); // make sure we did a mkfs! + // create PGs for (int nrep = 1; nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh @@ -1281,10 +1278,10 @@ void OSD::activate_map(ObjectStore::Transaction& t) } -void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full) +void OSD::send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full) { dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() - << " to " << dest << endl; + << " to " << inst << endl; MOSDMap *m = new MOSDMap; @@ -1303,7 +1300,7 @@ void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst } } - messenger->send_message(m, dest, inst); + messenger->send_message(m, inst); } bool OSD::get_map_bl(epoch_t e, bufferlist& bl) @@ -1550,8 +1547,8 @@ void OSD::do_notifies(map< int, list >& notify_list) } dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl; MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); - _share_map_outgoing(MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first)); - messenger->send_message(m, MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first)); + _share_map_outgoing(osdmap->get_inst(it->first)); + messenger->send_message(m, osdmap->get_inst(it->first)); } } @@ -1570,8 +1567,8 @@ void OSD::do_queries(map< int, map >& query_map) MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), pit->second); - _share_map_outgoing(MSG_ADDR_OSD(who), osdmap->get_inst(who)); - messenger->send_message(m, MSG_ADDR_OSD(who), osdmap->get_inst(who)); + _share_map_outgoing(osdmap->get_inst(who)); + messenger->send_message(m, osdmap->get_inst(who)); } } @@ -1758,7 +1755,7 @@ void OSD::handle_pg_log(MOSDPGLog *m) assert(pg->missing.num_lost() == 0); // ok activate! - pg->activate(t); + pg->activate(t); } unsigned tr = store->apply_transaction(t); @@ -1878,8 +1875,8 @@ void OSD::handle_pg_query(MOSDPGQuery *m) dout(10) << *pg << " sending " << m->log << " " << m->missing << endl; //m->log.print(cout); - _share_map_outgoing(MSG_ADDR_OSD(from), osdmap->get_inst(from)); - messenger->send_message(m, MSG_ADDR_OSD(from), osdmap->get_inst(from)); + _share_map_outgoing(osdmap->get_inst(from)); + messenger->send_message(m, osdmap->get_inst(from)); } _unlock_pg(pgid); @@ -1945,12 +1942,12 @@ void OSD::pull(PG *pg, object_t oid) // send op tid_t tid = ++last_tid; - MOSDOp *op = new MOSDOp(tid, messenger->get_myaddr(), + MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, tid, oid, pg->get_pgid(), osdmap->get_epoch(), OSD_OP_PULL); op->set_version(v); - messenger->send_message(op, MSG_ADDR_OSD(osd), osdmap->get_inst(osd)); + messenger->send_message(op, osdmap->get_inst(osd)); // take note assert(pg->objects_pulling.count(oid) == 0); @@ -1987,7 +1984,7 @@ void OSD::push(PG *pg, object_t oid, int dest) logger->inc("r_pushb", bl.length()); // send - MOSDOp *op = new MOSDOp(++last_tid, MSG_ADDR_OSD(whoami), + MOSDOp *op = new MOSDOp(messenger->get_myinst(), 0, ++last_tid, oid, pg->info.pgid, osdmap->get_epoch(), OSD_OP_PUSH); op->set_offset(0); @@ -1996,7 +1993,7 @@ void OSD::push(PG *pg, object_t oid, int dest) op->set_version(v); op->set_attrset(attrset); - messenger->send_message(op, MSG_ADDR_OSD(dest), osdmap->get_inst(dest)); + messenger->send_message(op, osdmap->get_inst(dest)); } @@ -2172,7 +2169,7 @@ void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complet << endl; MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); commit->set_pg_complete_thru(last_complete); - messenger->send_message(commit, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd)); + messenger->send_message(commit, osdmap->get_inst(ackerosd)); delete op; } @@ -2302,7 +2299,7 @@ void OSD::op_rep_modify(MOSDOp *op, PG *pg) // send ack to acker? if (g_conf.osd_rep != OSD_REP_CHAIN) { MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false); - messenger->send_message(ack, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd)); + messenger->send_message(ack, osdmap->get_inst(ackerosd)); } // ack myself. @@ -2332,7 +2329,7 @@ void OSD::handle_op(MOSDOp *op) if (!require_same_or_newer_map(op, op->get_map_epoch())) return; // share our map with sender, if they're old - _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch()); + _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); // what kind of op? bool read = op->get_op() < 10; // read, stat. but not pull. @@ -2441,7 +2438,7 @@ void OSD::handle_op(MOSDOp *op) if (pg->acting.size() > 1) { int peer = pg->acting[1]; dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl; - messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer)); + messenger->send_message(op, osdmap->get_inst(peer)); return; } } @@ -2464,7 +2461,7 @@ void OSD::handle_op(MOSDOp *op) << ", fwd to peer w/ qlen " << peer_qlen[peer] << " osd" << peer << endl; - messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer)); + messenger->send_message(op, osdmap->get_inst(peer)); return; } } @@ -2539,7 +2536,7 @@ void OSD::handle_op_reply(MOSDOpReply *op) if (!require_same_or_newer_map(op, op->get_map_epoch())) return; // share our map with sender, if they're old - _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch()); + _share_map_incoming(op->get_source_inst(), op->get_map_epoch()); if (!pg) { // hmm. @@ -2736,8 +2733,8 @@ bool OSD::block_if_wrlocked(MOSDOp* op) { object_t oid = op->get_oid(); - msg_addr_t source; - int len = store->getattr(oid, "wrlock", &source, sizeof(msg_addr_t)); + entity_name_t source; + int len = store->getattr(oid, "wrlock", &source, sizeof(entity_name_t)); //cout << "getattr returns " << len << " on " << oid << endl; if (len == sizeof(source) && @@ -2904,7 +2901,7 @@ void OSD::op_read(MOSDOp *op)//, PG *pg) if (r >= 0) logger->inc("rdb", r); // send it - messenger->send_message(reply, op->get_client(), op->get_client_inst()); + messenger->send_message(reply, op->get_client_inst()); delete op; } @@ -2940,7 +2937,7 @@ void OSD::op_stat(MOSDOp *op)//, PG *pg) MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true); reply->set_object_size(st.st_size); - messenger->send_message(reply, op->get_client(), op->get_client_inst()); + messenger->send_message(reply, op->get_client_inst()); logger->inc("stat"); @@ -2985,7 +2982,7 @@ void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) // send commit. MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true); dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl; - messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst()); + messenger->send_message(reply, repop->op->get_client_inst()); repop->sent_commit = true; } @@ -2998,7 +2995,7 @@ void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) // send ack MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false); dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl; - messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst()); + messenger->send_message(reply, repop->op->get_client_inst()); repop->sent_ack = true; utime_t now = g_clock.now(); @@ -3049,8 +3046,7 @@ void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) << endl; // forward the write/update/whatever - MOSDOp *wr = new MOSDOp(op->get_tid(), - op->get_client(), + MOSDOp *wr = new MOSDOp(op->get_client_inst(), op->get_client_inc(), op->get_reqid().tid, oid, pg->get_pgid(), osdmap->get_epoch(), @@ -3063,7 +3059,7 @@ void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) wr->set_rep_tid(op->get_rep_tid()); wr->set_pg_trim_to(pg->peers_complete_thru); - messenger->send_message(wr, MSG_ADDR_OSD(osd), osdmap->get_inst(osd)); + messenger->send_message(wr, osdmap->get_inst(osd)); } PG::RepOpGather *OSD::new_repop_gather(PG *pg, @@ -3204,9 +3200,8 @@ void OSD::op_modify(MOSDOp *op, PG *pg) } // dup op? - reqid_t reqid(op->get_client(), op->get_tid()); - if (pg->log.logged_req(reqid)) { - dout(-3) << "op_modify " << opname << " dup op " << reqid + if (pg->log.logged_req(op->get_reqid())) { + dout(-3) << "op_modify " << opname << " dup op " << op->get_reqid() << ", doing WRNOOP" << endl; op->set_op(OSD_OP_WRNOOP); opname = MOSDOp::get_opname(op->get_op()); @@ -3305,7 +3300,7 @@ void OSD::op_modify(MOSDOp *op, PG *pg) { for (unsigned i=1; iacting.size(); i++) { int osd = pg->acting[i]; - _share_map_outgoing( MSG_ADDR_OSD(osd), osdmap->get_inst(osd) ); + _share_map_outgoing( osdmap->get_inst(osd) ); } } osd_lock.Unlock(); @@ -3388,8 +3383,7 @@ void OSD::prepare_log_transaction(ObjectStore::Transaction& t, if (crev && rev && rev > crev) { eversion_t cv = version; cv.version--; - PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, - op->get_client(), op->get_tid()); + PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, op->get_reqid()); pg->log.add(cloneentry); dout(10) << "prepare_log_transaction " << op->get_op() @@ -3400,8 +3394,7 @@ void OSD::prepare_log_transaction(ObjectStore::Transaction& t, // actual op int opcode = PG::Log::Entry::MODIFY; if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; - PG::Log::Entry logentry(opcode, oid, version, - op->get_client(), op->get_tid()); + PG::Log::Entry logentry(opcode, oid, version, op->get_reqid()); dout(10) << "prepare_log_transaction " << op->get_op() << " " << logentry @@ -3467,7 +3460,7 @@ void OSD::prepare_op_transaction(ObjectStore::Transaction& t, case OSD_OP_WRLOCK: { // lock object //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit); - t.setattr(oid, "wrlock", &op->get_client(), sizeof(msg_addr_t)); + t.setattr(oid, "wrlock", &op->get_client(), sizeof(entity_name_t)); } break; diff --git a/branches/aleung/security1/ceph/osd/OSD.h b/branches/aleung/security1/ceph/osd/OSD.h index 975aa84558599..c3325fb5632b0 100644 --- a/branches/aleung/security1/ceph/osd/OSD.h +++ b/branches/aleung/security1/ceph/osd/OSD.h @@ -18,6 +18,7 @@ #include "common/Mutex.h" #include "common/ThreadPool.h" +#include "common/Timer.h" #include "mon/MonMap.h" @@ -93,10 +94,11 @@ public: void finish(int r) { osd->heartbeat(); } - } *next_heartbeat; + }; // global lock Mutex osd_lock; + SafeTimer timer; // -- stats -- int hb_stat_ops; // ops since last heartbeat @@ -160,9 +162,9 @@ public: class OSDMap *osdmap; list waiting_for_osdmap; - hash_map peer_map_epoch; - bool _share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch); - void _share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst); + hash_map peer_map_epoch; // FIXME types + bool _share_map_incoming(const entity_inst_t& inst, epoch_t epoch); + void _share_map_outgoing(const entity_inst_t& inst); void wait_for_new_map(Message *m); void handle_osd_map(class MOSDMap *m); @@ -175,7 +177,7 @@ public: bool get_inc_map_bl(epoch_t e, bufferlist& bl); bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); - void send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full); + void send_incremental_map(epoch_t since, const entity_inst_t& inst, bool full); @@ -257,8 +259,7 @@ public: // messages virtual void dispatch(Message *m); - virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst); - virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst); + virtual void ms_handle_failure(Message *m, const entity_inst_t& inst); void handle_osd_ping(class MOSDPing *m); void handle_op(class MOSDOp *m); diff --git a/branches/aleung/security1/ceph/osd/OSDMap.h b/branches/aleung/security1/ceph/osd/OSDMap.h index 1b0e956605684..aedeaaa88b0f4 100644 --- a/branches/aleung/security1/ceph/osd/OSDMap.h +++ b/branches/aleung/security1/ceph/osd/OSDMap.h @@ -22,6 +22,7 @@ */ #include "config.h" #include "include/types.h" +#include "osd_types.h" #include "msg/Message.h" #include "common/Mutex.h" #include "common/Clock.h" diff --git a/branches/aleung/security1/ceph/osd/ObjectStore.h b/branches/aleung/security1/ceph/osd/ObjectStore.h index d5ba667145e34..9ff94adfcae99 100644 --- a/branches/aleung/security1/ceph/osd/ObjectStore.h +++ b/branches/aleung/security1/ceph/osd/ObjectStore.h @@ -16,6 +16,7 @@ #define __OBJECTSTORE_H #include "include/types.h" +#include "osd_types.h" #include "include/Context.h" #include "include/buffer.h" @@ -84,6 +85,8 @@ public: static const int OP_RMATTR = 16; // oid, attrname static const int OP_CLONE = 17; // oid, newoid + static const int OP_TRIMCACHE = 18; // oid, offset, len + static const int OP_MKCOLL = 20; // cid static const int OP_RMCOLL = 21; // cid static const int OP_COLL_ADD = 22; // cid, oid @@ -142,6 +145,13 @@ public: lengths.push_back(len); bls.push_back(bl); } + void trim_from_cache(object_t oid, off_t off, size_t len) { + int op = OP_TRIMCACHE; + ops.push_back(op); + oids.push_back(oid); + offsets.push_back(off); + lengths.push_back(len); + } void truncate(object_t oid, off_t off) { int op = OP_TRUNCATE; ops.push_back(op); @@ -276,6 +286,15 @@ public: } break; + case Transaction::OP_TRIMCACHE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + trim_from_cache(oid, offset, len); + } + break; + case Transaction::OP_TRUNCATE: { object_t oid = t.oids.front(); t.oids.pop_front(); @@ -428,6 +447,8 @@ public: off_t offset, size_t len, bufferlist& bl, Context *onsafe) = 0;//{ return -1; } + virtual void trim_from_cache(object_t oid, + off_t offset, size_t len) { } virtual int setattr(object_t oid, const char *name, const void *value, size_t size, @@ -445,7 +466,7 @@ public: return -1; } - virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; + //virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; // collections virtual int list_collections(list& ls) {return 0;}//= 0; @@ -468,10 +489,10 @@ public: Context *onsafe=0) {return 0;} //= 0; virtual int collection_getattr(coll_t cid, const char *name, void *value, size_t size) {return 0;} //= 0; - virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; + //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; - virtual void sync(Context *onsync) {}; - virtual void sync() {}; + virtual void sync(Context *onsync) {} + virtual void sync() {} virtual void _fake_writes(bool b) {}; diff --git a/branches/aleung/security1/ceph/osd/PG.cc b/branches/aleung/security1/ceph/osd/PG.cc index 4dee6f03bd166..218f9eac36aae 100644 --- a/branches/aleung/security1/ceph/osd/PG.cc +++ b/branches/aleung/security1/ceph/osd/PG.cc @@ -808,8 +808,8 @@ void PG::peer(ObjectStore::Transaction& t, if (is_crashed()) { dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl; state_set(STATE_REPLAY); - g_timer.add_event_after(g_conf.osd_replay_window, - new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); + osd->timer.add_event_after(g_conf.osd_replay_window, + new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); } else if (!is_active()) { // -- ok, activate! @@ -826,7 +826,7 @@ void PG::activate(ObjectStore::Transaction& t) state_set(STATE_ACTIVE); state_clear(STATE_STRAY); if (is_crashed()) { - assert(is_replay()); + //assert(is_replay()); // HELP.. not on replica? state_clear(STATE_CRASHED); state_clear(STATE_REPLAY); } @@ -917,7 +917,7 @@ void PG::activate(ObjectStore::Transaction& t) dout(10) << "activate sending " << m->log << " " << m->missing << " to osd" << peer << endl; //m->log.print(cout); - osd->messenger->send_message(m, MSG_ADDR_OSD(peer), osd->osdmap->get_inst(peer)); + osd->messenger->send_message(m, osd->osdmap->get_inst(peer)); // update our missing if (peer_missing[peer].num_missing() == 0) { @@ -1115,7 +1115,7 @@ bool PG::do_recovery() ls.push_back(info); osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), ls), - MSG_ADDR_OSD(get_primary()), osd->osdmap->get_inst(get_primary())); + osd->osdmap->get_inst(get_primary())); } return false; @@ -1164,7 +1164,7 @@ void PG::clean_replicas() set ls; ls.insert(info.pgid); MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); - osd->messenger->send_message(m, MSG_ADDR_OSD(*p), osd->osdmap->get_inst(*p)); + osd->messenger->send_message(m, osd->osdmap->get_inst(*p)); } stray_set.clear(); @@ -1174,6 +1174,8 @@ void PG::clean_replicas() void PG::write_log(ObjectStore::Transaction& t) { + dout(10) << "write_log" << endl; + // assemble buffer bufferlist bl; @@ -1186,12 +1188,16 @@ void PG::write_log(ObjectStore::Transaction& t) if (bl.length() % 4096 == 0) ondisklog.block_map[bl.length()] = p->version; bl.append((char*)&(*p), sizeof(*p)); + if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. + bufferptr bp(4096 - sizeof(*p)); + bl.push_back(bp); + } } ondisklog.top = bl.length(); // write it - t.remove( object_t(1,info.pgid) ); - t.write( object_t(1,info.pgid) , 0, bl.length(), bl); + t.remove( info.pgid.to_object() ); + t.write( info.pgid.to_object() , 0, bl.length(), bl); t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); @@ -1234,6 +1240,8 @@ void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, eversion_t trim_to) { + dout(10) << "append_log " << ondisklog.top << " " << logentry << endl; + // write entry on disk bufferlist bl; bl.append( (char*)&logentry, sizeof(logentry) ); @@ -1241,7 +1249,7 @@ void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, bufferptr bp(4096 - sizeof(logentry)); bl.push_back(bp); } - t.write( object_t(1,info.pgid), ondisklog.top, bl.length(), bl ); + t.write( info.pgid.to_object(), ondisklog.top, bl.length(), bl ); // update block map? if (ondisklog.top % 4096 == 0) @@ -1263,30 +1271,43 @@ void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, void PG::read_log(ObjectStore *store) { + int r; // load bounds ondisklog.bottom = ondisklog.top = 0; - store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); - store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); - + r = store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); + assert(r == sizeof(ondisklog.bottom)); + r = store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); + assert(r == sizeof(ondisklog.top)); + + dout(10) << "read_log [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; + log.backlog = info.log_backlog; log.bottom = info.log_bottom; if (ondisklog.top > 0) { // read bufferlist bl; - store->read(object_t(1,info.pgid), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); + store->read(info.pgid.to_object(), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); PG::Log::Entry e; off_t pos = ondisklog.bottom; + assert(log.log.empty()); while (pos < ondisklog.top) { bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); + dout(10) << "read_log " << pos << " " << e << endl; + if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom if (pos % 4096 == 0) - ondisklog.block_map[pos] = e.version; + ondisklog.block_map[pos] = e.version; log.log.push_back(e); + } else { + dout(10) << "read_log ignoring entry at " << pos << endl; } - pos += sizeof(e); + if (g_conf.osd_pad_pg_log) // pad to 4k, until i fix ebofs reallocation crap. FIXME. + pos += 4096; + else + pos += sizeof(e); } } log.top = info.last_update; diff --git a/branches/aleung/security1/ceph/osd/PG.h b/branches/aleung/security1/ceph/osd/PG.h index 3da16b9b81b7b..f3b00cf935f91 100644 --- a/branches/aleung/security1/ceph/osd/PG.h +++ b/branches/aleung/security1/ceph/osd/PG.h @@ -34,35 +34,7 @@ using namespace __gnu_cxx; class OSD; -/* reqid_t - caller + tid to unique identify this request - */ -class reqid_t { -public: - msg_addr_t addr; - tid_t tid; - reqid_t() : tid(0) {} - reqid_t(const msg_addr_t& a, tid_t t) : addr(a), tid(t) {} -}; - -inline ostream& operator<<(ostream& out, const reqid_t& r) { - return out << r.addr << "." << r.tid; -} -inline bool operator==(const reqid_t& l, const reqid_t& r) { - return (l.addr == r.addr) && (l.tid == r.tid); -} -inline bool operator!=(const reqid_t& l, const reqid_t& r) { - return (l.addr != r.addr) || (l.tid != r.tid); -} -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const reqid_t &r) const { - static hash H; - static hash<__uint64_t> I; - return H(r.addr.type() ^ r.addr.num()) ^ I(r.tid); - } - }; -} /** PG - Replica Placement Group * @@ -241,12 +213,12 @@ public: eversion_t version; objectrev_t rev; - reqid_t reqid; // caller+tid to uniquely identify request + reqid_t reqid; // caller+tid to uniquely identify request Entry() : op(0) {} Entry(int _op, object_t _oid, const eversion_t& v, - const msg_addr_t& a, tid_t t) : - op(_op), oid(_oid), version(v), reqid(a,t) {} + const reqid_t& rid) : + op(_op), oid(_oid), version(v), reqid(rid) {} bool is_delete() const { return op == DELETE; } bool is_clone() const { return op == CLONE; } @@ -298,7 +270,7 @@ public: class IndexedLog : public Log { public: hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; + hash_set caller_ops; // recovery pointers list::iterator requested_to; // not inclusive of referenced item @@ -316,7 +288,7 @@ public: bool logged_object(object_t oid) { return objects.count(oid); } - bool logged_req(reqid_t &r) { + bool logged_req(const reqid_t &r) { return caller_ops.count(r); } @@ -641,7 +613,7 @@ inline ostream& operator<<(ostream& out, const PG::Info::History& h) inline ostream& operator<<(ostream& out, const PG::Info& pgi) { - out << "pginfo(" << hex << pgi.pgid << dec; + out << "pginfo(" << pgi.pgid; if (pgi.is_empty()) out << " empty"; else @@ -697,8 +669,8 @@ inline ostream& operator<<(ostream& out, const PG& pg) !pg.log.backlog) || (pg.log.log.rbegin()->version.version != pg.log.top.version)) { out << " (log bound mismatch, actual=[" - << pg.log.log.begin()->version << "," - << pg.log.log.rbegin()->version << "])"; + << pg.log.log.begin()->version << "," + << pg.log.log.rbegin()->version << "] len=" << pg.log.log.size() << ")"; } } diff --git a/branches/aleung/security1/ceph/osd/osd_types.h b/branches/aleung/security1/ceph/osd/osd_types.h new file mode 100644 index 0000000000000..f8656e1f3e178 --- /dev/null +++ b/branches/aleung/security1/ceph/osd/osd_types.h @@ -0,0 +1,174 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __OSD_TYPES_H +#define __OSD_TYPES_H + +#include "include/reqid.h" + +#define PG_INO 1 + + +// osd types +typedef __uint64_t coll_t; // collection id + +// pg stuff +typedef __uint16_t ps_t; +typedef __uint8_t pruleset_t; + +// placement group id +struct pg_t { + union { + struct { + __uint32_t preferred:32; // 32 + ps_t ps:16; // 16 + __uint8_t nrep:8; // 8 + pruleset_t ruleset:8; // 8 + } fields; + __uint64_t val; // 64 + } u; + + pg_t() { u.val = 0; } + pg_t(const pg_t& o) { u.val = o.u.val; } + pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { + u.fields.ps = s; + u.fields.preferred = p; + u.fields.nrep = n; + u.fields.ruleset = r; + } + pg_t(__uint64_t v) { u.val = v; } + /* + pg_t operator=(__uint64_t v) { u.val = v; return *this; } + pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } + pg_t operator+=(pg_t o) { u.val += o.val; return *this; } + pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } + pg_t operator++() { ++u.val; return *this; } + */ + operator __uint64_t() const { return u.val; } + + object_t to_object() const { return object_t(PG_INO, u.val >> 32, u.val & 0xffffffff); } +}; + +inline ostream& operator<<(ostream& out, pg_t pg) { + //return out << hex << pg.val << dec; + if (pg.u.fields.ruleset) + out << (int)pg.u.fields.ruleset << '.'; + out << (int)pg.u.fields.nrep << '.'; + if (pg.u.fields.preferred) + out << pg.u.fields.preferred << '.'; + out << hex << pg.u.fields.ps << dec; + out << "=" << hex << pg.u.val << dec; + out << "=" << hex << (__uint64_t)pg << dec; + return out; +} + +namespace __gnu_cxx { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash<__uint64_t> H; + return H(x); + } + }; +} + + + +// compound rados version type +class eversion_t { +public: + epoch_t epoch; + version_t version; + eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} +}; + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline ostream& operator<<(ostream& out, const eversion_t e) { + return out << e.epoch << "'" << e.version; +} + + + + + +// ----------------------------------------- + +class ObjectExtent { + public: + object_t oid; // object id + off_t start; // in object + size_t length; // in object + + objectrev_t rev; // which revision? + pg_t pgid; // where to find the object + + map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} + ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } +}; + +inline ostream& operator<<(ostream& out, ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " in " << hex << ex.pgid << dec + << " " << ex.start << "~" << ex.length + << ")"; +} + + + +// --------------------------------------- + +class OSDSuperblock { +public: + const static __uint64_t MAGIC = 0xeb0f505dULL; + __uint64_t magic; + __uint64_t fsid; // unique fs id (random number) + int whoami; // my role in this fs. + epoch_t current_epoch; // most recent epoch + epoch_t oldest_map, newest_map; // oldest/newest maps we have. + OSDSuperblock(__uint64_t f=0, int w=0) : + magic(MAGIC), fsid(f), whoami(w), + current_epoch(0), oldest_map(0), newest_map(0) {} +}; + +inline ostream& operator<<(ostream& out, OSDSuperblock& sb) +{ + return out << "sb(fsid " << sb.fsid + << " osd" << sb.whoami + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map + << "])"; +} + + +#endif diff --git a/branches/aleung/security1/ceph/osdc/Journaler.cc b/branches/aleung/security1/ceph/osdc/Journaler.cc index 1bee1542bf906..3d9621185d998 100644 --- a/branches/aleung/security1/ceph/osdc/Journaler.cc +++ b/branches/aleung/security1/ceph/osdc/Journaler.cc @@ -100,13 +100,14 @@ void Journaler::_finish_read_head(int r, bufferlist& bl) Header h; assert(bl.length() == sizeof(h)); bl.copy(0, sizeof(h), (char*)&h); - dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; write_pos = flush_pos = ack_pos = h.write_pos; read_pos = requested_pos = received_pos = h.read_pos; expire_pos = h.expire_pos; trimmed_pos = trimming_pos = h.trimmed_pos; + dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; + // probe the log state = STATE_PROBING; C_ProbeEnd *fin = new C_ProbeEnd(this); @@ -115,14 +116,22 @@ void Journaler::_finish_read_head(int r, bufferlist& bl) void Journaler::_finish_probe_end(int r, off_t end) { - assert(r >= 0); - assert(end >= write_pos); assert(state == STATE_PROBING); - - dout(1) << "_finish_probe_end write_pos = " << end - << " (header had " << write_pos << "). recovered." - << endl; + if (end == -1) { + end = write_pos; + dout(1) << "_finish_probe_end write_pos = " << end + << " (header had " << write_pos << "). log was empty. recovered." + << endl; + assert(0); // hrm. + } else { + assert(end >= write_pos); + assert(r >= 0); + dout(1) << "_finish_probe_end write_pos = " << end + << " (header had " << write_pos << "). recovered." + << endl; + } + write_pos = flush_pos = ack_pos = end; // done. diff --git a/branches/aleung/security1/ceph/osdc/ObjectCacher.cc b/branches/aleung/security1/ceph/osdc/ObjectCacher.cc index 32d6f31a4773a..be8e4ae0d2979 100644 --- a/branches/aleung/security1/ceph/osdc/ObjectCacher.cc +++ b/branches/aleung/security1/ceph/osdc/ObjectCacher.cc @@ -640,6 +640,11 @@ void ObjectCacher::flush(off_t amount) dout(10) << "flush " << amount << endl; + /* + * NOTE: we aren't actually pulling things off the LRU here, just looking at the + * tail item. Then we call bh_write, which moves it to the other LRU, so that we + * can call lru_dirty.lru_get_next_expire() again. + */ off_t did = 0; while (amount == 0 || did < amount) { BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); diff --git a/branches/aleung/security1/ceph/osdc/Objecter.cc b/branches/aleung/security1/ceph/osdc/Objecter.cc index df3fb4ffe2015..c1dc93f274901 100644 --- a/branches/aleung/security1/ceph/osdc/Objecter.cc +++ b/branches/aleung/security1/ceph/osdc/Objecter.cc @@ -71,11 +71,7 @@ void Objecter::handle_osd_map(MOSDMap *m) for (map::iterator i = inc.new_down.begin(); i != inc.new_down.end(); i++) - messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); - for (map::iterator i = inc.new_up.begin(); - i != inc.new_up.end(); - i++) - messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); + messenger->mark_down(i->second.addr); } else if (m->maps.count(e)) { @@ -86,7 +82,7 @@ void Objecter::handle_osd_map(MOSDMap *m) dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl; int mon = monmap->pick_mon(); messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + monmap->get_inst(mon)); break; } @@ -275,7 +271,8 @@ tid_t Objecter::stat_submit(OSDStat *st) // send last_tid++; - MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(), + assert(client_inc >= 0); + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, ex.oid, ex.pgid, osdmap->get_epoch(), OSD_OP_STAT); dout(10) << "stat_submit " << st << " tid " << last_tid @@ -285,7 +282,7 @@ tid_t Objecter::stat_submit(OSDStat *st) << endl; if (pg.acker() >= 0) - messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker())); + messenger->send_message(m, osdmap->get_inst(pg.acker())); // add to gather set st->tid = last_tid; @@ -387,7 +384,8 @@ tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) // send last_tid++; - MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(), + assert(client_inc >= 0); + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, last_tid, ex.oid, ex.pgid, osdmap->get_epoch(), OSD_OP_READ); m->set_length(ex.length); @@ -395,7 +393,7 @@ tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) // set ext cap // FIXME mds currently is writing without caps...so we let it // all other (client) writes should have cap - if (messenger->get_myaddr().is_client()) + if (messenger->get_myname().is_client()) m->set_capability(rd->ext_cap); dout(10) << "readx_submit " << rd << " tid " << last_tid << " oid " << ex.oid << " " << ex.start << "~" << ex.length @@ -405,7 +403,7 @@ tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) << endl; if (pg.acker() >= 0) - messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker())); + messenger->send_message(m, osdmap->get_inst(pg.acker())); // add to gather set rd->ops[last_tid] = ex; @@ -657,7 +655,7 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) else tid = ++last_tid; - MOSDOp *m = new MOSDOp(tid, messenger->get_myaddr(), + MOSDOp *m = new MOSDOp(messenger->get_myinst(), client_inc, tid, ex.oid, ex.pgid, osdmap->get_epoch(), wr->op); m->set_length(ex.length); @@ -666,7 +664,7 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) // only cap for a write, fix later // FIXME mds does writes through this interface without a cap // we let it for now - if (wr->op == OSD_OP_WRITE && messenger->get_myaddr().is_client()) + if (wr->op == OSD_OP_WRITE && messenger->get_myname().is_client()) m->set_capability(wr->modify_cap); if (wr->tid_version.count(tid)) @@ -709,7 +707,7 @@ tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) << " osd" << pg.primary() << endl; if (pg.primary() >= 0) - messenger->send_message(m, MSG_ADDR_OSD(pg.primary()), osdmap->get_inst(pg.primary())); + messenger->send_message(m, osdmap->get_inst(pg.primary())); dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl; @@ -817,7 +815,7 @@ void Objecter::handle_osd_modify_reply(MOSDOpReply *m) -void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +void Objecter::ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst) { if (dest.is_mon()) { // try a new mon @@ -825,15 +823,15 @@ void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_ dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", resending to mon" << mon << endl; - messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(m, monmap->get_inst(mon)); } else if (dest.is_osd()) { int mon = monmap->pick_mon(); dout(0) << "ms_handle_failure " << dest << " inst " << inst << ", dropping and reporting to mon" << mon << endl; - messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()), - MSG_ADDR_MON(mon), monmap->get_inst(mon)); + messenger->send_message(new MOSDFailure(inst, osdmap->get_epoch()), + monmap->get_inst(mon)); delete m; } else { dout(0) << "ms_handle_failure " << dest << " inst " << inst diff --git a/branches/aleung/security1/ceph/osdc/Objecter.h b/branches/aleung/security1/ceph/osdc/Objecter.h index 554e3ad71e5ae..3d679ea3ccb58 100644 --- a/branches/aleung/security1/ceph/osdc/Objecter.h +++ b/branches/aleung/security1/ceph/osdc/Objecter.h @@ -39,6 +39,7 @@ class Objecter { private: tid_t last_tid; + int client_inc; int num_unacked; int num_uncommitted; @@ -154,7 +155,7 @@ class Objecter { public: Objecter(Messenger *m, MonMap *mm, OSDMap *om) : messenger(m), monmap(mm), osdmap(om), - last_tid(0), + last_tid(0), client_inc(-1), num_unacked(0), num_uncommitted(0) {} ~Objecter() { @@ -183,6 +184,11 @@ class Objecter { return !(op_read.empty() && op_modify.empty()); } + int get_client_incarnation() { return client_inc; } + void set_client_incarnation(int inc) { + client_inc = inc; + } + // med level tid_t readx(OSDRead *read, Context *onfinish); tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); @@ -204,7 +210,7 @@ class Objecter { tid_t lock(int op, object_t oid, Context *onack, Context *oncommit); - void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst); + void ms_handle_failure(Message *m, entity_name_t dest, const entity_inst_t& inst); }; diff --git a/branches/aleung/security1/ceph/test/testos.cc b/branches/aleung/security1/ceph/test/testos.cc new file mode 100644 index 0000000000000..0296f05a49345 --- /dev/null +++ b/branches/aleung/security1/ceph/test/testos.cc @@ -0,0 +1,308 @@ +/* testos.cc -- simple ObjectStore test harness. + Copyright (C) 2007 Casey Marshall + +Ceph - scalable distributed file system + +This is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License version 2.1, as published by the Free Software +Foundation. See file COPYING. */ + + +#include "osd/ObjectStore.h" +#include "ebofs/Ebofs.h" +#include "osbdb/OSBDB.h" +#include "include/buffer.h" + +#include +#include + +#include +#include + +using namespace std; + +static inline unsigned long long +to_usec (struct timeval &time) +{ + return (((unsigned long long) time.tv_sec * 1000000) + + ((unsigned long long) time.tv_usec)); +} + +static inline unsigned long long +to_msec (struct timeval &time) +{ + return (((unsigned long long) time.tv_sec * 1000) + + ((unsigned long long) time.tv_usec / 1000)); +} + +int main (int argc, char **argv) +{ + char *osd_name = "ebofs"; + unsigned object_size = 1024; + unsigned object_count = 1024; + unsigned write_iter = 64; + unsigned random_seed = ::time(NULL); + char *device = "/tmp/testos"; + char *mountcmd = "mount /tmp/testos"; + char *umountcmd = "umount /tmp/testos"; + + bool inhibit_remount = (getenv("TESTOS_INHIBIT_REMOUNT") != NULL); + + if (argc > 1 + && (strcmp (argv[1], "-h") == 0 + || strcmp (argv[1], "-help") == 0 + || strcmp (argv[1], "--help") == 0 + || argc > 6)) + { + cout << "usage: " << argv[0] << " [store [object-size [object-count [iterations [seed]]]]]" << endl; + cout << endl; + cout << "Where the arguments are:" << endl << endl; + cout << " store -- store type; default \"ebofs\"" << endl; + cout << " object-size -- size of objects; default 1024" << endl; + cout << " object-count -- number of objects to write; default 1024" + << endl; + cout << " iterations -- write the objects that many times; default 5" + << endl; + cout << " seed -- random seed; default current time" << endl; + exit (0); + } + + if (argc > 1) + osd_name = argv[1]; + if (argc > 2) + object_size = (unsigned) atol (argv[2]); + if (argc > 3) + object_count = (unsigned) atol (argv[3]); + if (argc > 4) + write_iter = (unsigned) atol (argv[4]); + if (argc > 5) + random_seed = (unsigned) atol (argv[5]); + + // algin object size to 'long' + object_size = ((object_size + (sizeof (long) - 1)) / sizeof (long)) * sizeof (long); + + char *osd_file = new char[32]; + strcpy (osd_file, "/tmp/testos/testos.XXXXXX"); + mktemp (osd_file); + + if (!inhibit_remount) + { + if (system (mountcmd) != 0) + { + cerr << "mount failed" << endl; + exit (1); + } + } + + ObjectStore *os = NULL; + if (strcasecmp (osd_name, "ebofs") == 0) + { + FILE *f = fopen (osd_file, "w"); + if (f == NULL) + { + cerr << "failed to open " << osd_file << ": " << strerror (errno) + << endl; + exit (1); + } + // 1G file. + fseek (f, 1024 * 1024 * 1024, SEEK_SET); + fputc ('\0', f); + fclose (f); + // 20K cache + g_conf.ebofs_bc_size = 5; // times 4K + os = new Ebofs (osd_file); + } + else if (strcasecmp (osd_name, "osbdb") == 0) + { + char *e = getenv ("OSBDB_FFACTOR"); + if (e != NULL) + g_conf.bdbstore_ffactor = atol(e); + e = getenv ("OSBDB_NELEM"); + if (e != NULL) + g_conf.bdbstore_nelem = atol(e); + e = getenv ("OSBDB_PAGESIZE"); + if (e != NULL) + g_conf.bdbstore_pagesize = atol(e); + g_conf.debug_bdbstore = 1; + // 20K cache + g_conf.bdbstore_cachesize = 20 * 1024; + os = new OSBDB (osd_file); + } + else if (strcasecmp (osd_name, "osbdb-btree") == 0) + { + g_conf.bdbstore_btree = true; + // 20K cache + g_conf.bdbstore_cachesize = 20 * 1024; + os = new OSBDB (osd_file); + } + else + { + cerr << "I don't know about object store \"" << osd_name << "\"" + << endl; + exit (1); + } + + cout << "Writing " << object_count << " objects of size " + << object_size << " to " << osd_name << endl; + + char *val = (char *) malloc (object_size); + char *val2 = (char *) malloc (object_size); + auto_ptr valptr (val); + auto_ptr valptr2(val2); + if (getenv ("TESTOS_UNALIGNED") != NULL) + { + val = val + 1; + val2 = val2 + 1; + } + + for (unsigned i = 0; i < object_size; i++) + { + val[i] = (char) i; + val2[i] = (char) i; + } + object_t *oids = new object_t[object_count]; + + utime_t writes[write_iter]; + utime_t total_write; + utime_t reads[write_iter]; + utime_t total_read; + for (unsigned i = 0; i < write_iter; i++) + { + cerr << "Iteration " << i << endl; + + int ret = os->mkfs(); + if (ret != 0) + { + cerr << "mkfs(" << osd_file << "): " << strerror (-ret) << endl; + exit (1); + } + ret = os->mount(); + if (ret != 0) + { + cerr << "mount(): " << strerror (-ret) << endl; + exit (1); + } + + srandom (random_seed + i); + + for (unsigned j = 0; j < object_count; j++) + { + oids[j].ino = (uint64_t) random() << 32 | random(); + oids[j].bno = random(); + } + + utime_t begin = g_clock.now(); + for (unsigned o = 0; o < object_count; o++) + { + bufferptr bp (val, object_size); + bufferlist bl; + bl.push_back (bp); + int ret; + if ((ret = os->write (oids[o], 0L, object_size, bl, NULL)) < 0) + cerr << "write " << oids[o] << " failed: " + << strerror (-ret) << endl; + } + utime_t end = g_clock.now() - begin; + + cerr << "Write finished in " << end << endl; + total_write += end; + writes[i] = end; + + os->sync(); + os->umount(); + sync(); + + if (!inhibit_remount) + { + if (system (umountcmd) != 0) + { + cerr << "umount failed" << endl; + exit (1); + } + + if (system (mountcmd) != 0) + { + cerr << "mount(2) failed" << endl; + exit (1); + } + } + + os->mount(); + + begin = g_clock.now(); + for (unsigned o = 0; o < object_count; o++) + { + bufferptr bp (val2, object_size); + bufferlist bl; + bl.push_back (bp); + + if (os->read (oids[o], 0L, object_size, bl) < 0) + { + cerr << "object " << oids[o] << " not found!" << endl; + } + } + end = g_clock.now() - begin; + + cerr << "Read finished in " << end << endl; + total_read += end; + reads[i] = end; + + os->umount(); + sync(); + + if (!inhibit_remount) + { + if (system (umountcmd) != 0) + { + cerr << "umount(2) failed" << endl; + exit (1); + } + + if (system (mountcmd) != 0) + { + cerr << "mount(3) failed" << endl; + exit (1); + } + } + } + + cerr << "Finished in " << (total_write + total_read) << endl; + + double write_mean = (double) total_write / write_iter; + double write_sd = 0.0; + for (unsigned i = 0; i < write_iter; i++) + { + double x = (double) writes[i] - write_mean; + write_sd += x * x; + } + write_sd = sqrt (write_sd / write_iter); + + double read_mean = (double) total_read / write_iter; + double read_sd = 0.0; + for (unsigned i = 0; i < write_iter; i++) + { + double x = (double) reads[i] - read_mean; + write_sd += x * x; + } + read_sd = sqrt (read_sd / write_iter); + + cout << "TESTOS: write " << osd_name << ":" << object_size << ":" + << object_count << ":" << write_iter << ":" << random_seed + << " -- " << write_mean << " " << write_sd << endl; + + cout << "TESTOS: read " << osd_name << ":" << object_size << ":" + << object_count << ":" << write_iter << ":" << random_seed + << " -- " << read_mean << " " << read_sd << endl; + + unlink (osd_file); + if (!inhibit_remount) + { + if (system (umountcmd) != 0) + { + cerr << "umount(3) failed" << endl; + exit (1); + } + } + exit (0); +} -- 2.39.5