From c80eca7fcec49a21a3da9adc3118ab0d70563165 Mon Sep 17 00:00:00 2001 From: sageweil Date: Fri, 19 Jan 2007 19:48:59 +0000 Subject: [PATCH] sage mds branch git-svn-id: https://ceph.svn.sf.net/svnroot/ceph@1019 29311d96-e01e-0410-9327-a35deaab8ce9 --- branches/sage/cephmds2/COPYING | 504 +++ branches/sage/cephmds2/Makefile | 230 ++ branches/sage/cephmds2/README | 53 + branches/sage/cephmds2/TODO | 307 ++ branches/sage/cephmds2/cfuse.cc | 91 + branches/sage/cephmds2/client/Client.cc | 2614 ++++++++++++ branches/sage/cephmds2/client/Client.h | 588 +++ branches/sage/cephmds2/client/FileCache.cc | 171 + branches/sage/cephmds2/client/FileCache.h | 65 + .../sage/cephmds2/client/SyntheticClient.cc | 1226 ++++++ .../sage/cephmds2/client/SyntheticClient.h | 198 + branches/sage/cephmds2/client/Trace.cc | 125 + branches/sage/cephmds2/client/Trace.h | 75 + branches/sage/cephmds2/client/fuse.cc | 276 ++ branches/sage/cephmds2/client/fuse.h | 23 + branches/sage/cephmds2/client/ldceph.cc | 297 ++ branches/sage/cephmds2/client/msgthread.h | 25 + branches/sage/cephmds2/common/Clock.cc | 19 + branches/sage/cephmds2/common/Clock.h | 197 + branches/sage/cephmds2/common/Cond.h | 118 + branches/sage/cephmds2/common/DecayCounter.h | 94 + branches/sage/cephmds2/common/LogType.h | 119 + branches/sage/cephmds2/common/Logger.cc | 206 + branches/sage/cephmds2/common/Logger.h | 74 + branches/sage/cephmds2/common/Mutex.h | 68 + branches/sage/cephmds2/common/Semaphore.h | 52 + branches/sage/cephmds2/common/Thread.h | 60 + branches/sage/cephmds2/common/ThreadPool.h | 138 + branches/sage/cephmds2/common/Timer.cc | 220 ++ branches/sage/cephmds2/common/Timer.h | 143 + branches/sage/cephmds2/config.cc | 718 ++++ branches/sage/cephmds2/config.h | 297 ++ branches/sage/cephmds2/cosd.cc | 118 + branches/sage/cephmds2/crush/BinaryTree.h | 271 ++ branches/sage/cephmds2/crush/Bucket.h | 618 +++ branches/sage/cephmds2/crush/Hash.h | 287 ++ branches/sage/cephmds2/crush/crush.h | 521 +++ .../cephmds2/crush/test/bucket_movement.cc | 166 + .../cephmds2/crush/test/bucket_variance.cc | 199 + .../cephmds2/crush/test/cluster_movement.cc | 217 + .../crush/test/cluster_movement_remove.cc | 229 ++ .../crush/test/cluster_movement_rush.cc | 218 + .../cephmds2/crush/test/creeping_failure.cc | 276 ++ .../crush/test/creeping_failure_variance.cc | 281 ++ .../cephmds2/crush/test/depth_variance.cc | 185 + branches/sage/cephmds2/crush/test/mixed.cc | 300 ++ branches/sage/cephmds2/crush/test/movement.cc | 223 ++ .../cephmds2/crush/test/movement_failed.cc | 246 ++ branches/sage/cephmds2/crush/test/overload.cc | 335 ++ .../cephmds2/crush/test/overload_variance.cc | 281 ++ branches/sage/cephmds2/crush/test/sizes.cc | 131 + .../sage/cephmds2/crush/test/smallbucket.cc | 138 + .../sage/cephmds2/crush/test/speed_bucket.cc | 86 + .../sage/cephmds2/crush/test/speed_depth.cc | 174 + .../sage/cephmds2/crush/test/speed_rush.cc | 145 + branches/sage/cephmds2/crush/test/t.cc | 25 + .../sage/cephmds2/crush/test/testbucket.cc | 61 + .../sage/cephmds2/crush/test/testnormal.cc | 51 + branches/sage/cephmds2/doc/Commitdir.txt | 22 + branches/sage/cephmds2/doc/Replication.txt | 19 + branches/sage/cephmds2/doc/caching.txt | 200 + branches/sage/cephmds2/doc/dentries.txt | 4 + branches/sage/cephmds2/doc/file_modes.txt | 66 + branches/sage/cephmds2/doc/header.txt | 12 + branches/sage/cephmds2/doc/inos.txt | 11 + branches/sage/cephmds2/doc/journal.txt | 108 + branches/sage/cephmds2/doc/lazy_posix.txt | 53 + branches/sage/cephmds2/doc/osd_outline.txt | 37 + .../sage/cephmds2/doc/osd_replication.txt | 226 ++ branches/sage/cephmds2/doc/performance.txt | 36 + .../cephmds2/doc/shared_write_states_nogo.txt | 39 + branches/sage/cephmds2/doc/shutdown.txt | 13 + branches/sage/cephmds2/ebofs/Allocator.cc | 692 ++++ branches/sage/cephmds2/ebofs/Allocator.h | 85 + branches/sage/cephmds2/ebofs/BlockDevice.cc | 769 ++++ branches/sage/cephmds2/ebofs/BlockDevice.h | 331 ++ branches/sage/cephmds2/ebofs/BufferCache.cc | 1045 +++++ branches/sage/cephmds2/ebofs/BufferCache.h | 681 ++++ branches/sage/cephmds2/ebofs/Cnode.h | 100 + branches/sage/cephmds2/ebofs/Ebofs.cc | 3169 +++++++++++++++ branches/sage/cephmds2/ebofs/Ebofs.h | 323 ++ branches/sage/cephmds2/ebofs/Onode.h | 390 ++ branches/sage/cephmds2/ebofs/Table.h | 897 +++++ branches/sage/cephmds2/ebofs/mkfs.ebofs.cc | 299 ++ branches/sage/cephmds2/ebofs/nodes.h | 583 +++ branches/sage/cephmds2/ebofs/test.ebofs.cc | 224 ++ branches/sage/cephmds2/ebofs/types.h | 168 + branches/sage/cephmds2/fakefuse.cc | 147 + branches/sage/cephmds2/fakemon.cc | 178 + branches/sage/cephmds2/fakesyn.cc | 176 + branches/sage/cephmds2/include/Context.h | 119 + branches/sage/cephmds2/include/Distribution.h | 74 + branches/sage/cephmds2/include/buffer.h | 999 +++++ branches/sage/cephmds2/include/error.h | 40 + branches/sage/cephmds2/include/filepath.h | 206 + branches/sage/cephmds2/include/interval_set.h | 305 ++ branches/sage/cephmds2/include/lru.h | 321 ++ branches/sage/cephmds2/include/object.h | 91 + branches/sage/cephmds2/include/oldbuffer.h | 357 ++ .../sage/cephmds2/include/oldbufferlist.h | 681 ++++ branches/sage/cephmds2/include/rangeset.h | 252 ++ branches/sage/cephmds2/include/statlite.h | 70 + branches/sage/cephmds2/include/types.h | 537 +++ branches/sage/cephmds2/include/uofs.h | 50 + branches/sage/cephmds2/jobs/alc.tp | 38 + branches/sage/cephmds2/jobs/alcdat/makedirs | 45 + .../sage/cephmds2/jobs/alcdat/makedirs.big | 45 + .../sage/cephmds2/jobs/alcdat/makedirs.tput | 46 + .../cephmds2/jobs/alcdat/makefiles.shared | 32 + branches/sage/cephmds2/jobs/alcdat/openshared | 32 + .../sage/cephmds2/jobs/alcdat/ossh.include | 45 + .../cephmds2/jobs/alcdat/ossh.include.big | 46 + branches/sage/cephmds2/jobs/alcdat/ossh.lib | 45 + .../sage/cephmds2/jobs/alcdat/ossh.lib.big | 46 + branches/sage/cephmds2/jobs/alcdat/striping | 48 + branches/sage/cephmds2/jobs/mds/log_striping | 36 + branches/sage/cephmds2/jobs/mds/makedir_lat | 33 + branches/sage/cephmds2/jobs/mds/makedirs | 40 + branches/sage/cephmds2/jobs/mds/opensshlib | 44 + branches/sage/cephmds2/jobs/meta1 | 19 + branches/sage/cephmds2/jobs/meta1.proc.sh | 14 + branches/sage/cephmds2/jobs/osd/ebofs | 51 + branches/sage/cephmds2/jobs/osd/mds_log | 43 + branches/sage/cephmds2/jobs/osd/osd_threads | 33 + branches/sage/cephmds2/jobs/osd/striping | 78 + branches/sage/cephmds2/jobs/osd/wr_lat2 | 44 + branches/sage/cephmds2/jobs/osd/write_sizes | 60 + branches/sage/cephmds2/jobs/rados/map_dist | 32 + branches/sage/cephmds2/jobs/rados/rep_lat | 43 + branches/sage/cephmds2/jobs/rados/wr_sizes | 50 + branches/sage/cephmds2/mds/Anchor.h | 55 + branches/sage/cephmds2/mds/AnchorClient.cc | 149 + branches/sage/cephmds2/mds/AnchorClient.h | 55 + branches/sage/cephmds2/mds/AnchorTable.cc | 347 ++ branches/sage/cephmds2/mds/AnchorTable.h | 82 + branches/sage/cephmds2/mds/CDentry.cc | 141 + branches/sage/cephmds2/mds/CDentry.h | 188 + branches/sage/cephmds2/mds/CDir.cc | 914 +++++ branches/sage/cephmds2/mds/CDir.h | 706 ++++ branches/sage/cephmds2/mds/CInode.cc | 495 +++ branches/sage/cephmds2/mds/CInode.h | 757 ++++ branches/sage/cephmds2/mds/Capability.h | 214 + branches/sage/cephmds2/mds/ClientMap.h | 74 + branches/sage/cephmds2/mds/IdAllocator.cc | 188 + branches/sage/cephmds2/mds/IdAllocator.h | 78 + branches/sage/cephmds2/mds/Lock.h | 311 ++ branches/sage/cephmds2/mds/Locker.cc | 2286 +++++++++++ branches/sage/cephmds2/mds/Locker.h | 123 + branches/sage/cephmds2/mds/LogEvent.cc | 86 + branches/sage/cephmds2/mds/LogEvent.h | 97 + branches/sage/cephmds2/mds/MDBalancer.cc | 902 +++++ branches/sage/cephmds2/mds/MDBalancer.h | 106 + branches/sage/cephmds2/mds/MDCache.cc | 2580 ++++++++++++ branches/sage/cephmds2/mds/MDCache.h | 282 ++ branches/sage/cephmds2/mds/MDLog.cc | 371 ++ branches/sage/cephmds2/mds/MDLog.h | 91 + branches/sage/cephmds2/mds/MDS.cc | 692 ++++ branches/sage/cephmds2/mds/MDS.h | 252 ++ branches/sage/cephmds2/mds/MDSMap.h | 103 + branches/sage/cephmds2/mds/MDStore.cc | 786 ++++ branches/sage/cephmds2/mds/MDStore.h | 75 + branches/sage/cephmds2/mds/Migrator.cc | 3192 +++++++++++++++ branches/sage/cephmds2/mds/Migrator.h | 199 + branches/sage/cephmds2/mds/OSDMonitor.cc | 523 +++ branches/sage/cephmds2/mds/OSDMonitor.h | 85 + branches/sage/cephmds2/mds/Renamer.cc | 915 +++++ branches/sage/cephmds2/mds/Renamer.h | 98 + branches/sage/cephmds2/mds/Server.cc | 2151 ++++++++++ branches/sage/cephmds2/mds/Server.h | 144 + branches/sage/cephmds2/mds/events/EAlloc.h | 110 + .../sage/cephmds2/mds/events/EDirUpdate.h | 97 + .../sage/cephmds2/mds/events/EInodeUpdate.h | 55 + branches/sage/cephmds2/mds/events/EMkdir.h | 62 + branches/sage/cephmds2/mds/events/EMknod.h | 60 + .../sage/cephmds2/mds/events/EPurgeFinish.h | 49 + branches/sage/cephmds2/mds/events/EString.h | 53 + branches/sage/cephmds2/mds/events/ETrace.h | 119 + branches/sage/cephmds2/mds/events/EUnlink.h | 64 + branches/sage/cephmds2/mds/journal.cc | 345 ++ branches/sage/cephmds2/mds/mdstypes.h | 135 + branches/sage/cephmds2/mds/oldcachestuff.cc | 944 +++++ .../sage/cephmds2/messages/MAnchorReply.h | 74 + .../sage/cephmds2/messages/MAnchorRequest.h | 76 + .../sage/cephmds2/messages/MCacheExpire.h | 95 + .../sage/cephmds2/messages/MClientFileCaps.h | 102 + .../messages/MClientInodeAuthUpdate.h | 46 + .../sage/cephmds2/messages/MClientMount.h | 50 + .../sage/cephmds2/messages/MClientMountAck.h | 59 + .../sage/cephmds2/messages/MClientReply.h | 302 ++ .../sage/cephmds2/messages/MClientRequest.h | 201 + .../sage/cephmds2/messages/MDentryUnlink.h | 45 + branches/sage/cephmds2/messages/MDirExpire.h | 50 + .../sage/cephmds2/messages/MDirExpireReq.h | 49 + branches/sage/cephmds2/messages/MDirUpdate.h | 71 + branches/sage/cephmds2/messages/MDiscover.h | 75 + .../sage/cephmds2/messages/MDiscoverReply.h | 266 ++ branches/sage/cephmds2/messages/MExportDir.h | 102 + .../sage/cephmds2/messages/MExportDirAck.h | 42 + .../cephmds2/messages/MExportDirDiscover.h | 51 + .../cephmds2/messages/MExportDirDiscoverAck.h | 52 + .../sage/cephmds2/messages/MExportDirFinish.h | 43 + .../sage/cephmds2/messages/MExportDirNotify.h | 111 + .../cephmds2/messages/MExportDirNotifyAck.h | 46 + .../sage/cephmds2/messages/MExportDirPrep.h | 186 + .../cephmds2/messages/MExportDirPrepAck.h | 44 + .../cephmds2/messages/MExportDirWarning.h | 45 + branches/sage/cephmds2/messages/MFailure.h | 49 + branches/sage/cephmds2/messages/MFailureAck.h | 42 + .../sage/cephmds2/messages/MGenericMessage.h | 44 + branches/sage/cephmds2/messages/MHashDir.h | 64 + branches/sage/cephmds2/messages/MHashDirAck.h | 42 + .../sage/cephmds2/messages/MHashDirDiscover.h | 52 + .../cephmds2/messages/MHashDirDiscoverAck.h | 53 + .../sage/cephmds2/messages/MHashDirNotify.h | 50 + .../sage/cephmds2/messages/MHashDirPrep.h | 93 + .../sage/cephmds2/messages/MHashDirPrepAck.h | 43 + .../sage/cephmds2/messages/MHashReaddir.h | 44 + .../cephmds2/messages/MHashReaddirReply.h | 80 + branches/sage/cephmds2/messages/MHeartbeat.h | 81 + .../sage/cephmds2/messages/MInodeExpire.h | 50 + .../sage/cephmds2/messages/MInodeFileCaps.h | 55 + branches/sage/cephmds2/messages/MInodeLink.h | 47 + .../sage/cephmds2/messages/MInodeLinkAck.h | 47 + .../sage/cephmds2/messages/MInodeUnlink.h | 47 + .../sage/cephmds2/messages/MInodeUnlinkAck.h | 44 + .../sage/cephmds2/messages/MInodeUpdate.h | 61 + branches/sage/cephmds2/messages/MLock.h | 128 + branches/sage/cephmds2/messages/MMDSBoot.h | 38 + branches/sage/cephmds2/messages/MMDSGetMap.h | 38 + branches/sage/cephmds2/messages/MMDSMap.h | 69 + .../sage/cephmds2/messages/MMonElectionAck.h | 46 + .../cephmds2/messages/MMonElectionCollect.h | 42 + .../cephmds2/messages/MMonElectionRefresh.h | 51 + .../cephmds2/messages/MMonElectionStatus.h | 50 + .../sage/cephmds2/messages/MMonOSDMapInfo.h | 49 + .../sage/cephmds2/messages/MMonOSDMapLease.h | 49 + .../cephmds2/messages/MMonOSDMapLeaseAck.h | 44 + .../cephmds2/messages/MMonOSDMapUpdateAck.h | 42 + .../messages/MMonOSDMapUpdateCommit.h | 42 + .../messages/MMonOSDMapUpdatePrepare.h | 52 + branches/sage/cephmds2/messages/MNSConnect.h | 45 + .../sage/cephmds2/messages/MNSConnectAck.h | 53 + branches/sage/cephmds2/messages/MNSFailure.h | 52 + branches/sage/cephmds2/messages/MNSLookup.h | 46 + .../sage/cephmds2/messages/MNSLookupReply.h | 44 + branches/sage/cephmds2/messages/MNSRegister.h | 59 + .../sage/cephmds2/messages/MNSRegisterAck.h | 53 + branches/sage/cephmds2/messages/MOSDBoot.h | 43 + branches/sage/cephmds2/messages/MOSDFailure.h | 54 + branches/sage/cephmds2/messages/MOSDGetMap.h | 45 + branches/sage/cephmds2/messages/MOSDIn.h | 42 + branches/sage/cephmds2/messages/MOSDMap.h | 69 + branches/sage/cephmds2/messages/MOSDOp.h | 214 + branches/sage/cephmds2/messages/MOSDOpReply.h | 146 + branches/sage/cephmds2/messages/MOSDOut.h | 42 + branches/sage/cephmds2/messages/MOSDPGLog.h | 61 + .../sage/cephmds2/messages/MOSDPGNotify.h | 54 + branches/sage/cephmds2/messages/MOSDPGPeer.h | 57 + .../sage/cephmds2/messages/MOSDPGPeerAck.h | 69 + .../cephmds2/messages/MOSDPGPeerRequest.h | 50 + branches/sage/cephmds2/messages/MOSDPGQuery.h | 51 + .../sage/cephmds2/messages/MOSDPGRemove.h | 51 + .../sage/cephmds2/messages/MOSDPGSummary.h | 65 + .../sage/cephmds2/messages/MOSDPGUpdate.h | 64 + branches/sage/cephmds2/messages/MOSDPing.h | 50 + branches/sage/cephmds2/messages/MPing.h | 41 + branches/sage/cephmds2/messages/MPingAck.h | 40 + branches/sage/cephmds2/messages/MRename.h | 80 + branches/sage/cephmds2/messages/MRenameAck.h | 42 + .../sage/cephmds2/messages/MRenameNotify.h | 80 + .../sage/cephmds2/messages/MRenameNotifyAck.h | 40 + branches/sage/cephmds2/messages/MRenamePrep.h | 85 + branches/sage/cephmds2/messages/MRenameReq.h | 79 + .../sage/cephmds2/messages/MRenameWarning.h | 40 + branches/sage/cephmds2/messages/MUnhashDir.h | 42 + .../sage/cephmds2/messages/MUnhashDirAck.h | 65 + .../sage/cephmds2/messages/MUnhashDirNotify.h | 50 + .../cephmds2/messages/MUnhashDirNotifyAck.h | 42 + .../sage/cephmds2/messages/MUnhashDirPrep.h | 42 + .../cephmds2/messages/MUnhashDirPrepAck.h | 93 + branches/sage/cephmds2/mon/Elector.cc | 227 ++ branches/sage/cephmds2/mon/Elector.h | 163 + branches/sage/cephmds2/mon/MDSMonitor.cc | 158 + branches/sage/cephmds2/mon/MDSMonitor.h | 69 + branches/sage/cephmds2/mon/MonMap.h | 63 + branches/sage/cephmds2/mon/Monitor.cc | 260 ++ branches/sage/cephmds2/mon/Monitor.h | 114 + branches/sage/cephmds2/mon/OSDMonitor.cc | 869 ++++ branches/sage/cephmds2/mon/OSDMonitor.h | 108 + branches/sage/cephmds2/msg/Dispatcher.cc | 27 + branches/sage/cephmds2/msg/Dispatcher.h | 40 + branches/sage/cephmds2/msg/FakeMessenger.cc | 379 ++ branches/sage/cephmds2/msg/FakeMessenger.h | 81 + branches/sage/cephmds2/msg/HostMonitor.cc | 235 ++ branches/sage/cephmds2/msg/HostMonitor.h | 97 + branches/sage/cephmds2/msg/MPIMessenger.cc | 608 +++ branches/sage/cephmds2/msg/MPIMessenger.h | 56 + branches/sage/cephmds2/msg/MTMessenger.cc | 197 + branches/sage/cephmds2/msg/MTMessenger.h | 50 + branches/sage/cephmds2/msg/Message.cc | 442 +++ branches/sage/cephmds2/msg/Message.h | 463 +++ branches/sage/cephmds2/msg/Messenger.cc | 84 + branches/sage/cephmds2/msg/Messenger.h | 92 + branches/sage/cephmds2/msg/NewMessenger.cc | 1714 ++++++++ branches/sage/cephmds2/msg/NewMessenger.h | 305 ++ branches/sage/cephmds2/msg/NewerMessenger.cc | 1791 +++++++++ branches/sage/cephmds2/msg/NewerMessenger.h | 343 ++ branches/sage/cephmds2/msg/RWLock.h | 49 + branches/sage/cephmds2/msg/SerialMessenger.h | 28 + branches/sage/cephmds2/msg/TCPDirectory.cc | 178 + branches/sage/cephmds2/msg/TCPDirectory.h | 110 + branches/sage/cephmds2/msg/TCPMessenger.cc | 1454 +++++++ branches/sage/cephmds2/msg/TCPMessenger.h | 115 + branches/sage/cephmds2/msg/error.c | 77 + branches/sage/cephmds2/msg/mpistarter.cc | 62 + branches/sage/cephmds2/msg/new_mpistarter.cc | 43 + branches/sage/cephmds2/msg/tcp.cc | 87 + branches/sage/cephmds2/msg/tcp.h | 37 + branches/sage/cephmds2/newsyn.cc | 420 ++ branches/sage/cephmds2/osd/Ager.cc | 326 ++ branches/sage/cephmds2/osd/Ager.h | 42 + branches/sage/cephmds2/osd/BDBMap.h | 136 + branches/sage/cephmds2/osd/Fake.h | 249 ++ branches/sage/cephmds2/osd/FakeStore.cc | 364 ++ branches/sage/cephmds2/osd/FakeStore.h | 87 + .../cephmds2/osd/FakeStoreBDBCollections.h | 168 + branches/sage/cephmds2/osd/OBFSStore.cc | 244 ++ branches/sage/cephmds2/osd/OBFSStore.h | 56 + branches/sage/cephmds2/osd/OSD.cc | 3498 +++++++++++++++++ branches/sage/cephmds2/osd/OSD.h | 272 ++ branches/sage/cephmds2/osd/OSDMap.h | 515 +++ branches/sage/cephmds2/osd/ObjectStore.cc | 149 + branches/sage/cephmds2/osd/ObjectStore.h | 479 +++ branches/sage/cephmds2/osd/PG.cc | 1312 +++++++ branches/sage/cephmds2/osd/PG.h | 735 ++++ branches/sage/cephmds2/osd/rush.cc | 230 ++ branches/sage/cephmds2/osd/rush.h | 60 + branches/sage/cephmds2/osd/tp.cc | 80 + branches/sage/cephmds2/osdc/Blinker.h | 91 + branches/sage/cephmds2/osdc/Filer.cc | 235 ++ branches/sage/cephmds2/osdc/Filer.h | 158 + branches/sage/cephmds2/osdc/Journaler.cc | 601 +++ branches/sage/cephmds2/osdc/Journaler.h | 218 + branches/sage/cephmds2/osdc/ObjectCacher.cc | 1472 +++++++ branches/sage/cephmds2/osdc/ObjectCacher.h | 547 +++ branches/sage/cephmds2/osdc/Objecter.cc | 831 ++++ branches/sage/cephmds2/osdc/Objecter.h | 191 + branches/sage/cephmds2/script/add_header.pl | 29 + branches/sage/cephmds2/script/adjusttabs.pl | 24 + .../sage/cephmds2/script/clean_osd_cow.sh | 3 + branches/sage/cephmds2/script/clean_trace.pl | 8 + branches/sage/cephmds2/script/comb.pl | 113 + .../sage/cephmds2/script/find_auth_pins.pl | 46 + .../sage/cephmds2/script/find_bufferleaks.pl | 69 + .../cephmds2/script/find_lost_bdev_ops.pl | 34 + .../sage/cephmds2/script/find_lost_commit.pl | 38 + .../cephmds2/script/find_lost_objecter.pl | 34 + .../sage/cephmds2/script/find_pathpins.pl | 41 + .../sage/cephmds2/script/find_requests.pl | 42 + branches/sage/cephmds2/script/find_waiters.pl | 46 + branches/sage/cephmds2/script/grepblock | 15 + .../sage/cephmds2/script/merge_trace_rw.pl | 42 + branches/sage/cephmds2/script/profonly.pl | 12 + branches/sage/cephmds2/script/runset.pl | 380 ++ branches/sage/cephmds2/script/sum.pl | 148 + branches/sage/cephmds2/tcpfuse.cc | 80 + branches/sage/cephmds2/tcpsyn.cc | 292 ++ branches/sage/cephmds2/test/fakemds.cc | 104 + branches/sage/cephmds2/test/gprof-helper.c | 120 + branches/sage/cephmds2/test/makedirs.cc | 38 + branches/sage/cephmds2/test/mpitest.cc | 111 + branches/sage/cephmds2/test/mttest.cc | 140 + branches/sage/cephmds2/test/rushconfig | 7 + branches/sage/cephmds2/test/rushtest.cc | 49 + branches/sage/cephmds2/test/rushtest.cc~ | 49 + branches/sage/cephmds2/test/testbucket.cc | 67 + branches/sage/cephmds2/test/testbuffers.cc | 40 + branches/sage/cephmds2/test/testcrush.cc | 266 ++ branches/sage/cephmds2/test/testfilepath.cc | 22 + branches/sage/cephmds2/test/testmpi.cc | 53 + branches/sage/cephmds2/test/testnewbuffers.cc | 91 + branches/sage/cephmds2/test/testtree.cc | 46 + branches/sage/cephmds2/test/testxattr.cc | 31 + 383 files changed, 88774 insertions(+) create mode 100644 branches/sage/cephmds2/COPYING create mode 100644 branches/sage/cephmds2/Makefile create mode 100644 branches/sage/cephmds2/README create mode 100644 branches/sage/cephmds2/TODO create mode 100644 branches/sage/cephmds2/cfuse.cc create mode 100644 branches/sage/cephmds2/client/Client.cc create mode 100644 branches/sage/cephmds2/client/Client.h create mode 100644 branches/sage/cephmds2/client/FileCache.cc create mode 100644 branches/sage/cephmds2/client/FileCache.h create mode 100644 branches/sage/cephmds2/client/SyntheticClient.cc create mode 100644 branches/sage/cephmds2/client/SyntheticClient.h create mode 100644 branches/sage/cephmds2/client/Trace.cc create mode 100644 branches/sage/cephmds2/client/Trace.h create mode 100644 branches/sage/cephmds2/client/fuse.cc create mode 100644 branches/sage/cephmds2/client/fuse.h create mode 100644 branches/sage/cephmds2/client/ldceph.cc create mode 100644 branches/sage/cephmds2/client/msgthread.h create mode 100644 branches/sage/cephmds2/common/Clock.cc create mode 100644 branches/sage/cephmds2/common/Clock.h create mode 100644 branches/sage/cephmds2/common/Cond.h create mode 100644 branches/sage/cephmds2/common/DecayCounter.h create mode 100644 branches/sage/cephmds2/common/LogType.h create mode 100644 branches/sage/cephmds2/common/Logger.cc create mode 100644 branches/sage/cephmds2/common/Logger.h create mode 100755 branches/sage/cephmds2/common/Mutex.h create mode 100644 branches/sage/cephmds2/common/Semaphore.h create mode 100644 branches/sage/cephmds2/common/Thread.h create mode 100644 branches/sage/cephmds2/common/ThreadPool.h create mode 100644 branches/sage/cephmds2/common/Timer.cc create mode 100644 branches/sage/cephmds2/common/Timer.h create mode 100644 branches/sage/cephmds2/config.cc create mode 100644 branches/sage/cephmds2/config.h create mode 100644 branches/sage/cephmds2/cosd.cc create mode 100644 branches/sage/cephmds2/crush/BinaryTree.h create mode 100644 branches/sage/cephmds2/crush/Bucket.h create mode 100644 branches/sage/cephmds2/crush/Hash.h create mode 100644 branches/sage/cephmds2/crush/crush.h create mode 100644 branches/sage/cephmds2/crush/test/bucket_movement.cc create mode 100644 branches/sage/cephmds2/crush/test/bucket_variance.cc create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement.cc create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement_remove.cc create mode 100644 branches/sage/cephmds2/crush/test/cluster_movement_rush.cc create mode 100644 branches/sage/cephmds2/crush/test/creeping_failure.cc create mode 100644 branches/sage/cephmds2/crush/test/creeping_failure_variance.cc create mode 100644 branches/sage/cephmds2/crush/test/depth_variance.cc create mode 100644 branches/sage/cephmds2/crush/test/mixed.cc create mode 100644 branches/sage/cephmds2/crush/test/movement.cc create mode 100644 branches/sage/cephmds2/crush/test/movement_failed.cc create mode 100644 branches/sage/cephmds2/crush/test/overload.cc create mode 100644 branches/sage/cephmds2/crush/test/overload_variance.cc create mode 100644 branches/sage/cephmds2/crush/test/sizes.cc create mode 100644 branches/sage/cephmds2/crush/test/smallbucket.cc create mode 100644 branches/sage/cephmds2/crush/test/speed_bucket.cc create mode 100644 branches/sage/cephmds2/crush/test/speed_depth.cc create mode 100644 branches/sage/cephmds2/crush/test/speed_rush.cc create mode 100644 branches/sage/cephmds2/crush/test/t.cc create mode 100644 branches/sage/cephmds2/crush/test/testbucket.cc create mode 100644 branches/sage/cephmds2/crush/test/testnormal.cc create mode 100644 branches/sage/cephmds2/doc/Commitdir.txt create mode 100644 branches/sage/cephmds2/doc/Replication.txt create mode 100644 branches/sage/cephmds2/doc/caching.txt create mode 100644 branches/sage/cephmds2/doc/dentries.txt create mode 100644 branches/sage/cephmds2/doc/file_modes.txt create mode 100644 branches/sage/cephmds2/doc/header.txt create mode 100644 branches/sage/cephmds2/doc/inos.txt create mode 100644 branches/sage/cephmds2/doc/journal.txt create mode 100644 branches/sage/cephmds2/doc/lazy_posix.txt create mode 100644 branches/sage/cephmds2/doc/osd_outline.txt create mode 100644 branches/sage/cephmds2/doc/osd_replication.txt create mode 100644 branches/sage/cephmds2/doc/performance.txt create mode 100644 branches/sage/cephmds2/doc/shared_write_states_nogo.txt create mode 100644 branches/sage/cephmds2/doc/shutdown.txt create mode 100644 branches/sage/cephmds2/ebofs/Allocator.cc create mode 100644 branches/sage/cephmds2/ebofs/Allocator.h create mode 100644 branches/sage/cephmds2/ebofs/BlockDevice.cc create mode 100644 branches/sage/cephmds2/ebofs/BlockDevice.h create mode 100644 branches/sage/cephmds2/ebofs/BufferCache.cc create mode 100644 branches/sage/cephmds2/ebofs/BufferCache.h create mode 100644 branches/sage/cephmds2/ebofs/Cnode.h create mode 100644 branches/sage/cephmds2/ebofs/Ebofs.cc create mode 100644 branches/sage/cephmds2/ebofs/Ebofs.h create mode 100644 branches/sage/cephmds2/ebofs/Onode.h create mode 100644 branches/sage/cephmds2/ebofs/Table.h create mode 100644 branches/sage/cephmds2/ebofs/mkfs.ebofs.cc create mode 100644 branches/sage/cephmds2/ebofs/nodes.h create mode 100644 branches/sage/cephmds2/ebofs/test.ebofs.cc create mode 100644 branches/sage/cephmds2/ebofs/types.h create mode 100644 branches/sage/cephmds2/fakefuse.cc create mode 100644 branches/sage/cephmds2/fakemon.cc create mode 100644 branches/sage/cephmds2/fakesyn.cc create mode 100644 branches/sage/cephmds2/include/Context.h create mode 100644 branches/sage/cephmds2/include/Distribution.h create mode 100644 branches/sage/cephmds2/include/buffer.h create mode 100644 branches/sage/cephmds2/include/error.h create mode 100644 branches/sage/cephmds2/include/filepath.h create mode 100644 branches/sage/cephmds2/include/interval_set.h create mode 100644 branches/sage/cephmds2/include/lru.h create mode 100644 branches/sage/cephmds2/include/object.h create mode 100644 branches/sage/cephmds2/include/oldbuffer.h create mode 100644 branches/sage/cephmds2/include/oldbufferlist.h create mode 100644 branches/sage/cephmds2/include/rangeset.h create mode 100644 branches/sage/cephmds2/include/statlite.h create mode 100644 branches/sage/cephmds2/include/types.h create mode 100644 branches/sage/cephmds2/include/uofs.h create mode 100644 branches/sage/cephmds2/jobs/alc.tp create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs.big create mode 100644 branches/sage/cephmds2/jobs/alcdat/makedirs.tput create mode 100644 branches/sage/cephmds2/jobs/alcdat/makefiles.shared create mode 100644 branches/sage/cephmds2/jobs/alcdat/openshared create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.include create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.include.big create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.lib create mode 100644 branches/sage/cephmds2/jobs/alcdat/ossh.lib.big create mode 100644 branches/sage/cephmds2/jobs/alcdat/striping create mode 100644 branches/sage/cephmds2/jobs/mds/log_striping create mode 100644 branches/sage/cephmds2/jobs/mds/makedir_lat create mode 100644 branches/sage/cephmds2/jobs/mds/makedirs create mode 100644 branches/sage/cephmds2/jobs/mds/opensshlib create mode 100644 branches/sage/cephmds2/jobs/meta1 create mode 100755 branches/sage/cephmds2/jobs/meta1.proc.sh create mode 100644 branches/sage/cephmds2/jobs/osd/ebofs create mode 100644 branches/sage/cephmds2/jobs/osd/mds_log create mode 100644 branches/sage/cephmds2/jobs/osd/osd_threads create mode 100644 branches/sage/cephmds2/jobs/osd/striping create mode 100644 branches/sage/cephmds2/jobs/osd/wr_lat2 create mode 100644 branches/sage/cephmds2/jobs/osd/write_sizes create mode 100644 branches/sage/cephmds2/jobs/rados/map_dist create mode 100644 branches/sage/cephmds2/jobs/rados/rep_lat create mode 100644 branches/sage/cephmds2/jobs/rados/wr_sizes create mode 100644 branches/sage/cephmds2/mds/Anchor.h create mode 100644 branches/sage/cephmds2/mds/AnchorClient.cc create mode 100644 branches/sage/cephmds2/mds/AnchorClient.h create mode 100644 branches/sage/cephmds2/mds/AnchorTable.cc create mode 100644 branches/sage/cephmds2/mds/AnchorTable.h create mode 100644 branches/sage/cephmds2/mds/CDentry.cc create mode 100644 branches/sage/cephmds2/mds/CDentry.h create mode 100644 branches/sage/cephmds2/mds/CDir.cc create mode 100644 branches/sage/cephmds2/mds/CDir.h create mode 100644 branches/sage/cephmds2/mds/CInode.cc create mode 100644 branches/sage/cephmds2/mds/CInode.h create mode 100644 branches/sage/cephmds2/mds/Capability.h create mode 100644 branches/sage/cephmds2/mds/ClientMap.h create mode 100644 branches/sage/cephmds2/mds/IdAllocator.cc create mode 100644 branches/sage/cephmds2/mds/IdAllocator.h create mode 100644 branches/sage/cephmds2/mds/Lock.h create mode 100644 branches/sage/cephmds2/mds/Locker.cc create mode 100644 branches/sage/cephmds2/mds/Locker.h create mode 100644 branches/sage/cephmds2/mds/LogEvent.cc create mode 100644 branches/sage/cephmds2/mds/LogEvent.h create mode 100644 branches/sage/cephmds2/mds/MDBalancer.cc create mode 100644 branches/sage/cephmds2/mds/MDBalancer.h create mode 100644 branches/sage/cephmds2/mds/MDCache.cc create mode 100644 branches/sage/cephmds2/mds/MDCache.h create mode 100644 branches/sage/cephmds2/mds/MDLog.cc create mode 100644 branches/sage/cephmds2/mds/MDLog.h create mode 100644 branches/sage/cephmds2/mds/MDS.cc create mode 100644 branches/sage/cephmds2/mds/MDS.h create mode 100644 branches/sage/cephmds2/mds/MDSMap.h create mode 100644 branches/sage/cephmds2/mds/MDStore.cc create mode 100644 branches/sage/cephmds2/mds/MDStore.h create mode 100644 branches/sage/cephmds2/mds/Migrator.cc create mode 100644 branches/sage/cephmds2/mds/Migrator.h create mode 100644 branches/sage/cephmds2/mds/OSDMonitor.cc create mode 100644 branches/sage/cephmds2/mds/OSDMonitor.h create mode 100644 branches/sage/cephmds2/mds/Renamer.cc create mode 100644 branches/sage/cephmds2/mds/Renamer.h create mode 100644 branches/sage/cephmds2/mds/Server.cc create mode 100644 branches/sage/cephmds2/mds/Server.h create mode 100644 branches/sage/cephmds2/mds/events/EAlloc.h create mode 100644 branches/sage/cephmds2/mds/events/EDirUpdate.h create mode 100644 branches/sage/cephmds2/mds/events/EInodeUpdate.h create mode 100644 branches/sage/cephmds2/mds/events/EMkdir.h create mode 100644 branches/sage/cephmds2/mds/events/EMknod.h create mode 100644 branches/sage/cephmds2/mds/events/EPurgeFinish.h create mode 100644 branches/sage/cephmds2/mds/events/EString.h create mode 100644 branches/sage/cephmds2/mds/events/ETrace.h create mode 100644 branches/sage/cephmds2/mds/events/EUnlink.h create mode 100644 branches/sage/cephmds2/mds/journal.cc create mode 100644 branches/sage/cephmds2/mds/mdstypes.h create mode 100644 branches/sage/cephmds2/mds/oldcachestuff.cc create mode 100644 branches/sage/cephmds2/messages/MAnchorReply.h create mode 100644 branches/sage/cephmds2/messages/MAnchorRequest.h create mode 100644 branches/sage/cephmds2/messages/MCacheExpire.h create mode 100644 branches/sage/cephmds2/messages/MClientFileCaps.h create mode 100644 branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h create mode 100644 branches/sage/cephmds2/messages/MClientMount.h create mode 100644 branches/sage/cephmds2/messages/MClientMountAck.h create mode 100644 branches/sage/cephmds2/messages/MClientReply.h create mode 100644 branches/sage/cephmds2/messages/MClientRequest.h create mode 100644 branches/sage/cephmds2/messages/MDentryUnlink.h create mode 100644 branches/sage/cephmds2/messages/MDirExpire.h create mode 100644 branches/sage/cephmds2/messages/MDirExpireReq.h create mode 100644 branches/sage/cephmds2/messages/MDirUpdate.h create mode 100644 branches/sage/cephmds2/messages/MDiscover.h create mode 100644 branches/sage/cephmds2/messages/MDiscoverReply.h create mode 100644 branches/sage/cephmds2/messages/MExportDir.h create mode 100644 branches/sage/cephmds2/messages/MExportDirAck.h create mode 100644 branches/sage/cephmds2/messages/MExportDirDiscover.h create mode 100644 branches/sage/cephmds2/messages/MExportDirDiscoverAck.h create mode 100644 branches/sage/cephmds2/messages/MExportDirFinish.h create mode 100644 branches/sage/cephmds2/messages/MExportDirNotify.h create mode 100644 branches/sage/cephmds2/messages/MExportDirNotifyAck.h create mode 100644 branches/sage/cephmds2/messages/MExportDirPrep.h create mode 100644 branches/sage/cephmds2/messages/MExportDirPrepAck.h create mode 100644 branches/sage/cephmds2/messages/MExportDirWarning.h create mode 100644 branches/sage/cephmds2/messages/MFailure.h create mode 100644 branches/sage/cephmds2/messages/MFailureAck.h create mode 100644 branches/sage/cephmds2/messages/MGenericMessage.h create mode 100644 branches/sage/cephmds2/messages/MHashDir.h create mode 100644 branches/sage/cephmds2/messages/MHashDirAck.h create mode 100644 branches/sage/cephmds2/messages/MHashDirDiscover.h create mode 100644 branches/sage/cephmds2/messages/MHashDirDiscoverAck.h create mode 100644 branches/sage/cephmds2/messages/MHashDirNotify.h create mode 100644 branches/sage/cephmds2/messages/MHashDirPrep.h create mode 100644 branches/sage/cephmds2/messages/MHashDirPrepAck.h create mode 100644 branches/sage/cephmds2/messages/MHashReaddir.h create mode 100644 branches/sage/cephmds2/messages/MHashReaddirReply.h create mode 100644 branches/sage/cephmds2/messages/MHeartbeat.h create mode 100644 branches/sage/cephmds2/messages/MInodeExpire.h create mode 100644 branches/sage/cephmds2/messages/MInodeFileCaps.h create mode 100644 branches/sage/cephmds2/messages/MInodeLink.h create mode 100644 branches/sage/cephmds2/messages/MInodeLinkAck.h create mode 100644 branches/sage/cephmds2/messages/MInodeUnlink.h create mode 100644 branches/sage/cephmds2/messages/MInodeUnlinkAck.h create mode 100644 branches/sage/cephmds2/messages/MInodeUpdate.h create mode 100644 branches/sage/cephmds2/messages/MLock.h create mode 100644 branches/sage/cephmds2/messages/MMDSBoot.h create mode 100644 branches/sage/cephmds2/messages/MMDSGetMap.h create mode 100644 branches/sage/cephmds2/messages/MMDSMap.h create mode 100644 branches/sage/cephmds2/messages/MMonElectionAck.h create mode 100644 branches/sage/cephmds2/messages/MMonElectionCollect.h create mode 100644 branches/sage/cephmds2/messages/MMonElectionRefresh.h create mode 100644 branches/sage/cephmds2/messages/MMonElectionStatus.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapInfo.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapLease.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h create mode 100644 branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h create mode 100644 branches/sage/cephmds2/messages/MNSConnect.h create mode 100644 branches/sage/cephmds2/messages/MNSConnectAck.h create mode 100644 branches/sage/cephmds2/messages/MNSFailure.h create mode 100644 branches/sage/cephmds2/messages/MNSLookup.h create mode 100644 branches/sage/cephmds2/messages/MNSLookupReply.h create mode 100644 branches/sage/cephmds2/messages/MNSRegister.h create mode 100644 branches/sage/cephmds2/messages/MNSRegisterAck.h create mode 100644 branches/sage/cephmds2/messages/MOSDBoot.h create mode 100644 branches/sage/cephmds2/messages/MOSDFailure.h create mode 100644 branches/sage/cephmds2/messages/MOSDGetMap.h create mode 100644 branches/sage/cephmds2/messages/MOSDIn.h create mode 100644 branches/sage/cephmds2/messages/MOSDMap.h create mode 100644 branches/sage/cephmds2/messages/MOSDOp.h create mode 100644 branches/sage/cephmds2/messages/MOSDOpReply.h create mode 100644 branches/sage/cephmds2/messages/MOSDOut.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGLog.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGNotify.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeer.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeerAck.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGPeerRequest.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGQuery.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGRemove.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGSummary.h create mode 100644 branches/sage/cephmds2/messages/MOSDPGUpdate.h create mode 100644 branches/sage/cephmds2/messages/MOSDPing.h create mode 100644 branches/sage/cephmds2/messages/MPing.h create mode 100644 branches/sage/cephmds2/messages/MPingAck.h create mode 100644 branches/sage/cephmds2/messages/MRename.h create mode 100644 branches/sage/cephmds2/messages/MRenameAck.h create mode 100644 branches/sage/cephmds2/messages/MRenameNotify.h create mode 100644 branches/sage/cephmds2/messages/MRenameNotifyAck.h create mode 100644 branches/sage/cephmds2/messages/MRenamePrep.h create mode 100644 branches/sage/cephmds2/messages/MRenameReq.h create mode 100644 branches/sage/cephmds2/messages/MRenameWarning.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDir.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDirAck.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDirNotify.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDirPrep.h create mode 100644 branches/sage/cephmds2/messages/MUnhashDirPrepAck.h create mode 100644 branches/sage/cephmds2/mon/Elector.cc create mode 100644 branches/sage/cephmds2/mon/Elector.h create mode 100644 branches/sage/cephmds2/mon/MDSMonitor.cc create mode 100644 branches/sage/cephmds2/mon/MDSMonitor.h create mode 100644 branches/sage/cephmds2/mon/MonMap.h create mode 100644 branches/sage/cephmds2/mon/Monitor.cc create mode 100644 branches/sage/cephmds2/mon/Monitor.h create mode 100644 branches/sage/cephmds2/mon/OSDMonitor.cc create mode 100644 branches/sage/cephmds2/mon/OSDMonitor.h create mode 100644 branches/sage/cephmds2/msg/Dispatcher.cc create mode 100644 branches/sage/cephmds2/msg/Dispatcher.h create mode 100644 branches/sage/cephmds2/msg/FakeMessenger.cc create mode 100644 branches/sage/cephmds2/msg/FakeMessenger.h create mode 100644 branches/sage/cephmds2/msg/HostMonitor.cc create mode 100644 branches/sage/cephmds2/msg/HostMonitor.h create mode 100644 branches/sage/cephmds2/msg/MPIMessenger.cc create mode 100644 branches/sage/cephmds2/msg/MPIMessenger.h create mode 100644 branches/sage/cephmds2/msg/MTMessenger.cc create mode 100644 branches/sage/cephmds2/msg/MTMessenger.h create mode 100644 branches/sage/cephmds2/msg/Message.cc create mode 100644 branches/sage/cephmds2/msg/Message.h create mode 100644 branches/sage/cephmds2/msg/Messenger.cc create mode 100644 branches/sage/cephmds2/msg/Messenger.h create mode 100644 branches/sage/cephmds2/msg/NewMessenger.cc create mode 100644 branches/sage/cephmds2/msg/NewMessenger.h create mode 100644 branches/sage/cephmds2/msg/NewerMessenger.cc create mode 100644 branches/sage/cephmds2/msg/NewerMessenger.h create mode 100644 branches/sage/cephmds2/msg/RWLock.h create mode 100644 branches/sage/cephmds2/msg/SerialMessenger.h create mode 100644 branches/sage/cephmds2/msg/TCPDirectory.cc create mode 100644 branches/sage/cephmds2/msg/TCPDirectory.h create mode 100644 branches/sage/cephmds2/msg/TCPMessenger.cc create mode 100644 branches/sage/cephmds2/msg/TCPMessenger.h create mode 100644 branches/sage/cephmds2/msg/error.c create mode 100644 branches/sage/cephmds2/msg/mpistarter.cc create mode 100644 branches/sage/cephmds2/msg/new_mpistarter.cc create mode 100644 branches/sage/cephmds2/msg/tcp.cc create mode 100644 branches/sage/cephmds2/msg/tcp.h create mode 100644 branches/sage/cephmds2/newsyn.cc create mode 100644 branches/sage/cephmds2/osd/Ager.cc create mode 100644 branches/sage/cephmds2/osd/Ager.h create mode 100644 branches/sage/cephmds2/osd/BDBMap.h create mode 100644 branches/sage/cephmds2/osd/Fake.h create mode 100644 branches/sage/cephmds2/osd/FakeStore.cc create mode 100644 branches/sage/cephmds2/osd/FakeStore.h create mode 100644 branches/sage/cephmds2/osd/FakeStoreBDBCollections.h create mode 100644 branches/sage/cephmds2/osd/OBFSStore.cc create mode 100644 branches/sage/cephmds2/osd/OBFSStore.h create mode 100644 branches/sage/cephmds2/osd/OSD.cc create mode 100644 branches/sage/cephmds2/osd/OSD.h create mode 100644 branches/sage/cephmds2/osd/OSDMap.h create mode 100644 branches/sage/cephmds2/osd/ObjectStore.cc create mode 100644 branches/sage/cephmds2/osd/ObjectStore.h create mode 100644 branches/sage/cephmds2/osd/PG.cc create mode 100644 branches/sage/cephmds2/osd/PG.h create mode 100644 branches/sage/cephmds2/osd/rush.cc create mode 100644 branches/sage/cephmds2/osd/rush.h create mode 100644 branches/sage/cephmds2/osd/tp.cc create mode 100644 branches/sage/cephmds2/osdc/Blinker.h create mode 100644 branches/sage/cephmds2/osdc/Filer.cc create mode 100644 branches/sage/cephmds2/osdc/Filer.h create mode 100644 branches/sage/cephmds2/osdc/Journaler.cc create mode 100644 branches/sage/cephmds2/osdc/Journaler.h create mode 100644 branches/sage/cephmds2/osdc/ObjectCacher.cc create mode 100644 branches/sage/cephmds2/osdc/ObjectCacher.h create mode 100644 branches/sage/cephmds2/osdc/Objecter.cc create mode 100644 branches/sage/cephmds2/osdc/Objecter.h create mode 100755 branches/sage/cephmds2/script/add_header.pl create mode 100755 branches/sage/cephmds2/script/adjusttabs.pl create mode 100755 branches/sage/cephmds2/script/clean_osd_cow.sh create mode 100755 branches/sage/cephmds2/script/clean_trace.pl create mode 100755 branches/sage/cephmds2/script/comb.pl create mode 100755 branches/sage/cephmds2/script/find_auth_pins.pl create mode 100755 branches/sage/cephmds2/script/find_bufferleaks.pl create mode 100755 branches/sage/cephmds2/script/find_lost_bdev_ops.pl create mode 100755 branches/sage/cephmds2/script/find_lost_commit.pl create mode 100755 branches/sage/cephmds2/script/find_lost_objecter.pl create mode 100755 branches/sage/cephmds2/script/find_pathpins.pl create mode 100755 branches/sage/cephmds2/script/find_requests.pl create mode 100755 branches/sage/cephmds2/script/find_waiters.pl create mode 100755 branches/sage/cephmds2/script/grepblock create mode 100644 branches/sage/cephmds2/script/merge_trace_rw.pl create mode 100755 branches/sage/cephmds2/script/profonly.pl create mode 100755 branches/sage/cephmds2/script/runset.pl create mode 100755 branches/sage/cephmds2/script/sum.pl create mode 100644 branches/sage/cephmds2/tcpfuse.cc create mode 100644 branches/sage/cephmds2/tcpsyn.cc create mode 100644 branches/sage/cephmds2/test/fakemds.cc create mode 100644 branches/sage/cephmds2/test/gprof-helper.c create mode 100644 branches/sage/cephmds2/test/makedirs.cc create mode 100644 branches/sage/cephmds2/test/mpitest.cc create mode 100644 branches/sage/cephmds2/test/mttest.cc create mode 100644 branches/sage/cephmds2/test/rushconfig create mode 100644 branches/sage/cephmds2/test/rushtest.cc create mode 100644 branches/sage/cephmds2/test/rushtest.cc~ create mode 100644 branches/sage/cephmds2/test/testbucket.cc create mode 100644 branches/sage/cephmds2/test/testbuffers.cc create mode 100644 branches/sage/cephmds2/test/testcrush.cc create mode 100644 branches/sage/cephmds2/test/testfilepath.cc create mode 100644 branches/sage/cephmds2/test/testmpi.cc create mode 100644 branches/sage/cephmds2/test/testnewbuffers.cc create mode 100644 branches/sage/cephmds2/test/testtree.cc create mode 100644 branches/sage/cephmds2/test/testxattr.cc diff --git a/branches/sage/cephmds2/COPYING b/branches/sage/cephmds2/COPYING new file mode 100644 index 0000000000000..5ab7695ab8cab --- /dev/null +++ b/branches/sage/cephmds2/COPYING @@ -0,0 +1,504 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms of the +ordinary General Public License). + + To apply these terms, attach the following notices to the library. It is +safest to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the library, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James Random Hacker. + + , 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/branches/sage/cephmds2/Makefile b/branches/sage/cephmds2/Makefile new file mode 100644 index 0000000000000..1681ac16698a8 --- /dev/null +++ b/branches/sage/cephmds2/Makefile @@ -0,0 +1,230 @@ + +# mpicxx must be on your path; on googoo, this means that +# /usr/local/mpich2-1.0.2/bin must be on your path. + +# For now, use g++ most of the time. +# When compiling MPI stuff, specify myfile.cc instead of myfile.o so that ${MPICC} is +# invoked instead of the generic .o rule (or it'll use g++). +# This makes it less annoying to build on non-mpi hosts for dev work, and seems to +# behave just fine... change ${CC} back to mpicxx if you get paranoid. + +CC = g++ +CFLAGS = -g -Wall -I. -D_FILE_OFFSET_BITS=64 -DMPICH_IGNORE_CXX_SEEK -D_REENTRANT -D_THREAD_SAFE +LIBS = -lpthread + +#for normal mpich2 machines +MPICC = mpicxx +MPICFLAGS = ${CFLAGS} +MPILIBS = ${LIBS} + +#for LLNL boxes without mpicxx +#MPICC = g++ +#MPICFLAGS = ${CFLAGS} -I/usr/lib/mpi/include -L/usr/lib/mpi/mpi_gnu/lib +#MPILIBS = ${LIBS} -lelan -lmpi + +EBOFS_OBJS= \ + ebofs/BlockDevice.o\ + ebofs/BufferCache.o\ + ebofs/Ebofs.o\ + ebofs/Allocator.o + +MDS_OBJS= \ + mds/MDS.o\ + mds/journal.o\ + mds/Server.o\ + mds/MDCache.o\ + mds/Locker.o\ + mds/Migrator.o\ + mds/Renamer.o\ + mds/MDBalancer.o\ + mds/CDentry.o\ + mds/CDir.o\ + mds/CInode.o\ + mds/AnchorTable.o\ + mds/AnchorClient.o\ + mds/MDStore.o\ + mds/LogEvent.o\ + mds/IdAllocator.o\ + mds/MDLog.o + +OSD_OBJS= \ + osd/PG.o\ + osd/Ager.o\ + osd/FakeStore.o\ + osd/OSD.o + +OSDC_OBJS= \ + osdc/Objecter.o\ + osdc/ObjectCacher.o\ + osdc/Filer.o\ + osdc/Journaler.o + +MON_OBJS= \ + mon/Monitor.o\ + mon/OSDMonitor.o\ + mon/MDSMonitor.o\ + mon/Elector.o + +COMMON_OBJS= \ + msg/Messenger.o\ + msg/Message.o\ + msg/HostMonitor.o\ + common/Logger.o\ + common/Clock.o\ + common/Timer.o\ + config.o + + +CLIENT_OBJS= \ + client/FileCache.o\ + client/Client.o\ + client/SyntheticClient.o\ + client/Trace.o + +TCP_OBJS = \ + msg/TCPMessenger.o\ + msg/TCPDirectory.o + +TARGETS = cosd cfuse newsyn fakesyn + +SRCS=*.cc */*.cc *.h */*.h */*/*.h + +all: depend ${TARGETS} + +test: depend ${TEST_TARGETS} + +obfs: depend obfstest + + +# real bits +cmon: cmon.cc mon.o ebofs.o msg/NewerMessenger.o common.o + ${CC} ${CFLAGS} ${MPILIBS} $^ -o $@ + +cosd: cosd.cc osd.o ebofs.o msg/NewerMessenger.o common.o + ${CC} ${CFLAGS} ${MPILIBS} $^ -o $@ + +cmds: cmds.cc mds.o osdc.o msg/NewerMessenger.o common.o + ${CC} ${CFLAGS} ${MPILIBS} $^ -o $@ + +cfuse: cfuse.cc client.o osdc.o client/fuse.o msg/NewerMessenger.o common.o + ${CC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ + + +# misc +gprof-helper.so: test/gprof-helper.c + gcc -shared -fPIC test/gprof-helper.c -o gprof-helper.so -lpthread -ldl + + + +# fuse +fakefuse: fakefuse.cc mon.o mds.o client.o osd.o osdc.o ebofs.o client/fuse.o msg/FakeMessenger.cc common.o + ${CC} -pg ${CFLAGS} ${LIBS} -lfuse $^ -o $@ + +tcpfuse: tcpfuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o + ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ + +mpifuse: mpifuse.cc mds.o client.o client/fuse.o ${TCP_OBJS} common.o + ${MPICC} ${CFLAGS} ${LIBS} -lfuse $^ -o $@ + + +# synthetic workload +fakesyn: fakesyn.o mon.o mds.o client.o osd.o ebofs.o osdc.o msg/FakeMessenger.o common.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ + +tcpsyn: tcpsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o ${TCP_OBJS} common.o + ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ + +newsyn: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o + ${MPICC} -pg ${MPICFLAGS} ${MPILIBS} $^ -o $@ + +newsyn.nopg: newsyn.cc mon.o mds.o client.o osd.o ebofs.o osdc.o msg/NewerMessenger.o common.o + ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ + +# + obfs +fakesynobfs: fakesyn.cc mds.o client.o osd_obfs.o msg/FakeMessenger.o common.o + ${CC} -DUSE_OBFS ${CFLAGS} ${LIBS} $^ -o $@ + +tcpsynobfs: tcpsyn.cc mds.o client.o osd_obfs.o ${TCP_OBJS} common.o + ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ + + +# ebofs + +mkfs.ebofs: ebofs/mkfs.ebofs.cc config.cc common/Clock.o ebofs.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ + +test.ebofs: ebofs/test.ebofs.cc config.cc common/Clock.o ebofs.o + ${CC} -pg ${CFLAGS} ${LIBS} $^ -o $@ + + + + +# libceph +libceph.o: client/ldceph.o client/Client.o ${TCP_OBJS} ${COMMON_OBJS} ${SYN_OBJS} ${OSDC_OBJS} + ld -i $^ -o $@ + +bench/mdtest/mdtest.o: bench/mdtest/mdtest.c + mpicc -c $^ -o $@ + +mdtest: bench/mdtest/mdtest.o + ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ + +mdtest.ceph: bench/mdtest/mdtest.o libceph.o + ${MPICC} ${MPICFLAGS} ${MPILIBS} $^ -o $@ + +# + +%.so: %.cc + ${CC} -shared -fPIC ${CFLAGS} $< -o $@ + + +testmpi: test/testmpi.cc msg/MPIMessenger.cc config.o common/Timer.o common/clock.o msg/Messenger.o msg/Dispatcher.o msg/error.o + ${MPICC} ${CFLAGS} ${LIBS} $^ -o $@ + + +clean: + rm -f *.o */*.o ${TARGETS} ${TEST_TARGETS} + +common.o: ${COMMON_OBJS} + ld -i -o $@ $^ + +ebofs.o: ${EBOFS_OBJS} + ld -i -o $@ $^ + +client.o: ${CLIENT_OBJS} + ld -i -o $@ $^ + +osd.o: ${OSD_OBJS} + ld -i -o $@ $^ + +osdc.o: ${OSDC_OBJS} + ld -i -o $@ $^ + +osd_obfs.o: osd/OBFSStore.o osd/OSD.ccosd/PG.o osd/ObjectStore.o osd/FakeStore.o + ${MPICC} -DUSE_OBFS ${MPICFLAGS} ${MPILIBS} $^ -o $@ ../uofs/uofs.a + +mds.o: ${MDS_OBJS} + ld -i -o $@ $^ + +mon.o: ${MON_OBJS} + ld -i -o $@ $^ + +%.o: %.cc + ${CC} ${CFLAGS} -c $< -o $@ + +%.po: %.cc + ${CC} -fPIC ${CFLAGS} -c $< -o $@ + +count: + cat ${SRCS} | wc -l + cat ${SRCS} | grep -c \; + +.depend: + touch .depend + +depend: + $(RM) .depend + makedepend -f- -- $(CFLAGS) -- $(SRCS) > .depend 2>/dev/null + +# now add a line to include the dependency list. +include .depend diff --git a/branches/sage/cephmds2/README b/branches/sage/cephmds2/README new file mode 100644 index 0000000000000..97008e49ffe75 --- /dev/null +++ b/branches/sage/cephmds2/README @@ -0,0 +1,53 @@ +pmds = parallel metadata server/system + +'test' is a standalone proccess that runs all clients, OSDs, and MDSs +in a single process with a basic message passer (FakeMessenger). +Useful for debugging. + +'pmds' uses MPI for communication. + +'import' builds a metadata store on ./osddata/ by taking find output +from stdin. Make sure find is run from the current directory so that +import can stat the files it's fed. The find root becomes the file +system root; feel free to use relative paths. + +This is all GPL, etc. + + +Getting started: + + 1- Comment out the LEAKTRACER= line in the Makefile if you don't have + LeakTracer installed (you probably don't). + + 2- make (test and import targets are testing ones; pmds uses MPI) + + 3- Build an OSD metadata store: + # mkdir osddata + # find /some/big/dir | ./import root + + 4- Single proc sim: + # ./test + or more likely, + # ./test > out + + 5- Change parameters in config.cc. + + 6- If you want stats logged, mkdir log (make sure you have enough + file handles; there's one open file per client). + + +Notes on pmds (MPI version): + + - On mcr/alc I have to + # setenv LD_LIBRARY_PATH /usr/lib/mpi/mpi_gnu/lib + for the GNU runtime MPI libs (otherwise you get the Intel ones, + which segfault). + + - Each MDS and OSD gets its own node. Clients are divided over + whatever is left over. So make sure you tell MPI to give you at + least num_mds+num_osd+1 processes (num_mds etc defined in + config.cc). + + + +2004.08.25 sage@newdream.net diff --git a/branches/sage/cephmds2/TODO b/branches/sage/cephmds2/TODO new file mode 100644 index 0000000000000..3c1e1f62b437c --- /dev/null +++ b/branches/sage/cephmds2/TODO @@ -0,0 +1,307 @@ + +- paxos for monitor +- lnet? +- crush + - xml import/export? + - crush tools + +== todo + +1- pipelining writes? +2- intervening reads? + +inode ops + utime -- no concurrency issues + chown/chmod -- should lock + truncate -- should lock + 1-> no. multiple process concurrency on a single inode is not important. + 2-> maybe... intervening stats? probably not important. + +directory ops. parent inode mtime, + dirent xlocks? + mknod + open+create + symlink + unlink + rmdir + rename + 1-> yes. but mtime updates are independent (mtime monotonically increasing), so it's easy. + 2-> yes. + +--> so, make let's make file/hard wrlock exclusive. + +locks + namespace + path pins -- read lock + dentry xlock -- write lock + inode + hard/file rd start/stop -- read lock + hard/file wr start/stop -- write lock + + + + +- integrate revisions into ObjectCacher +- clean up oid.rev vs op.rev in osd+osdc + +rados paper todo +- better experiments +- flush log only in response to subsequent read or write? +- better behaving recovery +- justify use of splay. + - dynamic replication +- snapshots + +rados snapshots +- attr.crev is rev we were created in. +- oid.rev=0 is "live". defined for attr.crev <= rev. +- otherwise, defined for attr.crev <= rev < oid.rev (i.e. oid.rev is upper bound, non-inclusive.) + +- write|delete is tagged with op.rev + - if attr.crev < op.rev + - we clone to oid.rev=rev (clone keeps old crev) + - change live attr.crev=rev. + - apply update +- read is tagged with op.rev + - if 0, we read from 0 (if it exists). + - otherwise we choose object rev based on op.rev vs oid.rev, and then verifying attr.crev <= op.rev. + +- how to get usage feedback to monitor? + +- change messenger entity_inst_t + - no more rank! make it a uniquish nonce? + +- clean up mds caps release in exporter +- figure out client failure modes +- clean up messenger failure modes. +- add connection retry. + +mds recovery +- multiple passes? + 1- establish import/export map + ?- + 2- replay inode, dir, dentry updates +- single pass + - each event needs to embed inode for trace up to the import + - second stage will reconcile cached items with other active mds nodes + - cached items will be shared with the primary to repopulate it's non-dirty cache + - query clients for their state too? + - mds must journal list of clients with whom we share state? + + +journaler +- should we pad with zeros to avoid splitting individual entries? + - make it a g_conf flag? + - have to fix reader to skip over zeros (either <4 bytes for size, or zeroed sizes) +- need to truncate at detected (valid) write_pos to clear out any other partial trailing writes + + +monitor +?- monitor user lib that handles resending, redirection of mon requests. +- elector +/- organize monitor store + +osdmon +- distribute +- recovery: store elector epochs with maps.. +- monitor needs to monitor some osds... +- monitor pgs, notify on out +- watch osd utilization; adjust overload in cluster map + +mdsmon + +osd/rados +- efficiently replicate clone() objects +- pg_num instead of pg_bits +- flag missing log entries on crash recovery --> WRNOOP? or WRLOST? +- consider implications of nvram writeahead logs +- fix heartbeat wrt new replication +- mark residual pgs obsolete ??? +- rdlocks +- optimize remove wrt recovery pushes +- pg_bit/pg_num changes +- report crashed pgs? + +messenger +/- share same tcp socket for sender and receiver +/- graceful connection teardown +- close idle connections +- generalize out a transport layer? + - eg reliable tcp for most things, connectionless unreliable datagrams for monitors? + - or, aggressive connection closing on monitors? or just max_connections and an lru? +- osds: forget idle client addrs + +objecter + +objectcacher +- ocacher caps transitions vs locks +- test read locks + +reliability +- heartbeat vs ping +- osdmonitor, filter + +ebofs +- verify proper behavior of conflicting/overlapping reads of clones +- test(fix) sync() +- combine inodes and/or cnodes into same blocks +- allow btree sets instead of maps +- eliminate nodepools +- nonblocking write on missing onodes? +- fix bug in node rotation on insert (and reenable) +- fix NEAR_LAST_FWD (?) +- journaling? in NVRAM? +- metadata in nvram? flash? + + + +bugs/stability +- figure out weird 40ms latency with double log entries + + +general +- timer needs cancel sets, schedulers need to cancel outstanding events on shutdown +- well, just figure out general timer cancellation strategy that avoids races + - use updated Timer as a model? + + +remaining hard problems +- how to cope with file size changes and read/write sharing +- mds failure recovery (of course) + + +crush +- more efficient failure when all/too many osds are down +- allow forcefeed for more complicated rule structures. (e.g. make force_stack a list< set >) + + +mds +- distributed client management +- anchormgr + - 2pc + - independent journal + - distributed? +- link count management + - also 2pc +- chdir (directory opens!) +- rewrite logstream + - clean up + - be smart about rados ack vs reread + - log locking? root log object + - trimming, rotation + +- efficient stat for single writers +- lstat vs stat +- add FILE_CAP_EXTEND capability bit +- only share osdmap updates with clients holding capabilities +- delayed replica caps release... we need to set a timer event? (and cancel it when appropriate?) +- finish hard links! + - reclaim danglers from inode file on discover... + - fix rename wrt hard links +- interactive hash/unhash interface +- test hashed readdir +- make logstream.flush align itself to stripes + +- carefully define/document frozen wrt dir_auth vs hashing + + + +client +- mixed lazy and non-lazy io will clobber each others' caps in the buffer cache + +- test client caps with meta exports +- some heuristic behavior to consolidate caps to inode auth +- client will re-tx anything it needed to say upon rx of new mds notification (?) + + + + + + +MDS TODO +- fix hashed readdir: should (optionally) do a lock on dir namespace? +- fix hard links + - they mostly work, but they're fragile +- sync clients on stat + - will need to ditch 10s client metadata caching before this is useful + - implement truncate +- implement hashed directories +- statfs? +- rewrite journal + recovery +- figure out online failure recovery +- more distributed fh management? +- btree directories (for efficient large directories) +- consistency points/snapshots + +- fix MExportAck and others to use dir+dentry, not inode + (otherwise this all breaks with hard links.. altho it probably needs reworking already?) + + + + + +why qsync could be wrong (for very strict POSIX) : varying mds -> client message transit or processing times. +- mds -> 1,2 : qsync +- client1 writes at byte 100 +- client1 -> mds : qsync reply (size=100) +- client1 writes at byte 300 +- client1 -> client2 (outside channel) +- client2 writes at byte 200 +- client2 -> mds : qsync reply (size=200) +-> stat results in size 200, even though at no single point in time was the max size 500. +-> for correct result, need to _stop_ client writers while gathering metadata. + + +SAGE: + +- string table? + +- hard links + - fix MExportAck and others to use dir+dentry, not inode + (otherwise this all breaks with hard links.. altho it probably needs reworking already!) + +- do real permission checks? + + + +CLIENT TODO + +- statfs + + + + + +ISSUES + + +- discover + - soft: authority selectively repicates, or sets a 'forward' flag in reply + - hard: authority always replicates (eg. discover for export) + - forward flag (see soft) + - error flag (if file not found, etc.) + - [what was i talking about?] make sure waiters are properly triggered, either upon dir_rep update, or (empty!) discover reply + + + +DOCUMENT +- cache, distributed cache structure and invariants +- export process +- hash/unhash process + + +TEST +- hashing + - test hash/unhash operation + - hash+export: encode list of replicated dir inodes so they can be discovered before import is procesed. + - test nauthitems (wrt hashing?) + + +IMPLEMENT + +- smarter balancing + - popularity calculation and management is inconsistent/wrong. + - does it work? + +- dump active config in run output somewhere + + diff --git a/branches/sage/cephmds2/cfuse.cc b/branches/sage/cephmds2/cfuse.cc new file mode 100644 index 0000000000000..b260c4bd3c3f8 --- /dev/null +++ b/branches/sage/cephmds2/cfuse.cc @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "client/Client.h" +#include "client/fuse.h" + +#include "msg/NewMessenger.h" + +#include "common/Timer.h" + +#include + +#include +#include +#include + +int main(int argc, char **argv, char *envp[]) { + + //cerr << "cfuse starting " << myrank << "/" << world << endl; + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + // args for fuse + vec_to_argv(args, argc, argv); + + // load monmap + bufferlist bl; + int fd = ::open(".ceph_monmap", O_RDONLY); + assert(fd >= 0); + struct stat st; + ::fstat(fd, &st); + bufferptr bp(st.st_size); + bl.append(bp); + ::read(fd, (void*)bl.c_str(), bl.length()); + ::close(fd); + + MonMap *monmap = new MonMap; + monmap->decode(bl); + + // start up network + rank.set_namer(monmap->get_inst(0).addr); + rank.start_rank(); + + // start client + Client *client = new Client(rank.register_entity(MSG_ADDR_CLIENT_NEW), monmap); + client->init(); + + // start up fuse + // use my argc, argv (make sure you pass a mount point!) + cout << "mounting" << endl; + client->mount(); + + cerr << "starting fuse on pid " << getpid() << endl; + ceph_fuse_main(client, argc, argv); + cerr << "fuse finished on pid " << getpid() << endl; + + client->unmount(); + cout << "unmounted" << endl; + client->shutdown(); + + delete client; + + // wait for messenger to finish + rank.wait(); + + + return 0; +} + diff --git a/branches/sage/cephmds2/client/Client.cc b/branches/sage/cephmds2/client/Client.cc new file mode 100644 index 0000000000000..cb3cc2622bae4 --- /dev/null +++ b/branches/sage/cephmds2/client/Client.cc @@ -0,0 +1,2614 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +// unix-ey fs stuff +#include +#include +#include +#include +#include +#include + + +#include +using namespace std; + + +// ceph stuff +#include "Client.h" + + +#include "messages/MClientMount.h" +#include "messages/MClientMountAck.h" +#include "messages/MClientFileCaps.h" + +#include "messages/MGenericMessage.h" + +#include "messages/MMDSGetMap.h" +#include "messages/MMDSMap.h" + +#include "osdc/Filer.h" +#include "osdc/Objecter.h" +#include "osdc/ObjectCacher.h" + +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/Logger.h" + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "client" << whoami << "." << pthread_self() << " " + +#define tout if (g_conf.client_trace) cout << "trace: " + + +// static logger +LogType client_logtype; +Logger *client_logger = 0; + + + +class C_Client_CloseRelease : public Context { + Client *cl; + Inode *in; +public: + C_Client_CloseRelease(Client *c, Inode *i) : cl(c), in(i) {} + void finish(int) { + cl->close_release(in); + } +}; + +class C_Client_CloseSafe : public Context { + Client *cl; + Inode *in; +public: + C_Client_CloseSafe(Client *c, Inode *i) : cl(c), in(i) {} + void finish(int) { + cl->close_safe(in); + } +}; + + + + + + +// cons/des + +Client::Client(Messenger *m, MonMap *mm) +{ + // which client am i? + whoami = MSG_ADDR_NUM(m->get_myaddr()); + monmap = mm; + + mounted = false; + unmounting = false; + + last_tid = 0; + unsafe_sync_write = 0; + + mdsmap = 0; + + // + root = 0; + + set_cache_size(g_conf.client_cache_size); + + // file handles + free_fh_set.insert(10, 1<<30); + + // set up messengers + messenger = m; + messenger->set_dispatcher(this); + + // osd interfaces + osdmap = new OSDMap(); // initially blank.. see mount() + objecter = new Objecter(messenger, monmap, osdmap); + objectcacher = new ObjectCacher(objecter, client_lock); + filer = new Filer(objecter); +} + + +Client::~Client() +{ + if (messenger) { delete messenger; messenger = 0; } + if (filer) { delete filer; filer = 0; } + if (objectcacher) { delete objectcacher; objectcacher = 0; } + if (objecter) { delete objecter; objecter = 0; } + if (osdmap) { delete osdmap; osdmap = 0; } + + tear_down_cache(); +} + + +void Client::tear_down_cache() +{ + // fh's + for (hash_map::iterator it = fh_map.begin(); + it != fh_map.end(); + it++) { + Fh *fh = it->second; + dout(1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->inode.ino << endl; + put_inode(fh->inode); + delete fh; + } + fh_map.clear(); + + // caps! + // *** FIXME *** + + // empty lru + lru.lru_set_max(0); + trim_cache(); + assert(lru.lru_get_size() == 0); + + // close root ino + assert(inode_map.size() <= 1); + if (root && inode_map.size() == 1) { + delete root; + root = 0; + inode_map.clear(); + } + + assert(inode_map.empty()); +} + + + +// debug crapola + +void Client::dump_inode(Inode *in, set& did) +{ + dout(1) << "dump_inode: inode " << in->ino() << " ref " << in->ref << " dir " << in->dir << endl; + + if (in->dir) { + dout(1) << " dir size " << in->dir->dentries.size() << endl; + //for (hash_map, eqstr>::iterator it = in->dir->dentries.begin(); + for (hash_map::iterator it = in->dir->dentries.begin(); + it != in->dir->dentries.end(); + it++) { + dout(1) << " dn " << it->first << " ref " << it->second->ref << endl; + dump_inode(it->second->inode, did); + } + } +} + +void Client::dump_cache() +{ + set did; + + if (root) dump_inode(root, did); + + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it++) { + if (did.count(it->second)) continue; + + dout(1) << "dump_cache: inode " << it->first + << " ref " << it->second->ref + << " dir " << it->second->dir << endl; + if (it->second->dir) { + dout(1) << " dir size " << it->second->dir->dentries.size() << endl; + } + } + +} + + +void Client::init() { + +} + +void Client::shutdown() { + dout(1) << "shutdown" << endl; + messenger->shutdown(); +} + + + + +// =================== +// metadata cache stuff + +void Client::trim_cache() +{ + unsigned last = 0; + while (lru.lru_get_size() != last) { + last = lru.lru_get_size(); + + if (lru.lru_get_size() <= lru.lru_get_max()) break; + + // trim! + Dentry *dn = (Dentry*)lru.lru_expire(); + if (!dn) break; // done + + //dout(10) << "trim_cache unlinking dn " << dn->name << " in dir " << hex << dn->dir->inode->inode.ino << endl; + unlink(dn); + } + + // hose root? + if (lru.lru_get_size() == 0 && root && inode_map.size() == 1) { + delete root; + root = 0; + inode_map.clear(); + } +} + +/** insert_inode + * + * insert + link a single dentry + inode into the metadata cache. + */ +Inode* Client::insert_inode(Dir *dir, InodeStat *st, const string& dname) +{ + Dentry *dn = NULL; + if (dir->dentries.count(dname)) + dn = dir->dentries[dname]; + + dout(12) << "insert_inode " << dname << " ino " << st->inode.ino + << " size " << st->inode.size + << " mtime " << st->inode.mtime + << " hashed " << st->hashed + << endl; + + if (dn) { + if (dn->inode->inode.ino == st->inode.ino) { + touch_dn(dn); + dout(12) << " had dentry " << dname + << " with correct ino " << dn->inode->inode.ino + << endl; + } else { + dout(12) << " had dentry " << dname + << " with WRONG ino " << dn->inode->inode.ino + << endl; + unlink(dn); + dn = NULL; + } + } + + if (!dn) { + // have inode linked elsewhere? -> unlink and relink! + if (inode_map.count(st->inode.ino)) { + Inode *in = inode_map[st->inode.ino]; + assert(in); + + if (in->dn) { + dout(12) << " had ino " << in->inode.ino + << " linked at wrong position, unlinking" + << endl; + dn = relink(in->dn, dir, dname); + } else { + // link + dout(12) << " had ino " << in->inode.ino + << " unlinked, linking" << endl; + dn = link(dir, dname, in); + } + } + } + + if (!dn) { + Inode *in = new Inode(st->inode, objectcacher); + inode_map[st->inode.ino] = in; + dn = link(dir, dname, in); + dout(12) << " new dentry+node with ino " << st->inode.ino << endl; + } else { + // actually update info + dout(12) << " stat inode mask is " << st->inode.mask << endl; + dn->inode->inode = st->inode; + + // ...but don't clobber our mtime, size! + if ((dn->inode->inode.mask & INODE_MASK_SIZE) == 0 && + dn->inode->file_wr_size > dn->inode->inode.size) + dn->inode->inode.size = dn->inode->file_wr_size; + if ((dn->inode->inode.mask & INODE_MASK_MTIME) == 0 && + dn->inode->file_wr_mtime > dn->inode->inode.mtime) + dn->inode->inode.mtime = dn->inode->file_wr_mtime; + } + + // OK, we found it! + assert(dn && dn->inode); + + // or do we have newer size/mtime from writing? + if (dn->inode->file_caps() & CAP_FILE_WR) { + if (dn->inode->file_wr_size > dn->inode->inode.size) + dn->inode->inode.size = dn->inode->file_wr_size; + if (dn->inode->file_wr_mtime > dn->inode->inode.mtime) + dn->inode->inode.mtime = dn->inode->file_wr_mtime; + } + + // symlink? + if ((dn->inode->inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) { + if (!dn->inode->symlink) + dn->inode->symlink = new string; + *(dn->inode->symlink) = st->symlink; + } + + return dn->inode; +} + +/** update_inode_dist + * + * update MDS location cache for a single inode + */ +void Client::update_inode_dist(Inode *in, InodeStat *st) +{ + // dir info + in->dir_auth = st->dir_auth; + in->dir_hashed = st->hashed; + in->dir_replicated = st->replicated; + + // dir replication + if (st->spec_defined) { + if (st->dist.empty() && !in->dir_contacts.empty()) + dout(9) << "lost dist spec for " << in->inode.ino + << " " << st->dist << endl; + if (!st->dist.empty() && in->dir_contacts.empty()) + dout(9) << "got dist spec for " << in->inode.ino + << " " << st->dist << endl; + in->dir_contacts = st->dist; + } +} + + +/** insert_trace + * + * insert a trace from a MDS reply into the cache. + */ +Inode* Client::insert_trace(MClientReply *reply) +{ + Inode *cur = root; + time_t now = time(NULL); + + dout(10) << "insert_trace got " << reply->get_trace_in().size() << " inodes" << endl; + + list::const_iterator pdn = reply->get_trace_dn().begin(); + + for (list::const_iterator pin = reply->get_trace_in().begin(); + pin != reply->get_trace_in().end(); + ++pin) { + + if (pin == reply->get_trace_in().begin()) { + // root + dout(10) << "insert_trace root" << endl; + if (!root) { + // create + cur = root = new Inode((*pin)->inode, objectcacher); + inode_map[root->inode.ino] = root; + } + } else { + // not root. + dout(10) << "insert_trace dn " << *pdn << " ino " << (*pin)->inode.ino << endl; + Dir *dir = cur->open_dir(); + cur = this->insert_inode(dir, *pin, *pdn); + ++pdn; + + // move to top of lru! + if (cur->dn) + lru.lru_touch(cur->dn); + } + + // update dist info + update_inode_dist(cur, *pin); + + // set cache ttl + if (g_conf.client_cache_stat_ttl) + cur->valid_until = now + g_conf.client_cache_stat_ttl; + } + + return cur; +} + + + + +Dentry *Client::lookup(filepath& path) +{ + dout(14) << "lookup " << path << endl; + + Inode *cur = root; + if (!cur) return NULL; + + Dentry *dn = 0; + for (unsigned i=0; iinode << " valid_until " << dn->inode->valid_until << endl; + } else { + dout(14) << " dentry " << path[i] << " dne" << endl; + return NULL; + } + cur = dn->inode; + assert(cur); + } else { + return NULL; // not a dir + } + } + + if (dn) { + dout(11) << "lookup '" << path << "' found " << dn->name << " inode " << dn->inode->inode.ino << " valid_until " << dn->inode->valid_until<< endl; + } + + return dn; +} + +// ------- + +MClientReply *Client::make_request(MClientRequest *req, + bool auth_best, + int use_mds) // this param is icky, debug weirdness! +{ + // assign a unique tid + req->set_tid(++last_tid); + + // find deepest known prefix + Inode *diri = root; // the deepest known containing dir + Inode *item = 0; // the actual item... if we know it + int missing_dn = -1; // which dn we miss on (if we miss) + + unsigned depth = req->get_filepath().depth(); + for (unsigned i=0; iinode.mode & INODE_MODE_DIR && diri->dir) { + Dir *dir = diri->dir; + + // do we have the next dentry? + if (dir->dentries.count( req->get_filepath()[i] ) == 0) { + missing_dn = i; // no. + break; + } + + dout(7) << " have path seg " << i << " on " << diri->dir_auth << " ino " << diri->inode.ino << " " << req->get_filepath()[i] << endl; + + if (i == depth-1) { // last one! + item = dir->dentries[ req->get_filepath()[i] ]->inode; + break; + } + + // continue.. + diri = dir->dentries[ req->get_filepath()[i] ]->inode; + assert(diri); + } else { + missing_dn = i; + break; + } + } + + // choose an mds + int mds = 0; + if (diri) { + if (auth_best) { + // pick the actual auth (as best we can) + if (item) { + mds = item->authority(mdsmap); + } else if (diri->dir_hashed && missing_dn >= 0) { + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdsmap); + } else { + mds = diri->authority(mdsmap); + } + } else { + // balance our traffic! + if (diri->dir_hashed && missing_dn >= 0) + mds = diri->dentry_authority(req->get_filepath()[missing_dn].c_str(), + mdsmap); + else + mds = diri->pick_replica(mdsmap); + } + } else { + // no root info, pick a random MDS + mds = rand() % mdsmap->get_num_mds(); + } + dout(20) << "mds is " << mds << endl; + + // force use of a particular mds? + if (use_mds >= 0) mds = use_mds; + + + // time the call + utime_t start = g_clock.now(); + + bool nojournal = false; + int op = req->get_op(); + if (op == MDS_OP_STAT || + op == MDS_OP_LSTAT || + op == MDS_OP_READDIR || + op == MDS_OP_OPEN || + op == MDS_OP_RELEASE) + nojournal = true; + + MClientReply *reply = sendrecv(req, mds); + + if (client_logger) { + utime_t lat = g_clock.now(); + lat -= start; + dout(20) << "lat " << lat << endl; + client_logger->finc("lsum",(double)lat); + client_logger->inc("lnum"); + + if (nojournal) { + client_logger->finc("lrsum",(double)lat); + client_logger->inc("lrnum"); + } else { + client_logger->finc("lwsum",(double)lat); + client_logger->inc("lwnum"); + } + + if (op == MDS_OP_STAT) { + client_logger->finc("lstatsum",(double)lat); + client_logger->inc("lstatnum"); + } + else if (op == MDS_OP_READDIR) { + client_logger->finc("ldirsum",(double)lat); + client_logger->inc("ldirnum"); + } + + } + + return reply; +} + + +MClientReply* Client::sendrecv(MClientRequest *req, int mds) +{ + // NEW way. + Cond cond; + tid_t tid = req->get_tid(); + mds_rpc_cond[tid] = &cond; + + messenger->send_message(req, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), MDS_PORT_SERVER); + + // wait + while (mds_rpc_reply.count(tid) == 0) { + dout(20) << "sendrecv awaiting reply kick on " << &cond << endl; + cond.Wait(client_lock); + } + + // got it! + MClientReply *reply = mds_rpc_reply[tid]; + + // kick dispatcher (we've got it!) + assert(mds_rpc_dispatch_cond.count(tid)); + mds_rpc_dispatch_cond[tid]->Signal(); + dout(20) << "sendrecv kickback on tid " << tid << " " << mds_rpc_dispatch_cond[tid] << endl; + + // clean up. + mds_rpc_cond.erase(tid); + mds_rpc_reply.erase(tid); + + return reply; +} + +void Client::handle_client_reply(MClientReply *reply) +{ + tid_t tid = reply->get_tid(); + + // store reply + mds_rpc_reply[tid] = reply; + + // wake up waiter + assert(mds_rpc_cond.count(tid)); + dout(20) << "handle_client_reply kicking caller on " << mds_rpc_cond[tid] << endl; + mds_rpc_cond[tid]->Signal(); + + // wake for kick back + assert(mds_rpc_dispatch_cond.count(tid) == 0); + Cond cond; + mds_rpc_dispatch_cond[tid] = &cond; + while (mds_rpc_cond.count(tid)) { + dout(20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << endl; + cond.Wait(client_lock); + } + + // ok, clean up! + mds_rpc_dispatch_cond.erase(tid); +} + + +// ------------------------ +// incoming messages + +void Client::dispatch(Message *m) +{ + client_lock.Lock(); + + switch (m->get_type()) { + // osd + case MSG_OSD_OPREPLY: + objecter->handle_osd_op_reply((MOSDOpReply*)m); + break; + + case MSG_OSD_MAP: + objecter->handle_osd_map((class MOSDMap*)m); + break; + + // client + case MSG_MDS_MAP: + handle_mds_map((MMDSMap*)m); + break; + + case MSG_CLIENT_REPLY: + handle_client_reply((MClientReply*)m); + break; + + case MSG_CLIENT_FILECAPS: + handle_file_caps((MClientFileCaps*)m); + break; + + case MSG_CLIENT_MOUNTACK: + handle_mount_ack((MClientMountAck*)m); + break; + case MSG_CLIENT_UNMOUNT: + handle_unmount_ack(m); + break; + + + default: + cout << "dispatch doesn't recognize message type " << m->get_type() << endl; + assert(0); // fail loudly + break; + } + + // unmounting? + if (unmounting) { + dout(10) << "unmounting: trim pass, size was " << lru.lru_get_size() + << "+" << inode_map.size() << endl; + trim_cache(); + if (lru.lru_get_size() == 0 && inode_map.empty()) { + dout(10) << "unmounting: trim pass, cache now empty, waking unmount()" << endl; + mount_cond.Signal(); + } else { + dout(10) << "unmounting: trim pass, size still " << lru.lru_get_size() + << "+" << inode_map.size() << endl; + dump_cache(); + } + } + + client_lock.Unlock(); +} + +void Client::handle_mount_ack(MClientMountAck *m) +{ + // mdsmap! + if (!mdsmap) mdsmap = new MDSMap; + mdsmap->decode(m->get_mds_map_state()); + + // we got osdmap! + osdmap->decode(m->get_osd_map_state()); + + dout(2) << "mounted" << endl; + mounted = true; + mount_cond.Signal(); + + delete m; +} + + +void Client::handle_unmount_ack(Message* m) +{ + dout(1) << "got unmount ack" << endl; + mounted = false; + mount_cond.Signal(); + delete m; +} + + +void Client::handle_mds_map(MMDSMap* m) +{ + if (mdsmap == 0) + mdsmap = new MDSMap; + + map::reverse_iterator p = m->maps.rbegin(); + + dout(1) << "handle_mds_map epoch " << p->first << endl; + mdsmap->decode(p->second); + + delete m; + + mount_cond.Signal(); // mount might be waiting for this. +} + + +/**** + * caps + */ + + +class C_Client_ImplementedCaps : public Context { + Client *client; + MClientFileCaps *msg; + Inode *in; +public: + C_Client_ImplementedCaps(Client *c, MClientFileCaps *m, Inode *i) : client(c), msg(m), in(i) {} + void finish(int r) { + client->implemented_caps(msg,in); + } +}; + +/** handle_file_caps + * handle caps update from mds. including mds to mds caps transitions. + * do not block. + */ +void Client::handle_file_caps(MClientFileCaps *m) +{ + int mds = MSG_ADDR_NUM(m->get_source()); + Inode *in = 0; + if (inode_map.count(m->get_ino())) in = inode_map[ m->get_ino() ]; + + m->clear_payload(); // for if/when we send back to MDS + + // reap? + if (m->get_special() == MClientFileCaps::FILECAP_REAP) { + int other = m->get_mds(); + + if (in && in->stale_caps.count(other)) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " reap on mds" << other << endl; + + // fresh from new mds? + if (!in->caps.count(mds)) { + if (in->caps.empty()) in->get(); + in->caps[mds].seq = m->get_seq(); + in->caps[mds].caps = m->get_caps(); + } + + assert(in->stale_caps.count(other)); + in->stale_caps.erase(other); + if (in->stale_caps.empty()) put_inode(in); // note: this will never delete *in + + // fall-thru! + } else { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " premature (!!) reap on mds" << other << endl; + // delay! + cap_reap_queue[in->ino()][other] = m; + return; + } + } + + assert(in); + + // stale? + if (m->get_special() == MClientFileCaps::FILECAP_STALE) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " seq " << m->get_seq() << " from mds" << mds << " now stale" << endl; + + // move to stale list + assert(in->caps.count(mds)); + if (in->stale_caps.empty()) in->get(); + in->stale_caps[mds] = in->caps[mds]; + + assert(in->caps.count(mds)); + in->caps.erase(mds); + if (in->caps.empty()) in->put(); + + // delayed reap? + if (cap_reap_queue.count(in->ino()) && + cap_reap_queue[in->ino()].count(mds)) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " delayed reap on mds" << m->get_mds() << endl; + + // process delayed reap + handle_file_caps( cap_reap_queue[in->ino()][mds] ); + + cap_reap_queue[in->ino()].erase(mds); + if (cap_reap_queue[in->ino()].empty()) + cap_reap_queue.erase(in->ino()); + } + return; + } + + // release? + if (m->get_special() == MClientFileCaps::FILECAP_RELEASE) { + dout(5) << "handle_file_caps on ino " << m->get_ino() << " from mds" << mds << " release" << endl; + assert(in->caps.count(mds)); + in->caps.erase(mds); + for (map::iterator p = in->caps.begin(); + p != in->caps.end(); + p++) + dout(20) << " left cap " << p->first << " " + << cap_string(p->second.caps) << " " + << p->second.seq << endl; + for (map::iterator p = in->stale_caps.begin(); + p != in->stale_caps.end(); + p++) + dout(20) << " left stale cap " << p->first << " " + << cap_string(p->second.caps) << " " + << p->second.seq << endl; + + if (in->caps.empty()) { + //dout(0) << "did put_inode" << endl; + put_inode(in); + } else { + //dout(0) << "didn't put_inode" << endl; + } + + return; + } + + + // don't want? + if (in->file_caps_wanted() == 0) { + dout(5) << "handle_file_caps on ino " << m->get_ino() + << " seq " << m->get_seq() + << " " << cap_string(m->get_caps()) + << ", which we don't want caps for, releasing." << endl; + m->set_caps(0); + m->set_wanted(0); + entity_inst_t srcinst = m->get_source_inst(); + messenger->send_message(m, m->get_source(), srcinst, m->get_source_port()); + return; + } + + assert(in->caps.count(mds)); + + // update per-mds caps + const int old_caps = in->caps[mds].caps; + const int new_caps = m->get_caps(); + in->caps[mds].caps = new_caps; + in->caps[mds].seq = m->get_seq(); + dout(5) << "handle_file_caps on in " << m->get_ino() + << " mds" << mds << " seq " << m->get_seq() + << " caps now " << cap_string(new_caps) + << " was " << cap_string(old_caps) << endl; + + // did file size decrease? + if ((old_caps & new_caps & CAP_FILE_RDCACHE) && + in->inode.size > m->get_inode().size) { + dout(10) << "**** file size decreased from " << in->inode.size << " to " << m->get_inode().size << " FIXME" << endl; + // must have been a truncate() by someone. + // trim the buffer cache + // ***** fixme write me **** + + in->file_wr_size = m->get_inode().size; //?? + } + + // update inode + in->inode = m->get_inode(); // might have updated size... FIXME this is overkill! + + // preserve our (possibly newer) file size, mtime + if (in->file_wr_size > in->inode.size) + m->get_inode().size = in->inode.size = in->file_wr_size; + if (in->file_wr_mtime > in->inode.mtime) + m->get_inode().mtime = in->inode.mtime = in->file_wr_mtime; + + if (g_conf.client_oc) { + // caching on, use FileCache. + Context *onimplement = 0; + if (old_caps & ~new_caps) { // this mds is revoking caps + if (in->fc.get_caps() & ~(in->file_caps())) // net revocation + onimplement = new C_Client_ImplementedCaps(this, m, in); + else { + implemented_caps(m, in); // ack now. + } + } + in->fc.set_caps(new_caps, onimplement); + + } else { + // caching off. + + // wake up waiters? + if (new_caps & CAP_FILE_RD) { + for (list::iterator it = in->waitfor_read.begin(); + it != in->waitfor_read.end(); + it++) { + dout(5) << "signaling read waiter " << *it << endl; + (*it)->Signal(); + } + in->waitfor_read.clear(); + } + if (new_caps & CAP_FILE_WR) { + for (list::iterator it = in->waitfor_write.begin(); + it != in->waitfor_write.end(); + it++) { + dout(5) << "signaling write waiter " << *it << endl; + (*it)->Signal(); + } + in->waitfor_write.clear(); + } + if (new_caps & CAP_FILE_LAZYIO) { + for (list::iterator it = in->waitfor_lazy.begin(); + it != in->waitfor_lazy.end(); + it++) { + dout(5) << "signaling lazy waiter " << *it << endl; + (*it)->Signal(); + } + in->waitfor_lazy.clear(); + } + + // ack? + if (old_caps & ~new_caps) { + if (in->sync_writes) { + // wait for sync writes to finish + dout(5) << "sync writes in progress, will ack on finish" << endl; + in->waitfor_no_write.push_back(new C_Client_ImplementedCaps(this, m, in)); + } else { + // ok now + implemented_caps(m, in); + } + } else { + // discard + delete m; + } + } +} + +void Client::implemented_caps(MClientFileCaps *m, Inode *in) +{ + dout(5) << "implemented_caps " << cap_string(m->get_caps()) + << ", acking to " << m->get_source() << endl; + + if (in->file_caps() == 0) { + in->file_wr_mtime = 0; + in->file_wr_size = 0; + } + + messenger->send_message(m, m->get_source(), m->get_source_port()); +} + + +void Client::release_caps(Inode *in, + int retain) +{ + dout(5) << "releasing caps on ino " << in->inode.ino << dec + << " had " << cap_string(in->file_caps()) + << " retaining " << cap_string(retain) + << endl; + + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); + it++) { + //if (it->second.caps & ~retain) { + if (1) { + // release (some of?) these caps + it->second.caps = retain & it->second.caps; + // note: tell mds _full_ wanted; it'll filter/behave based on what it is allowed to do + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.seq, + it->second.caps, + in->file_caps_wanted()); + messenger->send_message(m, MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER); + } + } + + if (in->file_caps() == 0) { + in->file_wr_mtime = 0; + in->file_wr_size = 0; + } +} + +void Client::update_caps_wanted(Inode *in) +{ + dout(5) << "updating caps wanted on ino " << in->inode.ino + << " to " << cap_string(in->file_caps_wanted()) + << endl; + + // FIXME: pick a single mds and let the others off the hook.. + for (map::iterator it = in->caps.begin(); + it != in->caps.end(); + it++) { + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.seq, + it->second.caps, + in->file_caps_wanted()); + messenger->send_message(m, + MSG_ADDR_MDS(it->first), mdsmap->get_inst(it->first), MDS_PORT_LOCKER); + } +} + + + +// ------------------- +// fs ops + +int Client::mount(int mkfs) +{ + client_lock.Lock(); + + assert(!mounted); // caller is confused? + + // FIXME mds map update race with mount. + + dout(2) << "fetching latest mds map" << endl; + if (mdsmap) + delete mdsmap; + int mon = monmap->pick_mon(); + messenger->send_message(new MMDSGetMap(), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + + while (!mdsmap) + mount_cond.Wait(client_lock); + + dout(2) << "mounting" << endl; + MClientMount *m = new MClientMount(); + if (mkfs) m->set_mkfs(mkfs); + + messenger->send_message(m, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER); + + while (!mounted) + mount_cond.Wait(client_lock); + + client_lock.Unlock(); + + /* + dout(3) << "op: // client trace data structs" << endl; + dout(3) << "op: struct stat st;" << endl; + dout(3) << "op: struct utimbuf utim;" << endl; + dout(3) << "op: int readlinkbuf_len = 1000;" << endl; + dout(3) << "op: char readlinkbuf[readlinkbuf_len];" << endl; + dout(3) << "op: map dir_contents;" << endl; + dout(3) << "op: map open_files;" << endl; + dout(3) << "op: fh_t fh;" << endl; + */ + return 0; +} + +int Client::unmount() +{ + client_lock.Lock(); + + assert(mounted); // caller is confused? + + dout(2) << "unmounting" << endl; + unmounting = true; + + // NOTE: i'm assuming all caches are already flushing (because all files are closed). + assert(fh_map.empty()); + + // empty lru cache + lru.lru_set_max(0); + trim_cache(); + + if (g_conf.client_oc) { + // release any/all caps + for (hash_map::iterator p = inode_map.begin(); + p != inode_map.end(); + p++) { + Inode *in = p->second; + if (!in->caps.empty()) { + in->fc.release_clean(); + if (in->fc.is_dirty()) { + dout(10) << "unmount residual caps on " << in->ino() << ", flushing" << endl; + in->fc.empty(new C_Client_CloseRelease(this, in)); + } else { + dout(10) << "unmount residual caps on " << in->ino() << ", releasing" << endl; + release_caps(in); + } + } + } + } + + while (lru.lru_get_size() > 0 || + !inode_map.empty()) { + dout(2) << "cache still has " << lru.lru_get_size() + << "+" << inode_map.size() << " items" + << ", waiting (presumably for safe or for caps to be released?)" + << endl; + dump_cache(); + mount_cond.Wait(client_lock); + } + assert(lru.lru_get_size() == 0); + assert(inode_map.empty()); + + // unsafe writes + if (!g_conf.client_oc) { + while (unsafe_sync_write > 0) { + dout(0) << unsafe_sync_write << " unsafe_sync_writes, waiting" + << endl; + mount_cond.Wait(client_lock); + } + } + + // send unmount! + Message *req = new MGenericMessage(MSG_CLIENT_UNMOUNT); + messenger->send_message(req, MSG_ADDR_MDS(0), mdsmap->get_inst(0), MDS_PORT_SERVER); + + while (mounted) + mount_cond.Wait(client_lock); + + dout(2) << "unmounted" << endl; + + client_lock.Unlock(); + return 0; +} + + + +// namespace ops + +int Client::link(const char *existing, const char *newname) +{ + client_lock.Lock(); + dout(3) << "op: client->link(\"" << existing << "\", \"" << newname << "\");" << endl; + tout << "link" << endl; + tout << existing << endl; + tout << newname << endl; + + + // main path arg is new link name + // sarg is target (existing file) + + + MClientRequest *req = new MClientRequest(MDS_OP_LINK, whoami); + req->set_path(newname); + req->set_sarg(existing); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + + insert_trace(reply); + delete reply; + dout(10) << "link result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + + +int Client::unlink(const char *relpath) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->unlink\(\"" << path << "\");" << endl; + tout << "unlink" << endl; + tout << path << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_UNLINK, whoami); + req->set_path(path); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + if (res == 0) { + // remove from local cache + filepath fp(path); + Dentry *dn = lookup(fp); + if (dn) { + assert(dn->inode); + unlink(dn); + } + } + insert_trace(reply); + delete reply; + dout(10) << "unlink result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +int Client::rename(const char *relfrom, const char *relto) +{ + client_lock.Lock(); + + string absfrom; + mkabspath(relfrom, absfrom); + const char *from = absfrom.c_str(); + string absto; + mkabspath(relto, absto); + const char *to = absto.c_str(); + + dout(3) << "op: client->rename(\"" << from << "\", \"" << to << "\");" << endl; + tout << "rename" << endl; + tout << from << endl; + tout << to << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_RENAME, whoami); + req->set_path(from); + req->set_sarg(to); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + dout(10) << "rename result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +// dirs + +int Client::mkdir(const char *relpath, mode_t mode) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->mkdir(\"" << path << "\", " << mode << ");" << endl; + tout << "mkdir" << endl; + tout << path << endl; + tout << mode << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_MKDIR, whoami); + req->set_path(path); + req->set_iarg( (int)mode ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + dout(10) << "mkdir result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +int Client::rmdir(const char *relpath) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->rmdir(\"" << path << "\");" << endl; + tout << "rmdir" << endl; + tout << path << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_RMDIR, whoami); + req->set_path(path); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + if (res == 0) { + // remove from local cache + filepath fp(path); + Dentry *dn = lookup(fp); + if (dn) { + if (dn->inode->dir && dn->inode->dir->is_empty()) + close_dir(dn->inode->dir); // FIXME: maybe i shoudl proactively hose the whole subtree from cache? + unlink(dn); + } + } + insert_trace(reply); + delete reply; + dout(10) << "rmdir result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +// symlinks + +int Client::symlink(const char *reltarget, const char *rellink) +{ + client_lock.Lock(); + + string abstarget; + mkabspath(reltarget, abstarget); + const char *target = abstarget.c_str(); + string abslink; + mkabspath(rellink, abslink); + const char *link = abslink.c_str(); + + dout(3) << "op: client->symlink(\"" << target << "\", \"" << link << "\");" << endl; + tout << "symlink" << endl; + tout << target << endl; + tout << link << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_SYMLINK, whoami); + req->set_path(link); + req->set_sarg(target); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); //FIXME assuming trace of link, not of target + delete reply; + dout(10) << "symlink result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +int Client::readlink(const char *relpath, char *buf, off_t size) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->readlink(\"" << path << "\", readlinkbuf, readlinkbuf_len);" << endl; + tout << "readlink" << endl; + tout << path << endl; + client_lock.Unlock(); + + // stat first (FIXME, PERF access cache directly) **** + struct stat stbuf; + int r = this->lstat(path, &stbuf); + if (r != 0) return r; + + client_lock.Lock(); + + // pull symlink content from cache + Inode *in = inode_map[stbuf.st_ino]; + assert(in); // i just did a stat + + // copy into buf (at most size bytes) + unsigned res = in->symlink->length(); + if (res > size) res = size; + memcpy(buf, in->symlink->c_str(), res); + + trim_cache(); + client_lock.Unlock(); + return res; // return length in bytes (to mimic the system call) +} + + + +// inode stuff + +int Client::_lstat(const char *path, int mask, Inode **in) +{ + MClientRequest *req = 0; + filepath fpath(path); + + // check whether cache content is fresh enough + int res = 0; + + Dentry *dn = lookup(fpath); + inode_t inode; + time_t now = time(NULL); + if (dn && + now <= dn->inode->valid_until && + ((dn->inode->inode.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT)) { + inode = dn->inode->inode; + dout(10) << "lstat cache hit w/ sufficient inode.mask, valid until " << dn->inode->valid_until << endl; + + if (g_conf.client_cache_stat_ttl == 0) + dn->inode->valid_until = 0; // only one stat allowed after each readdir + + *in = dn->inode; + } else { + // FIXME where does FUSE maintain user information + //struct fuse_context *fc = fuse_get_context(); + //req->set_caller_uid(fc->uid); + //req->set_caller_gid(fc->gid); + + req = new MClientRequest(MDS_OP_LSTAT, whoami); + req->set_iarg(mask); + req->set_path(fpath); + + MClientReply *reply = make_request(req); + res = reply->get_result(); + dout(10) << "lstat res is " << res << endl; + if (res == 0) { + //Transfer information from reply to stbuf + inode = reply->get_inode(); + + //Update metadata cache + *in = insert_trace(reply); + } + + delete reply; + + if (res != 0) + *in = 0; // not a success. + } + + return res; +} + + +void Client::fill_stat(inode_t& inode, struct stat *st) +{ + memset(st, 0, sizeof(struct stat)); + st->st_ino = inode.ino; + st->st_mode = inode.mode; + st->st_nlink = inode.nlink; + st->st_uid = inode.uid; + st->st_gid = inode.gid; + st->st_ctime = inode.ctime; + st->st_atime = inode.atime; + st->st_mtime = inode.mtime; + st->st_size = inode.size; + st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; + st->st_blksize = 4096; +} + +void Client::fill_statlite(inode_t& inode, struct statlite *st) +{ + memset(st, 0, sizeof(struct stat)); + st->st_ino = inode.ino; + st->st_mode = inode.mode; + st->st_nlink = inode.nlink; + st->st_uid = inode.uid; + st->st_gid = inode.gid; + st->st_ctime = inode.ctime; + st->st_atime = inode.atime; + st->st_mtime = inode.mtime; + st->st_size = inode.size; + st->st_blocks = inode.size ? ((inode.size - 1) / 4096 + 1):0; + st->st_blksize = 4096; + + /* + S_REQUIREBLKSIZE(st->st_litemask); + if (inode.mask & INODE_MASK_BASE) S_REQUIRECTIME(st->st_litemask); + if (inode.mask & INODE_MASK_SIZE) { + S_REQUIRESIZE(st->st_litemask); + S_REQUIREBLOCKS(st->st_litemask); + } + if (inode.mask & INODE_MASK_MTIME) S_REQUIREMTIME(st->st_litemask); + if (inode.mask & INODE_MASK_ATIME) S_REQUIREATIME(st->st_litemask); + */ +} + + +int Client::lstat(const char *relpath, struct stat *stbuf) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->lstat(\"" << path << "\", &st);" << endl; + tout << "lstat" << endl; + tout << path << endl; + + Inode *in = 0; + + int res = _lstat(path, INODE_MASK_ALL_STAT, &in); + if (res == 0) { + assert(in); + fill_stat(in->inode,stbuf); + dout(10) << "stat sez size = " << in->inode.size << " ino = " << stbuf->st_ino << endl; + } + + trim_cache(); + client_lock.Unlock(); + return res; +} + + +int Client::lstatlite(const char *relpath, struct statlite *stl) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->lstatlite(\"" << path << "\", &st);" << endl; + tout << "lstatlite" << endl; + tout << path << endl; + + // make mask + int mask = INODE_MASK_BASE | INODE_MASK_PERM; + if (S_ISVALIDSIZE(stl->st_litemask) || + S_ISVALIDBLOCKS(stl->st_litemask)) mask |= INODE_MASK_SIZE; + if (S_ISVALIDMTIME(stl->st_litemask)) mask |= INODE_MASK_MTIME; + if (S_ISVALIDATIME(stl->st_litemask)) mask |= INODE_MASK_ATIME; + + Inode *in = 0; + int res = _lstat(path, mask, &in); + + if (res == 0) { + fill_statlite(in->inode,stl); + dout(10) << "stat sez size = " << in->inode.size << " ino = " << in->inode.ino << endl; + } + + trim_cache(); + client_lock.Unlock(); + return res; +} + + + +int Client::chmod(const char *relpath, mode_t mode) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->chmod(\"" << path << "\", " << mode << ");" << endl; + tout << "chmod" << endl; + tout << path << endl; + tout << mode << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_CHMOD, whoami); + req->set_path(path); + req->set_iarg( (int)mode ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + dout(10) << "chmod result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +int Client::chown(const char *relpath, uid_t uid, gid_t gid) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->chown(\"" << path << "\", " << uid << ", " << gid << ");" << endl; + tout << "chown" << endl; + tout << path << endl; + tout << uid << endl; + tout << gid << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_CHOWN, whoami); + req->set_path(path); + req->set_iarg( (int)uid ); + req->set_iarg2( (int)gid ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + dout(10) << "chown result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + +int Client::utime(const char *relpath, struct utimbuf *buf) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: utim.actime = " << buf->actime << "; utim.modtime = " << buf->modtime << ";" << endl; + dout(3) << "op: client->utime(\"" << path << "\", &utim);" << endl; + tout << "utime" << endl; + tout << path << endl; + tout << buf->actime << endl; + tout << buf->modtime << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_UTIME, whoami); + req->set_path(path); + req->set_targ( buf->modtime ); + req->set_targ2( buf->actime ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + dout(10) << "utime result is " << res << endl; + + trim_cache(); + client_lock.Unlock(); + return res; +} + + + +int Client::mknod(const char *relpath, mode_t mode) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->mknod(\"" << path << "\", " << mode << ");" << endl; + tout << "mknod" << endl; + tout << path << endl; + tout << mode << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_MKNOD, whoami); + req->set_path(path); + req->set_iarg( mode ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + + dout(10) << "mknod result is " << res << endl; + + delete reply; + + trim_cache(); + client_lock.Unlock(); + return res; +} + + + + +//readdir usually include inode info for each entry except of locked entries + +// +// getdir + +// fyi: typedef int (*dirfillerfunc_t) (void *handle, const char *name, int type, inodeno_t ino); + +int Client::getdir(const char *relpath, map& contents) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: client->getdir(\"" << path << "\", dir_contents);" << endl; + tout << "getdir" << endl; + tout << path << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_READDIR, whoami); + req->set_path(path); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + //FIXME enforce caller uid rights? + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + + if (res == 0) { + + // dir contents to cache! + inodeno_t ino = reply->get_ino(); + Inode *diri = inode_map[ ino ]; + assert(diri); + assert(diri->inode.mode & INODE_MODE_DIR); + + if (!reply->get_dir_in().empty()) { + // only open dir if we're actually adding stuff to it! + Dir *dir = diri->open_dir(); + assert(dir); + time_t now = time(NULL); + + list::const_iterator pdn = reply->get_dir_dn().begin(); + for (list::const_iterator pin = reply->get_dir_in().begin(); + pin != reply->get_dir_in().end(); + ++pin, ++pdn) { + // count entries + res++; + + // put in cache + Inode *in = this->insert_inode(dir, *pin, *pdn); + + if (g_conf.client_cache_stat_ttl) + in->valid_until = now + g_conf.client_cache_stat_ttl; + else if (g_conf.client_cache_readdir_ttl) + in->valid_until = now + g_conf.client_cache_readdir_ttl; + + // contents to caller too! + contents[*pdn] = in->inode; + } + } + + // add .. too? + if (diri != root && diri->dn && diri->dn->dir) { + Inode *parent = diri->dn->dir->parent_inode; + contents[".."] = parent->inode; + } + + // FIXME: remove items in cache that weren't in my readdir? + // *** + } + + delete reply; //fix thing above first + + client_lock.Unlock(); + return res; +} + + +/** POSIX stubs **/ + +DIR *Client::opendir(const char *name) +{ + DirResult *d = new DirResult; + d->size = getdir(name, d->contents); + d->p = d->contents.begin(); + d->off = 0; + return (DIR*)d; +} + +int Client::closedir(DIR *dir) +{ + DirResult *d = (DirResult*)dir; + delete d; + return 0; +} + +//struct dirent { +// ino_t d_ino; /* inode number */ +// off_t d_off; /* offset to the next dirent */ +// unsigned short d_reclen; /* length of this record */ +// unsigned char d_type; /* type of file */ +// char d_name[256]; /* filename */ +//}; + +struct dirent *Client::readdir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; +#ifndef __CYGWIN__ + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) +#endif + + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + // move up + ++d->off; + ++d->p; + + return &d->dp.d_dirent; +} + +void Client::rewinddir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + d->p = d->contents.begin(); + d->off = 0; +} + +off_t Client::telldir(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + return d->off; +} + +void Client::seekdir(DIR *dirp, off_t offset) +{ + DirResult *d = (DirResult*)dirp; + + d->p = d->contents.begin(); + d->off = 0; + + if (offset >= d->size) offset = d->size-1; + while (offset > 0) { + ++d->p; + ++d->off; + --offset; + } +} + +struct dirent_plus *Client::readdirplus(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; +#ifndef __CYGWIN__ + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) +#endif + + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + // plus + if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { + // have it + fill_stat(d->p->second, &d->dp.d_stat); + d->dp.d_stat_err = 0; + } else { + // don't have it, stat it + string path = d->path; + path += "/"; + path += d->p->first; + d->dp.d_stat_err = lstat(path.c_str(), &d->dp.d_stat); + } + + // move up + ++d->off; + ++d->p; + + return &d->dp; +} + +/* +struct dirent_lite *Client::readdirlite(DIR *dirp) +{ + DirResult *d = (DirResult*)dirp; + + // end of dir? + if (d->p == d->contents.end()) + return 0; + + // fill the dirent + d->dp.d_dirent.d_ino = d->p->second.ino; + if (d->p->second.is_symlink()) + d->dp.d_dirent.d_type = DT_LNK; + else if (d->p->second.is_dir()) + d->dp.d_dirent.d_type = DT_DIR; + else if (d->p->second.is_file()) + d->dp.d_dirent.d_type = DT_REG; + else + d->dp.d_dirent.d_type = DT_UNKNOWN; + strncpy(d->dp.d_dirent.d_name, d->p->first.c_str(), 256); + + d->dp.d_dirent.d_off = d->off; + d->dp.d_dirent.d_reclen = 1; // all records are length 1 (wrt offset, seekdir, telldir, etc.) + + // plus + if ((d->p->second.mask & INODE_MASK_ALL_STAT) == INODE_MASK_ALL_STAT) { + // have it + fill_statlite(d->p->second,d->dp.d_stat); + d->dp.d_stat_err = 0; + } else { + // don't have it, stat it + string path = p->path; + path += "/"; + path += p->first; + d->dp.d_statlite + d->dp.d_stat_err = lstatlite(path.c_str(), &d->dp.d_statlite); + } + + // move up + ++d->off; + ++d->p; + + return &d->dp; +} +*/ + + + + + + +/****** file i/o **********/ + +int Client::open(const char *relpath, int flags) +{ + client_lock.Lock(); + + string abspath; + mkabspath(relpath, abspath); + const char *path = abspath.c_str(); + + dout(3) << "op: fh = client->open(\"" << path << "\", " << flags << ");" << endl; + tout << "open" << endl; + tout << path << endl; + tout << flags << endl; + + int cmode = 0; + bool tryauth = false; + if (flags & O_LAZY) + cmode = FILE_MODE_LAZY; + else if (flags & O_WRONLY) { + cmode = FILE_MODE_W; + tryauth = true; + } else if (flags & O_RDWR) { + cmode = FILE_MODE_RW; + tryauth = true; + } else if (flags & O_APPEND) { + cmode = FILE_MODE_W; + tryauth = true; + } else + cmode = FILE_MODE_R; + + // go + MClientRequest *req = new MClientRequest(MDS_OP_OPEN, whoami); + req->set_path(path); + req->set_iarg(flags); + req->set_iarg2(cmode); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req, tryauth); // try auth if writer + + assert(reply); + dout(3) << "op: open_files[" << reply->get_result() << "] = fh; // fh = " << reply->get_result() << endl; + tout << reply->get_result() << endl; + + insert_trace(reply); + int result = reply->get_result(); + + // success? + fh_t fh = 0; + if (result >= 0) { + // yay + Fh *f = new Fh; + f->mode = cmode; + + // inode + f->inode = inode_map[reply->get_ino()]; + assert(f->inode); + f->inode->get(); + + if (cmode & FILE_MODE_R) f->inode->num_open_rd++; + if (cmode & FILE_MODE_W) f->inode->num_open_wr++; + if (cmode & FILE_MODE_LAZY) f->inode->num_open_lazy++; + + // caps included? + int mds = MSG_ADDR_NUM(reply->get_source()); + + if (f->inode->caps.empty()) {// first caps? + dout(7) << " first caps on " << f->inode->inode.ino << endl; + f->inode->get(); + } + + int new_caps = reply->get_file_caps(); + + assert(reply->get_file_caps_seq() >= f->inode->caps[mds].seq); + if (reply->get_file_caps_seq() > f->inode->caps[mds].seq) { + dout(7) << "open got caps " << cap_string(new_caps) + << " for " << f->inode->ino() + << " seq " << reply->get_file_caps_seq() + << " from mds" << mds << endl; + + int old_caps = f->inode->caps[mds].caps; + f->inode->caps[mds].caps = new_caps; + f->inode->caps[mds].seq = reply->get_file_caps_seq(); + + // we shouldn't ever lose caps at this point. + // actually, we might...? + assert((old_caps & ~f->inode->caps[mds].caps) == 0); + + if (g_conf.client_oc) + f->inode->fc.set_caps(new_caps); + + } else { + dout(7) << "open got SAME caps " << cap_string(new_caps) + << " for " << f->inode->ino() + << " seq " << reply->get_file_caps_seq() + << " from mds" << mds << endl; + } + + // put in map + result = fh = get_fh(); + assert(fh_map.count(fh) == 0); + fh_map[fh] = f; + + dout(3) << "open success, fh is " << fh << " combined caps " << cap_string(f->inode->file_caps()) << endl; + } else { + dout(0) << "open failure result " << result << endl; + } + + delete reply; + + trim_cache(); + client_lock.Unlock(); + + return result; +} + + + + + +void Client::close_release(Inode *in) +{ + dout(10) << "close_release on " << in->ino() << endl; + + if (!in->num_open_rd) + in->fc.release_clean(); + + int retain = 0; + if (in->num_open_wr || in->fc.is_dirty()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER; + if (in->num_open_rd || in->fc.is_cached()) retain |= CAP_FILE_WR | CAP_FILE_WRBUFFER; + + release_caps(in, retain); // release caps now. +} + +void Client::close_safe(Inode *in) +{ + dout(10) << "close_safe on " << in->ino() << endl; + put_inode(in); + if (unmounting) + mount_cond.Signal(); +} + +int Client::close(fh_t fh) +{ + client_lock.Lock(); + dout(3) << "op: client->close(open_files[ " << fh << " ]);" << endl; + dout(3) << "op: open_files.erase( " << fh << " );" << endl; + tout << "close" << endl; + tout << fh << endl; + + // get Fh, Inode + assert(fh_map.count(fh)); + Fh *f = fh_map[fh]; + Inode *in = f->inode; + + // update inode rd/wr counts + int before = in->file_caps_wanted(); + if (f->mode & FILE_MODE_R) + in->num_open_rd--; + if (f->mode & FILE_MODE_W) + in->num_open_wr--; + int after = in->file_caps_wanted(); + + // does this change what caps we want? + if (before != after && after) + update_caps_wanted(in); + + // hose fh + fh_map.erase(fh); + delete f; + + // release caps right away? + dout(10) << "num_open_rd " << in->num_open_rd << " num_open_wr " << in->num_open_wr << endl; + + if (g_conf.client_oc) { + // caching on. + if (in->num_open_rd == 0 && in->num_open_wr == 0) { + in->fc.empty(new C_Client_CloseRelease(this, in)); + } + else if (in->num_open_rd == 0) { + in->fc.release_clean(); + close_release(in); + } + else if (in->num_open_wr == 0) { + in->fc.flush_dirty(new C_Client_CloseRelease(this,in)); + } + + // pin until safe? + if (in->num_open_wr == 0 && !in->fc.all_safe()) { + dout(10) << "pinning ino " << in->ino() << " until safe" << endl; + in->get(); + in->fc.add_safe_waiter(new C_Client_CloseSafe(this, in)); + } + } else { + // caching off. + if (in->num_open_rd == 0 && in->num_open_wr == 0) { + dout(10) << " releasing caps on " << in->ino() << endl; + release_caps(in); // release caps now. + } + } + + put_inode( in ); + int result = 0; + + client_lock.Unlock(); + return result; +} + + + +// ------------ +// read, write + +// blocking osd interface + +int Client::read(fh_t fh, char *buf, off_t size, off_t offset) +{ + client_lock.Lock(); + + dout(3) << "op: client->read(" << fh << ", buf, " << size << ", " << offset << "); // that's " << offset << "~" << size << endl; + tout << "read" << endl; + tout << fh << endl; + tout << size << endl; + tout << offset << endl; + + assert(offset >= 0); + assert(fh_map.count(fh)); + Fh *f = fh_map[fh]; + Inode *in = f->inode; + + if (offset < 0) + offset = f->pos; + + bool lazy = f->mode == FILE_MODE_LAZY; + + // do we have read file cap? + while (!lazy && (in->file_caps() & CAP_FILE_RD) == 0) { + dout(7) << " don't have read cap, waiting" << endl; + Cond cond; + in->waitfor_read.push_back(&cond); + cond.Wait(client_lock); + } + // lazy cap? + while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { + dout(7) << " don't have lazy cap, waiting" << endl; + Cond cond; + in->waitfor_lazy.push_back(&cond); + cond.Wait(client_lock); + } + + // determine whether read range overlaps with file + // ...ONLY if we're doing async io + if (!lazy && (in->file_caps() & (CAP_FILE_WRBUFFER|CAP_FILE_RDCACHE))) { + // we're doing buffered i/o. make sure we're inside the file. + // we can trust size info bc we get accurate info when buffering/caching caps are issued. + dout(10) << "file size: " << in->inode.size << endl; + if (offset > 0 && offset >= in->inode.size) { + client_lock.Unlock(); + return 0; + } + if (offset + size > (unsigned)in->inode.size) size = (unsigned)in->inode.size - offset; + + if (size == 0) { + dout(10) << "read is size=0, returning 0" << endl; + client_lock.Unlock(); + return 0; + } + } else { + // unbuffered, synchronous file i/o. + // or lazy. + // defer to OSDs for file bounds. + } + + bufferlist blist; // data will go here + int rvalue = 0; + int r = 0; + + if (g_conf.client_oc) { + // object cache ON + rvalue = r = in->fc.read(offset, size, blist, client_lock); // may block. + } else { + // object cache OFF -- legacy inconsistent way. + Cond cond; + bool done = false; + C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); + + r = filer->read(in->inode, offset, size, &blist, onfinish); + + assert(r >= 0); + + // wait! + while (!done) + cond.Wait(client_lock); + } + + // adjust fd pos + f->pos = offset+blist.length(); + + // copy data into caller's char* buf + blist.copy(0, blist.length(), buf); + + //dout(10) << "i read '" << blist.c_str() << "'" << endl; + dout(10) << "read rvalue " << rvalue << ", r " << r << endl; + + // done! + client_lock.Unlock(); + return rvalue; +} + + + +/* + * hack -- + * until we properly implement synchronous writes wrt buffer cache, + * make sure we delay shutdown until they're all safe on disk! + */ +class C_Client_HackUnsafe : public Context { + Client *cl; +public: + C_Client_HackUnsafe(Client *c) : cl(c) {} + void finish(int) { + cl->hack_sync_write_safe(); + } +}; + +void Client::hack_sync_write_safe() +{ + client_lock.Lock(); + assert(unsafe_sync_write > 0); + unsafe_sync_write--; + if (unsafe_sync_write == 0 && unmounting) { + dout(10) << "hack_sync_write_safe -- no more unsafe writes, unmount can proceed" << endl; + mount_cond.Signal(); + } + client_lock.Unlock(); +} + +int Client::write(fh_t fh, const char *buf, off_t size, off_t offset) +{ + client_lock.Lock(); + + //dout(7) << "write fh " << fh << " size " << size << " offset " << offset << endl; + dout(3) << "op: client->write(" << fh << ", buf, " << size << ", " << offset << ");" << endl; + tout << "write" << endl; + tout << fh << endl; + tout << size << endl; + tout << offset << endl; + + assert(offset >= 0); + assert(fh_map.count(fh)); + Fh *f = fh_map[fh]; + Inode *in = f->inode; + + if (offset < 0) + offset = f->pos; + + bool lazy = f->mode == FILE_MODE_LAZY; + + dout(10) << "cur file size is " << in->inode.size << " wr size " << in->file_wr_size << endl; + + // do we have write file cap? + while (!lazy && (in->file_caps() & CAP_FILE_WR) == 0) { + dout(7) << " don't have write cap, waiting" << endl; + Cond cond; + in->waitfor_write.push_back(&cond); + cond.Wait(client_lock); + } + while (lazy && (in->file_caps() & CAP_FILE_LAZYIO) == 0) { + dout(7) << " don't have lazy cap, waiting" << endl; + Cond cond; + in->waitfor_lazy.push_back(&cond); + cond.Wait(client_lock); + } + + // adjust fd pos + f->pos = offset+size; + + // time it. + utime_t start = g_clock.now(); + + // copy into fresh buffer (since our write may be resub, async) + bufferptr bp = buffer::copy(buf, size); + bufferlist blist; + blist.push_back( bp ); + + if (g_conf.client_oc) { // buffer cache ON? + assert(objectcacher); + + // write (this may block!) + in->fc.write(offset, size, blist, client_lock); + + } else { + // legacy, inconsistent synchronous write. + dout(7) << "synchronous write" << endl; + + // prepare write + Cond cond; + bool done = false; + C_Cond *onfinish = new C_Cond(&cond, &done); + C_Client_HackUnsafe *onsafe = new C_Client_HackUnsafe(this); + unsafe_sync_write++; + in->sync_writes++; + + dout(20) << " sync write start " << onfinish << endl; + + filer->write(in->inode, offset, size, blist, 0, + onfinish, onsafe + //, 1+((int)g_clock.now()) / 10 //f->pos // hack hack test osd revision snapshots + ); + + while (!done) { + cond.Wait(client_lock); + dout(20) << " sync write bump " << onfinish << endl; + } + + in->sync_writes--; + if (in->sync_writes == 0 && + !in->waitfor_no_write.empty()) { + for (list::iterator i = in->waitfor_no_write.begin(); + i != in->waitfor_no_write.end(); + i++) + (*i)->finish(0); + in->waitfor_no_write.clear(); + } + + dout(20) << " sync write done " << onfinish << endl; + } + + // time + utime_t lat = g_clock.now(); + lat -= start; + if (client_logger) { + client_logger->finc("wrlsum",(double)lat); + client_logger->inc("wrlnum"); + } + + // assume success for now. FIXME. + off_t totalwritten = size; + + // extend file? + if (totalwritten + offset > in->inode.size) { + in->inode.size = in->file_wr_size = totalwritten + offset; + dout(7) << "wrote to " << totalwritten+offset << ", extending file size" << endl; + } else { + dout(7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->inode.size << endl; + } + + // mtime + in->file_wr_mtime = in->inode.mtime = g_clock.gettime(); + + // ok! + client_lock.Unlock(); + return totalwritten; +} + + +int Client::truncate(const char *file, off_t size) +{ + client_lock.Lock(); + dout(3) << "op: client->truncate(\"" << file << "\", " << size << ");" << endl; + tout << "truncate" << endl; + tout << file << endl; + tout << size << endl; + + + MClientRequest *req = new MClientRequest(MDS_OP_TRUNCATE, whoami); + req->set_path(file); + req->set_sizearg( size ); + + // FIXME where does FUSE maintain user information + req->set_caller_uid(getuid()); + req->set_caller_gid(getgid()); + + MClientReply *reply = make_request(req, true); + int res = reply->get_result(); + insert_trace(reply); + delete reply; + + dout(10) << " truncate result is " << res << endl; + + client_lock.Unlock(); + return res; +} + + +int Client::fsync(fh_t fh, bool syncdataonly) +{ + client_lock.Lock(); + dout(3) << "op: client->fsync(open_files[ " << fh << " ], " << syncdataonly << ");" << endl; + tout << "fsync" << endl; + tout << fh << endl; + tout << syncdataonly << endl; + + int r = 0; + + assert(fh_map.count(fh)); + Fh *f = fh_map[fh]; + Inode *in = f->inode; + + dout(3) << "fsync fh " << fh << " ino " << in->inode.ino << " syncdataonly " << syncdataonly << endl; + + // metadata? + if (!syncdataonly) { + dout(0) << "fsync - not syncing metadata yet.. implement me" << endl; + } + + // data? + Cond cond; + bool done = false; + if (!objectcacher->commit_set(in->ino(), + new C_Cond(&cond, &done))) { + // wait for callback + while (!done) cond.Wait(client_lock); + } + + client_lock.Unlock(); + return r; +} + + +// not written yet, but i want to link! + +int Client::chdir(const char *path) +{ + // fake it for now! + string abs; + mkabspath(path, abs); + dout(3) << "chdir " << path << " -> cwd now " << abs << endl; + cwd = abs; + return 0; +} + +int Client::statfs(const char *path, struct statfs *stbuf) +{ + assert(0); // implement me + return 0; +} + + + +int Client::lazyio_propogate(int fd, off_t offset, size_t count) +{ + client_lock.Lock(); + dout(3) << "op: client->lazyio_propogate(" << fd + << ", " << offset << ", " << count << ")" << endl; + + assert(fh_map.count(fd)); + Fh *f = fh_map[fd]; + Inode *in = f->inode; + + if (f->mode & FILE_MODE_LAZY) { + // wait for lazy cap + while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { + dout(7) << " don't have lazy cap, waiting" << endl; + Cond cond; + in->waitfor_lazy.push_back(&cond); + cond.Wait(client_lock); + } + + if (g_conf.client_oc) { + Cond cond; + bool done = false; + in->fc.flush_dirty(new C_SafeCond(&client_lock, &cond, &done)); + + while (!done) + cond.Wait(client_lock); + + } else { + // mmm, nothin to do. + } + } + + client_lock.Unlock(); + return 0; +} + +int Client::lazyio_synchronize(int fd, off_t offset, size_t count) +{ + client_lock.Lock(); + dout(3) << "op: client->lazyio_synchronize(" << fd + << ", " << offset << ", " << count << ")" << endl; + + assert(fh_map.count(fd)); + Fh *f = fh_map[fd]; + Inode *in = f->inode; + + if (f->mode & FILE_MODE_LAZY) { + // wait for lazy cap + while ((in->file_caps() & CAP_FILE_LAZYIO) == 0) { + dout(7) << " don't have lazy cap, waiting" << endl; + Cond cond; + in->waitfor_lazy.push_back(&cond); + cond.Wait(client_lock); + } + + if (g_conf.client_oc) { + in->fc.flush_dirty(0); // flush to invalidate. + in->fc.release_clean(); + } else { + // mm, nothin to do. + } + } + + client_lock.Unlock(); + return 0; +} + + +void Client::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +{ + if (dest.is_mon()) { + // resend to a different monitor. + int mon = monmap->pick_mon(true); + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", resending to mon" << mon + << endl; + messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + else if (dest.is_osd()) { + objecter->ms_handle_failure(m, dest, inst); + } + else if (dest.is_mds()) { + dout(0) << "ms_handle_failure " << dest << " inst " << inst << endl; + // help! + assert(0); + } + else { + // client? + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", dropping" << endl; + delete m; + } +} + diff --git a/branches/sage/cephmds2/client/Client.h b/branches/sage/cephmds2/client/Client.h new file mode 100644 index 0000000000000..626176f9f9f47 --- /dev/null +++ b/branches/sage/cephmds2/client/Client.h @@ -0,0 +1,588 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __CLIENT_H +#define __CLIENT_H + + +#include "mds/MDSMap.h" +#include "osd/OSDMap.h" +#include "mon/MonMap.h" + +#include "msg/Message.h" +#include "msg/Dispatcher.h" +#include "msg/Messenger.h" +#include "msg/SerialMessenger.h" + +#include "messages/MClientRequest.h" +#include "messages/MClientReply.h" + +//#include "msgthread.h" + +#include "include/types.h" +#include "include/lru.h" +#include "include/filepath.h" +#include "include/interval_set.h" + +#include "common/Mutex.h" + +#include "FileCache.h" + +// stl +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +#define O_LAZY 01000000 + + +class Filer; +class Objecter; +class ObjectCacher; + +extern class LogType client_logtype; +extern class Logger *client_logger; + + + +// ============================================ +// types for my local metadata cache +/* basic structure: + + - Dentries live in an LRU loop. they get expired based on last access. + see include/lru.h. items can be bumped to "mid" or "top" of list, etc. + - Inode has ref count for each Fh, Dir, or Dentry that points to it. + - when Inode ref goes to 0, it's expired. + - when Dir is empty, it's removed (and it's Inode ref--) + +*/ + +typedef int fh_t; + +class Dir; +class Inode; + +class Dentry : public LRUObject { + public: + string name; // sort of lame + //const char *name; + Dir *dir; + Inode *inode; + int ref; // 1 if there's a dir beneath me. + + void get() { assert(ref == 0); ref++; lru_pin(); } + void put() { assert(ref == 1); ref--; lru_unpin(); } + + Dentry() : dir(0), inode(0), ref(0) { } + + /*Dentry() : name(0), dir(0), inode(0), ref(0) { } + Dentry(string& n) : name(0), dir(0), inode(0), ref(0) { + name = new char[n.length()+1]; + strcpy((char*)name, n.c_str()); + } + ~Dentry() { + delete[] name; + }*/ +}; + +class Dir { + public: + Inode *parent_inode; // my inode + //hash_map, eqstr> dentries; + hash_map dentries; + + Dir(Inode* in) { parent_inode = in; } + + bool is_empty() { return dentries.empty(); } +}; + + +class InodeCap { + public: + int caps; + long seq; + InodeCap() : caps(0), seq(0) {} +}; + + +class Inode { + public: + inode_t inode; // the actual inode + time_t valid_until; + + // about the dir (if this is one!) + int dir_auth; + set dir_contacts; + bool dir_hashed, dir_replicated; + + // per-mds caps + map caps; // mds -> InodeCap + map stale_caps; // mds -> cap .. stale + + time_t file_wr_mtime; // [writers] time of last write + off_t file_wr_size; // [writers] largest offset we've written to + int num_open_rd, num_open_wr, num_open_lazy; // num readers, writers + + int ref; // ref count. 1 for each dentry, fh that links to me. + Dir *dir; // if i'm a dir. + Dentry *dn; // if i'm linked to a dentry. + string *symlink; // symlink content, if it's a symlink + + // for caching i/o mode + FileCache fc; + + // for sync i/o mode + int sync_reads; // sync reads in progress + int sync_writes; // sync writes in progress + + list waitfor_write; + list waitfor_read; + list waitfor_lazy; + list waitfor_no_read, waitfor_no_write; + + void get() { + ref++; + //cout << "inode.get on " << hex << inode.ino << dec << " now " << ref << endl; + } + void put() { + ref--; assert(ref >= 0); + //cout << "inode.put on " << hex << inode.ino << dec << " now " << ref << endl; + } + + Inode(inode_t _inode, ObjectCacher *_oc) : + inode(_inode), + valid_until(0), + dir_auth(-1), dir_hashed(false), dir_replicated(false), + file_wr_mtime(0), file_wr_size(0), + num_open_rd(0), num_open_wr(0), num_open_lazy(0), + ref(0), dir(0), dn(0), symlink(0), + fc(_oc, _inode), + sync_reads(0), sync_writes(0) + { } + ~Inode() { + if (symlink) { delete symlink; symlink = 0; } + } + + inodeno_t ino() { return inode.ino; } + + bool is_dir() { + return (inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR; + } + + int file_caps() { + int c = 0; + for (map::iterator it = caps.begin(); + it != caps.end(); + it++) + c |= it->second.caps; + for (map::iterator it = stale_caps.begin(); + it != stale_caps.end(); + it++) + c |= it->second.caps; + return c; + } + + int file_caps_wanted() { + int w = 0; + if (num_open_rd) w |= CAP_FILE_RD|CAP_FILE_RDCACHE; + if (num_open_wr) w |= CAP_FILE_WR|CAP_FILE_WRBUFFER; + if (num_open_lazy) w |= CAP_FILE_LAZYIO; + return w; + } + + int authority(MDSMap *mdsmap) { + //cout << "authority on " << inode.ino << " .. dir_auth is " << dir_auth<< endl; + // parent? + if (dn && dn->dir && dn->dir->parent_inode) { + // parent hashed? + if (dn->dir->parent_inode->dir_hashed) { + // hashed + assert(0); + // fixme + //return mdcluster->hash_dentry( dn->dir->parent_inode->ino(), + //dn->name ); + } + + if (dir_auth >= 0) + return dir_auth; + else + return dn->dir->parent_inode->authority(mdsmap); + } + + if (dir_auth >= 0) + return dir_auth; + + assert(0); // !!! + return 0; + } + int dentry_authority(const char *dn, + MDSMap *mdsmap) { + assert(0); + return 0; + //return ->hash_dentry( ino(), + //dn ); + } + int pick_replica(MDSMap *mdsmap) { + // replicas? + if (ino() > 1ULL && dir_contacts.size()) { + //cout << "dir_contacts if " << dir_contacts << endl; + set::iterator it = dir_contacts.begin(); + if (dir_contacts.size() == 1) + return *it; + else { + int r = rand() % dir_contacts.size(); + while (r--) it++; + return *it; + } + } + + if (dir_replicated || ino() == 1) { + //cout << "num_mds is " << mdcluster->get_num_mds() << endl; + return rand() % mdsmap->get_num_mds(); // huh.. pick a random mds! + } + else + return authority(mdsmap); + } + + + // open Dir for an inode. if it's not open, allocated it (and pin dentry in memory). + Dir *open_dir() { + if (!dir) { + if (dn) dn->get(); // pin dentry + get(); + dir = new Dir(this); + } + return dir; + } + +}; + + + + +// file handle for any open file state + +struct Fh { + Inode *inode; + off_t pos; + int mds; // have to talk to mds we opened with (for now) + int mode; // the mode i opened the file with + + bool is_lazy() { return mode & O_LAZY; } + + Fh() : inode(0), pos(0), mds(0), mode(0) {} +}; + + + + + +// ======================================================== +// client interface + +class Client : public Dispatcher { + public: + + /* getdir result */ + struct DirResult { + string path; + map contents; + map::iterator p; + int off; + int size; + struct dirent_plus dp; + struct dirent_lite dl; + DirResult() : p(contents.end()), off(-1), size(0) {} + }; + + + protected: + Messenger *messenger; + int whoami; + MonMap *monmap; + + // mds fake RPC + tid_t last_tid; + map mds_rpc_cond; + map mds_rpc_reply; + map mds_rpc_dispatch_cond; + + // cluster descriptors + MDSMap *mdsmap; + OSDMap *osdmap; + + bool mounted; + bool unmounting; + Cond mount_cond; + + int unsafe_sync_write; +public: + msg_addr_t get_myaddr() { return messenger->get_myaddr(); } + void hack_sync_write_safe(); + +protected: + Filer *filer; + ObjectCacher *objectcacher; + Objecter *objecter; // (non-blocking) osd interface + + // cache + hash_map inode_map; + Inode* root; + LRU lru; // lru list of Dentry's in our local metadata cache. + + // cap weirdness + map > cap_reap_queue; // ino -> mds -> msg .. set of (would-be) stale caps to reap + + + // file handles, etc. + string cwd; + interval_set free_fh_set; // unused fh's + hash_map fh_map; + + fh_t get_fh() { + fh_t fh = free_fh_set.start(); + free_fh_set.erase(fh, 1); + return fh; + } + void put_fh(fh_t fh) { + free_fh_set.insert(fh, 1); + } + + void mkabspath(const char *rel, string& abs) { + if (rel[0] == '/') { + abs = rel; + } else { + abs = cwd; + abs += "/"; + abs += rel; + } + } + + + // global client lock + // - protects Client and buffer cache both! + Mutex client_lock; + + + // -- metadata cache stuff + + // decrease inode ref. delete if dangling. + void put_inode(Inode *in) { + in->put(); + if (in->ref == 0) { + inode_map.erase(in->inode.ino); + if (in == root) root = 0; + delete in; + } + } + + void close_dir(Dir *dir) { + assert(dir->is_empty()); + + Inode *in = dir->parent_inode; + if (in->dn) in->dn->put(); // unpin dentry + + delete in->dir; + in->dir = 0; + put_inode(in); + } + + int get_cache_size() { return lru.lru_get_size(); } + void set_cache_size(int m) { lru.lru_set_max(m); } + + Dentry* link(Dir *dir, const string& name, Inode *in) { + Dentry *dn = new Dentry; + dn->name = name; + + // link to dir + dn->dir = dir; + dir->dentries[dn->name] = dn; + + // link to inode + dn->inode = in; + in->dn = dn; + in->get(); + + lru.lru_insert_mid(dn); // mid or top? + return dn; + } + + void unlink(Dentry *dn) { + Inode *in = dn->inode; + + // unlink from inode + dn->inode = 0; + in->dn = 0; + put_inode(in); + + // unlink from dir + dn->dir->dentries.erase(dn->name); + if (dn->dir->is_empty()) + close_dir(dn->dir); + dn->dir = 0; + + // delete den + lru.lru_remove(dn); + delete dn; + } + + Dentry *relink(Dentry *dn, Dir *dir, const string& name) { + // first link new dn to dir + /* + char *oldname = (char*)dn->name; + dn->name = new char[name.length()+1]; + strcpy((char*)dn->name, name.c_str()); + dir->dentries[dn->name] = dn; + */ + dir->dentries[name] = dn; + + // unlink from old dir + dn->dir->dentries.erase(dn->name); + //delete[] oldname; + if (dn->dir->is_empty()) + close_dir(dn->dir); + + // fix up dn + dn->name = name; + dn->dir = dir; + + return dn; + } + + // move dentry to top of lru + void touch_dn(Dentry *dn) { lru.lru_touch(dn); } + + // trim cache. + void trim_cache(); + void dump_inode(Inode *in, set& did); + void dump_cache(); // debug + + // find dentry based on filepath + Dentry *lookup(filepath& path); + + // make blocking mds request + MClientReply *make_request(MClientRequest *req, bool auth_best=false, int use_auth=-1); + MClientReply* sendrecv(MClientRequest *req, int mds); + void handle_client_reply(MClientReply *reply); + + void fill_stat(inode_t& inode, struct stat *st); + void fill_statlite(inode_t& inode, struct statlite *st); + + + // friends + friend class SyntheticClient; + + public: + Client(Messenger *m, MonMap *mm); + ~Client(); + void tear_down_cache(); + + int get_nodeid() { return whoami; } + + void init(); + void shutdown(); + + // messaging + void dispatch(Message *m); + + void handle_mount_ack(class MClientMountAck*); + void handle_unmount_ack(Message*); + void handle_mds_map(class MMDSMap *m); + + // file caps + void handle_file_caps(class MClientFileCaps *m); + void implemented_caps(class MClientFileCaps *m, Inode *in); + void release_caps(Inode *in, int retain=0); + void update_caps_wanted(Inode *in); + + void close_release(Inode *in); + void close_safe(Inode *in); + + // metadata cache + Inode* insert_inode(Dir *dir, InodeStat *in_info, const string& dn); + void update_inode_dist(Inode *in, InodeStat *st); + Inode* insert_trace(MClientReply *reply); + + // ---------------------- + // fs ops. + int mount(int mkfs=0); + int unmount(); + + // these shoud (more or less) mirror the actual system calls. + int statfs(const char *path, struct statfs *stbuf); + + // crap + int chdir(const char *s); + + // namespace ops + int getdir(const char *path, list& contents); + int getdir(const char *path, map& contents); + + DIR *opendir(const char *name); + int closedir(DIR *dir); + struct dirent *readdir(DIR *dir); + void rewinddir(DIR *dir); + off_t telldir(DIR *dir); + void seekdir(DIR *dir, off_t offset); + + struct dirent_plus *readdirplus(DIR *dirp); + int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); + struct dirent_lite *readdirlite(DIR *dirp); + int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); + + + int link(const char *existing, const char *newname); + int unlink(const char *path); + int rename(const char *from, const char *to); + + // dirs + int mkdir(const char *path, mode_t mode); + int rmdir(const char *path); + + // symlinks + int readlink(const char *path, char *buf, off_t size); + int symlink(const char *existing, const char *newname); + + // inode stuff + int _lstat(const char *path, int mask, Inode **in); + int lstat(const char *path, struct stat *stbuf); + int lstatlite(const char *path, struct statlite *buf); + + int chmod(const char *path, mode_t mode); + int chown(const char *path, uid_t uid, gid_t gid); + int utime(const char *path, struct utimbuf *buf); + + // file ops + int mknod(const char *path, mode_t mode); + int open(const char *path, int mode); + int close(fh_t fh); + int read(fh_t fh, char *buf, off_t size, off_t offset=-1); + int write(fh_t fh, const char *buf, off_t size, off_t offset=-1); + int truncate(const char *file, off_t size); + //int truncate(fh_t fh, long long size); + int fsync(fh_t fh, bool syncdataonly); + + // hpc lazyio + int lazyio_propogate(int fd, off_t offset, size_t count); + int lazyio_synchronize(int fd, off_t offset, size_t count); + + int describe_layout(char *fn, list& result); + + void ms_handle_failure(Message*, msg_addr_t dest, const entity_inst_t& inst); +}; + +#endif diff --git a/branches/sage/cephmds2/client/FileCache.cc b/branches/sage/cephmds2/client/FileCache.cc new file mode 100644 index 0000000000000..36b28dc600391 --- /dev/null +++ b/branches/sage/cephmds2/client/FileCache.cc @@ -0,0 +1,171 @@ + +#include "config.h" +#include "include/types.h" + +#include "FileCache.h" +#include "osdc/ObjectCacher.h" + +#include "msg/Messenger.h" + +#undef dout +#define dout(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache " +#define derr(x) if (x <= g_conf.debug_client) cout << g_clock.now() << " " << oc->objecter->messenger->get_myaddr() << ".filecache " + + +// flush/release/clean + +void FileCache::flush_dirty(Context *onflush) +{ + if (oc->flush_set(inode.ino, onflush)) { + onflush->finish(0); + delete onflush; + } +} + +off_t FileCache::release_clean() +{ + return oc->release_set(inode.ino); +} + +bool FileCache::is_cached() +{ + return oc->set_is_cached(inode.ino); +} + +bool FileCache::is_dirty() +{ + return oc->set_is_dirty_or_committing(inode.ino); +} + +void FileCache::empty(Context *onempty) +{ + off_t unclean = release_clean(); + bool clean = oc->flush_set(inode.ino, onempty); + assert(!unclean == clean); + + if (clean) { + onempty->finish(0); + delete onempty; + } +} + + +// caps + +void FileCache::set_caps(int caps, Context *onimplement) +{ + if (onimplement) { + assert(latest_caps & ~caps); // we should be losing caps. + caps_callbacks[caps].push_back(onimplement); + } + + latest_caps = caps; + check_caps(); +} + + +void FileCache::check_caps() +{ + int used = 0; + if (num_reading) used |= CAP_FILE_RD; + if (oc->set_is_cached(inode.ino)) used |= CAP_FILE_RDCACHE; + if (num_writing) used |= CAP_FILE_WR; + if (oc->set_is_dirty_or_committing(inode.ino)) used |= CAP_FILE_WRBUFFER; + dout(10) << "check_caps used " << cap_string(used) << endl; + + // check callbacks + map >::iterator p = caps_callbacks.begin(); + while (p != caps_callbacks.end()) { + if (used == 0 || (~(p->first) & used)) { + // implemented. + dout(10) << "used is " << cap_string(used) + << ", caps " << cap_string(p->first) << " implemented, doing callback(s)" << endl; + finish_contexts(p->second); + map >::iterator o = p; + p++; + caps_callbacks.erase(o); + } else { + dout(10) << "used is " << cap_string(used) + << ", caps " << cap_string(p->first) << " not yet implemented" << endl; + p++; + } + } +} + + + +// read/write + +int FileCache::read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) +{ + int r = 0; + + // inc reading counter + num_reading++; + + if (latest_caps & CAP_FILE_RDCACHE) { + // read (and block) + Cond cond; + bool done = false; + int rvalue = 0; + C_Cond *onfinish = new C_Cond(&cond, &done, &rvalue); + + r = oc->file_read(inode, offset, size, &blist, onfinish); + + if (r == 0) { + // block + while (!done) + cond.Wait(client_lock); + r = rvalue; + } else { + // it was cached. + delete onfinish; + } + } else { + r = oc->file_atomic_sync_read(inode, offset, size, &blist, client_lock); + } + + // dec reading counter + num_reading--; + + if (num_reading == 0 && !caps_callbacks.empty()) + check_caps(); + + return r; +} + +void FileCache::write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock) +{ + // inc writing counter + num_writing++; + + if (latest_caps & CAP_FILE_WRBUFFER) { // caps buffered write? + // wait? (this may block!) + oc->wait_for_write(size, client_lock); + + // async, caching, non-blocking. + oc->file_write(inode, offset, size, blist); + } else { + // atomic, synchronous, blocking. + oc->file_atomic_sync_write(inode, offset, size, blist, client_lock); + } + + // dec writing counter + num_writing--; + if (num_writing == 0 && !caps_callbacks.empty()) + check_caps(); +} + +bool FileCache::all_safe() +{ + return !oc->set_is_dirty_or_committing(inode.ino); +} + +void FileCache::add_safe_waiter(Context *c) +{ + bool safe = oc->commit_set(inode.ino, c); + if (safe) { + c->finish(0); + delete c; + } +} diff --git a/branches/sage/cephmds2/client/FileCache.h b/branches/sage/cephmds2/client/FileCache.h new file mode 100644 index 0000000000000..742ec98733d9b --- /dev/null +++ b/branches/sage/cephmds2/client/FileCache.h @@ -0,0 +1,65 @@ +#ifndef __FILECACHE_H +#define __FILECACHE_H + +#include +using namespace std; + +#include "common/Cond.h" +#include "mds/Capability.h" + +class ObjectCacher; + +class FileCache { + ObjectCacher *oc; + inode_t inode; + + // caps + int latest_caps; + map > caps_callbacks; + + int num_reading; + int num_writing; + //int num_unsafe; + + // waiters + list waitfor_read; + list waitfor_write; + //list waitfor_safe; + bool waitfor_release; + + public: + FileCache(ObjectCacher *_oc, inode_t _inode) : + oc(_oc), + inode(_inode), + latest_caps(0), + num_reading(0), num_writing(0),// num_unsafe(0), + waitfor_release(false) {} + + // waiters/waiting + bool can_read() { return latest_caps & CAP_FILE_RD; } + bool can_write() { return latest_caps & CAP_FILE_WR; } + bool all_safe();// { return num_unsafe == 0; } + + void add_read_waiter(Cond *c) { waitfor_read.push_back(c); } + void add_write_waiter(Cond *c) { waitfor_write.push_back(c); } + void add_safe_waiter(Context *c);// { waitfor_safe.push_back(c); } + + // ... + void flush_dirty(Context *onflush=0); + off_t release_clean(); + void empty(Context *onempty=0); + bool is_empty() { return !(is_cached() || is_dirty()); } + bool is_cached(); + bool is_dirty(); + + int get_caps() { return latest_caps; } + void set_caps(int caps, Context *onimplement=0); + void check_caps(); + + int read(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. + void write(off_t offset, size_t size, bufferlist& blist, Mutex& client_lock); // may block. + +}; + + +#endif diff --git a/branches/sage/cephmds2/client/SyntheticClient.cc b/branches/sage/cephmds2/client/SyntheticClient.cc new file mode 100644 index 0000000000000..b0569d52e553e --- /dev/null +++ b/branches/sage/cephmds2/client/SyntheticClient.cc @@ -0,0 +1,1226 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +using namespace std; + + + +#include "SyntheticClient.h" + +#include "include/filepath.h" +#include "mds/MDS.h" + +#include +#include +#include +#include +#include +#include + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_client) cout << "synthetic" << client->get_nodeid() << " " + +// traces +//void trace_include(SyntheticClient *syn, Client *cl, string& prefix); +//void trace_openssh(SyntheticClient *syn, Client *cl, string& prefix); + + +list syn_modes; +list syn_iargs; +list syn_sargs; + +void parse_syn_options(vector& args) +{ + vector nargs; + + for (unsigned i=0; iclient = client; + thread_id = 0; + + did_readdir = false; + + run_only = -1; + + this->modes = syn_modes; + this->iargs = syn_iargs; + this->sargs = syn_sargs; + + run_start = g_clock.now(); +} + + + + +#define DBL 2 + +void *synthetic_client_thread_entry(void *ptr) +{ + SyntheticClient *sc = (SyntheticClient*)ptr; + //int r = + sc->run(); + return 0;//(void*)r; +} + +string SyntheticClient::get_sarg(int seq) +{ + string a; + if (!sargs.empty()) { + a = sargs.front(); + sargs.pop_front(); + } + if (a.length() == 0 || a == "~") { + char s[20]; + sprintf(s,"syn.%d.%d", client->whoami, seq); + a = s; + } + //cout << "a is " << a << endl; + return a; +} + +int SyntheticClient::run() +{ + //run_start = g_clock.now(); + run_until = utime_t(0,0); + dout(5) << "run" << endl; + + for (list::iterator it = modes.begin(); + it != modes.end(); + it++) { + int mode = *it; + dout(3) << "mode " << mode << endl; + + switch (mode) { + case SYNCLIENT_MODE_RANDOMSLEEP: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + srand(time(0) + getpid() + client->whoami); + sleep(rand() % iarg1); + } + } + break; + + case SYNCLIENT_MODE_SLEEP: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + dout(2) << "sleep " << iarg1 << endl; + sleep(iarg1); + } + } + break; + + case SYNCLIENT_MODE_ONLY: + { + run_only = iargs.front(); + iargs.pop_front(); + if (run_only == client->get_nodeid()) + dout(2) << "only " << run_only << endl; + } + break; + + case SYNCLIENT_MODE_UNTIL: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (iarg1) { + dout(2) << "until " << iarg1 << endl; + utime_t dur(iarg1,0); + run_until = run_start + dur; + } else { + dout(2) << "until " << iarg1 << " (no limit)" << endl; + run_until = utime_t(0,0); + } + } + break; + + case SYNCLIENT_MODE_SLEEPUNTIL: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (iarg1) { + dout(2) << "sleepuntil " << iarg1 << endl; + utime_t at = g_clock.now() - run_start; + if (at.sec() < iarg1) + sleep(iarg1 - at.sec()); + } + } + break; + + case SYNCLIENT_MODE_RANDOMWALK: + { + int iarg1 = iargs.front(); + iargs.pop_front(); + if (run_me()) { + dout(2) << "randomwalk " << iarg1 << endl; + random_walk(iarg1); + } + } + break; + + case SYNCLIENT_MODE_MAKEDIRS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makedirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; + make_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + } + break; + case SYNCLIENT_MODE_STATDIRS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "statdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; + stat_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + } + break; + case SYNCLIENT_MODE_READDIRS: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "readdirs " << sarg1 << " " << iarg1 << " " << iarg2 << " " << iarg3 << endl; + read_dirs(sarg1.c_str(), iarg1, iarg2, iarg3); + } + } + break; + + + case SYNCLIENT_MODE_MAKEFILES: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles " << num << " " << count << " " << priv << endl; + make_files(num, count, priv, false); + } + } + break; + case SYNCLIENT_MODE_MAKEFILES2: + { + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + int priv = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "makefiles2 " << num << " " << count << " " << priv << endl; + make_files(num, count, priv, true); + } + } + break; + case SYNCLIENT_MODE_CREATESHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "createshared " << num << endl; + create_shared(num); + } + } + break; + case SYNCLIENT_MODE_OPENSHARED: + { + string sarg1 = get_sarg(0); + int num = iargs.front(); iargs.pop_front(); + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + dout(2) << "openshared " << num << endl; + open_shared(num, count); + } + } + break; + + case SYNCLIENT_MODE_FULLWALK: + { + string sarg1 = get_sarg(0); + if (run_me()) { + dout(2) << "fullwalk" << sarg1 << endl; + full_walk(sarg1); + } + } + break; + case SYNCLIENT_MODE_REPEATWALK: + { + string sarg1 = get_sarg(0); + if (run_me()) { + dout(2) << "repeatwalk " << sarg1 << endl; + while (full_walk(sarg1) == 0) ; + } + } + break; + + case SYNCLIENT_MODE_WRITEFILE: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + if (run_me()) + write_file(sarg1, iarg1, iarg2); + } + break; + case SYNCLIENT_MODE_WRSHARED: + { + string sarg1 = "shared"; + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + if (run_me()) + write_file(sarg1, iarg1, iarg2); + } + break; + case SYNCLIENT_MODE_WRITEBATCH: + { + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + int iarg3 = iargs.front(); iargs.pop_front(); + + if (run_me()) + write_batch(iarg1, iarg2, iarg3); + } + break; + + case SYNCLIENT_MODE_READFILE: + { + string sarg1 = get_sarg(0); + int iarg1 = iargs.front(); iargs.pop_front(); + int iarg2 = iargs.front(); iargs.pop_front(); + if (run_me()) + read_file(sarg1, iarg1, iarg2); + } + break; + + case SYNCLIENT_MODE_TRACE: + { + string tfile = get_sarg(0); + sargs.push_front(string("~")); + int iarg1 = iargs.front(); iargs.pop_front(); + string prefix = get_sarg(0); + + if (run_me()) { + dout(2) << "trace " << tfile << " prefix " << prefix << " ... " << iarg1 << " times" << endl; + + Trace t(tfile.c_str()); + + client->mkdir(prefix.c_str(), 0755); + + for (int i=0; i 0 + && i < iarg1-1 + ) { + client_logger->finc("trsum", (double)lat); + client_logger->inc("trnum"); + } + } + } + } + break; + + + case SYNCLIENT_MODE_OPENTEST: + { + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + for (int i=0; iopen("test", rand()%2 ? (O_WRONLY|O_CREAT):O_RDONLY); + if (fd > 0) client->close(fd); + } + } + } + break; + + case SYNCLIENT_MODE_OPTEST: + { + int count = iargs.front(); iargs.pop_front(); + if (run_me()) { + client->mknod("test",0777); + struct stat st; + for (int i=0; ilstat("test", &st); + client->chmod("test", 0777); + } + } + } + break; + + default: + assert(0); + } + } + return 0; +} + + +int SyntheticClient::start_thread() +{ + assert(!thread_id); + + pthread_create(&thread_id, NULL, synthetic_client_thread_entry, this); + assert(thread_id); + return 0; +} + +int SyntheticClient::join_thread() +{ + assert(thread_id); + void *rv; + pthread_join(thread_id, &rv); + return 0; +} + + +bool roll_die(float p) +{ + float r = (float)(rand() % 100000) / 100000.0; + if (r < p) + return true; + else + return false; +} + +void SyntheticClient::init_op_dist() +{ + op_dist.clear(); + op_dist.add( MDS_OP_STAT, g_conf.fakeclient_op_stat ); + op_dist.add( MDS_OP_UTIME, g_conf.fakeclient_op_utime ); + op_dist.add( MDS_OP_CHMOD, g_conf.fakeclient_op_chmod ); + op_dist.add( MDS_OP_CHOWN, g_conf.fakeclient_op_chown ); + + op_dist.add( MDS_OP_READDIR, g_conf.fakeclient_op_readdir ); + op_dist.add( MDS_OP_MKNOD, g_conf.fakeclient_op_mknod ); + op_dist.add( MDS_OP_LINK, g_conf.fakeclient_op_link ); + op_dist.add( MDS_OP_UNLINK, g_conf.fakeclient_op_unlink ); + op_dist.add( MDS_OP_RENAME, g_conf.fakeclient_op_rename ); + + op_dist.add( MDS_OP_MKDIR, g_conf.fakeclient_op_mkdir ); + op_dist.add( MDS_OP_RMDIR, g_conf.fakeclient_op_rmdir ); + op_dist.add( MDS_OP_SYMLINK, g_conf.fakeclient_op_symlink ); + + op_dist.add( MDS_OP_OPEN, g_conf.fakeclient_op_openrd ); + //op_dist.add( MDS_OP_READ, g_conf.fakeclient_op_read ); + //op_dist.add( MDS_OP_WRITE, g_conf.fakeclient_op_write ); + op_dist.add( MDS_OP_TRUNCATE, g_conf.fakeclient_op_truncate ); + op_dist.add( MDS_OP_FSYNC, g_conf.fakeclient_op_fsync ); + op_dist.add( MDS_OP_RELEASE, g_conf.fakeclient_op_close ); // actually, close() + op_dist.normalize(); +} + +void SyntheticClient::up() +{ + cwd = cwd.prefixpath(cwd.depth()-1); + dout(DBL) << "cd .. -> " << cwd << endl; + clear_dir(); +} + + +int SyntheticClient::play_trace(Trace& t, string& prefix) +{ + dout(4) << "play trace" << endl; + t.start(); + + utime_t start = g_clock.now(); + + const char *p = prefix.c_str(); + + map<__int64_t, __int64_t> open_files; + + while (!t.end()) { + + if (time_to_stop()) break; + + // op + const char *op = t.get_string(); + dout(4) << "trace op " << op << endl; + if (strcmp(op, "link") == 0) { + const char *a = t.get_string(p); + const char *b = t.get_string(p); + client->link(a,b); + } else if (strcmp(op, "unlink") == 0) { + const char *a = t.get_string(p); + client->unlink(a); + } else if (strcmp(op, "rename") == 0) { + const char *a = t.get_string(p); + const char *b = t.get_string(p); + client->rename(a,b); + } else if (strcmp(op, "mkdir") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + client->mkdir(a, b); + } else if (strcmp(op, "rmdir") == 0) { + const char *a = t.get_string(p); + client->rmdir(a); + } else if (strcmp(op, "symlink") == 0) { + const char *a = t.get_string(p); + const char *b = t.get_string(p); + client->symlink(a,b); + } else if (strcmp(op, "readlink") == 0) { + const char *a = t.get_string(p); + char buf[100]; + client->readlink(a, buf, 100); + } else if (strcmp(op, "lstat") == 0) { + struct stat st; + const char *a = t.get_string(p); + client->lstat(a, &st); + } else if (strcmp(op, "chmod") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + client->chmod(a, b); + } else if (strcmp(op, "chown") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + __int64_t c = t.get_int(); + client->chown(a, b, c); + } else if (strcmp(op, "utime") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + __int64_t c = t.get_int(); + struct utimbuf u; + u.actime = b; + u.modtime = c; + client->utime(a, &u); + } else if (strcmp(op, "mknod") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + client->mknod(a, b); + } else if (strcmp(op, "getdir") == 0) { + const char *a = t.get_string(p); + map contents; + client->getdir(a, contents); + } else if (strcmp(op, "open") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + __int64_t id = t.get_int(); + __int64_t fh = client->open(a, b); + open_files[id] = fh; + } else if (strcmp(op, "close") == 0) { + __int64_t id = t.get_int(); + __int64_t fh = open_files[id]; + if (fh > 0) client->close(fh); + open_files.erase(id); + } else if (strcmp(op, "truncate") == 0) { + const char *a = t.get_string(p); + __int64_t b = t.get_int(); + client->truncate(a,b); + } else if (strcmp(op, "read") == 0) { + __int64_t id = t.get_int(); + __int64_t fh = open_files[id]; + int size = t.get_int(); + int off = t.get_int(); + char *buf = new char[size]; + client->read(fh, buf, size, off); + delete[] buf; + } else if (strcmp(op, "write") == 0) { + __int64_t id = t.get_int(); + __int64_t fh = open_files[id]; + int size = t.get_int(); + int off = t.get_int(); + char *buf = new char[size]; + memset(buf, 1, size); // let's write 1's! + client->write(fh, buf, size, off); + delete[] buf; + } else if (strcmp(op, "fsync") == 0) { + assert(0); + } else + assert(0); + } + + // close open files + for (map<__int64_t, __int64_t>::iterator fi = open_files.begin(); + fi != open_files.end(); + fi++) { + dout(1) << "leftover close " << fi->second << endl; + if (fi->second > 0) client->close(fi->second); + } + + return 0; +} + + +int SyntheticClient::clean_dir(string& basedir) +{ + // read dir + map contents; + int r = client->getdir(basedir.c_str(), contents); + if (r < 0) { + dout(1) << "readdir on " << basedir << " returns " << r << endl; + return r; + } + + for (map::iterator it = contents.begin(); + it != contents.end(); + it++) { + string file = basedir + "/" + it->first; + + if (time_to_stop()) break; + + struct stat st; + int r = client->lstat(file.c_str(), &st); + if (r < 0) { + dout(1) << "stat error on " << file << " r=" << r << endl; + continue; + } + + if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) { + clean_dir(file); + client->rmdir(file.c_str()); + } else { + client->unlink(file.c_str()); + } + } + + return 0; + +} + + +int SyntheticClient::full_walk(string& basedir) +{ + if (time_to_stop()) return -1; + + // read dir + map contents; + int r = client->getdir(basedir.c_str(), contents); + if (r < 0) { + dout(1) << "readdir on " << basedir << " returns " << r << endl; + return r; + } + + for (map::iterator it = contents.begin(); + it != contents.end(); + it++) { + string file = basedir + "/" + it->first; + + struct stat st; + int r = client->lstat(file.c_str(), &st); + if (r < 0) { + dout(1) << "stat error on " << file << " r=" << r << endl; + continue; + } + + if ((st.st_mode & INODE_TYPE_MASK) == INODE_MODE_DIR) full_walk(file); + } + + return 0; +} + +int SyntheticClient::make_dirs(const char *basedir, int dirs, int files, int depth) +{ + if (time_to_stop()) return 0; + + // make sure base dir exists + int r = client->mkdir(basedir, 0755); + if (r != 0) { + dout(1) << "can't make base dir? " << basedir << endl; + return -1; + } + + // children + char d[500]; + dout(3) << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; + for (int i=0; imknod(d, 0644); + } + + if (depth == 0) return 0; + + for (int i=0; ilstat(basedir, &st); + if (r != 0) { + dout(1) << "can't make base dir? " << basedir << endl; + return -1; + } + + // children + char d[500]; + dout(3) << "stat_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; + for (int i=0; ilstat(d, &st); + } + + if (depth == 0) return 0; + + for (int i=0; i contents; + utime_t s = g_clock.now(); + int r = client->getdir(basedir, contents); + utime_t e = g_clock.now(); + e -= s; + if (client_logger) client_logger->finc("readdir", e); + if (r < 0) { + dout(0) << "read_dirs couldn't readdir " << basedir << ", stopping" << endl; + return -1; + } + + for (int i=0; ilstat(d, &st) < 0) { + dout(2) << "read_dirs failed stat on " << d << ", stopping" << endl; + return -1; + } + utime_t e = g_clock.now(); + e -= s; + if (client_logger) client_logger->finc("stat", e); + } + + if (depth > 0) + for (int i=0; iget_nodeid(); + char d[255]; + + if (priv) { + for (int c=0; cmkdir(d, 0755); + } + } else { + // shared + if (whoami == 0) { + for (int c=0; cmkdir(d, 0755); + } + } else { + sleep(5); + } + } + + // files + struct stat st; + for (int c=0; cmknod(d, 0644); + + if (more) { + client->lstat(d, &st); + int fd = client->open(d, O_RDONLY); + client->unlink(d); + client->close(fd); + } + + if (time_to_stop()) return 0; + } + } + + return 0; +} + + +int SyntheticClient::create_shared(int num) +{ + // files + char d[255]; + for (int n=0; nmknod(d, 0644); + } + + return 0; +} + +int SyntheticClient::open_shared(int num, int count) +{ + // files + char d[255]; + for (int c=0; c fds; + for (int n=0; nopen(d,O_RDONLY); + fds.push_back(fd); + } + + while (!fds.empty()) { + int fd = fds.front(); + fds.pop_front(); + client->close(fd); + } + } + + return 0; +} + + +int SyntheticClient::write_file(string& fn, int size, int wrsize) // size is in MB, wrsize in bytes +{ + //__uint64_t wrsize = 1024*256; + char *buf = new char[wrsize+100]; // 1 MB + memset(buf, 7, wrsize); + __uint64_t chunks = (__uint64_t)size * (__uint64_t)(1024*1024) / (__uint64_t)wrsize; + + int fd = client->open(fn.c_str(), O_RDWR|O_CREAT); + dout(5) << "writing to " << fn << " fd " << fd << endl; + if (fd < 0) return fd; + + for (unsigned i=0; iget_nodeid(); + p++; + *p = 0; + p++; + } + + client->write(fd, buf, wrsize, i*wrsize); + } + + client->close(fd); + delete[] buf; + + return 0; +} + +int SyntheticClient::write_batch(int nfile, int size, int wrsize) +{ + for (int i=0; iopen(fn.c_str(), O_RDONLY); + dout(5) << "reading from " << fn << " fd " << fd << endl; + if (fd < 0) return fd; + + for (unsigned i=0; iread(fd, buf, rdsize, i*rdsize); + + // verify fingerprint + int *p = (int*)buf; + int bad = 0; + int boff, bgoff, bchunk, bclient, bzero; + while ((char*)p + 32 < buf + rdsize) { + boff = *p; + bgoff = (int)((char*)p - buf); + p++; + bchunk = *p; + p++; + bclient = *p; + p++; + bzero = *p; + p++; + if (boff != bgoff || + bchunk != (int)i || + bclient != client->get_nodeid() || + bzero != 0) { + if (!bad) + dout(0) << "WARNING: wrong data from OSD, it should be " + << "(block=" << i + << " offset=" << bgoff + << " client=" << client->get_nodeid() << ")" + << " .. but i read back .. " + << "(block=" << bchunk + << " offset=" << boff + << " client=" << bclient << " zero=" << bzero << ")" << endl; + + bad++; + } + } + if (bad) + dout(0) << " + " << (bad-1) << " other bad 16-byte bits in this block" << endl; + + } + + client->close(fd); + delete[] buf; + + return 0; +} + + + +int SyntheticClient::random_walk(int num_req) +{ + int left = num_req; + + //dout(1) << "random_walk() will do " << left << " ops" << endl; + + init_op_dist(); // set up metadata op distribution + + while (left > 0) { + left--; + + if (time_to_stop()) break; + + // ascend? + if (cwd.depth() && !roll_die(::pow((double).9, (double)cwd.depth()))) { + dout(DBL) << "die says up" << endl; + up(); + continue; + } + + // descend? + if (.9*roll_die(::pow((double).9,(double)cwd.depth())) && subdirs.size()) { + string s = get_random_subdir(); + cwd.add_dentry( s ); + dout(DBL) << "cd " << s << " -> " << cwd << endl; + clear_dir(); + continue; + } + + int op = 0; + filepath path; + + if (contents.empty() && roll_die(.3)) { + if (did_readdir) { + dout(DBL) << "empty dir, up" << endl; + up(); + } else + op = MDS_OP_READDIR; + } else { + op = op_dist.sample(); + } + //dout(DBL) << "op is " << op << endl; + + int r = 0; + + // do op + if (op == MDS_OP_UNLINK) { + if (contents.empty()) + op = MDS_OP_READDIR; + else + r = client->unlink( get_random_sub() ); // will fail on dirs + } + + if (op == MDS_OP_RENAME) { + if (contents.empty()) + op = MDS_OP_READDIR; + else { + r = client->rename( get_random_sub(), make_sub("ren") ); + } + } + + if (op == MDS_OP_MKDIR) { + r = client->mkdir( make_sub("mkdir"), 0755); + } + + if (op == MDS_OP_RMDIR) { + if (!subdirs.empty()) + r = client->rmdir( get_random_subdir() ); + else + r = client->rmdir( cwd.c_str() ); // will pbly fail + } + + if (op == MDS_OP_SYMLINK) { + } + + if (op == MDS_OP_CHMOD) { + if (contents.empty()) + op = MDS_OP_READDIR; + else + r = client->chmod( get_random_sub(), rand() & 0755 ); + } + + if (op == MDS_OP_CHOWN) { + if (contents.empty()) r = client->chown( cwd.c_str(), rand(), rand() ); + else + r = client->chown( get_random_sub(), rand(), rand() ); + } + + if (op == MDS_OP_LINK) { + } + + if (op == MDS_OP_UTIME) { + struct utimbuf b; + memset(&b, 1, sizeof(b)); + if (contents.empty()) + r = client->utime( cwd.c_str(), &b ); + else + r = client->utime( get_random_sub(), &b ); + } + + if (op == MDS_OP_MKNOD) { + r = client->mknod( make_sub("mknod"), 0644); + } + + if (op == MDS_OP_OPEN) { + if (contents.empty()) + op = MDS_OP_READDIR; + else { + r = client->open( get_random_sub(), O_RDONLY ); + if (r > 0) { + assert(open_files.count(r) == 0); + open_files.insert(r); + } + } + } + + if (op == MDS_OP_RELEASE) { // actually, close + if (open_files.empty()) + op = MDS_OP_STAT; + else { + int fh = get_random_fh(); + r = client->close( fh ); + if (r == 0) open_files.erase(fh); + } + } + + if (op == MDS_OP_STAT) { + struct stat st; + if (contents.empty()) { + if (did_readdir) { + if (roll_die(.1)) { + dout(DBL) << "stat in empty dir, up" << endl; + up(); + } else { + op = MDS_OP_MKNOD; + } + } else + op = MDS_OP_READDIR; + } else + r = client->lstat(get_random_sub(), &st); + } + + if (op == MDS_OP_READDIR) { + clear_dir(); + + map c; + r = client->getdir( cwd.c_str(), c ); + + for (map::iterator it = c.begin(); + it != c.end(); + it++) { + //dout(DBL) << " got " << it->first << endl; + contents[it->first] = it->second; + if (it->second.is_dir()) + subdirs.insert(it->first); + } + + did_readdir = true; + } + + // errors? + if (r < 0) { + // reevaluate cwd. + //while (cwd.depth()) { + //if (client->lookup(cwd)) break; // it's in the cache + + //dout(DBL) << "r = " << r << ", client doesn't have " << cwd << ", cd .." << endl; + dout(DBL) << "r = " << r << ", client may not have " << cwd << ", cd .." << endl; + up(); + //} + } + } + + // close files + dout(DBL) << "closing files" << endl; + while (!open_files.empty()) { + int fh = get_random_fh(); + int r = client->close( fh ); + if (r == 0) open_files.erase(fh); + } + + dout(DBL) << "done" << endl; + return 0; +} + + diff --git a/branches/sage/cephmds2/client/SyntheticClient.h b/branches/sage/cephmds2/client/SyntheticClient.h new file mode 100644 index 0000000000000..14720bdd412b2 --- /dev/null +++ b/branches/sage/cephmds2/client/SyntheticClient.h @@ -0,0 +1,198 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __SYNTHETICCLIENT_H +#define __SYNTHETICCLIENT_H + +#include + +#include "Client.h" +#include "include/Distribution.h" + +#include "Trace.h" + +#define SYNCLIENT_MODE_RANDOMWALK 1 +#define SYNCLIENT_MODE_FULLWALK 2 +#define SYNCLIENT_MODE_REPEATWALK 7 + +#define SYNCLIENT_MODE_MAKEDIRS 8 // dirs files depth +#define SYNCLIENT_MODE_STATDIRS 9 // dirs files depth +#define SYNCLIENT_MODE_READDIRS 10 // dirs files depth + +#define SYNCLIENT_MODE_MAKEFILES 11 // num count private +#define SYNCLIENT_MODE_MAKEFILES2 12 // num count private +#define SYNCLIENT_MODE_CREATESHARED 13 // num +#define SYNCLIENT_MODE_OPENSHARED 14 // num count + +#define SYNCLIENT_MODE_WRITEFILE 20 +#define SYNCLIENT_MODE_READFILE 21 +#define SYNCLIENT_MODE_WRITEBATCH 22 +#define SYNCLIENT_MODE_WRSHARED 23 + +#define SYNCLIENT_MODE_TRACE 30 + +#define SYNCLIENT_MODE_OPENTEST 40 +#define SYNCLIENT_MODE_OPTEST 41 + +#define SYNCLIENT_MODE_ONLY 50 +#define SYNCLIENT_MODE_UNTIL 51 +#define SYNCLIENT_MODE_SLEEPUNTIL 52 + +#define SYNCLIENT_MODE_RANDOMSLEEP 61 +#define SYNCLIENT_MODE_SLEEP 62 + + + + +void parse_syn_options(vector& args); + +class SyntheticClient { + Client *client; + + pthread_t thread_id; + + Distribution op_dist; + + void init_op_dist(); + int get_op(); + + + filepath cwd; + map contents; + set subdirs; + bool did_readdir; + set open_files; + + void up(); + + void clear_dir() { + contents.clear(); + subdirs.clear(); + did_readdir = false; + } + + int get_random_fh() { + int r = rand() % open_files.size(); + set::iterator it = open_files.begin(); + while (r--) it++; + return *it; + } + + + filepath n1; + const char *get_random_subdir() { + assert(!subdirs.empty()); + int r = ((rand() % subdirs.size()) + (rand() % subdirs.size())) / 2; // non-uniform distn + set::iterator it = subdirs.begin(); + while (r--) it++; + + n1 = cwd; + n1.add_dentry( *it ); + return n1.get_path().c_str(); + } + filepath n2; + const char *get_random_sub() { + assert(!contents.empty()); + int r = ((rand() % contents.size()) + (rand() % contents.size())) / 2; // non-uniform distn + if (cwd.depth() && cwd.last_bit().length()) + r += cwd.last_bit().c_str()[0]; // slightly permuted + r %= contents.size(); + + map::iterator it = contents.begin(); + while (r--) it++; + + n2 = cwd; + n2.add_dentry( it->first ); + return n2.get_path().c_str(); + } + + filepath sub; + char sub_s[50]; + const char *make_sub(char *base) { + sprintf(sub_s, "%s.%d", base, rand() % 100); + string f = sub_s; + sub = cwd; + sub.add_dentry(f); + return sub.c_str(); + } + + public: + SyntheticClient(Client *client); + + int start_thread(); + int join_thread(); + + int run(); + + bool run_me() { + if (run_only >= 0) { + if (run_only == client->get_nodeid()) { + run_only = -1; + return true; + } + run_only = -1; + return false; + } + return true; + } + + // run() will do one of these things: + list modes; + list sargs; + list iargs; + utime_t run_start; + utime_t run_until; + + int run_only; + + string get_sarg(int seq); + + bool time_to_stop() { + utime_t now = g_clock.now(); + if (0) cout << "time_to_stop .. now " << now + << " until " << run_until + << " start " << run_start + << endl; + if (run_until.sec() && now > run_until) + return true; + else + return false; + } + + string compose_path(string& prefix, char *rest) { + return prefix + rest; + } + + int full_walk(string& fromdir); + int random_walk(int n); + + int make_dirs(const char *basedir, int dirs, int files, int depth); + int stat_dirs(const char *basedir, int dirs, int files, int depth); + int read_dirs(const char *basedir, int dirs, int files, int depth); + int make_files(int num, int count, int priv, bool more); + + int create_shared(int num); + int open_shared(int num, int count); + + int write_file(string& fn, int mb, int chunk); + int write_batch(int nfile, int mb, int chunk); + int read_file(string& fn, int mb, int chunk); + + int clean_dir(string& basedir); + + int play_trace(Trace& t, string& prefix); + +}; + +#endif diff --git a/branches/sage/cephmds2/client/Trace.cc b/branches/sage/cephmds2/client/Trace.cc new file mode 100644 index 0000000000000..43459653011a1 --- /dev/null +++ b/branches/sage/cephmds2/client/Trace.cc @@ -0,0 +1,125 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Trace.h" + +#include +#include +#include +#include +using namespace __gnu_cxx; + +#include "common/Mutex.h" + +#include "config.h" + +#include +#include +#include + + +Mutex trace_lock; + +class TokenList { +public: + string filename; + char *data; + int len; + list tokens; + + int ref; + + TokenList() : data(0), ref(0) {} + ~TokenList() { + delete[] data; + } +}; + +map traces; + + +// +Trace::Trace(const char* f) +{ + string filename = f; + + trace_lock.Lock(); + + if (traces.count(filename)) + tl = traces[filename]; + else { + tl = new TokenList; + tl->filename = filename; + + // open file + crope cr; + int fd = open(filename.c_str(), O_RDONLY); + assert(fd > 0); + char buf[100]; + while (1) { + int r = read(fd, buf, 100); + if (r == 0) break; + assert(r > 0); + cr.append(buf, r); + } + close(fd); + + // copy + tl->len = cr.length()+1; + tl->data = new char[tl->len]; + memcpy(tl->data, cr.c_str(), cr.length()); + tl->data[tl->len-1] = '\n'; + + // index! + int o = 0; + while (o < tl->len) { + char *n = tl->data + o; + + // find newline + while (tl->data[o] != '\n') o++; + assert(tl->data[o] == '\n'); + tl->data[o] = 0; + + if (tl->data + o > n) tl->tokens.push_back(n); + o++; + } + + dout(1) << "trace " << filename << " loaded with " << tl->tokens.size() << " tokens" << endl; + traces[filename] = tl; + } + + tl->ref++; + + trace_lock.Unlock(); +} + +Trace::~Trace() +{ + trace_lock.Lock(); + + tl->ref--; + if (tl->ref == 0) { + traces.erase(tl->filename); + delete tl; + } + + trace_lock.Unlock(); +} + + +list& Trace::get_list() +{ + return tl->tokens; +} diff --git a/branches/sage/cephmds2/client/Trace.h b/branches/sage/cephmds2/client/Trace.h new file mode 100644 index 0000000000000..08b1fa8ff2722 --- /dev/null +++ b/branches/sage/cephmds2/client/Trace.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __CLIENT_TRACE_H +#define __CLIENT_TRACE_H + +#include +#include +#include +using namespace std; + +/* + + this class is more like an iterator over a constant tokenlist (which + is protected by a mutex, see Trace.cc) + + */ + +class Trace { + class TokenList *tl; + + public: + Trace(const char* filename); + ~Trace(); + + list& get_list(); + + list::iterator _cur; + list::iterator _end; + + void start() { + _cur = get_list().begin(); + _end = get_list().end(); + ns = 0; + } + + char strings[10][200]; + int ns; + const char *get_string(const char *prefix = 0) { + assert(_cur != _end); + const char *s = *_cur; + _cur++; + if (prefix) { + if (strstr(s, "/prefix") == s || + strstr(s, "/prefix") == s+1) { + strcpy(strings[ns], prefix); + strcpy(strings[ns] + strlen(prefix), + s + strlen("/prefix")); + s = (const char*)strings[ns]; + ns++; + if (ns == 10) ns = 0; + } + } + return s; + } + __int64_t get_int() { + return atoll(get_string()); + } + bool end() { + return _cur == _end; + } +}; + +#endif diff --git a/branches/sage/cephmds2/client/fuse.cc b/branches/sage/cephmds2/client/fuse.cc new file mode 100644 index 0000000000000..560a515a95240 --- /dev/null +++ b/branches/sage/cephmds2/client/fuse.cc @@ -0,0 +1,276 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +/* + FUSE: Filesystem in Userspace + Copyright (C) 2001-2005 Miklos Szeredi + + This program can be distributed under the terms of the GNU GPL. + See the file COPYING. +*/ + + +// fuse crap +#ifdef linux +/* For pread()/pwrite() */ +#define _XOPEN_SOURCE 500 +#endif + +#define FUSE_USE_VERSION 22 + +#include +#include +#include +#include +#include +#include +#include +#include + + +// ceph stuff +#include "include/types.h" + +#include "Client.h" + +#include "config.h" + +// stl +#include +using namespace std; + + +// globals +Client *client; // the ceph client + + + +// ------ +// fuse hooks + +static int ceph_getattr(const char *path, struct stat *stbuf) +{ + return client->lstat(path, stbuf); +} + +static int ceph_readlink(const char *path, char *buf, size_t size) +{ + int res; + + res = client->readlink(path, buf, size - 1); + if (res < 0) return res; + + buf[res] = '\0'; + return 0; +} + + +static int ceph_getdir(const char *path, fuse_dirh_t h, fuse_dirfil_t filler) +{ + map contents; + + int res = client->getdir(path, contents); + if (res < 0) return res; + + // return contents to fuse via callback + for (map::iterator it = contents.begin(); + it != contents.end(); + it++) { + // (immutable) inode contents too. + res = filler(h, // fuse's handle + it->first.c_str(), // dentry as char* + it->second.mode & INODE_TYPE_MASK, // mask type bits from mode + it->second.ino); // ino.. 64->32 bit issue here? FIXME + if (res != 0) break; // fuse has had enough + } + return res; +} + +static int ceph_mknod(const char *path, mode_t mode, dev_t rdev) +{ + return client->mknod(path, mode); +} + +static int ceph_mkdir(const char *path, mode_t mode) +{ + return client->mkdir(path, mode); +} + +static int ceph_unlink(const char *path) +{ + return client->unlink(path); +} + +static int ceph_rmdir(const char *path) +{ + return client->rmdir(path); +} + +static int ceph_symlink(const char *from, const char *to) +{ + return client->symlink(from, to); +} + +static int ceph_rename(const char *from, const char *to) +{ + return client->rename(from, to); +} + +static int ceph_link(const char *from, const char *to) +{ + return client->link(from, to); +} + +static int ceph_chmod(const char *path, mode_t mode) +{ + return client->chmod(path, mode); +} + +static int ceph_chown(const char *path, uid_t uid, gid_t gid) +{ + return client->chown(path, uid, gid); +} + +static int ceph_truncate(const char *path, off_t size) +{ + return client->truncate(path, size); +} + +static int ceph_utime(const char *path, struct utimbuf *buf) +{ + return client->utime(path, buf); +} + + +static int ceph_open(const char *path, struct fuse_file_info *fi) +{ + int res; + + res = client->open(path, fi->flags); + if (res < 0) return res; + fi->fh = res; + return 0; // fuse wants 0 onsucess +} + +static int ceph_read(const char *path, char *buf, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->read(fh, buf, size, offset); +} + +static int ceph_write(const char *path, const char *buf, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->write(fh, buf, size, offset); +} + +/* +static int ceph_flush(const char *path, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->flush(fh); +} +*/ + +static int ceph_statfs(const char *path, struct statfs *stbuf) +{ + return client->statfs(path, stbuf); +} + + + +static int ceph_release(const char *path, struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + int r = client->close(fh); // close the file + return r; +} + +static int ceph_fsync(const char *path, int isdatasync, + struct fuse_file_info *fi) +{ + fh_t fh = fi->fh; + return client->fsync(fh, isdatasync ? true:false); +} + + +static struct fuse_operations ceph_oper = { + getattr: ceph_getattr, + readlink: ceph_readlink, + getdir: ceph_getdir, + mknod: ceph_mknod, + mkdir: ceph_mkdir, + unlink: ceph_unlink, + rmdir: ceph_rmdir, + symlink: ceph_symlink, + rename: ceph_rename, + link: ceph_link, + chmod: ceph_chmod, + chown: ceph_chown, + truncate: ceph_truncate, + utime: ceph_utime, + open: ceph_open, + read: ceph_read, + write: ceph_write, + statfs: ceph_statfs, + flush: 0, //ceph_flush, + release: ceph_release, + fsync: ceph_fsync +}; + + +int ceph_fuse_main(Client *c, int argc, char *argv[]) +{ + // init client + client = c; + + // set up fuse argc/argv + int newargc = 0; + char **newargv = (char **) malloc((argc + 10) * sizeof(char *)); + newargv[newargc++] = argv[0]; + + // allow other (all!) users to see my file system + // NOTE: echo user_allow_other >> /etc/fuse.conf + newargv[newargc++] = "-o"; + newargv[newargc++] = "allow_other"; + + // use inos + newargv[newargc++] = "-o"; + newargv[newargc++] = "use_ino"; + + // large reads, direct_io (no kernel cachine) + //newargv[newargc++] = "-o"; + //newargv[newargc++] = "large_read"; + if (g_conf.fuse_direct_io) { + newargv[newargc++] = "-o"; + newargv[newargc++] = "direct_io"; + } + + // disable stupid fuse unlink hiding thing + newargv[newargc++] = "-o"; + newargv[newargc++] = "hard_remove"; + + // force into foreground + // -> we can watch stdout this way!! + newargv[newargc++] = "-f"; + + // copy rest of cmdline (hopefully, the mount point!) + for (int argctr = 1; argctr < argc; argctr++) newargv[newargc++] = argv[argctr]; + + // go fuse go + cout << "ok, calling fuse_main" << endl; + return fuse_main(newargc, newargv, &ceph_oper); +} diff --git a/branches/sage/cephmds2/client/fuse.h b/branches/sage/cephmds2/client/fuse.h new file mode 100644 index 0000000000000..d0b8dcb1154f5 --- /dev/null +++ b/branches/sage/cephmds2/client/fuse.h @@ -0,0 +1,23 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +/* ceph_fuse_main + * - start up fuse glue, attached to Client* cl. + * - argc, argv should include a mount point, and + * any weird fuse options you want. by default, + * we will put fuse in the foreground so that it + * won't fork and we can see stdout. + */ +int ceph_fuse_main(Client *cl, int argc, char *argv[]); diff --git a/branches/sage/cephmds2/client/ldceph.cc b/branches/sage/cephmds2/client/ldceph.cc new file mode 100644 index 0000000000000..9706fd49cad99 --- /dev/null +++ b/branches/sage/cephmds2/client/ldceph.cc @@ -0,0 +1,297 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +using namespace std; + +// ceph stuff +#include "config.h" +#include "client/Client.h" +#include "msg/TCPMessenger.h" + +// syscall fun +#include +#include +#include +//#include + +#define _FCNTL_H +#include + +#define CEPH_FD_OFF 50000 + + +/****** startup etc *******/ + +class LdCeph { +public: + // globals + bool started; + char *mount_point; + char *mount_point_parent; + int mount_point_len; + + Client *client; + + filepath fp_mount_point; + filepath cwd; + bool cwd_above_mp, cwd_in_mp; + + const char *get_ceph_path(const char *orig, char *buf) { + if (!started) return 0; + + // relative path? BUG: this won't catch "blah/../../asdf" + if (orig[0] && + orig[0] != '/' && + !(orig[0] == '.' && orig[1] == '.')) { + + if (cwd_in_mp) return orig; // inside mount point, definitely ceph + if (!cwd_above_mp) return 0; // not above mount point, definitely not ceph + + // relative, above mp. + filepath o = orig; + filepath p = cwd; + for (unsigned b = 0; b < o.depth(); b++) { + if (o[b] == "..") + p.pop_dentry(); + else + p.add_dentry(o[b]); + } + + // FIXME rewrite + if (strncmp(p.c_str(), mount_point, mount_point_len) == 0) { + if (p.c_str()[mount_point_len] == 0) + return "/"; + if (p.c_str()[mount_point_len] == '/') { + strcpy(buf, p.c_str() + mount_point_len); + return buf; + } + } + return 0; + } else { + // absolute + if (strncmp(orig, mount_point, mount_point_len) == 0) { + if (orig[mount_point_len] == 0) + return "/"; + if (orig[mount_point_len] == '/') + return orig + mount_point_len; + } + return 0; + } + } + + void refresh_cwd() { + char buf[255]; + syscall(SYS_getcwd, buf, 255); + cwd = buf; + + if (strncmp(buf, mount_point, mount_point_len) == 0 && + (buf[mount_point_len] == 0 || + buf[mount_point_len] == '/')) + cwd_in_mp = true; + else { + if (cwd.depth() > fp_mount_point.depth()) + cwd_above_mp = false; + else { + cwd_above_mp = true; + for (unsigned i=0; iget_myaddr() << endl; + + refresh_cwd(); + } + } + ~LdCeph() { + cout << "ldceph fini" << endl; + if (false && client) { + client->unmount(); + client->shutdown(); + delete client; + client = 0; + tcpmessenger_wait(); + tcpmessenger_shutdown(); + } + } + +} ldceph; + + + +/****** original functions ****/ + + + +/****** captured functions ****/ + + +#define MYFD(f) ((fd) > CEPH_FD_OFF && ldceph.started) +#define TO_FD(fd) (fd > 0 ? fd+CEPH_FD_OFF:fd) +#define FROM_FD(fd) (fd - CEPH_FD_OFF) + +extern "C" { + + // open/close + //int open(const char *pathname, int flags) { + int open(const char *pathname, int flags, mode_t mode) { + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) + return TO_FD(ldceph.client->open(c, flags)); + else + return syscall(SYS_open, pathname, flags, mode); + } + + int creat(const char *pathname, mode_t mode) { + return open(pathname, O_CREAT|O_WRONLY|O_TRUNC, mode); + } + int close(int fd) { + if (MYFD(fd)) + return ldceph.client->close(FROM_FD(fd)); + else + return syscall(SYS_close, fd); + } + + + // read/write + ssize_t write(int fd, const void *buf, size_t count) { + if (MYFD(fd)) + return ldceph.client->write(FROM_FD(fd), (char*)buf, count); + else + return syscall(SYS_write, fd, buf, count); + } + + ssize_t read(int fd, void *buf, size_t count) { + if (MYFD(fd)) + return ldceph.client->read(FROM_FD(fd), (char*)buf, count); + else + return syscall(SYS_read, fd, buf, count); + } + + //int fsync(int fd); + //int fdatasync(int fd); + + + // namespace + int rmdir(const char *pathname) { + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) + return ldceph.client->rmdir(c); + else + return syscall(SYS_rmdir, pathname); + } + int mkdir(const char *pathname, mode_t mode) { + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) + return ldceph.client->mkdir(c, mode); + else + return syscall(SYS_mkdir, pathname, mode); + } + int unlink(const char *pathname) { + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) + return ldceph.client->unlink(c); + else + return syscall(SYS_unlink, pathname); + } + + int stat(const char *pathname, struct stat *st) { + //int __xstat64(int __ver, const char *pathname, struct stat64 *st64) { // stoopid GLIBC + //struct stat *st = (struct stat*)st64; + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) + return ldceph.client->lstat(c, st); // FIXME + else + return syscall(SYS_stat, pathname, st); + } + //int fstat(int filedes, struct stat *buf); + //int lstat(const char *file_name, struct stat *buf); + + int chdir(const char *pathname) { + char buf[255]; + if (const char *c = ldceph.get_ceph_path(pathname, buf)) { + int r = ldceph.client->chdir(c); + if (r == 0) { + if (!ldceph.cwd_in_mp) + syscall(SYS_chdir, ldceph.mount_point_parent); + ldceph.cwd_in_mp = true; + ldceph.cwd_above_mp = false; + ldceph.cwd = ldceph.mount_point; + filepath fpc = c; + ldceph.cwd.append(fpc); + } + return r; + } else { + int r = syscall(SYS_chdir, pathname); + if (r) { + ldceph.refresh_cwd(); + } + return r; + } + } + char *getcwd(char *buf, size_t size) { + strncpy(buf, ldceph.cwd.c_str(), size); + return buf; + } + //int fchdir(int fd); + + + + +} diff --git a/branches/sage/cephmds2/client/msgthread.h b/branches/sage/cephmds2/client/msgthread.h new file mode 100644 index 0000000000000..69d10be9f6a56 --- /dev/null +++ b/branches/sage/cephmds2/client/msgthread.h @@ -0,0 +1,25 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "msg/Message.h" + +// send the message, expecting no response. threads other than the +// MPI thread use this function; if the MPI thread uses this function +// it could deadlock: this function could wait for the out queue to be +// emptied, but only the MPI thread can empty it. +void obfsmpi_send(Message *m) + +// send the message to a server and wait for the response. threads +// other than the MPI thread use this function. +Message *obfsmpi_sendrecv(Message *m) diff --git a/branches/sage/cephmds2/common/Clock.cc b/branches/sage/cephmds2/common/Clock.cc new file mode 100644 index 0000000000000..c970a337826b6 --- /dev/null +++ b/branches/sage/cephmds2/common/Clock.cc @@ -0,0 +1,19 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "Clock.h" + +// public +Clock g_clock; + diff --git a/branches/sage/cephmds2/common/Clock.h b/branches/sage/cephmds2/common/Clock.h new file mode 100644 index 0000000000000..c1789dedc2461 --- /dev/null +++ b/branches/sage/cephmds2/common/Clock.h @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __CLOCK_H +#define __CLOCK_H + +#include +#include + +#include +#include +#include + +#include "Mutex.h" + + +// -------- +// utime_t + +class utime_t { + private: + struct timeval tv; + + struct timeval& timeval() { return tv; } + friend class Clock; + + + public: + void normalize() { + if (tv.tv_usec > 1000*1000) { + tv.tv_sec += tv.tv_usec / (1000*1000); + tv.tv_usec %= 1000*1000; + } + } + + // cons + utime_t() { tv.tv_sec = 0; tv.tv_usec = 0; normalize(); } + utime_t(time_t s, int u) { tv.tv_sec = s; tv.tv_usec = u; normalize(); } + + // accessors + time_t sec() const { return tv.tv_sec; } + long usec() const { return tv.tv_usec; } + int nsec() const { return tv.tv_usec*1000; } + + // ref accessors/modifiers + time_t& sec_ref() { return tv.tv_sec; } + long& usec_ref() { return tv.tv_usec; } + + // cast to double + operator double() { + return (double)sec() + ((double)usec() / 1000000.0L); + } +}; + +// arithmetic operators +inline utime_t operator+(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() + r.sec() + (l.usec()+r.usec())/1000000L, + (l.usec()+r.usec())%1000000L ); +} +inline utime_t& operator+=(utime_t& l, const utime_t& r) { + l.sec_ref() += r.sec() + (l.usec()+r.usec())/1000000L; + l.usec_ref() += r.usec(); + l.usec_ref() %= 1000000L; + return l; +} +inline utime_t& operator+=(utime_t& l, double f) { + double fs = trunc(f); + double us = (f - fs) / (double)1000000.0; + l.sec_ref() += (long)fs; + l.usec_ref() += (long)us; + l.normalize(); + return l; +} + +inline utime_t operator-(const utime_t& l, const utime_t& r) { + return utime_t( l.sec() - r.sec() - (l.usec()= r.usec()) + l.usec_ref() -= r.usec(); + else { + l.usec_ref() += 1000000L - r.usec(); + l.sec_ref()--; + } + return l; +} + +inline bool operator>(const utime_t& a, const utime_t& b) +{ + return (a.sec() > b.sec()) || (a.sec() == b.sec() && a.usec() > b.usec()); +} +inline bool operator<(const utime_t& a, const utime_t& b) +{ + return (a.sec() < b.sec()) || (a.sec() == b.sec() && a.usec() < b.usec()); +} + +// ostream +inline std::ostream& operator<<(std::ostream& out, const utime_t& t) +{ + //return out << t.sec() << "." << t.usec(); + out << (long)t.sec() << "."; + out.setf(std::ios::right); + out.fill('0'); + out << std::setw(6) << t.usec(); + out.unsetf(std::ios::right); + return out; + + //return out << (long)t.sec << "." << ios::setf(ios::right) << ios::fill('0') << t.usec() << ios::usetf(); +} + + + + +// -- clock -- +class Clock { + protected: + //utime_t start_offset; + //utime_t abs_last; + utime_t last; + utime_t zero; + + Mutex lock; + + public: + Clock() { + // set offset + tare(); + } + + // real time. + utime_t real_now() { + utime_t realnow = now(); + realnow += zero; + //gettimeofday(&realnow.timeval(), NULL); + return realnow; + } + + // relative time (from startup) + void tare() { + gettimeofday(&zero.timeval(), NULL); + } + utime_t now() { + //lock.Lock(); + utime_t n; + gettimeofday(&n.timeval(), NULL); + n -= zero; + if (n < last) { + //std::cerr << "WARNING: clock jumped backwards from " << last << " to " << n << std::endl; + n = last; // clock jumped backwards! + } else + last = n; + //lock.Unlock(); + return n; + } + utime_t recent_now() { + return last; + } + + void realify(utime_t& t) { + t += zero; + } + + void make_timespec(utime_t& t, struct timespec *ts) { + utime_t real = t; + realify(real); + + memset(ts, 0, sizeof(*ts)); + ts->tv_sec = real.sec(); + ts->tv_nsec = real.nsec(); + } + + + + // absolute time + time_t gettime() { + return real_now().sec(); + } + +}; + +extern Clock g_clock; + +#endif diff --git a/branches/sage/cephmds2/common/Cond.h b/branches/sage/cephmds2/common/Cond.h new file mode 100644 index 0000000000000..ed465ce3762d6 --- /dev/null +++ b/branches/sage/cephmds2/common/Cond.h @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __COND_H +#define __COND_H + +#include + +#include "Mutex.h" +#include "Clock.h" + +#include "include/Context.h" + +#include +#include + +class Cond { + // my bits + pthread_cond_t _c; + + // don't allow copying. + void operator=(Cond &C) {} + Cond( const Cond &C ) {} + + public: + Cond() { + int r = pthread_cond_init(&_c,NULL); + assert(r == 0); + } + virtual ~Cond() { + pthread_cond_destroy(&_c); + } + + int Wait(Mutex &mutex) { + int r = pthread_cond_wait(&_c, &mutex._m); + return r; + } + + int Wait(Mutex &mutex, char* s) { + //cout << "Wait: " << s << endl; + int r = pthread_cond_wait(&_c, &mutex._m); + return r; + } + + int WaitUntil(Mutex &mutex, utime_t when) { + struct timespec ts; + g_clock.make_timespec(when, &ts); + //cout << "timedwait for " << ts.tv_sec << " sec " << ts.tv_nsec << " nsec" << endl; + int r = pthread_cond_timedwait(&_c, &mutex._m, &ts); + return r; + } + int WaitInterval(Mutex &mutex, utime_t interval) { + utime_t when = g_clock.now(); + when += interval; + return WaitUntil(mutex, when); + } + + int Signal() { + //int r = pthread_cond_signal(&_c); + int r = pthread_cond_broadcast(&_c); + return r; + } + int SignalOne() { + int r = pthread_cond_signal(&_c); + return r; + } + int SignalAll() { + //int r = pthread_cond_signal(&_c); + int r = pthread_cond_broadcast(&_c); + return r; + } +}; + +class C_Cond : public Context { + Cond *cond; + bool *done; + int *rval; +public: + C_Cond(Cond *c, bool *d, int *r=0) : cond(c), done(d), rval(r) { + *done = false; + } + void finish(int r) { + if (rval) *rval = r; + *done = true; + cond->Signal(); + } +}; + +class C_SafeCond : public Context { + Mutex *lock; + Cond *cond; + bool *done; + int *rval; +public: + C_SafeCond(Mutex *l, Cond *c, bool *d, int *r=0) : lock(l), cond(c), done(d), rval(r) { + *done = false; + } + void finish(int r) { + lock->Lock(); + if (rval) *rval = r; + *done = true; + cond->Signal(); + lock->Unlock(); + } +}; + +#endif diff --git a/branches/sage/cephmds2/common/DecayCounter.h b/branches/sage/cephmds2/common/DecayCounter.h new file mode 100644 index 0000000000000..b95ebea815b7c --- /dev/null +++ b/branches/sage/cephmds2/common/DecayCounter.h @@ -0,0 +1,94 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __DECAYCOUNTER_H +#define __DECAYCOUNTER_H + +#include +#include "Clock.h" + +#include "config.h" + +class DecayCounter { + protected: + double val; // value + + double half_life; // in seconds + double k; // k = ln(.5)/half_life + + utime_t last_decay; // time of last decay + + public: + DecayCounter() : val(0) { + set_halflife( g_conf.mds_decay_halflife ); + reset(); + } + /* + DecayCounter(double hl) : val(0) { + set_halflife(hl); + reset(); + } + */ + + void adjust(double a) { + decay(); + val += a; + } + void adjust_down(const DecayCounter& other) { + // assume other has same time stamp as us... + val -= other.val; + } + + void set_halflife(double hl) { + half_life = hl; + k = log(.5) / hl; + } + + void take(DecayCounter& other) { + *this = other; + other.reset(); + } + + void reset() { + last_decay.sec_ref() = 0; + last_decay.usec_ref() = 0; + val = 0; + } + + void decay() { + utime_t el = g_clock.recent_now(); + el -= last_decay; + if (el.sec() >= 1) { + val = val * exp((double)el * k); + if (val < .01) val = 0; + last_decay = g_clock.recent_now(); + } + } + + double get() { + decay(); + return val; + } + + double hit(double v = 1.0) { + decay(); + val += v; + return val; + } + +}; + + +#endif diff --git a/branches/sage/cephmds2/common/LogType.h b/branches/sage/cephmds2/common/LogType.h new file mode 100644 index 0000000000000..3de17751ec2f8 --- /dev/null +++ b/branches/sage/cephmds2/common/LogType.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __LOGTYPE_H +#define __LOGTYPE_H + +#include "include/types.h" + +#include +#include +using namespace std; +#include +#include +using namespace __gnu_cxx; + +#include "Mutex.h" + + +class LogType { + protected: + hash_map<__uint64_t, int> keymap; + vector keys; + set inc_keys; + + int version; + + // HACK to avoid the hash table as often as possible... + // cache recent key name lookups in a small ring buffer + const static int cache_keys = 10; + __uint64_t kc_ptr[cache_keys]; + int kc_val[cache_keys]; + int kc_pos; + + friend class Logger; + + public: + LogType() { + version = 1; + + for (int i=0;i= 0) return i; + + i = keys.size(); + keys.push_back(key); + +#ifdef __LP64__ + __uint64_t p = (__uint64_t)key; +#else + __uint64_t p = (__uint32_t)key; +#endif + keymap[p] = i; + if (is_inc) inc_keys.insert(i); + + version++; + return i; + } + int add_inc(const char* key) { + return add_key(key, true); + } + int add_set(const char *key) { + return add_key(key, false); + } + + bool have_key(const char* key) { + return lookup_key(key) < 0; + } + + int lookup_key(const char* key) { +#ifdef __LP64__ + __uint64_t p = (__uint64_t)key; +#else + __uint64_t p = (__uint32_t)key; +#endif + + if (keymap.count(p)) + return keymap[p]; + + // try kc ringbuffer + int pos = kc_pos-1; + for (int j=0; j + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include + +#include "LogType.h" +#include "Logger.h" + +#include +#include "Clock.h" + +#include "config.h" + +#include +#include + + +// per-process lock. lame, but this way I protect LogType too! +Mutex logger_lock; + +Logger::Logger(string fn, LogType *type) +{ + logger_lock.Lock(); + { + filename = "log/"; + if (g_conf.log_name) { + filename += g_conf.log_name; + ::mkdir( filename.c_str(), 0755 ); // make sure dir exists + filename += "/"; + } + filename += fn; + //cout << "log " << filename << endl; + interval = g_conf.log_interval; + + start = g_clock.now(); // time 0! + last_logged = 0; + wrote_header = -1; + open = false; + this->type = type; + wrote_header_last = 0; + + version = 0; + } + logger_lock.Unlock(); + flush(false); +} + +Logger::~Logger() +{ + flush(true); + out.close(); +} + +long Logger::inc(const char *key, long v) +{ + if (!g_conf.log) return 0; + logger_lock.Lock(); + int i = type->lookup_key(key); + if (i < 0) i = type->add_inc(key); + flush(); + vals[i] += v; + long r = vals[i]; + logger_lock.Unlock(); + return r; +} + +double Logger::finc(const char *key, double v) +{ + if (!g_conf.log) return 0; + logger_lock.Lock(); + int i = type->lookup_key(key); + if (i < 0) i = type->add_inc(key); + flush(); + fvals[i] += v; + double r = fvals[i]; + logger_lock.Unlock(); + return r; +} + +long Logger::set(const char *key, long v) +{ + if (!g_conf.log) return 0; + logger_lock.Lock(); + int i = type->lookup_key(key); + if (i < 0) i = type->add_set(key); + flush(); + long r = vals[i] = v; + logger_lock.Unlock(); + return r; +} + + +double Logger::fset(const char *key, double v) +{ + if (!g_conf.log) return 0; + logger_lock.Lock(); + int i = type->lookup_key(key); + if (i < 0) i = type->add_set(key); + flush(); + double r = fvals[i] = v; + logger_lock.Unlock(); + return r; +} + +long Logger::get(const char* key) +{ + if (!g_conf.log) return 0; + logger_lock.Lock(); + int i = type->lookup_key(key); + long r = 0; + if (i >= 0 && (int)vals.size() > i) + r = vals[i]; + logger_lock.Unlock(); + return r; +} + +void Logger::flush(bool force) +{ + if (!g_conf.log) return; + logger_lock.Lock(); + + if (version != type->version) { + while (type->keys.size() > vals.size()) + vals.push_back(0); + while (type->keys.size() > fvals.size()) + fvals.push_back(0); + version = type->version; + } + + if (!open) { + out.open(filename.c_str(), ofstream::out); + open = true; + //cout << "opening log file " << filename << endl; + } + + utime_t fromstart = g_clock.now(); + if (fromstart < start) { + cerr << "logger time jumped backwards from " << start << " to " << fromstart << endl; + assert(0); + start = fromstart; + } + fromstart -= start; + + while (force || + ((fromstart.sec() > last_logged) && + (fromstart.sec() - last_logged >= interval))) { + last_logged += interval; + force = false; + + //cout << "logger " << this << " advancing from " << last_logged << " now " << now << endl; + + if (!open) { + out.open(filename.c_str(), ofstream::out); + open = true; + //cout << "opening log file " << filename << endl; + } + + // header? + wrote_header_last++; + if (wrote_header != type->version || + wrote_header_last > 10) { + out << "#" << type->keymap.size(); + for (unsigned i=0; ikeys.size(); i++) + out << "\t" << type->keys[i]; + out << endl; //out << "\t (" << type->keymap.size() << ")" << endl; + wrote_header = type->version; + wrote_header_last = 0; + } + + // write line to log + out << last_logged; + for (unsigned i=0; ikeys.size(); i++) { + if (fvals[i] > 0 && vals[i] == 0) + out << "\t" << fvals[i]; + else + out << "\t" << vals[i]; + } + out << endl; + + // reset the counters + for (unsigned i=0; ikeys.size(); i++) { + if (type->inc_keys.count(i)) { + this->vals[i] = 0; + this->fvals[i] = 0; + } + } + } + + logger_lock.Unlock(); +} + + + + diff --git a/branches/sage/cephmds2/common/Logger.h b/branches/sage/cephmds2/common/Logger.h new file mode 100644 index 0000000000000..85102acd90370 --- /dev/null +++ b/branches/sage/cephmds2/common/Logger.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __LOGGER_H +#define __LOGGER_H + +#include "include/types.h" +#include "Clock.h" +#include "Mutex.h" + +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +#include "LogType.h" + + + + +class Logger { + protected: + //hash_map, eqstr> vals; + //hash_map, eqstr> fvals; + vector vals; + vector fvals; + + //Mutex lock; + LogType *type; + + utime_t start; + int last_logged; + int interval; + int wrote_header; + int wrote_header_last; + + string filename; + + int version; + + ofstream out; + bool open; + + public: + Logger(string fn, LogType *type); + ~Logger(); + + void set_start(const utime_t& a) { start = a; } + utime_t& get_start() { return start; } + + long inc(const char *s, long v = 1); + long set(const char *s, long v); + long get(const char *s); + + double fset(const char *s, double v); + double finc(const char *s, double v); + + void flush(bool force = false); +}; + +#endif diff --git a/branches/sage/cephmds2/common/Mutex.h b/branches/sage/cephmds2/common/Mutex.h new file mode 100755 index 0000000000000..c4615a3ff4c6e --- /dev/null +++ b/branches/sage/cephmds2/common/Mutex.h @@ -0,0 +1,68 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MUTEX_H +#define __MUTEX_H + +#include +#include + +class Mutex { +private: + pthread_mutex_t _m; + int nlock; + bool recursive; + + // don't allow copying. + void operator=(Mutex &M) {} + Mutex( const Mutex &M ) {} + +public: + Mutex(bool r = true) : nlock(0), recursive(r) { + if (recursive) { + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr,PTHREAD_MUTEX_RECURSIVE); + pthread_mutex_init(&_m,&attr); + pthread_mutexattr_destroy(&attr); + } else { + pthread_mutex_init(&_m,NULL); + } + } + virtual ~Mutex() { + assert(nlock == 0); + pthread_mutex_destroy(&_m); + } + + bool is_locked() { + return (nlock > 0); + } + + void Lock() { + int r = pthread_mutex_lock(&_m); + assert(r == 0); + nlock++; + assert(nlock == 1 || recursive); + } + + void Unlock() { + assert(nlock > 0); + --nlock; + int r = pthread_mutex_unlock(&_m); + assert(r == 0); + } + + friend class Cond; +}; + +#endif diff --git a/branches/sage/cephmds2/common/Semaphore.h b/branches/sage/cephmds2/common/Semaphore.h new file mode 100644 index 0000000000000..7526f5c1ec9c8 --- /dev/null +++ b/branches/sage/cephmds2/common/Semaphore.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef _Sem_Posix_ +#define _Sem_Posix_ + +#include + +class Semaphore +{ + Mutex m; + Cond c; + int count; + + public: + + Semaphore() + { + count = 0; + } + + void Put() + { + m.Lock(); + count++; + c.Signal(); + m.Unlock(); + } + + void Get() + { + m.Lock(); + while(count <= 0) { + c.Wait(m); + } + count--; + m.Unlock(); + } +}; + +#endif // !_Mutex_Posix_ diff --git a/branches/sage/cephmds2/common/Thread.h b/branches/sage/cephmds2/common/Thread.h new file mode 100644 index 0000000000000..43e2942e84c5f --- /dev/null +++ b/branches/sage/cephmds2/common/Thread.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __THREAD_H +#define __THREAD_H + +#include + +class Thread { + private: + pthread_t thread_id; + + public: + Thread() : thread_id(0) {} + virtual ~Thread() {} + + pthread_t &get_thread_id() { return thread_id; } + bool is_started() { return thread_id != 0; } + + virtual void *entry() = 0; + + private: + static void *_entry_func(void *arg) { + return ((Thread*)arg)->entry(); + } + + public: + int create() { + return pthread_create( &thread_id, NULL, _entry_func, (void*)this ); + } + + bool am_self() { + return (pthread_self() == thread_id); + } + + int join(void **prval = 0) { + if (thread_id == 0) return -1; // never started. + int status = pthread_join(thread_id, prval); + if (status == 0) + thread_id = 0; + else { + cout << "join status = " << status << endl; + assert(0); + } + return status; + } +}; + +#endif diff --git a/branches/sage/cephmds2/common/ThreadPool.h b/branches/sage/cephmds2/common/ThreadPool.h new file mode 100644 index 0000000000000..674053bfe1087 --- /dev/null +++ b/branches/sage/cephmds2/common/ThreadPool.h @@ -0,0 +1,138 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef THREADPOOL +#define THREADPOOL + +#include +using namespace std; + + +#include +#include +#include +#include + + +// debug output +#include "config.h" +#define tpdout(x) if (x <= g_conf.debug) cout << myname +#define DBLVL 15 + + +using namespace std; + +#define MAX_THREADS 1000 + +template +class ThreadPool { + + private: + list q; + Mutex q_lock; + Semaphore q_sem; + + int num_ops; + int num_threads; + vector thread; + + U u; + void (*func)(U,T); + void (*prefunc)(U,T); + string myname; + + static void *foo(void *arg) + { + ThreadPool *t = (ThreadPool *)arg; + t->do_ops(arg); + return 0; + } + + void *do_ops(void *nothing) + { + tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " starting" << endl; + while (1) { + q_sem.Get(); + if (q.empty()) break; + + T op = get_op(); + tpdout(DBLVL) << ".func thread "<< pthread_self() << " on " << op << endl; + func(u, op); + } + tpdout(DBLVL) << ".do_ops thread " << pthread_self() << " exiting" << endl; + return 0; + } + + + T get_op() + { + T op; + q_lock.Lock(); + { + op = q.front(); + q.pop_front(); + num_ops--; + + if (prefunc && op) { + tpdout(DBLVL) << ".prefunc thread "<< pthread_self() << " on " << op << endl; + prefunc(u, op); + } + } + q_lock.Unlock(); + + return op; + } + + public: + + ThreadPool(char *myname, int howmany, void (*f)(U,T), U obj, void (*pf)(U,T) = 0) : + num_ops(0), num_threads(howmany), + thread(num_threads), + u(obj), + func(f), prefunc(pf), + myname(myname) { + tpdout(DBLVL) << ".cons num_threads " << num_threads << endl; + + // start threads + int status; + for(int i = 0; i < howmany; i++) { + status = pthread_create(&thread[i], NULL, (void*(*)(void *))&ThreadPool::foo, this); + assert(status == 0); + } + } + + ~ThreadPool() { + // bump sem to make threads exit cleanly + for(int i = 0; i < num_threads; i++) + q_sem.Put(); + + // wait for them to die + for(int i = 0; i < num_threads; i++) { + tpdout(DBLVL) << ".des joining thread " << thread[i] << endl; + void *rval = 0; // we don't actually care + pthread_join(thread[i], &rval); + } + } + + void put_op(T op) { + tpdout(DBLVL) << ".put_op " << op << endl; + q_lock.Lock(); + q.push_back(op); + num_ops++; + q_sem.Put(); + q_lock.Unlock(); + } + +}; +#endif diff --git a/branches/sage/cephmds2/common/Timer.cc b/branches/sage/cephmds2/common/Timer.cc new file mode 100644 index 0000000000000..d70259c3e0a08 --- /dev/null +++ b/branches/sage/cephmds2/common/Timer.cc @@ -0,0 +1,220 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + + +#include "Timer.h" +#include "Cond.h" + +#include "config.h" +#include "include/Context.h" + +#undef dout +#define dout(x) if (x <= g_conf.debug) cout << "Timer: " + +#define DBL 10 + +#include +#include +#include + +// single global instance +Timer g_timer; + + +/**** thread solution *****/ + +void Timer::timer_entry() +{ + lock.Lock(); + + while (!thread_stop) { + + // now + utime_t now = g_clock.now(); + + // any events due? + utime_t next; + Context *event = get_next_scheduled(next); + + list pending; + + if (event && now >= next) { + // move to pending list + map< utime_t, multiset >::iterator it = scheduled.begin(); + while (it != scheduled.end()) { + if (it->first > now) break; + + utime_t t = it->first; + dout(DBL) << "queueing event(s) scheduled at " << t << endl; + + for (multiset::iterator cit = it->second.begin(); + cit != it->second.end(); + cit++) { + pending.push_back(*cit); + event_times.erase(*cit); + num_event--; + } + + map< utime_t, multiset >::iterator previt = it; + it++; + scheduled.erase(previt); + } + + if (!pending.empty()) { + sleeping = false; + lock.Unlock(); + { // make sure we're not holding any locks while we do callbacks + // make the callbacks myself. + for (list::iterator cit = pending.begin(); + cit != pending.end(); + cit++) { + dout(DBL) << "doing callback " << *cit << endl; + (*cit)->finish(0); + } + pending.clear(); + assert(pending.empty()); + } + lock.Lock(); + } + + } + + else { + // sleep + if (event) { + dout(DBL) << "sleeping until " << next << endl; + timed_sleep = true; + sleeping = true; + timeout_cond.WaitUntil(lock, next); // wait for waker or time + utime_t now = g_clock.now(); + dout(DBL) << "kicked or timed out at " << now << endl; + } else { + dout(DBL) << "sleeping" << endl; + timed_sleep = false; + sleeping = true; + sleep_cond.Wait(lock); // wait for waker + utime_t now = g_clock.now(); + dout(DBL) << "kicked at " << now << endl; + } + } + } + + lock.Unlock(); +} + + + +/** + * Timer bits + */ + +void Timer::register_timer() +{ + if (timer_thread.is_started()) { + if (sleeping) { + dout(DBL) << "register_timer kicking thread" << endl; + if (timed_sleep) + timeout_cond.SignalAll(); + else + sleep_cond.SignalAll(); + } else { + dout(DBL) << "register_timer doing nothing; thread is alive but not sleeping" << endl; + // it's probably doing callbacks. + } + } else { + dout(DBL) << "register_timer starting thread" << endl; + timer_thread.create(); + } +} + +void Timer::cancel_timer() +{ + // clear my callback pointers + if (timer_thread.is_started()) { + dout(10) << "setting thread_stop flag" << endl; + lock.Lock(); + thread_stop = true; + if (timed_sleep) + timeout_cond.SignalAll(); + else + sleep_cond.SignalAll(); + lock.Unlock(); + + dout(10) << "waiting for thread to finish" << endl; + void *ptr; + timer_thread.join(&ptr); + + dout(10) << "thread finished, exit code " << ptr << endl; + } +} + + +/* + * schedule + */ + + +void Timer::add_event_after(float seconds, + Context *callback) +{ + utime_t when = g_clock.now(); + when.sec_ref() += (int)seconds; + add_event_at(when, callback); +} + +void Timer::add_event_at(utime_t when, + Context *callback) +{ + // insert + dout(DBL) << "add_event " << callback << " at " << when << endl; + + lock.Lock(); + scheduled[ when ].insert(callback); + assert(event_times.count(callback) == 0); // err.. there can be only one (for now!) + event_times[callback] = when; + + num_event++; + + // make sure i wake up + register_timer(); + + lock.Unlock(); +} + +bool Timer::cancel_event(Context *callback) +{ + lock.Lock(); + + dout(DBL) << "cancel_event " << callback << endl; + + if (!event_times.count(callback)) { + dout(DBL) << "cancel_event " << callback << " wasn't scheduled?" << endl; + lock.Unlock(); + assert(0); + return false; // wasn't scheduled. + } + + utime_t tp = event_times[callback]; + assert(scheduled.count(tp)); + + multiset::iterator p = scheduled[tp].find(callback); // there may be more than one? + assert(p != scheduled[tp].end()); + scheduled[tp].erase(p); + + event_times.erase(callback); + + lock.Unlock(); + return true; +} diff --git a/branches/sage/cephmds2/common/Timer.h b/branches/sage/cephmds2/common/Timer.h new file mode 100644 index 0000000000000..bd63d7173a3d3 --- /dev/null +++ b/branches/sage/cephmds2/common/Timer.h @@ -0,0 +1,143 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __TIMER_H +#define __TIMER_H + +#include "include/types.h" +#include "include/Context.h" +#include "Clock.h" + +#include "Mutex.h" +#include "Cond.h" +#include "Thread.h" + +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + + +/*** Timer + * schedule callbacks + */ + +//class Messenger; + + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const Context *p) const { + static hash H; + return H((unsigned long)p); + } + }; +} + + +class Timer { + private: + map< utime_t, multiset > scheduled; // time -> (context ...) + hash_map< Context*, utime_t > event_times; // event -> time + + // get time of the next event + Context* get_next_scheduled(utime_t& when) { + if (scheduled.empty()) return 0; + map< utime_t, multiset >::iterator it = scheduled.begin(); + when = it->first; + multiset::iterator sit = it->second.begin(); + return *sit; + } + + void register_timer(); // make sure i get a callback + void cancel_timer(); // make sure i get a callback + + //pthread_t thread_id; + bool thread_stop; + Mutex lock; + bool timed_sleep; + bool sleeping; + Cond sleep_cond; + Cond timeout_cond; + + public: + void timer_entry(); // waiter thread (that wakes us up) + + class TimerThread : public Thread { + Timer *t; + public: + void *entry() { + t->timer_entry(); + return 0; + } + TimerThread(Timer *_t) : t(_t) {} + } timer_thread; + + + int num_event; + + + public: + Timer() : + thread_stop(false), + timed_sleep(false), + sleeping(false), + timer_thread(this), + num_event(0) + { + } + ~Timer() { + // stop. + cancel_timer(); + + // scheduled + for (map< utime_t, multiset >::iterator it = scheduled.begin(); + it != scheduled.end(); + it++) { + for (multiset::iterator sit = it->second.begin(); + sit != it->second.end(); + sit++) + delete *sit; + } + scheduled.clear(); + } + + void init() { + register_timer(); + } + void shutdown() { + cancel_timer(); + } + + // schedule events + void add_event_after(float seconds, + Context *callback); + void add_event_at(utime_t when, + Context *callback); + bool cancel_event(Context *callback); + + // execute pending events + void execute_pending(); + +}; + + +// single global instance +extern Timer g_timer; + + + +#endif diff --git a/branches/sage/cephmds2/config.cc b/branches/sage/cephmds2/config.cc new file mode 100644 index 0000000000000..fe7261f703cf0 --- /dev/null +++ b/branches/sage/cephmds2/config.cc @@ -0,0 +1,718 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "config.h" +#include "include/types.h" + +//#define MDS_CACHE_SIZE 4*10000 -> <20mb +//#define MDS_CACHE_SIZE 80000 62mb + +#define AVG_PER_INODE_SIZE 450 +#define MDS_CACHE_MB_TO_INODES(x) ((x)*1000000/AVG_PER_INODE_SIZE) + +//#define MDS_CACHE_SIZE MDS_CACHE_MB_TO_INODES( 50 ) +//#define MDS_CACHE_SIZE 1500000 +#define MDS_CACHE_SIZE 150000 + + +// hack hack hack ugly FIXME +#include "common/Mutex.h" +long buffer_total_alloc = 0; +Mutex bufferlock; + + + +FileLayout g_OSD_FileLayout( 1<<20, 1, 1<<20, 2 ); // stripe over 1M objects, 2x replication +//FileLayout g_OSD_FileLayout( 1<<17, 4, 1<<20 ); // 128k stripes over sets of 4 + +// ?? +//FileLayout g_OSD_MDDirLayout( 1<<8, 1<<2, 1<<19, 3 ); // this is stupid, but can bring out an ebofs table bug? +FileLayout g_OSD_MDDirLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects, 2x replication + +// stripe mds log over 128 byte bits (see mds_log_pad_entry below to match!) +FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20, 2 ); // 1M objects +//FileLayout g_OSD_MDLogLayout( 1<<8, 1<<2, 1<<19, 3 ); // 256 byte bits +//FileLayout g_OSD_MDLogLayout( 1<<7, 32, 1<<20, 3 ); // 128 byte stripes over 32 1M objects +//FileLayout g_OSD_MDLogLayout( 57, 32, 1<<20 ); // pathological case to test striping buffer mapping +//FileLayout g_OSD_MDLogLayout( 1<<20, 1, 1<<20 ); // old way + +// fake osd failures: osd -> time +std::map g_fake_osd_down; +std::map g_fake_osd_out; + +md_config_t g_debug_after_conf; + +md_config_t g_conf = { + num_mon: 1, + num_mds: 1, + num_osd: 4, + num_client: 1, + + mkfs: false, + + // profiling and debugging + log: true, + log_interval: 1, + log_name: (char*)0, + + log_messages: true, + log_pins: true, + + fake_clock: false, + fakemessenger_serialize: true, + + fake_osdmap_expand: 0, + fake_osdmap_updates: 0, + fake_osd_mttf: 0, + fake_osd_mttr: 0, + + osd_remount_at: 0, + + kill_after: 0, + + tick: 0, + + debug: 0, + debug_mds: 1, + debug_mds_balancer: 1, + debug_mds_log: 1, + debug_buffer: 0, + debug_filer: 0, + debug_objecter: 0, + debug_objectcacher: 0, + debug_client: 0, + debug_osd: 0, + debug_ebofs: 1, + debug_bdev: 1, // block device + debug_ns: 0, + debug_ms: 0, + debug_mon: 0, + + debug_after: 0, + + // --- clock --- + clock_lock: false, + + // --- messenger --- + ms_single_dispatch: false, + ms_requeue_on_sender_fail: false, + + ms_stripe_osds: false, + ms_skip_rank0: false, + ms_overlay_clients: false, + + ms_die_on_failure: false, + + /*tcp_skip_rank0: false, + tcp_overlay_clients: false, // over osds! + tcp_log: false, + tcp_serial_marshall: true, + tcp_serial_out: false, + tcp_multi_out: true, + tcp_multi_dispatch: false, // not fully implemented yet + */ + + // --- mon --- + mon_tick_interval: 5, + mon_osd_down_out_interval: 5, // seconds + mon_lease: 2.000, // seconds + + // --- client --- + client_cache_size: 300, + client_cache_mid: .5, + client_cache_stat_ttl: 0, // seconds until cached stat results become invalid + client_cache_readdir_ttl: 1, // 1 second only + client_use_random_mds: false, + + client_sync_writes: 0, + + client_oc: true, + client_oc_size: 1024*1024* 5, // MB * n + client_oc_max_dirty: 1024*1024* 5, // MB * n + client_oc_max_sync_write: 128*1024, // writes >= this use wrlock + + client_trace: 0, + fuse_direct_io: 0, + + // --- objecter --- + objecter_buffer_uncommitted: true, + + // --- journaler --- + journaler_allow_split_entries: false, + + // --- mds --- + mds_cache_size: MDS_CACHE_SIZE, + mds_cache_mid: .7, + + mds_decay_halflife: 30, + + mds_log: true, + mds_log_max_len: MDS_CACHE_SIZE / 3, + mds_log_max_trimming: 10000, + mds_log_read_inc: 1<<20, + mds_log_pad_entry: 128,//256,//64, + mds_log_before_reply: true, + mds_log_flush_on_shutdown: true, + + mds_bal_replicate_threshold: 2000, + mds_bal_unreplicate_threshold: 0,//500, + mds_bal_hash_rd: 10000, + mds_bal_unhash_rd: 1000, + mds_bal_hash_wr: 10000, + mds_bal_unhash_wr: 1000, + mds_bal_interval: 30, // seconds + mds_bal_hash_interval: 5, // seconds + mds_bal_idle_threshold: .1, + mds_bal_max: -1, + mds_bal_max_until: -1, + + mds_bal_mode: 0, + mds_bal_min_start: .2, // if we need less than this, we don't do anything + mds_bal_need_min: .8, // take within this range of what we need + mds_bal_need_max: 1.2, + mds_bal_midchunk: .3, // any sub bigger than this taken in full + mds_bal_minchunk: .001, // never take anything smaller than this + + mds_commit_on_shutdown: true, + mds_shutdown_check: 0, //30, + + mds_verify_export_dirauth: true, + + mds_local_osd: false, + + + // --- osd --- + osd_rep: OSD_REP_PRIMARY, + osd_balance_reads: false, + osd_pg_bits: 0, // 0 == let osdmonitor decide + osd_object_layout: OBJECT_LAYOUT_HASHINO, + osd_pg_layout: PG_LAYOUT_CRUSH, + osd_max_rep: 4, + osd_maxthreads: 2, // 0 == no threading + osd_max_opq: 10, + osd_mkfs: false, + osd_age: .8, + osd_age_time: 0, + osd_heartbeat_interval: 5, // shut up while i'm debugging + osd_replay_window: 5, + osd_max_pull: 2, + osd_pad_pg_log: false, + + // --- fakestore --- + fakestore_fake_sync: 2, // 2 seconds + fakestore_fsync: false,//true, + fakestore_writesync: false, + fakestore_syncthreads: 4, + fakestore_fakeattr: true, + fakestore_dev: 0, + + // --- ebofs --- + ebofs: 1, + ebofs_cloneable: false, + ebofs_verify: false, + ebofs_commit_ms: 2000, // 0 = no forced commit timeout (for debugging/tracing) + ebofs_idle_commit_ms: 100, // 0 = no idle detection. use this -or- bdev_idle_kick_after_ms + ebofs_oc_size: 10000, // onode cache + ebofs_cc_size: 10000, // cnode cache + ebofs_bc_size: (80 *256), // 4k blocks, *256 for MB + ebofs_bc_max_dirty: (60 *256), // before write() will block + ebofs_max_prefetch: 1000, // 4k blocks + ebofs_realloc: true, + + ebofs_abp_zero: false, // zero newly allocated buffers (may shut up valgrind) + ebofs_abp_max_alloc: 4096*16, // max size of new buffers (larger -> more memory fragmentation) + + // --- obfs --- + uofs: 0, + uofs_fake_sync: 2, // 2 seconds + uofs_cache_size: 1 << 28, //256MB + uofs_onode_size: (int)1024, + uofs_small_block_size: (int)4096, //4KB + uofs_large_block_size: (int)524288, //512KB + uofs_segment_size: (int)268435456, //256MB + uofs_block_meta_ratio: (int)10, + uofs_sync_write: (int)0, + uofs_nr_hash_buckets: (int)1023, + uofs_flush_interval: (int)5, //seconds + uofs_min_flush_pages: (int)1024, //4096 4k-pages + uofs_delay_allocation: (int)1, //true + + // --- block device --- + bdev_lock: true, + bdev_iothreads: 1, // number of ios to queue with kernel + bdev_idle_kick_after_ms: 0,//100, // ms ** FIXME ** this seems to break things, not sure why yet ** + bdev_el_fw_max_ms: 10000, // restart elevator at least once every 1000 ms + bdev_el_bw_max_ms: 3000, // restart elevator at least once every 300 ms + bdev_el_bidir: true, // bidirectional elevator? + bdev_iov_max: 512, // max # iov's to collect into a single readv()/writev() call + bdev_debug_check_io_overlap: true, // [DEBUG] check for any pending io overlaps + bdev_fake_mb: 0, + bdev_fake_max_mb: 0, + + // --- fakeclient (mds regression testing) (ancient history) --- + num_fakeclient: 100, + fakeclient_requests: 100, + fakeclient_deterministic: false, + + fakeclient_op_statfs: false, + + // loosely based on Roselli workload paper numbers + fakeclient_op_stat: 610, + fakeclient_op_lstat: false, + fakeclient_op_utime: 0, + fakeclient_op_chmod: 1, + fakeclient_op_chown: 1, + + fakeclient_op_readdir: 2, + fakeclient_op_mknod: 30, + fakeclient_op_link: false, + fakeclient_op_unlink: 20, + fakeclient_op_rename: 0,//40, + + fakeclient_op_mkdir: 10, + fakeclient_op_rmdir: 20, + fakeclient_op_symlink: 20, + + fakeclient_op_openrd: 200, + fakeclient_op_openwr: 0, + fakeclient_op_openwrc: 0, + fakeclient_op_read: false, // osd! + fakeclient_op_write: false, // osd! + fakeclient_op_truncate: false, + fakeclient_op_fsync: false, + fakeclient_op_close: 200 +}; + + +#include +#include + + +void env_to_vec(std::vector& args) +{ + const char *p = getenv("CEPH_ARGS"); + if (!p) return; + + static char buf[1000]; + int len = strlen(p); + memcpy(buf, p, len); + buf[len] = 0; + //cout << "CEPH_ARGS " << buf << endl; + + int l = 0; + for (int i=0; i& args) +{ + for (int i=1; i& args, + int& argc, char **&argv) +{ + argv = (char**)malloc(sizeof(char*) * argc); + argc = 1; + argv[0] = "asdf"; + + for (unsigned i=0; i& args) +{ + std::vector nargs; + + for (unsigned i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __CONFIG_H +#define __CONFIG_H + +extern class FileLayout g_OSD_FileLayout; +extern class FileLayout g_OSD_MDDirLayout; +extern class FileLayout g_OSD_MDLogLayout; + +#include +#include + +extern std::map g_fake_osd_down; +extern std::map g_fake_osd_out; + +#define OSD_REP_PRIMARY 0 +#define OSD_REP_SPLAY 1 +#define OSD_REP_CHAIN 2 + +struct md_config_t { + int num_mon; + int num_mds; + int num_osd; + int num_client; + + bool mkfs; + + // profiling + bool log; + int log_interval; + char *log_name; + + bool log_messages; + bool log_pins; + + bool fake_clock; + bool fakemessenger_serialize; + + int fake_osdmap_expand; + int fake_osdmap_updates; + int fake_osd_mttf; + int fake_osd_mttr; + + int osd_remount_at; + + int kill_after; + + int tick; + + int debug; + int debug_mds; + int debug_mds_balancer; + int debug_mds_log; + int debug_buffer; + int debug_filer; + int debug_objecter; + int debug_objectcacher; + int debug_client; + int debug_osd; + int debug_ebofs; + int debug_bdev; + int debug_ns; + int debug_ms; + int debug_mon; + + int debug_after; + + // clock + bool clock_lock; + + // messenger + + /*bool tcp_skip_rank0; + bool tcp_overlay_clients; + bool tcp_log; + bool tcp_serial_marshall; + bool tcp_serial_out; + bool tcp_multi_out; + bool tcp_multi_dispatch; + */ + + bool ms_single_dispatch; + bool ms_requeue_on_sender_fail; + + bool ms_stripe_osds; + bool ms_skip_rank0; + bool ms_overlay_clients; + bool ms_die_on_failure; + + // mon + int mon_tick_interval; + int mon_osd_down_out_interval; + float mon_lease; + + // client + int client_cache_size; + float client_cache_mid; + int client_cache_stat_ttl; + int client_cache_readdir_ttl; + bool client_use_random_mds; // debug flag + + bool client_sync_writes; + + bool client_oc; + int client_oc_size; + int client_oc_max_dirty; + size_t client_oc_max_sync_write; + + + + /* + bool client_bcache; + int client_bcache_alloc_minsize; + int client_bcache_alloc_maxsize; + int client_bcache_ttl; + off_t client_bcache_size; + int client_bcache_lowater; + int client_bcache_hiwater; + size_t client_bcache_align; + */ + + int client_trace; + int fuse_direct_io; + + // objecter + bool objecter_buffer_uncommitted; + + // journaler + bool journaler_allow_split_entries; + + // mds + int mds_cache_size; + float mds_cache_mid; + + float mds_decay_halflife; + + bool mds_log; + int mds_log_max_len; + int mds_log_max_trimming; + int mds_log_read_inc; + int mds_log_pad_entry; + bool mds_log_before_reply; + bool mds_log_flush_on_shutdown; + + float mds_bal_replicate_threshold; + float mds_bal_unreplicate_threshold; + float mds_bal_hash_rd; + float mds_bal_unhash_rd; + float mds_bal_hash_wr; + float mds_bal_unhash_wr; + int mds_bal_interval; + int mds_bal_hash_interval; + float mds_bal_idle_threshold; + int mds_bal_max; + int mds_bal_max_until; + + int mds_bal_mode; + float mds_bal_min_start; + float mds_bal_need_min; + float mds_bal_need_max; + float mds_bal_midchunk; + float mds_bal_minchunk; + + bool mds_commit_on_shutdown; + int mds_shutdown_check; + bool mds_verify_export_dirauth; // debug flag + + bool mds_local_osd; + + + // osd + int osd_rep; + bool osd_balance_reads; + int osd_pg_bits; + int osd_object_layout; + int osd_pg_layout; + int osd_max_rep; + int osd_maxthreads; + int osd_max_opq; + bool osd_mkfs; + float osd_age; + int osd_age_time; + int osd_heartbeat_interval; + int osd_replay_window; + int osd_max_pull; + bool osd_pad_pg_log; + + int fakestore_fake_sync; + bool fakestore_fsync; + bool fakestore_writesync; + int fakestore_syncthreads; // such crap + bool fakestore_fakeattr; + char *fakestore_dev; + + // ebofs + int ebofs; + bool ebofs_cloneable; + bool ebofs_verify; + int ebofs_commit_ms; + int ebofs_idle_commit_ms; + int ebofs_oc_size; + int ebofs_cc_size; + off_t ebofs_bc_size; + off_t ebofs_bc_max_dirty; + unsigned ebofs_max_prefetch; + bool ebofs_realloc; + + bool ebofs_abp_zero; + size_t ebofs_abp_max_alloc; + + int uofs; + int uofs_fake_sync; + int uofs_cache_size; + int uofs_onode_size; + int uofs_small_block_size; + int uofs_large_block_size; + int uofs_segment_size; + int uofs_block_meta_ratio; + int uofs_sync_write; + + int uofs_nr_hash_buckets; + int uofs_flush_interval; + int uofs_min_flush_pages; + int uofs_delay_allocation; + + // block device + bool bdev_lock; + int bdev_iothreads; + int bdev_idle_kick_after_ms; + int bdev_el_fw_max_ms; + int bdev_el_bw_max_ms; + bool bdev_el_bidir; + int bdev_iov_max; + bool bdev_debug_check_io_overlap; + int bdev_fake_mb; + int bdev_fake_max_mb; + + // fake client + int num_fakeclient; + unsigned fakeclient_requests; + bool fakeclient_deterministic; // debug flag + + int fakeclient_op_statfs; + + int fakeclient_op_stat; + int fakeclient_op_lstat; + int fakeclient_op_utime; + int fakeclient_op_chmod; + int fakeclient_op_chown; + + int fakeclient_op_readdir; + int fakeclient_op_mknod; + int fakeclient_op_link; + int fakeclient_op_unlink; + int fakeclient_op_rename; + + int fakeclient_op_mkdir; + int fakeclient_op_rmdir; + int fakeclient_op_symlink; + + int fakeclient_op_openrd; + int fakeclient_op_openwr; + int fakeclient_op_openwrc; + int fakeclient_op_read; + int fakeclient_op_write; + int fakeclient_op_truncate; + int fakeclient_op_fsync; + int fakeclient_op_close; + +}; + +extern md_config_t g_conf; +extern md_config_t g_debug_after_conf; + +#define dout(x) if ((x) <= g_conf.debug) std::cout +#define dout2(x) if ((x) <= g_conf.debug) std::cout + +void env_to_vec(std::vector& args); +void argv_to_vec(int argc, char **argv, + std::vector& args); +void vec_to_argv(std::vector& args, + int& argc, char **&argv); + +void parse_config_options(std::vector& args); + +#endif diff --git a/branches/sage/cephmds2/cosd.cc b/branches/sage/cephmds2/cosd.cc new file mode 100644 index 0000000000000..cb60ed492515b --- /dev/null +++ b/branches/sage/cephmds2/cosd.cc @@ -0,0 +1,118 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mon/MonMap.h" + +#include "osd/OSD.h" +#include "ebofs/Ebofs.h" + +#include "msg/NewMessenger.h" + +#include "common/Timer.h" + + +class C_Die : public Context { +public: + void finish(int) { + cerr << "die" << endl; + exit(1); + } +}; + +class C_Debug : public Context { + public: + void finish(int) { + int size = &g_conf.debug_after - &g_conf.debug; + memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); + dout(0) << "debug_after flipping debug settings" << endl; + } +}; + + +int main(int argc, char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + + parse_config_options(args); + + if (g_conf.kill_after) + g_timer.add_event_after(g_conf.kill_after, new C_Die); + if (g_conf.debug_after) + g_timer.add_event_after(g_conf.debug_after, new C_Debug); + + + assert(args.size() == 1); + char *dev = args[0]; + cerr << "dev " << dev << endl; + + // who am i? peek at superblock! + OSDSuperblock sb; + ObjectStore *store = new Ebofs(dev); + bufferlist bl; + store->mount(); + int r = store->read(object_t(0,0), 0, sizeof(sb), bl); + if (r < 0) { + cerr << "couldn't read superblock object on " << dev << endl; + exit(0); + } + bl.copy(0, sizeof(sb), (char*)&sb); + store->umount(); + delete store; + + cout << "osd fs says i am osd" << sb.whoami << endl; + + // load monmap + bl.clear(); + int fd = ::open(".ceph_monmap", O_RDONLY); + assert(fd >= 0); + struct stat st; + ::fstat(fd, &st); + bufferptr bp(st.st_size); + bl.append(bp); + ::read(fd, (void*)bl.c_str(), bl.length()); + ::close(fd); + + MonMap *monmap = new MonMap; + monmap->decode(bl); + + // start up network + rank.set_namer(monmap->get_inst(0).addr); + rank.start_rank(); + + // start osd + Messenger *m = rank.register_entity(MSG_ADDR_OSD(sb.whoami)); + assert(m); + OSD *osd = new OSD(sb.whoami, m, monmap, dev); + osd->init(); + + // wait + rank.wait(); + + // done + delete osd; + + return 0; +} + diff --git a/branches/sage/cephmds2/crush/BinaryTree.h b/branches/sage/cephmds2/crush/BinaryTree.h new file mode 100644 index 0000000000000..4f8524bf4ddce --- /dev/null +++ b/branches/sage/cephmds2/crush/BinaryTree.h @@ -0,0 +1,271 @@ +#ifndef __crush_BINARYTREE_H +#define __crush_BINARYTREE_H + +#include +#include +#include +#include +//#include +using namespace std; + +#include "include/buffer.h" + +namespace crush { + + class BinaryTree { + private: + // tree def + int root_node; // 0 for empty tree. + int alloc; + vector node_nested; // all existing nodes in this map + vector node_weight; // and this one + vector node_complete; // only nodes with all possible children + + public: + BinaryTree() : root_node(0), alloc(0) {} + + void _encode(bufferlist& bl) { + bl.append((char*)&root_node, sizeof(root_node)); + bl.append((char*)&alloc, sizeof(alloc)); + ::_encode(node_nested, bl); + ::_encode(node_weight, bl); + ::_encode(node_complete, bl); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(root_node), (char*)&root_node); + off += sizeof(root_node); + bl.copy(off, sizeof(alloc), (char*)&alloc); + off += sizeof(alloc); + ::_decode(node_nested, bl, off); + ::_decode(node_weight, bl, off); + ::_decode(node_complete, bl, off); + } + + // accessors + bool empty() const { return root_node == 0; } + bool exists(int n) const { return n < alloc && node_nested[n]; } + int nested(int n) const { return exists(n) ? node_nested[n]:0; } + float weight(int n) const { return exists(n) ? node_weight[n]:0; } + bool complete(int n) const { return exists(n) ? node_complete[n]:false; } + + int root() const { return root_node; } + + void realloc(int n) { + /* + while (alloc <= n) { + node_nested.push_back(0); + node_weight.push_back(0); + node_complete.push_back(0); + alloc++; + } + */ + if (alloc <= n) { + int add = n - alloc + 1; + node_nested.insert(node_nested.end(), add, 0); + node_weight.insert(node_weight.end(), add, 0); + node_complete.insert(node_complete.end(), add, 0); + alloc = n+1; + } + } + + // tree navigation + bool terminal(int n) const { return n & 1; } // odd nodes are leaves. + int height(int n) const { + assert(n); + int h = 0; + while ((n & 1) == 0) { + assert(n > 0); + h++; n = n >> 1; + } + return h; + } + int left(int n) const { + int h = height(n); + //cout << "left of " << n << " is " << (n - (1 << h)) << endl; + return n - (1 << (h-1)); + } + int right(int n) const { + int h = height(n); + //cout << "right of " << n << " is " << (n + (1 << h)) << endl; + return n + (1 << (h-1)); + } + bool on_right(int n, int h = -1) const { + if (h < 0) h = height(n); + return n & (1 << (h+1)); + } + bool on_left(int n) const { return !on_right(n); } + int parent(int n) const { + int h = height(n); + if (on_right(n, h)) + return n - (1<0; t--) out << " "; + if (tree.root() == n) + out << "root "; + else { + if (tree.on_left(n)) + out << "left "; + else + out << "right "; + } + out << n << " : nested " << tree.nested(n) << " weight " << tree.weight(n); + if (tree.complete(n)) out << " complete"; + out << endl; + if (!tree.terminal(n)) { + if (tree.exists(tree.left(n))) + print_binary_tree_node(out, tree, tree.left(n), i+2); + if (tree.exists(tree.right(n))) + print_binary_tree_node(out, tree, tree.right(n), i+2); + } + } + + inline ostream& operator<<(ostream& out, const BinaryTree& tree) { + if (tree.empty()) + return out << "tree is empty"; + print_binary_tree_node(out, tree, tree.root(), 0); + return out; + } + +} + +#endif diff --git a/branches/sage/cephmds2/crush/Bucket.h b/branches/sage/cephmds2/crush/Bucket.h new file mode 100644 index 0000000000000..cdae5bfce8ae4 --- /dev/null +++ b/branches/sage/cephmds2/crush/Bucket.h @@ -0,0 +1,618 @@ +#ifndef __crush_BUCKET_H +#define __crush_BUCKET_H + +#include "BinaryTree.h" +#include "Hash.h" + +#include +#include +#include +#include +using namespace std; + +#include + +#include "include/buffer.h" + +namespace crush { + + + const int CRUSH_BUCKET_UNIFORM = 1; + const int CRUSH_BUCKET_TREE = 2; + const int CRUSH_BUCKET_LIST = 3; + const int CRUSH_BUCKET_STRAW = 4; + + /** abstract bucket **/ + class Bucket { + protected: + int id; + int parent; + int type; + float weight; + + public: + Bucket(int _type, + float _weight) : + id(0), parent(0), + type(_type), + weight(_weight) { } + + Bucket(bufferlist& bl, int& off) { + bl.copy(off, sizeof(id), (char*)&id); + off += sizeof(id); + bl.copy(off, sizeof(parent), (char*)&parent); + off += sizeof(parent); + bl.copy(off, sizeof(type), (char*)&type); + off += sizeof(type); + bl.copy(off, sizeof(weight), (char*)&weight); + off += sizeof(weight); + } + + virtual ~Bucket() { } + + virtual const char *get_bucket_type() const = 0; + virtual bool is_uniform() const = 0; + + int get_id() const { return id; } + int get_type() const { return type; } + float get_weight() const { return weight; } + int get_parent() const { return parent; } + virtual int get_size() const = 0; + + void set_id(int i) { id = i; } + void set_parent(int p) { parent = p; } + void set_weight(float w) { weight = w; } + + virtual void get_items(vector& i) const = 0; + virtual float get_item_weight(int item) const = 0; + virtual void add_item(int item, float w, bool back=false) = 0; + virtual void adjust_item_weight(int item, float w) = 0; + virtual void set_item_weight(int item, float w) { + adjust_item_weight(item, w - get_item_weight(item)); + } + + virtual int choose_r(int x, int r, Hash& h) const = 0; + + virtual void _encode(bufferlist& bl) = 0; + }; + + + + + /** uniform bucket **/ + class UniformBucket : public Bucket { + protected: + public: + vector items; + int item_type; + float item_weight; + + // primes + vector primes; + + int get_prime(int j) const { + return primes[ j % primes.size() ]; + } + void make_primes() { + if (items.empty()) return; + + //cout << "make_primes " << get_id() << " " << items.size() << endl; + Hash h(123+get_id()); + primes.clear(); + + // start with odd number > num_items + unsigned x = items.size() + 1; // this is the minimum! + x += h(items.size()) % (3*items.size()); // bump it up some + x |= 1; // make it odd + + while (primes.size() < items.size()) { + unsigned j; + for (j=2; j*j<=x; j++) + if (x % j == 0) break; + if (j*j > x) { + primes.push_back(x); + //cout << "prime " << x << endl; + } + x += 2; + } + } + + public: + UniformBucket(int _type, int _item_type) : + Bucket(_type, 0), + item_type(_item_type) { } + UniformBucket(int _type, int _item_type, + float _item_weight, vector& _items) : + Bucket(_type, _item_weight*_items.size()), + item_type(_item_type), + item_weight(_item_weight) { + items = _items; + make_primes(); + } + + UniformBucket(bufferlist& bl, int& off) : Bucket(bl, off) { + bl.copy(off, sizeof(item_type), (char*)&item_type); + off += sizeof(item_type); + bl.copy(off, sizeof(item_weight), (char*)&item_weight); + off += sizeof(item_weight); + ::_decode(items, bl, off); + make_primes(); + } + + void _encode(bufferlist& bl) { + char t = CRUSH_BUCKET_UNIFORM; + bl.append((char*)&t, sizeof(t)); + bl.append((char*)&id, sizeof(id)); + bl.append((char*)&parent, sizeof(parent)); + bl.append((char*)&type, sizeof(type)); + bl.append((char*)&weight, sizeof(weight)); + + bl.append((char*)&item_type, sizeof(item_type)); + bl.append((char*)&item_weight, sizeof(item_weight)); + + ::_encode(items, bl); + } + + const char *get_bucket_type() const { return "uniform"; } + bool is_uniform() const { return true; } + + int get_size() const { return items.size(); } + + // items + void get_items(vector& i) const { + i = items; + } + int get_item_type() const { return item_type; } + float get_item_weight(int item) const { return item_weight; } + + void add_item(int item, float w, bool back=false) { + if (items.empty()) + item_weight = w; + items.push_back(item); + weight += item_weight; + make_primes(); + } + + void adjust_item_weight(int item, float w) { + assert(0); + } + + int choose_r(int x, int r, Hash& hash) const { + //cout << "uniformbucket.choose_r(" << x << ", " << r << ")" << endl; + //if (r >= get_size()) cout << "warning: r " << r << " >= " << get_size() << " uniformbucket.size" << endl; + + unsigned v = hash(x, get_id());// % get_size(); + unsigned p = get_prime( hash(get_id(), x) ); // choose a prime based on hash(x, get_id(), 2) + unsigned s = (x + v + (r+1)*p) % get_size(); + return items[s]; + } + + }; + + + + + + // list bucket.. RUSH_P sorta + + class ListBucket : public Bucket { + protected: + list items; + list item_weight; + list sum_weight; + + public: + ListBucket(int _type) : Bucket(_type, 0) { } + + ListBucket(bufferlist& bl, int& off) : Bucket(bl, off) { + ::_decode(items, bl, off); + ::_decode(item_weight, bl, off); + ::_decode(sum_weight, bl, off); + } + + void _encode(bufferlist& bl) { + char t = CRUSH_BUCKET_LIST; + bl.append((char*)&t, sizeof(t)); + bl.append((char*)&id, sizeof(id)); + bl.append((char*)&parent, sizeof(parent)); + bl.append((char*)&type, sizeof(type)); + bl.append((char*)&weight, sizeof(weight)); + + ::_encode(items, bl); + ::_encode(item_weight, bl); + ::_encode(sum_weight, bl); + } + + const char *get_bucket_type() const { return "list"; } + bool is_uniform() const { return false; } + + int get_size() const { return items.size(); } + + void get_items(vector& i) const { + for (list::const_iterator it = items.begin(); + it != items.end(); + it++) + i.push_back(*it); + } + float get_item_weight(int item) const { + list::const_iterator i = items.begin(); + list::const_iterator w = item_weight.begin(); + while (i != items.end()) { + if (*i == item) return *w; + i++; w++; + } + assert(0); + return 0; + } + + void add_item(int item, float w, bool back=false) { + if (back) { + items.push_back(item); + item_weight.push_back(w); + sum_weight.clear(); + float s = 0.0; + for (list::reverse_iterator i = item_weight.rbegin(); + i != item_weight.rend(); + i++) { + s += *i; + sum_weight.push_front(s); + } + weight += w; + assert(weight == s); + } else { + items.push_front(item); + item_weight.push_front(w); + weight += w; + sum_weight.push_front(weight); + } + } + + void adjust_item_weight(int item, float dw) { + // find it + list::iterator p = items.begin(); + list::iterator pw = item_weight.begin(); + list::iterator ps = sum_weight.begin(); + + while (*p != item) { + *ps += dw; + p++; pw++; ps++; // next! + assert(p != items.end()); + } + + assert(*p == item); + *pw += dw; + *ps += dw; + } + + + int choose_r(int x, int r, Hash& h) const { + //cout << "linearbucket.choose_r(" << x << ", " << r << ")" << endl; + + list::const_iterator p = items.begin(); + list::const_iterator pw = item_weight.begin(); + list::const_iterator ps = sum_weight.begin(); + + while (p != items.end()) { + const int item = *p; + const float iw = *pw; + const float tw = *ps; + const float f = (float)(h(x, item, r, get_id()) % 10000) * tw / 10000.0; + //cout << "item " << item << " iw = " << iw << " tw = " << tw << " f = " << f << endl; + if (f < iw) { + //cout << "linearbucket.choose_r(" << x << ", " << r << ") = " << item << endl; + return item; + } + p++; pw++; ps++; // next! + } + assert(0); + return 0; + } + + + }; + + + + + // mixed bucket, based on RUSH_T type binary tree + + class TreeBucket : public Bucket { + protected: + //vector item_weight; + + // public: + BinaryTree tree; + map node_item; // node id -> item + vector node_item_vec; // fast version of above + map item_node; // item -> node id + map item_weight; + + public: + TreeBucket(int _type) : Bucket(_type, 0) { } + + TreeBucket(bufferlist& bl, int& off) : Bucket(bl, off) { + tree._decode(bl, off); + + ::_decode(node_item, bl, off); + ::_decode(node_item_vec, bl, off); + ::_decode(item_node, bl, off); + ::_decode(item_weight, bl, off); + } + + void _encode(bufferlist& bl) { + char t = CRUSH_BUCKET_TREE; + bl.append((char*)&t, sizeof(t)); + bl.append((char*)&id, sizeof(id)); + bl.append((char*)&parent, sizeof(parent)); + bl.append((char*)&type, sizeof(type)); + bl.append((char*)&weight, sizeof(weight)); + + tree._encode(bl); + + ::_encode(node_item, bl); + ::_encode(node_item_vec, bl); + ::_encode(item_node, bl); + ::_encode(item_weight, bl); + } + + const char *get_bucket_type() const { return "tree"; } + bool is_uniform() const { return false; } + + int get_size() const { return node_item.size(); } + + // items + void get_items(vector& i) const { + for (map::const_iterator it = node_item.begin(); + it != node_item.end(); + it++) + i.push_back(it->second); + } + float get_item_weight(int i) const { + assert(item_weight.count(i)); + return ((map)item_weight)[i]; + } + + + void add_item(int item, float w, bool back=false) { + item_weight[item] = w; + weight += w; + + unsigned n = tree.add_node(w); + node_item[n] = item; + item_node[item] = n; + + while (node_item_vec.size() <= n) + node_item_vec.push_back(0); + node_item_vec[n] = item; + } + + void adjust_item_weight(int item, float dw) { + // adjust my weight + weight += dw; + item_weight[item] += dw; + + // adjust tree weights + tree.adjust_node_weight(item_node[item], dw); + } + + int choose_r(int x, int r, Hash& h) const { + //cout << "mixedbucket.choose_r(" << x << ", " << r << ")" << endl; + int n = tree.root(); + while (!tree.terminal(n)) { + // pick a point in [0,w) + float w = tree.weight(n); + float f = (float)(h(x, n, r, get_id()) % 10000) * w / 10000.0; + + // left or right? + int l = tree.left(n); + if (tree.exists(l) && + f < tree.weight(l)) + n = l; + else + n = tree.right(n); + } + //assert(node_item.count(n)); + //return ((map)node_item)[n]; + return node_item_vec[n]; + } + }; + + + + + + // straw bucket.. new thing! + + class StrawBucket : public Bucket { + protected: + map item_weight; + map item_straw; + + list _items; + list _straws; + + public: + StrawBucket(int _type) : Bucket(_type, 0) { } + + StrawBucket(bufferlist& bl, int& off) : Bucket(bl, off) { + ::_decode(item_weight, bl, off); + calc_straws(); + } + + void _encode(bufferlist& bl) { + char t = CRUSH_BUCKET_TREE; + bl.append((char*)&t, sizeof(t)); + bl.append((char*)&id, sizeof(id)); + bl.append((char*)&parent, sizeof(parent)); + bl.append((char*)&type, sizeof(type)); + bl.append((char*)&weight, sizeof(weight)); + + ::_encode(item_weight, bl); + } + + const char *get_bucket_type() const { return "straw"; } + bool is_uniform() const { return false; } + + int get_size() const { return item_weight.size(); } + + + // items + void get_items(vector& i) const { + for (map::const_iterator it = item_weight.begin(); + it != item_weight.end(); + it++) + i.push_back(it->first); + } + float get_item_weight(int item) const { + assert(item_weight.count(item)); + return ((map)item_weight)[item]; + } + + void add_item(int item, float w, bool back=false) { + item_weight[item] = w; + weight += w; + calc_straws(); + } + + void adjust_item_weight(int item, float dw) { + //cout << "adjust " << item << " " << dw << endl; + weight += dw; + item_weight[item] += dw; + calc_straws(); + } + + + /* calculate straw lengths. + this is kind of ugly. not sure if there's a closed form way to calculate this or not! + */ + void calc_straws() { + //cout << get_id() << ": calc_straws ============" << endl; + + item_straw.clear(); + _items.clear(); + _straws.clear(); + + // reverse sort by weight; skip zero weight items + map > reverse; + for (map::iterator p = item_weight.begin(); + p != item_weight.end(); + p++) { + //cout << get_id() << ":" << p->first << " " << p->second << endl; + if (p->second > 0) { + //p->second /= minw; + reverse[p->second].insert(p->first); + } + } + + /* 1:2:7 + item_straw[0] = 1.0; + item_straw[1] = item_straw[0]*sqrt(1.0/.6); + item_straw[2] = item_straw[1]*2.0; + */ + + // work from low to high weights + float straw = 1.0; + float numleft = item_weight.size(); + float wbelow = 0.0; + float lastw = 0.0; + + map >::iterator next = reverse.begin(); + //while (next != reverse.end()) { + while (1) { + //cout << "hi " << next->first << endl; + map >::iterator cur = next; + + // set straw length for this set + for (set::iterator s = cur->second.begin(); + s != cur->second.end(); + s++) { + item_straw[*s] = straw; + //cout << "straw " << *s << " w " << item_weight[*s] << " -> " << straw << endl; + _items.push_back(*s); + _straws.push_back(straw); + } + + next++; + if (next == reverse.end()) break; + + wbelow += (cur->first-lastw) * numleft; + //cout << "wbelow " << wbelow << endl; + + numleft -= 1.0 * (float)cur->second.size(); + //cout << "numleft now " << numleft << endl; + + float wnext = numleft * (next->first - cur->first); + //cout << "wnext " << wnext << endl; + + float pbelow = wbelow / (wbelow+wnext); + //cout << "pbelow " << pbelow << endl; + + straw *= pow((double)(1.0/pbelow), (double)1.0/numleft); + + lastw = cur->first; + } + //cout << "============" << endl; + } + + int choose_r(int x, int r, Hash& h) const { + //cout << "strawbucket.choose_r(" << x << ", " << r << ")" << endl; + + float high_draw = -1; + int high = 0; + + list::const_iterator pi = _items.begin(); + list::const_iterator ps = _straws.begin(); + while (pi != _items.end()) { + const int item = *pi; + const float rnd = (float)(h(x, item, r) % 1000000) / 1000000.0; + const float straw = *ps * rnd; + + if (high_draw < 0 || + straw > high_draw) { + high = *pi; + high_draw = straw; + } + + pi++; + ps++; + } + return high; + } + }; + + + + + + inline Bucket* decode_bucket(bufferlist& bl, int& off) { + char t; + bl.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); + + switch (t) { + case CRUSH_BUCKET_UNIFORM: + return new UniformBucket(bl, off); + case CRUSH_BUCKET_LIST: + return new ListBucket(bl, off); + case CRUSH_BUCKET_TREE: + return new TreeBucket(bl, off); + case CRUSH_BUCKET_STRAW: + return new StrawBucket(bl, off); + default: + assert(0); + } + return 0; + } + + + +} + + + + + + + + +#endif diff --git a/branches/sage/cephmds2/crush/Hash.h b/branches/sage/cephmds2/crush/Hash.h new file mode 100644 index 0000000000000..cd3bb0a02cda6 --- /dev/null +++ b/branches/sage/cephmds2/crush/Hash.h @@ -0,0 +1,287 @@ + +// Robert Jenkins' function for mixing 32-bit values +// http://burtleburtle.net/bob/hash/evahash.html +// a, b = random bits, c = input and output +#define hashmix(a,b,c) \ + a=a-b; a=a-c; a=a^(c>>13); \ + b=b-c; b=b-a; b=b^(a<<8); \ + c=c-a; c=c-b; c=c^(b>>13); \ + a=a-b; a=a-c; a=a^(c>>12); \ + b=b-c; b=b-a; b=b^(a<<16); \ + c=c-a; c=c-b; c=c^(b>>5); \ + a=a-b; a=a-c; a=a^(c>>3); \ + b=b-c; b=b-a; b=b^(a<<10); \ + c=c-a; c=c-b; c=c^(b>>15); + +namespace crush { + + class Hash { + int seed; + + public: + int get_seed() { return seed; } + void set_seed(int s) { seed = s; } + + Hash(int s) { + unsigned int hash = 1315423911; + int x = 231232; + int y = 1232; + hashmix(s, x, hash); + hashmix(y, s, hash); + seed = s; + } + + inline int operator()(int a) { + unsigned int hash = seed ^ a; + int b = a; + int x = 231232; + int y = 1232; + hashmix(b, x, hash); + hashmix(y, a, hash); + return (hash & 0x7FFFFFFF); + } + + inline int operator()(int a, int b) { + unsigned int hash = seed ^ a ^ b; + int x = 231232; + int y = 1232; + hashmix(a, b, hash); + hashmix(x, a, hash); + hashmix(b, y, hash); + return (hash & 0x7FFFFFFF); + } + + inline int operator()(int a, int b, int c) { + unsigned int hash = seed ^ a ^ b ^ c; + int x = 231232; + int y = 1232; + hashmix(a, b, hash); + hashmix(c, x, hash); + hashmix(y, a, hash); + hashmix(b, x, hash); + hashmix(y, c, hash); + return (hash & 0x7FFFFFFF); + } + + inline int operator()(int a, int b, int c, int d) { + unsigned int hash = seed ^a ^ b ^ c ^ d; + int x = 231232; + int y = 1232; + hashmix(a, b, hash); + hashmix(c, d, hash); + hashmix(a, x, hash); + hashmix(y, b, hash); + hashmix(c, x, hash); + hashmix(y, d, hash); + return (hash & 0x7FFFFFFF); + } + + inline int operator()(int a, int b, int c, int d, int e) { + unsigned int hash = seed ^ a ^ b ^ c ^ d ^ e; + int x = 231232; + int y = 1232; + hashmix(a, b, hash); + hashmix(c, d, hash); + hashmix(e, x, hash); + hashmix(y, a, hash); + hashmix(b, x, hash); + hashmix(y, c, hash); + hashmix(d, x, hash); + hashmix(y, e, hash); + return (hash & 0x7FFFFFFF); + } + }; + +} + + + +#if 0 + + + //return myhash(a) ^ seed; + return myhash(a, seed); + } + int operator()(int a, int b) { + //return myhash( myhash(a) ^ myhash(b) ^ seed ); + return myhash(a, b, seed); + } + int operator()(int a, int b, int c) { + //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ seed ); + return myhash(a, b, c, seed); + } + int operator()(int a, int b, int c, int d) { + //return myhash( myhash(a ^ seed) ^ myhash(b ^ seed) ^ myhash(c ^ seed) ^ myhash(d ^ seed) ^ seed ); + return myhash(a, b, c, d, seed); + } + + // ethan's rush hash? + if (0) + return (n ^ 0xdead1234) * (884811920 * 3 + 1); + + if (1) { + + // before + hash ^= ((hash << 5) + (n&255) + (hash >> 2)); + hashmix(a, b, hash); + n = n >> 8; + hash ^= ((hash << 5) + (n&255) + (hash >> 2)); + hashmix(a, b, hash); + n = n >> 8; + hash ^= ((hash << 5) + (n&255) + (hash >> 2)); + hashmix(a, b, hash); + n = n >> 8; + hash ^= ((hash << 5) + (n&255) + (hash >> 2)); + hashmix(a, b, hash); + n = n >> 8; + + //return hash; + return (hash & 0x7FFFFFFF); + } + + // JS + // a little better than RS + // + jenkin's mixing thing (which sucks on its own but helps tons here) + // best so far + if (1) { + unsigned int hash = 1315423911; + int a = 231232; + int b = 1232; + + for(unsigned int i = 0; i < 4; i++) + { + hash ^= ((hash << 5) + (n&255) + (hash >> 2)); + hashmix(a, b, hash); + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + + // Robert jenkins' 96 bit mix + // sucks + if (0) { + int c = n; + int a = 12378912; + int b = 2982827; + a=a-b; a=a-c; a=a^(c>>13); + b=b-c; b=b-a; b=b^(a<<8); + c=c-a; c=c-b; c=c^(b>>13); + a=a-b; a=a-c; a=a^(c>>12); + b=b-c; b=b-a; b=b^(a<<16); + c=c-a; c=c-b; c=c^(b>>5); + a=a-b; a=a-c; a=a^(c>>3); + b=b-c; b=b-a; b=b^(a<<10); + c=c-a; c=c-b; c=c^(b>>15); + return c; + } + // robert jenkins 32-bit + // sucks + if (0) { + n += (n << 12); + n ^= (n >> 22); + n += (n << 4); + n ^= (n >> 9); + n += (n << 10); + n ^= (n >> 2); + n += (n << 7); + n ^= (n >> 12); + return n; + } + + // djb2 + if (0) { + unsigned int hash = 5381; + for (int i=0; i<4; i++) { + hash = ((hash << 5) + hash) + ((n&255) ^ 123); + n = n >> 8; + } + return hash; + } + + + // SDBM + if (1) { + unsigned int hash = 0; + + for(unsigned int i = 0; i < 4; i++) + { + hash = (n&255) + (hash << 6) + (hash << 16) - hash; + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + // PJW + // horrid + if (0) { + unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8); + unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4); + unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8); + unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth); + unsigned int hash = 0; + unsigned int test = 0; + + for(unsigned int i = 0; i < 4; i++) + { + hash = (hash << OneEighth) + (n&255); + + if((test = hash & HighBits) != 0) + { + hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits)); + } + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + // RS Hash function, from Robert Sedgwicks Algorithms in C book, w/ some changes. + if (0) { + unsigned int b = 378551; + unsigned int a = 63689; + unsigned int hash = 0; + + for(unsigned int i=0; i<4; i++) + { + hash = hash * a + (n&0xff); + a = a * b; + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + // DJB + // worse than rs + if (0) { + unsigned int hash = 5381; + + for(unsigned int i = 0; i < 4; i++) + { + hash = ((hash << 5) + hash) + (n&255); + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + // AP + // even worse + if (1) { + unsigned int hash = 0; + + for(unsigned int i = 0; i < 4; i++) + { + hash ^= ((i & 1) == 0) ? ( (hash << 7) ^ (n&255) ^ (hash >> 3)) : + (~((hash << 11) ^ (n&255) ^ (hash >> 5))); + n = n >> 8; + } + + return (hash & 0x7FFFFFFF); + } + + +#endif diff --git a/branches/sage/cephmds2/crush/crush.h b/branches/sage/cephmds2/crush/crush.h new file mode 100644 index 0000000000000..b1e245f1b6af6 --- /dev/null +++ b/branches/sage/cephmds2/crush/crush.h @@ -0,0 +1,521 @@ +#ifndef __crush_CRUSH_H +#define __crush_CRUSH_H + +#include +#include +#include +#include +#include +using namespace std; +#include +#include +using namespace __gnu_cxx; + + +#include "Bucket.h" + +#include "include/buffer.h" + + +namespace crush { + + + // *** RULES *** + + class RuleStep { + public: + int cmd; + vector args; + + RuleStep(int c) : cmd(c) {} + RuleStep(int c, int a) : cmd(c) { + args.push_back(a); + } + RuleStep(int c, int a, int b) : cmd(c) { + args.push_back(a); + args.push_back(b); + } + RuleStep(int o, int a, int b, int c) : cmd(o) { + args.push_back(a); + args.push_back(b); + args.push_back(c); + } + + void _encode(bufferlist& bl) { + bl.append((char*)&cmd, sizeof(cmd)); + ::_encode(args, bl); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(cmd), (char*)&cmd); + off += sizeof(cmd); + ::_decode(args, bl, off); + } + }; + + + // Rule operations + const int CRUSH_RULE_TAKE = 0; + const int CRUSH_RULE_CHOOSE = 1; // first n by default + const int CRUSH_RULE_CHOOSE_FIRSTN = 1; + const int CRUSH_RULE_CHOOSE_INDEP = 2; + const int CRUSH_RULE_EMIT = 3; + + class Rule { + public: + vector< RuleStep > steps; + + void _encode(bufferlist& bl) { + int n = steps.size(); + bl.append((char*)&n, sizeof(n)); + for (int i=0; i buckets; + int bucketno; + Hash h; + + hash_map parent_map; // what bucket each leaf/bucket lives in + + public: + map rules; + + //map collisions; + //map bumps; + + void _encode(bufferlist& bl) { + // buckets + int n = buckets.size(); + bl.append((char*)&n, sizeof(n)); + for (map::const_iterator it = buckets.begin(); + it != buckets.end(); + it++) { + bl.append((char*)&it->first, sizeof(it->first)); + it->second->_encode(bl); + } + bl.append((char*)&bucketno, sizeof(bucketno)); + + // hash + int s = h.get_seed(); + bl.append((char*)&s, sizeof(s)); + + //::_encode(out, bl); + //::_encode(overload, bl); + + // rules + n = rules.size(); + bl.append((char*)&n, sizeof(n)); + for(map::iterator it = rules.begin(); + it != rules.end(); + it++) { + bl.append((char*)&it->first, sizeof(it->first)); + it->second._encode(bl); + } + + } + + void _decode(bufferlist& bl, int& off) { + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i::iterator bp = buckets.begin(); + bp != buckets.end(); + ++bp) { + // index bucket items + vector items; + bp->second->get_items(items); + for (vector::iterator ip = items.begin(); + ip != items.end(); + ++ip) + parent_map[*ip] = bp->first; + } + } + + + + public: + Crush(int seed=123) : bucketno(-1), h(seed) {} + ~Crush() { + // hose buckets + for (map::iterator it = buckets.begin(); + it != buckets.end(); + it++) { + delete it->second; + } + } + + int print(ostream& out, int root, int indent=0) { + for (int i=0; iget_weight() << "\t" << b->get_id() << "\t"; + for (int i=0; iget_bucket_type() << ": "; + + vector items; + b->get_items(items); + + if (buckets.count(items[0])) { + out << endl; + for (unsigned i=0; iset_id(n); + buckets[n] = b; + return n; + } + + void add_item(int parent, int item, float w, bool back=false) { + // add item + assert(!buckets[parent]->is_uniform()); + Bucket *p = buckets[parent]; + + p->add_item(item, w, back); + + // set item's parent + Bucket *n = buckets[item]; + if (n) + n->set_parent(parent); + + // update weights + while (buckets.count(p->get_parent())) { + int child = p->get_id(); + p = buckets[p->get_parent()]; + p->adjust_item_weight(child, w); + } + } + + + /* + this is a hack, fix me! weights should be consistent throughout hierarchy! + + */ + void set_bucket_weight(int item, float w) { + Bucket *b = buckets[item]; + float adj = w - b->get_weight(); + + while (buckets.count(b->get_parent())) { + Bucket *p = buckets[b->get_parent()]; + p->adjust_item_weight(b->get_id(), adj); + b = p; + } + } + + + /* + * choose numrep distinct items of type type + */ + void choose(int x, + int numrep, + int type, + Bucket *inbucket, + vector& outvec, + bool firstn, + set& outset, map& overloadmap, + bool forcefeed=false, + int forcefeedval=-1) { + int off = outvec.size(); + + // for each replica + for (int rep=0; repis_uniform()) { + // uniform bucket; be careful! + if (firstn || numrep >= in->get_size()) { + // uniform bucket is too small; just walk thru elements + r += ftotal; // r' = r + f_total (first n) + } else { + // make sure numrep is not a multple of bucket size + int add = numrep*flocal; // r' = r + n*f_local + if (in->get_size() % numrep == 0) { + add += add/in->get_size(); // shift seq once per pass through the bucket + } + r += add; + } + } else { + // mixed bucket; just make a distinct-ish r sequence + if (firstn) + r += ftotal; // r' = r + f_total + else + r += numrep * flocal; // r' = r + n*f_local + } + + // choose + outv = in->choose_r(x, r, h); + + // did we get the type we want? + int itemtype = 0; // 0 is terminal type + Bucket *newin = 0; // remember bucket we hit + if (in->is_uniform()) { + itemtype = ((UniformBucket*)in)->get_item_type(); + } else { + if (buckets.count(outv)) { // another bucket + newin = buckets[outv]; + itemtype = newin->get_type(); + } + } + if (itemtype == type) { // this is what we want! + // collision? + bool collide = false; + for (int prep=0; prep overloadmap[outv]) + bad = true; + } + + if (collide || bad) { + ftotal++; + flocal++; + + if (collide && flocal < 3) + continue; // try locally a few times! + + if (ftotal >= 10) { + // ok fine, just ignore dup. FIXME. + skip_rep = true; + break; + } + + retry_rep = true; + } + + break; // ok then! + } + + // next + in = newin; + } + + if (retry_rep) continue; // try again + + break; + } + + // skip this rep? (e.g. too many collisions, we give up) + if (skip_rep) continue; + + // output this value + outvec.push_back(outv); + } // for rep + + // double check! + if (0) { + for (unsigned i=1; i& result, + set& outset, map& overloadmap, + int forcefeed=-1) { + //int numresult = 0; + result.clear(); + + // determine hierarchical context for first. + list force_stack; + if (forcefeed >= 0) { + int t = forcefeed; + while (1) { + force_stack.push_front(t); + if (parent_map.count(t) == 0) break; // reached root, presumably. + //cout << " " << t << " parent is " << parent_map[t] << endl; + t = parent_map[t]; + } + } + + // working vector + vector w; // working variable + + // go through each statement + for (vector::iterator pc = rule.steps.begin(); + pc != rule.steps.end(); + pc++) { + // move input? + + // do it + switch (pc->cmd) { + case CRUSH_RULE_TAKE: + { + const int arg = pc->args[0]; + //cout << "take " << arg << endl; + + if (!force_stack.empty()) { + int forceval = force_stack.front(); + force_stack.pop_front(); + assert(arg == forceval); + } + + w.clear(); + w.push_back(arg); + } + break; + + case CRUSH_RULE_CHOOSE_FIRSTN: + case CRUSH_RULE_CHOOSE_INDEP: + { + const bool firstn = pc->cmd == CRUSH_RULE_CHOOSE_FIRSTN; + const int numrep = pc->args[0]; + const int type = pc->args[1]; + + //cout << "choose " << numrep << " of type " << type << endl; + + assert(!w.empty()); + + // reset output + vector out; + + // forcefeeding? + bool forcing = false; + int forceval; + if (!force_stack.empty()) { + forceval = force_stack.front(); + force_stack.pop_front(); + //cout << "priming out with " << forceval << endl; + forcing = true; + } + + // do each row independently + for (vector::iterator i = w.begin(); + i != w.end(); + i++) { + assert(buckets.count(*i)); + Bucket *b = buckets[*i]; + choose(x, numrep, type, b, out, firstn, + outset, overloadmap, + forcing, + forceval); + forcing = false; // only once + } // for inrow + + // put back into w + w.swap(out); + out.clear(); + } + break; + + case CRUSH_RULE_EMIT: + { + for (unsigned i=0; i + +#include +#include +using namespace std; + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int n, float f, int buckettype) +{ + Hash h(73232313); + + // crush + Crush c; + + int ndisks = 0; + + // bucket + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(1); + else if (buckettype == 3) + b = new StrawBucket(1); + else if (buckettype == 4) + b = new UniformBucket(0,0); + + for (int i=0; iadd_item(ndisks++,1); + + c.add_bucket(b); + int root = b->get_id(); + + //c.print(cout,root); + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 1000; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + + // ORIGINAL + place(c, rule, numpg, numrep, placement1); + + int olddisks = ndisks; + + // add item + if (buckettype == 2) { + // start over! + ndisks = 0; + b = new ListBucket(1); + for (int i=0; i<=n; i++) + b->add_item(ndisks++,1); + c.add_bucket(b); + root = b->get_id(); + + rule.steps.clear(); + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + } + else + b->add_item(ndisks++, 1); + + + // ADDED + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + //Bucket *b = new MixedBucket(h+1); + Bucket *b = new StrawBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +float go(int dep) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (0) { + for (int d=0; dadd_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 10000, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + + + + // rule + int numrep = 1; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + cout << "#looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + + int x = 0; + for (int t=0; t v(numrep); + + for (int z=0; z + +#include +#include +using namespace std; + +int buckettype = 0; + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + //Bucket *b = new TreeBucket(h+1); + //Bucket *b = new ListBucket(h+1); + //Bucket *b = new StrawBucket(h+1); + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(h+1); + else if (buckettype == 3) + b = new StrawBucket(h+1); + + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks, int add, int modifydepth) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + root = make_hierarchy(c, wid, buckets, ndisks); + + //c.print(cout,root); + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + + // ORIGINAL + place(c, rule, numpg, numrep, placement1); + + int olddisks = ndisks; + + // add disks + //cout << " adding " << add << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + + //Bucket *o = buckets[2].back(); + Bucket *o; + if (buckettype == 2) + o = buckets[modifydepth].front(); + else + o = buckets[modifydepth].back(); + + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + //newbucket = b; + + + // ADDED + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j + +#include +#include +using namespace std; + + +int buckettype = 2; // 0 = mixed, 1 = linear, 2 = straw + +int big_one_skip = 255; +int big_one_size; +Bucket *big_one = 0; + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + + int s = wid[h]; + if (big_one_skip > 0) + big_one_skip--; + if (!big_one_skip && !big_one) + s = big_one_size; + + + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks " << disks.size()<< endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1) + b = new ListBucket(h+1); + else if (buckettype == 2) + b = new StrawBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks, int add) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + big_one_size = add; + big_one = 0; + + //cout << "making tree" << endl; + root = make_hierarchy(c, wid, buckets, ndisks); + + //c.print(cout, root); + + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + int olddisks = ndisks; + + + place(c, rule, numpg, numrep, placement1); + + if (1) { + // remove disks + assert(big_one); + c.adjust_item(big_one->get_id(), 0); + } + + int newdisks = ndisks - add; + + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j >::iterator i = r.begin(); + i != r.end(); + i++) { + cout << i->first; + for (map::iterator j = i->second.begin(); + j != i->second.end(); + j++) + cout << "\t" << j->first << "\t" << j->second; + cout << endl; + } + */ +} + diff --git a/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc b/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc new file mode 100644 index 0000000000000..90cc197c24f65 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/cluster_movement_rush.cc @@ -0,0 +1,218 @@ + + +#include "../crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + +int buckettype = 0; + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + //Bucket *b = new TreeBucket(h+1); + //Bucket *b = new ListBucket(h+1); + //Bucket *b = new StrawBucket(h+1); + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(h+1); + else if (buckettype == 3) + b = new StrawBucket(h+1); + + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks, int add, int modifydepth) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + root = make_hierarchy(c, wid, buckets, ndisks); + + //c.print(cout,root); + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + + // ORIGINAL + place(c, rule, numpg, numrep, placement1); + + int olddisks = ndisks; + + // add disks + //cout << " adding " << add << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + + //Bucket *o = buckets[2].back(); + Bucket *o; + if (buckettype == 2) + o = buckets[modifydepth].front(); + else + o = buckets[modifydepth].back(); + + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight(), buckettype == 2); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + //newbucket = b; + + + // ADDED + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) + if (placement1[x] != placement2[x]) + for (int j=0; j + +#include +#include +using namespace std; + + +Clock g_clock; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + + +float go(int dep, int failpc) +{ + Hash h(73232313); + + //int overloadcutoff = (int)((float)10000.0 / (float)utilization); + + //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; d ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 1000000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tavg[10]; + float tvar[10]; + for (int j=0;j<10;j++) { + tvar[j] = 0; + tavg[j] = 0; + } + int tvarnum = 0; + float trvar = 0.0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + float aslowdown = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.out.clear(); + + for (int z=0; z= ndisks) cout << "v[i] " << i << " is " << v[i] << " .. x = " << x << endl; + //assert(v[i] < ndisks); + ocount[v[i]]++; + } + } + utime_t t1b = g_clock.now(); + + // add in numf failed disks + for (int f = 0; f < numf; f++) { + int d = rand() % ndisks; + while (c.out.count(d)) d = rand() % ndisks; + c.out.insert(d); + } + + utime_t t3a = g_clock.now(); + for (int x=xs; x + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +Bucket *make_random(Crush& c, int wid, int height, int& ndisks) +{ + int w = rand() % (wid-1) + 2; + + if (height == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + int h = rand() % height + 1; + MixedBucket *b = new MixedBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } + +} + + +float go(int dep, int overloadcutoff) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; dget_id(); + //c.print(cout, root); + } + if (0) { + MixedBucket *b = new MixedBucket(1); + for (int i=0; i<10000; i++) + b->add_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 10000, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + //cout << ndisks << " disks" << endl; + + + + // rule + int numrep = 1; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 1000000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.overload.clear(); + + for (int z=0; z overloadcutoff) + overloaded++; + + if (ocount[i] > 100+(overloadcutoff-100)/2) { + adjusted++; + c.overload[i] = 100.0 / (float)ocount[i]; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + //cout << overloaded << " overloaded" << endl; + overloadsum += (float)overloaded / (float)ndisks; + adjustsum += (float)adjusted / (float)ndisks; + + + for (int x=xs; x overloadcutoff) { + still++; + //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + } + //if (still) cout << "overload was " << overloaded << " now " << still << endl; + afteroverloadsum += (float)still / (float)ndisks; + + //cout << "collisions: " << c.collisions << endl; + //cout << "r bumps: " << c.bumps << endl; + + float avg = 0.0; + for (int i=0; i100; d -= 5) { + float var = go(3,d); + //cout << "## depth = " << d << endl; + //cout << d << "\t" << var << endl; + } +} diff --git a/branches/sage/cephmds2/crush/test/depth_variance.cc b/branches/sage/cephmds2/crush/test/depth_variance.cc new file mode 100644 index 0000000000000..7d60ebaae9501 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/depth_variance.cc @@ -0,0 +1,185 @@ + + +#include "../crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +float go(int dep) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (1) { + for (int d=0; d ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + cout << "#looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + float tavg = 0; + + int x = 0; + for (int t=0; t v(numrep); + + for (int z=0; z + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + //Bucket *b = new StrawBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + + +float go(int dep, int overloadcutoff) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; d ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tavg[10]; + float tvar[10]; + for (int j=0;j<10;j++) { + tvar[j] = 0; + tavg[j] = 0; + } + int tvarnum = 0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.overload.clear(); + + for (int z=0; z cutoff) + overloaded++; + + if (ocount[i] > adjoff) { + adjusted++; + c.overload[i] = (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + //cout << overloaded << " overloaded" << endl; + overloadsum += (float)overloaded / (float)ndisks; + adjustsum += (float)adjusted / (float)ndisks; + + + + if (1) { + // second pass + for (int x=xs; x= adjoff) { + adjusted++; + if (c.overload.count(i) == 0) { + c.overload[i] = 1.0; + adjusted++; + } + //else cout << "(re)adjusting " << i << endl; + c.overload[i] *= (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + } + + for (int x=xs; x cutoff) { + still++; + //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; + if (c.overload.count(i)) cout << "[adjusted] "; + cout << "disk " << i << " has " << ocount[i] << endl; + } + } + //if (still) cout << "overload was " << overloaded << " now " << still << endl; + afteroverloadsum += (float)still / (float)ndisks; + + //cout << "collisions: " << c.collisions << endl; + //cout << "r bumps: " << c.bumps << endl; + + int n = ndisks/10; + float avg[10]; + float var[10]; + for (int i=0;i<10;i++) { + int s = n*i; + avg[i] = 0.0; + for (int j=0; j100; d -= 5) { + float var = go(3,d); + //cout << "## depth = " << d << endl; + //cout << d << "\t" << var << endl; + } +} diff --git a/branches/sage/cephmds2/crush/test/movement.cc b/branches/sage/cephmds2/crush/test/movement.cc new file mode 100644 index 0000000000000..2621f09457fe6 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/movement.cc @@ -0,0 +1,223 @@ + + +#include "../crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + if (1) { + root = make_hierarchy(c, wid, buckets, ndisks); + } + if (0) { + MixedBucket *b = new MixedBucket(1); + for (int i=0; i<10000; i++) + b->add_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 1, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + + + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + place(c, rule, numpg, numrep, placement1); + + if (1) { + // failed + + //for (int i=500; i<1000; i++) + //c.failed.insert(i); + c.failed.insert(0); + } + + int olddisks = ndisks; + + if (1) { + int n = udisks; + //cout << " adding " << n << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + Bucket *o = buckets[1].back(); + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + } + + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + int moved = 0; + for (int x=1; x<=numpg; x++) { + if (placement1[x] != placement2[x]) { + for (int j=0; j v; + cout << depth; + for (int branching = 3; branching < 16; branching += 1) { + float fac = testmovement(depth, branching, udisks); + v.push_back(fac); + int n = udisks * pow((float)branching, (float)depth-1); + cout << "\t" << n; + cout << "\t" << fac; + } + //for (int i=0; i + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, map >& placement) +{ + vector v(numrep); + map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i::iterator it = ocount.begin(); + it != ocount.end(); + it++) + cout << it->first << "\t" << it->second << endl; + +} + + +float testmovement(int depth, int branching, int udisks) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + wid.push_back(udisks); + for (int d=1; d > buckets; + + if (1) { + root = make_hierarchy(c, wid, buckets, ndisks); + } + if (0) { + MixedBucket *b = new MixedBucket(1); + for (int i=0; i<10000; i++) + b->add_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 1, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + + + + // rule + int numrep = 2; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + /* + cout << ndisks << " disks, " << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + */ + map > placement1, placement2; + + //c.print(cout, root); + + place(c, rule, numpg, numrep, placement1); + + float over = .5; + + if (1) { + // failed + + //for (int i=500; i<1000; i++) + //c.failed.insert(i); + //c.failed.insert(0); + c.overload[0] = over; + } + + int olddisks = ndisks; + + + + if (0) { + int n = udisks; + //cout << " adding " << n << " disks" << endl; + vector disks; + for (int i=0; imake_primes(h); + Bucket *o = buckets[1].back(); + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + //((MixedBucket*)o)->add_item(b->get_id(), b->get_weight()); + } + + //c.print(cout, root); + place(c, rule, numpg, numrep, placement2); + + vector moved(ndisks); + + //int moved = 0; + for (int d=0; d::iterator it = placement1[d].begin(); + it != placement1[d].end(); + it++) { + placement2[d].erase(*it); + } + } + + float avg = 0; + for (int d=0; d v; + cout << depth; + for (int branching = 3; branching < 16; branching += 1) { + float fac = testmovement(depth, branching, udisks); + v.push_back(fac); + int n = udisks * pow((float)branching, (float)depth-1); + //cout << "\t" << n; + //cout << "\t" << fac; + } + //for (int i=0; i + +#include +#include +using namespace std; + + +Clock g_clock; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks weight " << w << endl; + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + + +float go(int dep, int utilization ) +{ + Hash h(73232313); + + int overloadcutoff = (int)((float)10000.0 / (float)utilization); + + //cout << "util " << utilization << " cutoff " << overloadcutoff << endl; + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; d ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 100000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tavg[10]; + float tvar[10]; + for (int j=0;j<10;j++) { + tvar[j] = 0; + tavg[j] = 0; + } + int tvarnum = 0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + float aslowdown = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.overload.clear(); + + for (int z=0; z cutoff) + overloaded++; + + if (ocount[i] > adjoff) { + adjusted++; + c.overload[i] = (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + //cout << overloaded << " overloaded" << endl; + overloadsum += (float)overloaded / (float)ndisks; + adjustsum += (float)adjusted / (float)ndisks; + + + + // keep adjusting! + for (int bla=0; bla<5; bla++) { + utime_t t2a = g_clock.now(); + + // second pass + for (int x=xs; x= adjoff) { + numover++; + if (c.overload.count(i) == 0) { + c.overload[i] = 1.0; + adjusted++; + } + //else cout << "(re)adjusting " << i << endl; + c.overload[i] *= (float)target / (float)ocount[i]; + //cout << "setting overload " << i << " to " << c.overload[i] << endl; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + if (!numover) break; + cout << "readjusting" << endl; + } + + utime_t t3a = g_clock.now(); + + for (int x=xs; x cutoff) { + still++; + //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; + if (c.overload.count(i)) cout << "[adjusted] "; + cout << "disk " << i << " has " << ocount[i] << endl; + } + } + //if (still) cout << "overload was " << overloaded << " now " << still << endl; + afteroverloadsum += (float)still / (float)ndisks; + + //cout << "collisions: " << c.collisions << endl; + //cout << "r bumps: " << c.bumps << endl; + + int n = ndisks/10; + float avg[10]; + float var[10]; + for (int i=0;i<10;i++) { + int s = n*i; + avg[i] = 0.0; + for (int j=0; j + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +Bucket *make_random(Crush& c, int wid, int height, int& ndisks) +{ + int w = rand() % (wid-1) + 2; + + if (height == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + int h = rand() % height + 1; + MixedBucket *b = new MixedBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } + +} + + +float go(int dep, int overloadcutoff) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + for (int d=0; dget_id(); + //c.print(cout, root); + } + if (0) { + MixedBucket *b = new MixedBucket(1); + for (int i=0; i<10000; i++) + b->add_item(ndisks++, 10); + root = c.add_bucket(b); + } + if (0) { + vector disks; + for (int i=0; i<10000; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 10000, disks); + Hash h(123); + b->make_primes(h); + root = c.add_bucket(b); + } + //cout << ndisks << " disks" << endl; + + + + // rule + int numrep = 1; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + //cout << ndisks << " disks, " << endl; + //cout << pg_per << " pgs per disk" << endl; + // cout << numpg << " logical pgs" << endl; + //cout << "numrep is " << numrep << endl; + + + int place = 1000000; + int times = place / numpg; + if (!times) times = 1; + + + //cout << "looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + + float overloadsum = 0.0; + float adjustsum = 0.0; + float afteroverloadsum = 0.0; + int chooses = 0; + int xs = 1; + for (int t=0; t v(numrep); + + c.overload.clear(); + + for (int z=0; z overloadcutoff) + overloaded++; + + if (ocount[i] > 100+(overloadcutoff-100)/2) { + adjusted++; + c.overload[i] = 100.0 / (float)ocount[i]; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + ocount[i] = 0; + } + //cout << overloaded << " overloaded" << endl; + overloadsum += (float)overloaded / (float)ndisks; + adjustsum += (float)adjusted / (float)ndisks; + + + for (int x=xs; x overloadcutoff) { + still++; + //c.overload[ocount[i]] = 100.0 / (float)ocount[i]; + //cout << "disk " << i << " has " << ocount[i] << endl; + } + } + //if (still) cout << "overload was " << overloaded << " now " << still << endl; + afteroverloadsum += (float)still / (float)ndisks; + + //cout << "collisions: " << c.collisions << endl; + //cout << "r bumps: " << c.bumps << endl; + + float avg = 0.0; + for (int i=0; i100; d -= 5) { + float var = go(3,d); + //cout << "## depth = " << d << endl; + //cout << d << "\t" << var << endl; + } +} diff --git a/branches/sage/cephmds2/crush/test/sizes.cc b/branches/sage/cephmds2/crush/test/sizes.cc new file mode 100644 index 0000000000000..cc5780218210a --- /dev/null +++ b/branches/sage/cephmds2/crush/test/sizes.cc @@ -0,0 +1,131 @@ + +#include "include/types.h" +#include "include/Distribution.h" +#include "osd/OSDMap.h" + + +Distribution file_size_distn; //kb + + +list object_queue; +int max_object_size = 1024*1024*100; //kb + +off_t no; + +int get_object() //kb +{ + if (object_queue.empty()) { + int max = file_size_distn.sample(); + no++; + int filesize = max/2 + (rand() % 100) * max/200 + 1; + //cout << "file " << filesize << endl; + while (filesize > max_object_size) { + object_queue.push_back(max_object_size); + filesize -= max_object_size; + } + object_queue.push_back(filesize); + } + int s = object_queue.front(); + object_queue.pop_front(); + //cout << "object " << s << endl; + return s; +} + +void getdist(vector& v, float& avg, float& var) +{ + avg = 0.0; + for (int i=0; i pgs(n); + off_t did = 0; + + no = 0; + while (did < dist) { + off_t s = get_object(); + pgs[rand()%n] += s; + did += s; + } + while (!object_queue.empty()) + pgs[rand()%n] += get_object(); + + numo = no; + //cout << did/n << endl; + + //for (int i=0; i + +#include +#include +using namespace std; + + +Bucket *make_bucket(Crush& c, vector& wid, int h, map< int, list >& buckets, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + buckets[h].push_back(b); + return b; + } else { + // mixed + Bucket *b = new TreeBucket(h+1); + c.add_bucket(b); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + n->set_parent(b->get_id()); + } + buckets[h].push_back(b); + //cout << b->get_id() << " mixedbucket with " << wid[h] << " at " << h << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, map< int, list >& buckets, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, buckets, ndisks); + return b->get_id(); +} + + +void place(Crush& c, Rule& rule, int numpg, int numrep, vector& ocount) +{ + vector v(numrep); + //map ocount; + + for (int x=1; x<=numpg; x++) { + + //cout << H(x) << "\t" << h(x) << endl; + c.do_rule(rule, x, v); + //cout << "v = " << v << endl;// " " << v[0] << " " << v[1] << " " << v[2] << endl; + + bool bad = false; + for (int i=0; i wid; + wid.push_back(10); + wid.push_back(2); + + map< int, list > buckets; + root = make_hierarchy(c, wid, buckets, ndisks); + + // add small bucket + vector disks; + for (int i=0; i<3; i++) + disks.push_back(ndisks++); + UniformBucket *b = new UniformBucket(1, 0, 1, disks); + b->make_primes(h); + Bucket *o = buckets[1].back(); + c.add_bucket(b); + //cout << " adding under " << o->get_id() << endl; + c.add_item(o->get_id(), b->get_id(), b->get_weight()); + + + // rule + int numrep = 6; + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + //c.overload[10] = .1; + + int pg_per = 10000; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + + c.print(cout, root); + + place(c, rule, numpg, numrep, ocount); + + for (int i=0; i + +#include +#include +using namespace std; + + +int numrep = 1; + + +double go(int n, int bucket) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + Bucket *b; + vector items; + if (bucket == 0) b = new UniformBucket(1,0,10,items); + if (bucket == 1) b = new TreeBucket(1); + if (bucket == 2) b = new ListBucket(1); + if (bucket == 3) b = new StrawBucket(1); + + for (int d=0; dadd_item(ndisks++, 1); + + //if (!bucket) ((UniformBucket*)b)->make_primes(h); + + root = c.add_bucket(b); + + // rule + Rule rule; + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + + + int place = 1000000; + + + vector v(numrep); + set out; + map overload; + + utime_t start = g_clock.now(); + + for (int x=1; x <= place; x++) + c.do_rule(rule, x, v, out, overload); + + utime_t end = g_clock.now(); + + end -= start; + double el = (double)end; + + //cout << "\t" << ndisks; + + return el; +} + + +int main() +{ + + for (int n=4; n<=50; n += 4) { + cout << n; + for (int b=0; b<4; b++) { + double el = go(n,b); + cout << "\t" << el; + } + cout << endl; + } +} diff --git a/branches/sage/cephmds2/crush/test/speed_depth.cc b/branches/sage/cephmds2/crush/test/speed_depth.cc new file mode 100644 index 0000000000000..32275d16d2b31 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/speed_depth.cc @@ -0,0 +1,174 @@ + +#include "../../common/Clock.h" +#include "../crush.h" +using namespace crush; + + +Clock g_clock; + +#include + +#include +#include +using namespace std; + + +int uniform = 10; +int branching = 10; +int buckettype = 0; +int numrep = 1; + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + Bucket *b; + if (buckettype == 0) + b = new TreeBucket(h+1); + else if (buckettype == 1 || buckettype == 2) + b = new ListBucket(h+1); + else if (buckettype == 3) + b = new StrawBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +double go(int dep, int per) +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (1) { + wid.push_back(uniform); + for (int d=1; d v(numrep); + + utime_t start = g_clock.now(); + + set out; + map overload; + + for (int x=1; x <= place; x++) + c.do_rule(rule, x, v, out, overload); + + utime_t end = g_clock.now(); + + end -= start; + double el = (double)end; + + //cout << "\t" << ndisks; + + return el; +} + + +int main() +{ + uniform = branching = 8; + + cout << "// dep\tuniform\tbranch\tndisks" << endl; + + for (int d=2; d<=5; d++) { + cout << d;// << "\t" << branching; + cout << "\t" << uniform; + cout << "\t" << branching; + + int n = 1; + for (int i=0; i + +#include +#include +using namespace std; + + +int branching = 10; +bool linear = false; +int numrep = 1; + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + Bucket *b; + if (linear) + b = new ListBucket(h+1); + else + b = new TreeBucket(h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks); + return b->get_id(); +} + + +double go(int s) +{ + int dep = 2; + Hash h(73232313); + + // crush + Crush c; + + + // buckets + int root = -1; + int ndisks = 0; + + vector wid; + if (1) { + //for (int d=0; d v(numrep); + + utime_t start = g_clock.now(); + + for (int x=1; x <= place; x++) + c.do_rule(rule, x, v); + + utime_t end = g_clock.now(); + + end -= start; + double el = (double)end; + + cout << "\t" << ndisks; + + return el; +} + + +int main() +{ + branching = 8; + + int d = 2; + numrep = 2; + + for (int s = 64; s <= 32768; s *= 8) { + cout << "t"; + linear = false; + double el = go(s, d); + cout << "\t" << el; + + cout << "\tp"; + linear = true; + el = go(s, d); + cout << "\t" << el; + + cout << endl; + } +} diff --git a/branches/sage/cephmds2/crush/test/t.cc b/branches/sage/cephmds2/crush/test/t.cc new file mode 100644 index 0000000000000..0785ef47d6c04 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/t.cc @@ -0,0 +1,25 @@ + +#include "../../common/Clock.h" +#include "../crush.h" +using namespace crush; + + +Clock g_clock; + +#include + +#include +#include +using namespace std; + + +int branching = 10; +bool linear = false; +int numrep = 1; + +int main() { + + Bucket *b = new UniformBucket(1, 0); + //b = new TreeBucket(1); +} + diff --git a/branches/sage/cephmds2/crush/test/testbucket.cc b/branches/sage/cephmds2/crush/test/testbucket.cc new file mode 100644 index 0000000000000..065721c2c1967 --- /dev/null +++ b/branches/sage/cephmds2/crush/test/testbucket.cc @@ -0,0 +1,61 @@ + + +#include "../Bucket.h" +using namespace crush; + +#include +#include +using namespace std; + + +ostream& operator<<(ostream& out, vector& v) +{ + out << "["; + for (int i=0; i ocount(ndisks); + + vector v(numrep); + int nplace = 0; + for (int x=1; x<1000000; x++) { + //cout << H(x) << "\t" << h(x) << endl; + for (int i=0; i +#include +using namespace std; + + +void getdist(vector& v, float& avg, float& var) +{ + avg = 0.0; + for (int i=0; i a(n); + vector b(n); + + for (int i=0; i c(n); + for (int i=0; i cached_by; + map cached_by_serial; + +The cached_by set _always_ includes all nodes that cache the +partcuarly inode, but may additionally include nodes that used to +cache it but no longer do. In those cases, an expire message should +be in transit. + + +REPLICA + +The replica maintains a notion of who it believes is the authority for +each replicated inode. There are two possibilities: + + - Ordinarily, this notion is correct. + - If the part of the file system in question was recently exported to + a new MDS, the inodes old authority is acting as a CACHEPROXY, + and will forward relevant messages on to the authority. + +When a repica is expired from cache, and expire is sent to the +authority. The expire includes the serial number issued when the +replica was originally created to disambiguate potentially concurrent +replication activity. + + +EXPORTS + +- The old authority suddenly becomes a replica. It's serial is well + defined. It also becomes a CACHEPROXY, which means its cached_by + remains defined (with an alternate meaning!). While a proxy, the + node will forward relevant messages from the replica to the + authority (but not the other way around--the authority knows all + replicas). + +- Once the export is acked, the old authority sends a + message to the replica notifying it of the new authority. As soon + as all replicas acknowedge receipt of this notice, the old authority + can cease CACHEPROXY responsibilities and become a regular replica. + At this point it's cached_by is no longer defined. + +- Replicas always know who the authority for the inode is, OR they + know prior owner acting as a CACHEPROXY. (They don't know which it + is.) + + +CACHED_BY + +The authority always has an inclusive list of nodes who cache an item. +As such it can confidently send updates to replicas for locking, +invalidating, etc. When a replica is expired from cache, an expire is +sent to the authority. If the serial matches, the node is removed +from the cached_by list. + + + + + +SUBTREE AUTHORITY DELEGATION: imports versus hashing + +Authority is generally defined recursively: an inode's authority +matches the containing directory, and a directory's authority matches +the directory inode's. Thus the authority delegation chain can be +broken/redefined in two ways: + + - Imports and exports redefine the directory inode -> directory + linkage, such that the directory authority is explicitly specified + via dir.dir_auth: + + dir.dir_auth == -1 -> directory matches its inode + dir.dir_auth >= 0 -> directory authority is dir.dir_auth + + - Hashed directories redefine the directory -> inode linkage. In + non-hashed directories, inodes match their containing directory. + In hashed directories, each dentry's authority is defined by a hash + function. + + inode.hash_seed == 0 -> inode matches containing directory + inode.hash_seed > 0 -> defined by hash(hash_seed, dentry) + +A directory's "containing_import" (bad name, FIXME) is either the +import or hashed directory that is responsible for delegating a +subtree. Note that the containing_import of a directory may be itself +because it is an import, but it cannot be itself because it is hashed. + +Thus: + + - Import and export operations' manipulation of dir_auth is + completely orthogonal to hashing operations. Hashing methods can + ignore dir_auth, except when they create imports/exports (and break + the inode<->dir auth linkage). + + - Hashdirs act sort of like imports in that they bound an + authoritative region. That is, either hashdirs or imports can be + the key for nested_exports. In some cases, a dir may be both an + import and a hash. + + - Export_dir won't export a hashdir. This is because it's tricky + (tho not necessarily impossible) due to the way nested_exports is + used with imports versus hashdirs. + + + + +FREEZING + +There are two types of freezing: + + - TREE: recursively freezes everything nested beneath a directory, + until an export of edge of cache is reached. + - DIR: freezes the contents of a single directory. + +Some notes: + + - Occurs on the authoritative node only. + + - Used for suspending critical operations while migrating authority + between nodes or hashing/unhashing directories. + + - Freezes the contents of the cache such that items may not be added, + items cannot be auth pinned, and/or subsequently reexported. The + namespace of the affected portions of the hierarchy may not change. + The content of inodes and other orthogonal operations + (e.g. replication, inode locking and modification) are unaffected. + +Two states are defined: freezing and frozen. The freezing state is +used while waiting for auth_pins to be removed. Once all auth_pins +are gone, the state is changed to frozen. New auth_pins cannot be +added while freezing or frozen. + + +AUTH PINS + +An auth pin keeps a given item on the authoritative node until it is +removed. The pins are tracked recursively, so that a subtree cannot +be frozen if it contains any auth pins. + +If a pin is placed on a non-authoritative item, the item is allowed to +become authoritative; the specific restriction is it cannot be frozen, +which only happens during export-type operations. + + +TYPES OF EXPORTS + +- Actual export of a subtree from one node to another +- A rename between directories on different nodes exports the renamed +_inode_. (If it is a directory, it becomes an export such that the +directory itself does not move.) +- A hash or unhash operation will migrate inodes within the directory +either to or from the directory's main authority. + +EXPORT PROCESS + + + + +HASHING + +- All nodes discover and open directory + +- Prep message distributes subdir inode replicas for exports so that + peers can open those dirs. This is necessary because subdirs are + converted into exports or imports as needed to avoid migrating + anything except the hashed dir itself. The prep is needed for the + same reasons its important with exports: the inode authority must + always have the exported dir open so that it gets accurate dir + authority updates, and can keep the inode->dir_auth up to date. + +- MHashDir messsage distributes the directory contents. + +- While auth is frozen_dir, we can't get_or_open_dir. Otherwise the + Prep messages won't be inclusive of all dirs, and the + imports/exports won't get set up properly. + +TODO +readdir + + +- subtrees stop at hashed dir. hashed dir's dir_auth follows parent + subtree, unless the dir is also an explicit import. thus a hashed + dir can also be an import dir. + + +bananas +apples +blueberries +green pepper +carrots +celery + + + + diff --git a/branches/sage/cephmds2/doc/dentries.txt b/branches/sage/cephmds2/doc/dentries.txt new file mode 100644 index 0000000000000..ab14765998b2f --- /dev/null +++ b/branches/sage/cephmds2/doc/dentries.txt @@ -0,0 +1,4 @@ + +null dentires only exist + - on auth + - on replica, if they are xlock \ No newline at end of file diff --git a/branches/sage/cephmds2/doc/file_modes.txt b/branches/sage/cephmds2/doc/file_modes.txt new file mode 100644 index 0000000000000..d4ceba4034e5f --- /dev/null +++ b/branches/sage/cephmds2/doc/file_modes.txt @@ -0,0 +1,66 @@ + +underlying client capabilities: + +- read + cache +- read sync +- write sync +- write + buffer + (...potentially eventually augmented by byte ranges) + +whatever system of modes, tokens, etc. has to satisfy the basic +constraint that no conflicting capabilities are ever in the +hands of clients. + + +questions: +- is there any use to clients writing to a replica? + - reading, yes.. 100,000 open same file.. + + +------ + +simplest approach: +- all readers, writers go through authority +- all open, close traffic at replicas forwarded to auth + +- fh state migrates with exports. + + + +-------- + +less simple: +- all writers go through authority + - open, close traffic fw +- readers from any replica + - need token from auth +- weird auth <-> replica <-> client interactions ensue! + + +-------- + +even more complex (and totally FLAWED, ignore this!) + +- clients can open a file with any replica (for read or write). +- replica gets a read or write token from the primary + - primary thus knows if it's all read, all write, mixed, or none. +- once replica has a token it can service as many clients (of given type(s)) as it wants. +- on export, tokens are moved too. + - primary give _itself_ a token too! much simpler. + +- clients maintain a mode for each open file: rdonly, wronly, rdwr, lock +- globally, the mode is controlled by the primary, based on the mixture of + read and write tokens issued + + + +- [optional] if a client has a file open rdwr and the mode is rdonly or wronly, it can + request to read or write from the mds (which might twiddle the mode for performance + reasons.. e.g. lots of ppl rdwr but no actual reading) + + + + +-------- + + diff --git a/branches/sage/cephmds2/doc/header.txt b/branches/sage/cephmds2/doc/header.txt new file mode 100644 index 0000000000000..8a3c51280461d --- /dev/null +++ b/branches/sage/cephmds2/doc/header.txt @@ -0,0 +1,12 @@ +// -*- mode:C++; tab-width:4; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ diff --git a/branches/sage/cephmds2/doc/inos.txt b/branches/sage/cephmds2/doc/inos.txt new file mode 100644 index 0000000000000..b5ab1db25ca60 --- /dev/null +++ b/branches/sage/cephmds2/doc/inos.txt @@ -0,0 +1,11 @@ + +inodeno_t namespace + - relevant both for ino's, and for the (ino) input for Filer and object storage namespace... + +1 - root inode + +100+mds - mds log/journal +200+mds - mds ino, fh allocation tables +300+mds - mds inode files (for non-embedded inodes) + +1000+ - regular files and directories \ No newline at end of file diff --git a/branches/sage/cephmds2/doc/journal.txt b/branches/sage/cephmds2/doc/journal.txt new file mode 100644 index 0000000000000..12d66f86f00f4 --- /dev/null +++ b/branches/sage/cephmds2/doc/journal.txt @@ -0,0 +1,108 @@ + + +journal is distributed among different nodes. because authority changes over time, it's not immedicatley clear to a recoverying node relaying the journal whether the data is "real" or not (it might be exported later in the journal). + + +possibilities: + + +ONE.. bloat the journal! + +- journal entry includes full trace of dirty data (dentries, inodes) up until import point + - local renames implicit.. cache is reattached on replay + - exports are a list of exported dirs.. which are then dumped + ... + +recovery phase 1 +- each entry includes full trace (inodes + dentries) up until the import point +- cache during recovery is fragmetned/dangling beneath import points +- when export is encountered items are discarded (marked clean) + +recovery phase 2 +- import roots ping store to determine attachment points (if not already known) + - if it was imported during period, attachment point is already known. + - renames affecting imports are logged too +- import roots discovered from other nodes, attached to hierarchy + +then +- maybe resume normal operations +- if recovery is a background process on a takeover mds, "export" everything to that node. + + +-> journal contains lots of clean data.. maybe 5+ times bigger as a result! + +possible fixes: + - collect dir traces into journal chunks so they aren't repeated as often + - each chunk summarizes traces in previous chunk + - hopefully next chunk will include many of the same traces + - if not, then the entry will include it + + + + +=== log entry types === +- all inode, dentry, dir items include a dirty flag. +- dirs are implicitly _never_ complete; even if they are, a fetch before commit is necessary to confirm + +ImportPath - log change in import path +Import - log import addition (w/ path, dirino) + +InoAlloc - allocate ino +InoRelease - release ino + +Inode - inode info, along with dentry+inode trace up to import point +Unlink - (null) dentry + trace, + flag (whether inode/dir is destroyed) +Link - (new) dentry + inode + trace + + +----------------------------- + +TWO.. +- directories in store contain path at time of commit (relative to import, and root) +- replay without attaching anything to heirarchy +- after replay, directories pinged in store to attach to hierarchy + +-> phase 2 too slow! +-> and nested dirs may reattach... that won't be apparent from journal. + - put just parent dir+dentry in dir store.. even worse on phase 2! + + +THREE +- + + + + + + + +metadata journal/log + + +event types: + +chown, chmod, utime + InodeUpdate + +mknod, mkdir, symlink + Mknod .. new inode + link + +unlink, rmdir + Unlink + +rename + Link + Unlink (foreign) +or Rename (local) + +link + Link .. link existing inode + + + + +InodeUpdate +DentryLink +DentryUnlink +InodeCreate +InodeDestroy +Mkdir? diff --git a/branches/sage/cephmds2/doc/lazy_posix.txt b/branches/sage/cephmds2/doc/lazy_posix.txt new file mode 100644 index 0000000000000..1d226cd03d8e4 --- /dev/null +++ b/branches/sage/cephmds2/doc/lazy_posix.txt @@ -0,0 +1,53 @@ + +http://www.usenix.org/events/fast05/wips/slides/welch.pdf + + + +-- STATLITE + statlite(const char *filename, struct statlite *buf); + fstatlite(int fd, struct statlite *buf); + lstatlite(const char *filename, struct statlite *buf); + + * file size, mtime are optionally not guaranteed to be correct + * mask field to specify which fields you need to be correct + + +-- READDIR+ + + struct dirent_plus *readdirplus(DIR *dirp); + int readdirplus_r(DIR *dirp, struct dirent_plus *entry, struct dirent_plus **result); + struct dirent_lite *readdirlite(DIR *dirp); + int readdirlite_r(DIR *dirp, struct dirent_lite *entry, struct dirent_lite **result); + + * plus returns lstat + * lite returns lstatlite + + +-- lazy i/o integrity + + O_LAZY to open(2) + + * relax data coherency + * writes may not be visible until lazyio_propagate, fsync, close + + lazyio_propagate(int fd, off_t offset, size_t count); + * my writes are safe + + lazyio_synchronize(int fd, off_t offset, size_t count); + * i will see everyone else's propagated writes + +-- read/write non-serial vectors + + ssize_t readx(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); + ssize_t writex(int fd, const struct iovec *iov, size_t iov_count, struct xtvec *xtv, size_t xtv_count); + + * like readv/writev, but serial + * + + +int lockg(int fd, int cmd, lgid_t *lgid) + group locks + +int openg(char *path, int mode, fh_t *handle); + portable file handle +int sutoc(fh_t *fh); \ No newline at end of file diff --git a/branches/sage/cephmds2/doc/osd_outline.txt b/branches/sage/cephmds2/doc/osd_outline.txt new file mode 100644 index 0000000000000..2c6f3287aac5f --- /dev/null +++ b/branches/sage/cephmds2/doc/osd_outline.txt @@ -0,0 +1,37 @@ + +intro + +osd cluster map + requirements + desireable properties + (c)rush + +failure detection + distributed ping or heartbeat + central filter, notifier + +design + placement seed, class/superset, groups + +normal operation + reads + writes + +recovery + triggers: failed disk, or total cluster reorganization + + notify + peering + pull + push + clean + +writes during recovery + +graceful data loss + recovery? + + + + + + diff --git a/branches/sage/cephmds2/doc/osd_replication.txt b/branches/sage/cephmds2/doc/osd_replication.txt new file mode 100644 index 0000000000000..907d00e2050a2 --- /dev/null +++ b/branches/sage/cephmds2/doc/osd_replication.txt @@ -0,0 +1,226 @@ + + +SOME GENERAL REQUIREMENTS + +- cluster expansion: + - any or all of the replicas may move to new OSDs. + +- cluster map may change frequently + - map change should translate into pending replication/migration + state quickly (or better yet, instantly), so that we could push + through a series of (say, botched) maps quickly and be fine, so long + as the final map is correct. + +- ideally, unordered osd<->osd, client<->osd communication + (mds<->mds, client<->mds communication is ordered, but file i/o + would be too slow that way?) + + + + +PRIMARY ONLY PICTURE + +let's completely ignore replication for a while, and see how +complicated the picture needs to be to reliably support cluster expansion. + +typedef __uint64_t version_t; + + +per-Object metadata: +- version #. incremented when an object is modified. + e.g. version_t version; +- on primary, keep list of stray replicas + e.g. map stray_replicas; // osds w/ stray replicas + includes old primary osd(s), until deletion is confirmed. used while rg + is importing. + + +per-RG metadata +- object list. well, a method to fetch it by querying a collection or whatever. +- negative list + e.g. map deleted_objects; + - used to enumerate deleted objects, when in "importing" state. +- a RG "state" (enum/int) + + + + + + +Normal RG state: +- role=primary + clean - i am primary, all is well. no stray copies. i can + discard my negative object list, since my local + object store tells me everything. + + +After a map change: +- new primary + undef - initially; i don't know RG exists. +- old primary + homeless - i was primary, still have unmolested data. new primary is not yet migrating + (presumably it's state=undef.) i need to contact new primary and tell them + this RG exists. + +- new primary + importing - i am migrating data from old primary. keep negative dir entries for deletions. + write locally. proxy reads (force immediately migration). do whole objects + initially (on write, block until i migrate the object). later we can do + sub-object state (where "live" object data is spread across new/old primaries.. +- old primary + exporting - primary is migrating my data. + undef - when it finishes. (i will forget this RG existed.) + + +After a second map change (scenario 1): + as above, if we were clean again. + +After a second map change (scenario 2): + we weren't clean yet. +- new primary + undef - initially (until i learn RG exists) +- old primary + importing - i'm still migrating from old old primary +- old old primary + exporting - ... +- old primary +?? importing+exporting - proxy reads as before. continue migrating from old old primary. + + +After a second map change (scenario 3): + we weren't clean yet, and old old primary is also new primary +- new primary (and old old primary) + exporting - change state to importing. be sure to compare object versions, and neg dir + entries (as we always should do, really!). +- old primary + importing - note that the old import source matches new primary, and change + state to exporting, and stop importing. (unlike scenario 2) + +-> this approach could mean that a series of fast map changes could + force data to migrate down a "chain" of old primaries to reach the + new one. maybe old primary should go from importing -> exporting, + and pass along old old primary id to new primary such that the + import is a many-to-one thing, instead of one-to-one. version + numbers and neg entries will make it easy to pick out correct versions. + + + +For the importing process on a given RG: + +- metadata for each source + - each source has a state: + 'starting' - don't know anything about source yet. query source! + this probaby induces the source to change from + 'homeless' or something similar to 'exporting'. + 'importing' - i've fetched the source's object list (and neg + object list). i'm busy reading them! these lists + will shrink as the process continues. after i fetch + an object, i will erase it from the source. + (object metadata will include stray copy info + until i confirm that its removed.) + 'finishing' - i've read all my data, and i'm telling the old person + to discard any remaining RG metadata (RG contents + should already be gone) + - unmigrated object list + - migrated but not deleted object list + - stray osd is also listed in per-object MD during this stage + - negative object list + - i can remove these items if i see a newer object version (say, + from a different import source or something). + - i can remove any local objects or ignore imported ones if it is + older than deleted version + +- the lists should be sets or otherwise queryable so that while i'm + importing and a real op comes through I can quickly determine if a + given object_id is pending migration etc or if my local store is to + be trusted. + + + + + +SOME CODE BITS + + +typedef __uint64_t version_t; +class Object { + version_t version; + map stray_replicas; +}; + + +class ReplicaGroup { + int enumerate_objects(list& ls); + + int state; + + // for unstable states, + map deleted_objects; // locally + map exporters; // importing from these guys. +}; + +// primary +#define RG_STATE_CLEAN 1 +#define RG_STATE_IMPORTING 2 // pulling data + +// non-primary +#define RG_STATE_HOMELESS 5 // old primary; new primary not yet + // notified; not yet exporting. +#define RG_STATE_EXPORTING 6 // a newer primary is extracting my + // data. + + +struct RGExporter_t { + int import_state; + + set remaining_objects; // remote object list + set stray_objects; // imported but not deleted. + +}; + + + + + +---- +all crap from here on down + + + + +REPLICAS +- + + + + +OSD STATES +- primary, up to date. +- replica, up to date. + +- primary, proxy to old primary (primaries?) + +- replica, not up to date. + + +REPLICATION STUFF + +Per-RG metadata +- primary + - per-replica state: clean, catching up? +- replica + +Per-object metadata +- primary and replica + - version number/mtime + - rg (reverse indexed) +- primary + - replication level and state. + - commited to memory and/or disk, on which replicas (#1, #2, etc.) +- replica + + + + + +-> \ No newline at end of file diff --git a/branches/sage/cephmds2/doc/performance.txt b/branches/sage/cephmds2/doc/performance.txt new file mode 100644 index 0000000000000..7ca278bd284b1 --- /dev/null +++ b/branches/sage/cephmds2/doc/performance.txt @@ -0,0 +1,36 @@ + + +quick performance test 2005-05-11. fakemds, 100x100, asdf/asdf, debug 13 + -g marshalling +real 3m8.697s +user 2m53.282s +sys 0m6.291s + +real 3m3.337s +user 2m49.467s +sys 0m6.243s + + -g no marshalling +real 2m1.464s +user 1m42.680s +sys 0m8.128s + +real 1m49.469s +user 1m34.523s +sys 0m6.410s + + -O3 marshalling +real 1m29.833s +user 1m11.474s +sys 0m7.588s + +real 1m9.439s +user 0m56.071s +sys 0m5.643s + + + -O3 no marshalling +real 1m2.739s +user 0m46.578s +sys 0m7.882s + diff --git a/branches/sage/cephmds2/doc/shared_write_states_nogo.txt b/branches/sage/cephmds2/doc/shared_write_states_nogo.txt new file mode 100644 index 0000000000000..f409617d82681 --- /dev/null +++ b/branches/sage/cephmds2/doc/shared_write_states_nogo.txt @@ -0,0 +1,39 @@ + +// stable states // ------auth----- -----replica----- +#define LOCK_SYNC 0 // R . / . . . WB same ... for stat() +#define LOCK_LOCK 1 // R W / RC . . . . . / RC . . . ... for truncate(), fsync() +#define LOCK_RDONLY 2 // R . / RC R . . same +#define LOCK_MIXED 3 // . . / . R W . same +#define LOCK_WRONLY 4 // . . / . . W WB same + +// transition states +#define LOCK_GSYNCR 8 // R . / RC . . . same +#define LOCK_GSYNCMW 9 // . . / RC . . WB same +#define LOCK_GSYNCMW2 9 // . . / RC . . WB same + +#define LOCK_GLOCKSR 5 // R . / RC . . . . . / RC . . . +#define LOCK_GLOCKMW 7 // . . / RC . . . same + +#define LOCK_GRDONLYM 10 // . . / . R . . same +#define LOCK_GRDONLYM2 10 // --- . . / . R . . +#define LOCK_GRDONLYW 11 // . . / . . . . same +#define LOCK_GRDONLYW2 11 // --- . . / . . . . +#define LOCK_GRDONLYS 12 // R . / RC . . . same +#define LOCK_GRDONLYL 13 // R . / RC . . . --- + +#define LOCK_GMIXEDR 14 // R . / . R . . . . / . R . . +#define LOCK_GMIXEDR2 15 // --- . . / . R . . +#define LOCK_GMIXEDW 16 // . . / . . W . same +#define LOCK_GMIXEDW2 16 // --- . . / . . W . +#define LOCK_GMIXEDS 16 // R . / . . . . . . / . . . . +#define LOCK_GMIXEDS2 16 // --- . . / . . . . +#define LOCK_GMIXEDL 17 // R . / . . . . --- + +#define LOCK_GWRONLYR 18 // R . / . . . . same +#define LOCK_GWRONLYR2 18 // --- . . / . . . . +#define LOCK_GWRONLYM 19 // . . / . . . . same +#define LOCK_GWRONLYM2 19 // --- . . / . . . . +#define LOCK_GWRONLYS 20 // R . / . . . WB same +#define LOCK_GWRONLYS2 20 // --- . . / . . . . +#define LOCK_GWRONLYL 21 + diff --git a/branches/sage/cephmds2/doc/shutdown.txt b/branches/sage/cephmds2/doc/shutdown.txt new file mode 100644 index 0000000000000..e5ccde3171004 --- /dev/null +++ b/branches/sage/cephmds2/doc/shutdown.txt @@ -0,0 +1,13 @@ + +- mds0 triggers shutdown by sending a shutdown_start to all nodes. + +- from here on out, all client requests are discarded (unless they are a file close?) + +- each mds checks for outstanding inter-mds transations. e.g imports, discoveries, etc. once they're all done, send a shutdown_ready to mds0 + +- each mds successively disassembles its cache, flushing data to long-term storage, and sending inodeexpires, exporting imported dirs to parent (after they're clean + empty) + +- when the cache is empty, send shutdown_done to mds0 and exit. + +- mds0 exits when all mdss have finished. + diff --git a/branches/sage/cephmds2/ebofs/Allocator.cc b/branches/sage/cephmds2/ebofs/Allocator.cc new file mode 100644 index 0000000000000..805957f779a11 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Allocator.cc @@ -0,0 +1,692 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Allocator.h" +#include "Ebofs.h" + + +#undef dout +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << fs->dev.get_device_name() << ").allocator." + + +void Allocator::dump_freelist() +{ + if (1) { + interval_set free; // validate too + + block_t n = 0; + for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { + Table *tab; + if (b < EBOFS_NUM_FREE_BUCKETS) { + tab = fs->free_tab[b]; + dout(0) << "dump bucket " << b << " " << tab->get_num_keys() << endl; + } else { + tab = fs->limbo_tab; + dout(0) << "dump limbo " << tab->get_num_keys() << endl;; + } + + if (tab->get_num_keys() > 0) { + Table::Cursor cursor(tab); + assert(tab->find(0, cursor) >= 0); + while (1) { + dout(0) << "dump ex " << cursor.current().key << "~" << cursor.current().value << endl; + assert(cursor.current().value > 0); + + if (b < EBOFS_NUM_FREE_BUCKETS) + n += cursor.current().value; + + if (free.contains( cursor.current().key, cursor.current().value )) + dout(0) << "dump bad " << cursor.current().key << "~" << cursor.current().value << endl; + assert(!free.contains( cursor.current().key, cursor.current().value )); + free.insert( cursor.current().key, cursor.current().value ); + if (cursor.move_right() <= 0) break; + } + } else { + //cout << " empty" << endl; + } + } + + assert(n == fs->free_blocks); + dout(0) << "dump combined freelist is " << free << endl; + + + // alloc_tab + if (fs->alloc_tab->get_num_keys() > 0) { + Table >::Cursor cursor(fs->alloc_tab); + assert(fs->alloc_tab->find(0, cursor) >= 0); + while (1) { + dout(0) << "alloc ex " << cursor.current().key << "~" << cursor.current().value.first << " ref " + << cursor.current().value.second + << endl; + assert(cursor.current().value.first > 0); + + if (cursor.move_right() <= 0) break; + } + } + } +} + + +int Allocator::find(Extent& ex, int bucket, block_t num, block_t near, int dir) +{ + Table::Cursor cursor(fs->free_tab[bucket]); + bool found = false; + + if ((dir == DIR_ANY || dir == DIR_FWD) && + fs->free_tab[bucket]->find( near, cursor ) >= 0) { + // look to the right + do { + if (cursor.current().value >= num) + found = true; + } while (!found && cursor.move_right() > 0); + } + + if ((dir == DIR_ANY || dir == DIR_BACK) && + !found) { + // look to the left + fs->free_tab[bucket]->find( near, cursor ); + + while (!found && cursor.move_left() >= 0) + if (cursor.current().value >= num) + found = true; + } + + if (found) { + ex.start = cursor.current().key; + ex.length = cursor.current().value; + return 0; + } + + return -1; +} + +int Allocator::allocate(Extent& ex, block_t num, block_t near) +{ + //dump_freelist(); + + int dir = DIR_ANY; // no dir + if (near == NEAR_LAST_FWD) { + near = last_pos; + dir = DIR_FWD; // fwd + } + else if (near == NEAR_LAST) + near = last_pos; + + int bucket; + + while (1) { // try twice, if fwd = true + + // look for contiguous extent + for (bucket = pick_bucket(num); bucket < EBOFS_NUM_FREE_BUCKETS; bucket++) { + if (find(ex, bucket, num, near, dir) >= 0) { + // yay! + + // remove original + fs->free_tab[bucket]->remove( ex.start ); + fs->free_blocks -= ex.length; + + if (ex.length > num) { + if (ex.start < near) { + // to the left + if (ex.start + ex.length - num <= near) { + // by a lot. take right-most portion. + Extent left; + left.start = ex.start; + left.length = ex.length - num; + ex.start += left.length; + ex.length -= left.length; + assert(ex.length == num); + _release_loner(left); + } else { + // take middle part. + Extent left,right; + left.start = ex.start; + left.length = near - ex.start; + ex.start = near; + right.start = ex.start + num; + right.length = ex.length - left.length - num; + ex.length = num; + _release_loner(left); + _release_loner(right); + } + } + else { + // to the right. take left-most part. + Extent right; + right.start = ex.start + num; + right.length = ex.length - num; + ex.length = num; + _release_loner(right); + } + } + + dout(20) << "allocate " << ex << " near " << near << endl; + last_pos = ex.end(); + //dump_freelist(); + if (g_conf.ebofs_cloneable) + alloc_inc(ex); + return num; + } + } + + if (dir == DIR_BACK || dir == DIR_ANY) break; + dir = DIR_BACK; + } + + // ok, find partial extent instead. + for (block_t trysize = num/2; trysize >= 1; trysize /= 2) { + int bucket = pick_bucket(trysize); + if (find(ex, bucket, trysize, near) >= 0) { + // yay! + assert(ex.length < num); + + fs->free_tab[bucket]->remove(ex.start); + fs->free_blocks -= ex.length; + last_pos = ex.end(); + dout(20) << "allocate partial " << ex << " (wanted " << num << ") near " << near << endl; + //dump_freelist(); + if (g_conf.ebofs_cloneable) + alloc_inc(ex); + return ex.length; + } + } + + dout(1) << "allocate failed, fs completely full! " << fs->free_blocks << endl; + assert(0); + //dump_freelist(); + return -1; +} + +int Allocator::_release_into_limbo(Extent& ex) +{ + dout(10) << "_release_into_limbo " << ex << endl; + dout(10) << "limbo is " << limbo << endl; + assert(ex.length > 0); + limbo.insert(ex.start, ex.length); + fs->limbo_blocks += ex.length; + return 0; +} + +int Allocator::release(Extent& ex) +{ + if (g_conf.ebofs_cloneable) + return alloc_dec(ex); + + _release_into_limbo(ex); + return 0; +} + +int Allocator::commit_limbo() +{ + dout(20) << "commit_limbo" << endl; + for (map::iterator i = limbo.m.begin(); + i != limbo.m.end(); + i++) { + fs->limbo_tab->insert(i->first, i->second); + //fs->free_blocks += i->second; + } + limbo.clear(); + //fs->limbo_blocks = 0; + //dump_freelist(); + return 0; +} + +int Allocator::release_limbo() +{ + //dump_freelist(); + if (fs->limbo_tab->get_num_keys() > 0) { + Table::Cursor cursor(fs->limbo_tab); + fs->limbo_tab->find(0, cursor); + while (1) { + Extent ex(cursor.current().key, cursor.current().value); + dout(20) << "release_limbo ex " << ex << endl; + + fs->limbo_blocks -= ex.length; + _release_merge(ex); + + if (cursor.move_right() <= 0) break; + } + } + fs->limbo_tab->clear(); + //dump_freelist(); + return 0; +} + + + +/* +int Allocator::_alloc_loner_inc(Extent& ex) +{ + Table >::Cursor cursor(fs->alloc_tab); + + if (fs->alloc_tab->find( ex.start, cursor ) + == Table >::Cursor::MATCH) { + assert(cursor.current().value.first == ex.length); + pair& v = cursor.dirty_current_value(); + v.second++; + dout(10) << "_alloc_loner_inc " << ex << " " + << (v.second-1) << " -> " << v.second + << endl; + } else { + // insert it, @1 + fs->alloc_tab->insert(ex.start, pair(ex.length,1)); + dout(10) << "_alloc_loner_inc " << ex << " 0 -> 1" << endl; + } + return 0; +} + +int Allocator::_alloc_loner_dec(Extent& ex) +{ + Table >::Cursor cursor(fs->alloc_tab); + + if (fs->alloc_tab->find( ex.start, cursor ) + == Table >::Cursor::MATCH) { + assert(cursor.current().value.first == ex.length); + if (cursor.current().value.second == 1) { + dout(10) << "_alloc_loner_dec " << ex << " 1 -> 0" << endl; + fs->alloc_tab->remove( cursor.current().key ); + } else { + pair& v = cursor.dirty_current_value(); + --v.second; + dout(10) << "_alloc_loner_dec " << ex << " " + << (v.second+1) << " -> " << v.second + << endl; + } + } else { + assert(0); + } + return 0; +} +*/ + + +int Allocator::alloc_inc(Extent ex) +{ + dout(10) << "alloc_inc " << ex << endl; + + // empty table? + if (fs->alloc_tab->get_num_keys() == 0) { + // easy. + fs->alloc_tab->insert(ex.start, pair(ex.length,1)); + dout(10) << "alloc_inc + " << ex << " 0 -> 1 (first entry)" << endl; + return 0; + } + + Table >::Cursor cursor(fs->alloc_tab); + + // try to move to left (to check for overlap) + int r = fs->alloc_tab->find( ex.start, cursor ); + if (r == Table >::Cursor::OOB || + cursor.current().key > ex.start) { + r = cursor.move_left(); + dout(10) << "alloc_inc move_left r = " << r << endl; + } + + while (1) { + dout(10) << "alloc_inc loop at " << cursor.current().key + << "~" << cursor.current().value.first + << " ref " << cursor.current().value.second + << endl; + + // too far left? + if (cursor.current().key < ex.start && + cursor.current().key + cursor.current().value.first <= ex.start) { + // adjacent? + bool adjacent = false; + if (cursor.current().key + cursor.current().value.first == ex.start && + cursor.current().value.second == 1) + adjacent = true; + + // no overlap. + r = cursor.move_right(); + dout(10) << "alloc_inc move_right r = " << r << endl; + + // at end? + if (r <= 0) { + // hmm! + if (adjacent) { + // adjust previous entry + cursor.move_left(); + pair &v = cursor.dirty_current_value(); + v.first += ex.length; // yay! + dout(10) << "alloc_inc + " << ex << " 0 -> 1 (adjust at end)" << endl; + } else { + // insert at end, finish. + int r = fs->alloc_tab->insert(ex.start, pair(ex.length,1)); + dout(10) << "alloc_inc + " << ex << " 0 -> 1 (at end) .. r = " << r << endl; + //dump_freelist(); + } + return 0; + } + } + + if (cursor.current().key > ex.start) { + // gap. + // oooooo + // nnnnn..... + block_t l = MIN(ex.length, cursor.current().key - ex.start); + + fs->alloc_tab->insert(ex.start, pair(l,1)); + dout(10) << "alloc_inc + " << ex.start << "~" << l << " 0 -> 1" << endl; + ex.start += l; + ex.length -= l; + if (ex.length == 0) break; + fs->alloc_tab->find( ex.start, cursor ); + } + else if (cursor.current().key < ex.start) { + block_t end = cursor.current().value.first + cursor.current().key; + + if (end <= ex.end()) { + // single split + // oooooo + // nnnnn + pair& v = cursor.dirty_current_value(); + v.first = ex.start - cursor.current().key; + int ref = v.second; + + block_t l = end - ex.start; + fs->alloc_tab->insert(ex.start, pair(l, 1+ref)); + + dout(10) << "alloc_inc " << ex.start << "~" << l + << " " << ref << " -> " << ref+1 + << " (right split)" << endl; + + ex.start += l; + ex.length -= l; + if (ex.length == 0) break; + fs->alloc_tab->find( ex.start, cursor ); + + } else { + // double split, finish. + // ------------- + // ------ + pair& v = cursor.dirty_current_value(); + v.first = ex.start - cursor.current().key; + int ref = v.second; + + fs->alloc_tab->insert(ex.start, pair(ex.length, 1+ref)); + + int rl = end - ex.end(); + fs->alloc_tab->insert(ex.end(), pair(rl, ref)); + + dout(10) << "alloc_inc " << ex + << " " << ref << " -> " << ref+1 + << " (double split finish)" + << endl; + + break; + } + } + else { + assert(cursor.current().key == ex.start); + + if (cursor.current().value.first <= ex.length) { + // inc. + // oooooo + // nnnnnnnn + pair& v = cursor.dirty_current_value(); + v.second++; + dout(10) << "alloc_inc " << ex.start << "~" << cursor.current().value.first + << " " << cursor.current().value.second-1 << " -> " + << cursor.current().value.second + << " (left split)" << endl; + ex.start += v.first; + ex.length -= v.first; + if (ex.length == 0) break; + cursor.move_right(); + } else { + // single split, finish. + // oooooo + // nnn + block_t l = cursor.current().value.first - ex.length; + int ref = cursor.current().value.second; + + pair& v = cursor.dirty_current_value(); + v.first = ex.length; + v.second++; + + fs->alloc_tab->insert(ex.end(), pair(l, ref)); + + dout(10) << "alloc_inc " << ex + << " " << ref << " -> " << ref+1 + << " (left split finish)" + << endl; + + break; + } + } + } + + return 0; +} + + +int Allocator::alloc_dec(Extent ex) +{ + dout(10) << "alloc_dec " << ex << endl; + + assert(fs->alloc_tab->get_num_keys() >= 0); + + Table >::Cursor cursor(fs->alloc_tab); + + // try to move to left (to check for overlap) + int r = fs->alloc_tab->find( ex.start, cursor ); + dout(10) << "alloc_dec find r = " << r << endl; + + if (r == Table >::Cursor::OOB || + cursor.current().key > ex.start) { + r = cursor.move_left(); + dout(10) << "alloc_dec move_left r = " << r << endl; + + // too far left? + if (cursor.current().key < ex.start && + cursor.current().key + cursor.current().value.first <= ex.start) { + // no overlap. + dump_freelist(); + assert(0); + } + } + + while (1) { + dout(10) << "alloc_dec ? " << cursor.current().key + << "~" << cursor.current().value.first + << " " << cursor.current().value.second + << ", ex is " << ex + << endl; + + assert(cursor.current().key <= ex.start); // no gap allowed. + + if (cursor.current().key < ex.start) { + block_t end = cursor.current().value.first + cursor.current().key; + + if (end <= ex.end()) { + // single split + // oooooo + // ----- + pair& v = cursor.dirty_current_value(); + v.first = ex.start - cursor.current().key; + int ref = v.second; + dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first + << " " << ref + << " shortened left bit of single" << endl; + + block_t l = end - ex.start; + if (ref > 1) { + fs->alloc_tab->insert(ex.start, pair(l, ref-1)); + dout(10) << "alloc_dec . " << ex.start << "~" << l + << " " << ref << " -> " << ref-1 + << endl; + } else { + Extent r(ex.start, l); + _release_into_limbo(r); + } + + ex.start += l; + ex.length -= l; + if (ex.length == 0) break; + fs->alloc_tab->find( ex.start, cursor ); + + } else { + // double split, finish. + // ooooooooooooo + // ------ + pair& v = cursor.dirty_current_value(); + v.first = ex.start - cursor.current().key; + int ref = v.second; + dout(10) << "alloc_dec s " << cursor.current().key << "~" << cursor.current().value.first + << " " << ref + << " shorted left bit of double split" << endl; + + if (ref > 1) { + fs->alloc_tab->insert(ex.start, pair(ex.length, ref-1)); + dout(10) << "alloc_inc s " << ex + << " " << ref << " -> " << ref-1 + << " reinserted middle bit of double split" + << endl; + } else { + _release_into_limbo(ex); + } + + int rl = end - ex.end(); + fs->alloc_tab->insert(ex.end(), pair(rl, ref)); + dout(10) << "alloc_dec s " << ex.end() << "~" << rl + << " " << ref + << " reinserted right bit of double split" << endl; + break; + } + } + else { + assert(cursor.current().key == ex.start); + + if (cursor.current().value.first <= ex.length) { + // inc. + // oooooo + // nnnnnnnn + if (cursor.current().value.second > 1) { + pair& v = cursor.dirty_current_value(); + v.second--; + dout(10) << "alloc_dec s " << ex.start << "~" << cursor.current().value.first + << " " << cursor.current().value.second+1 << " -> " << cursor.current().value.second + << endl; + ex.start += v.first; + ex.length -= v.first; + if (ex.length == 0) break; + cursor.move_right(); + } else { + Extent r(cursor.current().key, cursor.current().value.first); + _release_into_limbo(r); + + ex.start += cursor.current().value.first; + ex.length -= cursor.current().value.first; + cursor.remove(); + + if (ex.length == 0) break; + fs->alloc_tab->find( ex.start, cursor ); + } + } else { + // single split, finish. + // oooooo + // nnn + block_t l = cursor.current().value.first - ex.length; + int ref = cursor.current().value.second; + + if (ref > 1) { + pair& v = cursor.dirty_current_value(); + v.first = ex.length; + v.second--; + dout(10) << "alloc_inc . " << ex + << " " << ref << " -> " << ref-1 + << endl; + } else { + _release_into_limbo(ex); + cursor.remove(); + } + + dout(10) << "alloc_dec s " << ex.end() << "~" << l + << " " << ref + << " reinserted right bit of single split" << endl; + fs->alloc_tab->insert(ex.end(), pair(l, ref)); + break; + } + } + + + } + + return 0; +} + + +/* + * release extent into freelist + * WARNING: *ONLY* use this if you _know_ there are no adjacent free extents + */ +int Allocator::_release_loner(Extent& ex) +{ + assert(ex.length > 0); + int b = pick_bucket(ex.length); + fs->free_tab[b]->insert(ex.start, ex.length); + fs->free_blocks += ex.length; + return 0; +} + +/* + * release extent into freelist + * look for any adjacent extents and merge with them! + */ +int Allocator::_release_merge(Extent& orig) +{ + dout(15) << "_release_merge " << orig << endl; + assert(orig.length > 0); + + Extent newex = orig; + + // one after us? + for (int b=0; b::Cursor cursor(fs->free_tab[b]); + + if (fs->free_tab[b]->find( newex.start+newex.length, cursor ) + == Table::Cursor::MATCH) { + // add following extent to ours + newex.length += cursor.current().value; + + // remove it + fs->free_blocks -= cursor.current().value; + fs->free_tab[b]->remove( cursor.current().key ); + break; + } + } + + // one before us? + for (int b=0; b::Cursor cursor(fs->free_tab[b]); + fs->free_tab[b]->find( newex.start+newex.length, cursor ); + if (cursor.move_left() >= 0 && + (cursor.current().key + cursor.current().value == newex.start)) { + // merge + newex.start = cursor.current().key; + newex.length += cursor.current().value; + + // remove it + fs->free_blocks -= cursor.current().value; + fs->free_tab[b]->remove( cursor.current().key ); + break; + } + } + + // ok, insert newex + _release_loner(newex); + return 0; +} diff --git a/branches/sage/cephmds2/ebofs/Allocator.h b/branches/sage/cephmds2/ebofs/Allocator.h new file mode 100644 index 0000000000000..c53ff2a69fba1 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Allocator.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_ALLOCATOR_H +#define __EBOFS_ALLOCATOR_H + +#include "types.h" + +#include "include/interval_set.h" + +class Ebofs; + +class Allocator { +public: + const static block_t NEAR_LAST = 0; + const static block_t NEAR_LAST_FWD = 1; + + const static int DIR_ANY = 0; + const static int DIR_FWD = 2; + const static int DIR_BACK = 1; + +protected: + Ebofs *fs; + block_t last_pos; + + + interval_set limbo; + + static int pick_bucket(block_t num) { + int b = 0; + while (num > 1) { + b++; + num = num >> EBOFS_FREE_BUCKET_BITS; + } + if (b >= EBOFS_NUM_FREE_BUCKETS) + b = EBOFS_NUM_FREE_BUCKETS-1; + return b; + } + + int find(Extent& ex, int bucket, block_t num, block_t near, int dir = DIR_ANY); + + void dump_freelist(); + + public: + int _release_into_limbo(Extent& ex); + + int _release_loner(Extent& ex); // release loner extent + int _release_merge(Extent& ex); // release any extent (searches for adjacent) + + //int _alloc_loner_inc(Extent& ex); + //int _alloc_loner_dec(Extent& ex); + + + public: + Allocator(Ebofs *f) : fs(f), last_pos(0) {} + + int allocate(Extent& ex, block_t num, block_t near=NEAR_LAST); + int release(Extent& ex); // alias for alloc_dec + + int alloc_inc(Extent ex); + int alloc_dec(Extent ex); + + + /*int unallocate(Extent& ex) { // skip limbo + return _release_merge(ex); + } + */ + + int commit_limbo(); // limbo -> fs->limbo_tab + int release_limbo(); // fs->limbo_tab -> free_tabs + +}; + +#endif diff --git a/branches/sage/cephmds2/ebofs/BlockDevice.cc b/branches/sage/cephmds2/ebofs/BlockDevice.cc new file mode 100644 index 0000000000000..5188946574643 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/BlockDevice.cc @@ -0,0 +1,769 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "BlockDevice.h" + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifndef __CYGWIN__ +#include +#endif + + +/******************************************* + * biovec + */ + +inline ostream& operator<<(ostream& out, BlockDevice::biovec &bio) +{ + out << "bio("; + if (bio.type == BlockDevice::biovec::IO_READ) out << "rd "; + if (bio.type == BlockDevice::biovec::IO_WRITE) out << "wr "; + out << bio.start << "~" << bio.length; + if (bio.note) out << " " << bio.note; + out << " " << &bio; + out << ")"; + return out; +} + + + +/******************************************* + * ElevatorQueue + */ + +#undef dout +#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").elevatorq." +#define derr(x) if (x <= g_conf.debug_bdev) cerr << "bdev(" << dev << ").elevatorq." + + +int BlockDevice::ElevatorQueue::dequeue_io(list& biols, + block_t& start, block_t& length, + interval_set& block_lock) +{ + // queue empty? + assert(!io_map.empty()); + + dout(20) << "dequeue_io el_pos " << el_pos << " dir " << el_dir_forward << endl; + + // find our position: i >= pos + map::iterator i; + + int tries = g_conf.bdev_el_bidir + 1; + while (tries > 0) { + if (el_dir_forward) { + i = io_map.lower_bound(el_pos); + if (i != io_map.end()) { + break; // not at end. good. + } + } else { + i = io_map.upper_bound(el_pos); + if (i != io_map.begin()) { + i--; // and back down one (to get i <= pos). good. + break; + } + } + + // reverse (or initial startup)? + if (g_conf.bdev_el_bidir || !el_dir_forward) { + // dout(20) << "restart reversing" << endl; + el_dir_forward = !el_dir_forward; + } + + if (el_dir_forward) { + // forward + el_pos = 0; + + if (g_conf.bdev_el_fw_max_ms) { + el_stop = g_clock.now(); + utime_t max(0, 1000*g_conf.bdev_el_fw_max_ms); // (s,us), convert ms -> us! + el_stop += max; + // dout(20) << "restart forward sweep for " << max << endl; + } else { + // dout(20) << "restart fowrard sweep" << endl; + } + } else { + // reverse + el_pos = bdev->get_num_blocks(); + + if (g_conf.bdev_el_bw_max_ms) { + el_stop = g_clock.now(); + utime_t max(0, 1000*g_conf.bdev_el_bw_max_ms); // (s,us), convert ms -> us! + el_stop += max; + // dout(20) << "restart reverse sweep for " << max << endl; + } else { + // dout(20) << "restart reverse sweep" << endl; + } + } + + tries--; + } + + assert(tries > 0); // this shouldn't happen if the queue is non-empty. + + // get some biovecs + int num_bio = 0; + + dout(20) << "dequeue_io starting with " << i->first << " " << *i->second << endl; + + // merge contiguous ops + char type = i->second->type; // read or write + int num_iovs = 0; // count eventual iov's for readv/writev + + start = i->first; + length = 0; + + if (el_dir_forward) + el_pos = start; + else + el_pos = i->first + i->second->length; + + // while (contiguous) + while ((( el_dir_forward && el_pos == i->first) || + (!el_dir_forward && el_pos == i->first + i->second->length)) && + type == i->second->type) { + biovec *bio = i->second; + + // allowed? (not already submitted to kernel?) + if (block_lock.intersects(bio->start, bio->length)) { + // dout(20) << "dequeue_io " << bio->start << "~" << bio->length + // << " intersects block_lock " << block_lock << endl; + break; // stop, or go with what we've got so far + } + + // add to biols + int nv = bio->bl.buffers().size(); // how many iov's in this bio's bufferlist? + if (num_iovs + nv >= g_conf.bdev_iov_max) break; // too many! + num_iovs += nv; + + start = MIN(start, bio->start); + length += bio->length; + + if (el_dir_forward) { + //dout(20) << "dequeue_io fw dequeue io at " << el_pos << " " << *i->second << endl; + biols.push_back(bio); // add at back + } else { + // dout(20) << "dequeue_io bw dequeue io at " << el_pos << " " << *i->second << endl; + biols.push_front(bio); // add at front + } + num_bio++; + + // move elevator pointer + bool at_end = false; + map::iterator prev = i; + if (el_dir_forward) { + el_pos += bio->length; // cont. next would start right after us + i++; + if (i == io_map.end()) { + at_end = true; + } + } else { + el_pos -= bio->length; + if (i == io_map.begin()) { + at_end = true; + } else { + i--; + } + } + + // dequeue + io_map.erase(prev); + bio->in_queue = 0; + + if (at_end) break; + } + + return num_bio; +} + + + +/******************************************* + * BarrierQueue + */ +#undef dout +#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ").barrierq." + +void BlockDevice::BarrierQueue::barrier() +{ + if (!qls.empty() && qls.front()->empty()) { + assert(qls.size() == 1); + dout(10) << "barrier not adding new queue, front is empty" << endl; + } else { + qls.push_back(new ElevatorQueue(bdev, dev)); + dout(10) << "barrier adding new elevator queue (now " << qls.size() << "), front queue has " + << qls.front()->size() << " ios left" << endl; + } +} + +bool BlockDevice::BarrierQueue::bump() +{ + assert(!qls.empty()); + + // is the front queue empty? + if (qls.front()->empty() && + qls.front() != qls.back()) { + delete qls.front(); + qls.pop_front(); + dout(10) << "dequeue_io front empty, moving to next queue (" << qls.front()->size() << ")" << endl; + return true; + } + + return false; +} + +int BlockDevice::BarrierQueue::dequeue_io(list& biols, + block_t& start, block_t& length, + interval_set& locked) +{ + assert(!qls.empty()); + int n = qls.front()->dequeue_io(biols, start, length, locked); + bump(); // in case we emptied the front queue + return n; +} + + + + +/******************************************* + * BlockDevice + */ + +#undef dout +#define dout(x) if (x <= g_conf.debug_bdev) cout << "bdev(" << dev << ")." + + + +block_t BlockDevice::get_num_blocks() +{ + if (!num_blocks) { + assert(fd > 0); + +#ifdef BLKGETSIZE64 + // ioctl block device? + ioctl(fd, BLKGETSIZE64, &num_blocks); +#endif + + if (!num_blocks) { + // hmm, try stat! + struct stat st; + fstat(fd, &st); + num_blocks = st.st_size; + } + + num_blocks /= (__uint64_t)EBOFS_BLOCK_SIZE; + + if (g_conf.bdev_fake_mb) { + num_blocks = g_conf.bdev_fake_mb * 256; + dout(0) << "faking dev size " << g_conf.bdev_fake_mb << " mb" << endl; + } + if (g_conf.bdev_fake_max_mb && + num_blocks > (block_t)g_conf.bdev_fake_max_mb * 256ULL) { + dout(0) << "faking dev size " << g_conf.bdev_fake_max_mb << " mb" << endl; + num_blocks = g_conf.bdev_fake_max_mb * 256; + } + + } + return num_blocks; +} + + + +/** io thread + * each worker thread dequeues ios from the root_queue and submits them to the kernel. + */ +void* BlockDevice::io_thread_entry() +{ + lock.Lock(); + + int whoami = io_threads_started++; + io_threads_running++; + assert(io_threads_running <= g_conf.bdev_iothreads); + dout(10) << "io_thread" << whoami << " start, " << io_threads_running << " now running" << endl; + + // get my own fd (and file position pointer) + int fd = open_fd(); + assert(fd > 0); + + while (!io_stop) { + bool do_sleep = false; + + // queue empty? + if (root_queue.empty()) { + // sleep + do_sleep = true; + } else { + dout(20) << "io_thread" << whoami << " going" << endl; + + block_t start, length; + list biols; + int n = root_queue.dequeue_io(biols, start, length, io_block_lock); + + if (n == 0) { + // failed to dequeue a do-able op, sleep for now + dout(20) << "io_thread" << whoami << " couldn't dequeue doable op, sleeping" << endl; + assert(io_threads_running > 1); // there must be someone else, if we couldn't dequeue something doable. + do_sleep = true; + } + else { + // lock blocks + assert(start == biols.front()->start); + io_block_lock.insert(start, length); + + // drop lock to do the io + lock.Unlock(); + do_io(fd, biols); + lock.Lock(); + + // unlock blocks + io_block_lock.erase(start, length); + + // someone might have blocked on our block_lock? + if (io_threads_running < g_conf.bdev_iothreads && + (int)root_queue.size() > io_threads_running) + io_wakeup.SignalAll(); + } + } + + if (do_sleep) { + do_sleep = false; + + // sleep + io_threads_running--; + dout(20) << "io_thread" << whoami << " sleeping, " << io_threads_running << " threads now running," + << " queue has " << root_queue.size() << endl; + + if (g_conf.bdev_idle_kick_after_ms > 0 && + io_threads_running == 0 && + idle_kicker) { + // first wait for signal | timeout + io_wakeup.WaitInterval(lock, utime_t(0, g_conf.bdev_idle_kick_after_ms*1000)); + + // should we still be sleeping? (did we get woken up, or did timer expire? + if (root_queue.empty() && io_threads_running == 0) { + idle_kicker->kick(); // kick + io_wakeup.Wait(lock); // and wait + } + } else { + // normal, just wait. + io_wakeup.Wait(lock); + } + + io_threads_running++; + assert(io_threads_running <= g_conf.bdev_iothreads); + dout(20) << "io_thread" << whoami << " woke up, " << io_threads_running << " threads now running" << endl; + } + } + + // clean up + ::close(fd); + io_threads_running--; + + lock.Unlock(); + + dout(10) << "io_thread" << whoami << " finish" << endl; + return 0; +} + + + +/** do_io + * do a single io operation + * (lock is NOT held, but we own the *biovec) + */ +void BlockDevice::do_io(int fd, list& biols) +{ + int r; + assert(!biols.empty()); + + // get full range, type, bl + bufferlist bl; + bl.claim(biols.front()->bl); + block_t start = biols.front()->start; + block_t length = biols.front()->length; + char type = biols.front()->type; + + list::iterator p = biols.begin(); + int numbio = 1; + for (p++; p != biols.end(); p++) { + length += (*p)->length; + bl.claim_append((*p)->bl); + numbio++; + } + + // do it + dout(20) << "do_io start " << (type==biovec::IO_WRITE?"write":"read") + << " " << start << "~" << length + << " " << numbio << " bits" << endl; + if (type == biovec::IO_WRITE) { + r = _write(fd, start, length, bl); + } else if (type == biovec::IO_READ) { + r = _read(fd, start, length, bl); + } else assert(0); + dout(20) << "do_io finish " << (type==biovec::IO_WRITE?"write":"read") + << " " << start << "~" << length << endl; + + // set rval + for (p = biols.begin(); p != biols.end(); p++) + (*p)->rval = r; + + if (1) { + // put in completion queue + complete_lock.Lock(); + complete_queue.splice( complete_queue.end(), biols ); + complete_queue_len += numbio; + complete_wakeup.Signal(); + complete_lock.Unlock(); + } else { + // be slow and finish synchronously + for (p = biols.begin(); p != biols.end(); p++) + finish_io(*p); + } +} + + +/** finish_io + * + * finish an io by signaling the cond or performing a callback. + * called by completion thread, unless that's disabled above. + */ +void BlockDevice::finish_io(biovec *bio) +{ + bio->done = true; + if (bio->cond) { + bio->cond->Signal(); + } + else if (bio->cb) { + bio->cb->finish((ioh_t)bio, bio->rval); + delete bio->cb; + delete bio; + } +} + +/*** completion_thread + * handle Cond signals or callbacks for completed ios + */ +void* BlockDevice::complete_thread_entry() +{ + complete_lock.Lock(); + dout(10) << "complete_thread start" << endl; + + while (!io_stop) { + + while (!complete_queue.empty()) { + list ls; + ls.swap(complete_queue); + dout(10) << "complete_thread grabbed " << complete_queue_len << " biovecs" << endl; + complete_queue_len = 0; + + complete_lock.Unlock(); + + // finish + for (list::iterator p = ls.begin(); + p != ls.end(); + p++) { + biovec *bio = *p; + dout(20) << "complete_thread finishing " << *bio << endl; + finish_io(bio); + } + + complete_lock.Lock(); + } + if (io_stop) break; + + /* + if (io_threads_running == 0 && idle_kicker) { + complete_lock.Unlock(); + idle_kicker->kick(); + complete_lock.Lock(); + if (!complete_queue.empty() || io_stop) + continue; + } + */ + + dout(25) << "complete_thread sleeping" << endl; + complete_wakeup.Wait(complete_lock); + } + + dout(10) << "complete_thread finish" << endl; + complete_lock.Unlock(); + return 0; +} + + + + +// io queue + +void BlockDevice::_submit_io(biovec *b) +{ + // NOTE: lock must be held + dout(15) << "_submit_io " << *b << endl; + + // wake up io_thread(s)? + if ((int)root_queue.size() == io_threads_running) + io_wakeup.SignalOne(); + else if ((int)root_queue.size() > io_threads_running) + io_wakeup.SignalAll(); + + // queue + root_queue.submit_io(b); + + /* + // [DEBUG] check for overlapping ios + // BUG: this doesn't detect all overlaps w/ the next queue thing. + if (g_conf.bdev_debug_check_io_overlap) { + // BUG: this doesn't catch everything! eg 1~10000000 will be missed.... + multimap::iterator p = io_queue.lower_bound(b->start); + if ((p != io_queue.end() && + p->first < b->start+b->length) || + (p != io_queue.begin() && + (p--, p->second->start + p->second->length > b->start))) { + dout(1) << "_submit_io new io " << *b + << " overlaps with existing " << *p->second << endl; + cerr << "_submit_io new io " << *b + << " overlaps with existing " << *p->second << endl; + } + } + */ + +} + +int BlockDevice::_cancel_io(biovec *bio) +{ + // NOTE: lock must be held + + if (bio->in_queue == 0) { + dout(15) << "_cancel_io " << *bio << " FAILED" << endl; + return -1; + } else { + dout(15) << "_cancel_io " << *bio << endl; + bio->in_queue->cancel_io(bio); + if (root_queue.bump()) + io_wakeup.SignalAll(); // something happened! + return 0; + } +} + + + +// low level io + +int BlockDevice::_read(int fd, block_t bno, unsigned num, bufferlist& bl) +{ + dout(10) << "_read " << bno << "~" << num << endl; + + assert(fd > 0); + + off_t offset = bno * EBOFS_BLOCK_SIZE; + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + size_t len = num*EBOFS_BLOCK_SIZE; + assert(bl.length() >= len); + + struct iovec iov[ bl.buffers().size() ]; + int n = 0; + size_t left = len; + for (list::const_iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) { + assert(i->length() % EBOFS_BLOCK_SIZE == 0); + + iov[n].iov_base = (void*)i->c_str(); + iov[n].iov_len = MIN(left, i->length()); + + left -= iov[n].iov_len; + n++; + if (left == 0) break; + } + + int got = ::readv(fd, iov, n); + assert(got <= (int)len); + + return 0; +} + +int BlockDevice::_write(int fd, unsigned bno, unsigned num, bufferlist& bl) +{ + dout(10) << "_write " << bno << "~" << num << endl; + + assert(fd > 0); + + off_t offset = (off_t)bno << EBOFS_BLOCK_BITS; + assert((off_t)bno * (off_t)EBOFS_BLOCK_SIZE == offset); + off_t actual = lseek(fd, offset, SEEK_SET); + assert(actual == offset); + + // write buffers + size_t len = num*EBOFS_BLOCK_SIZE; + + struct iovec iov[ bl.buffers().size() ]; + + int n = 0; + size_t left = len; + for (list::const_iterator i = bl.buffers().begin(); + i != bl.buffers().end(); + i++) { + assert(i->length() % EBOFS_BLOCK_SIZE == 0); + + iov[n].iov_base = (void*)i->c_str(); + iov[n].iov_len = MIN(left, i->length()); + + assert((((unsigned long long)iov[n].iov_base) & 4095ULL) == 0); + assert((iov[n].iov_len & 4095) == 0); + + left -= iov[n].iov_len; + n++; + if (left == 0) break; + } + + int r = ::writev(fd, iov, n); + + if (r < 0) { + dout(1) << "couldn't write bno " << bno << " num " << num + << " (" << len << " bytes) in " << n << " iovs, r=" << r + << " errno " << errno << " " << strerror(errno) << endl; + dout(1) << "bl is " << bl << endl; + assert(0); + } else { + assert(r == (int)len); + } + + return 0; +} + + + +// open/close + +int BlockDevice::open_fd() +{ + return ::open(dev.c_str(), O_RDWR|O_SYNC|O_DIRECT, 0); +} + +int BlockDevice::open(kicker *idle) +{ + assert(fd == 0); + + // open? + fd = open_fd(); + if (fd < 0) { + dout(1) << "open failed, r = " << fd << " " << strerror(errno) << endl; + fd = 0; + return -1; + } + + // lock + if (g_conf.bdev_lock) { + int r = ::flock(fd, LOCK_EX|LOCK_NB); + if (r < 0) { + derr(1) << "open " << dev << " failed to get LOCK_EX" << endl; + assert(0); + return -1; + } + } + + // figure size + __uint64_t bsize = get_num_blocks(); + + dout(2) << "open " << bsize << " bytes, " << num_blocks << " blocks" << endl; + + // start thread + io_threads_started = 0; + io_threads.clear(); + for (int i=0; icreate(); + } + complete_thread.create(); + + // idle kicker? + idle_kicker = idle; + + return fd; +} + + +int BlockDevice::close() +{ + assert(fd>0); + + idle_kicker = 0; + + // shut down io thread + dout(10) << "close stopping io+complete threads" << endl; + lock.Lock(); + complete_lock.Lock(); + io_stop = true; + io_wakeup.SignalAll(); + complete_wakeup.SignalAll(); + complete_lock.Unlock(); + lock.Unlock(); + + + for (int i=0; ijoin(); + delete io_threads[i]; + } + io_threads.clear(); + + complete_thread.join(); + + io_stop = false; // in case we start again + + dout(2) << "close " << endl; + + if (g_conf.bdev_lock) + ::flock(fd, LOCK_UN); + + ::close(fd); + fd = 0; + + return 0; +} + +int BlockDevice::cancel_io(ioh_t ioh) +{ + biovec *pbio = (biovec*)ioh; + + lock.Lock(); + int r = _cancel_io(pbio); + lock.Unlock(); + + // FIXME? + if (r == 0 && pbio->cb) { + //pbio->cb->finish(ioh, 0); + delete pbio->cb; + delete pbio; + } + + return r; +} + diff --git a/branches/sage/cephmds2/ebofs/BlockDevice.h b/branches/sage/cephmds2/ebofs/BlockDevice.h new file mode 100644 index 0000000000000..25adf62606947 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/BlockDevice.h @@ -0,0 +1,331 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_BLOCKDEVICE_H +#define __EBOFS_BLOCKDEVICE_H + +#include "include/buffer.h" +#include "include/interval_set.h" +#include "include/Context.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "types.h" + + +typedef void *ioh_t; // opaque handle to an io request. (in actuality, a biovec*) + + +class BlockDevice { + public: + // callback type for io completion notification + class callback { + public: + virtual ~callback() {} + virtual void finish(ioh_t ioh, int rval) = 0; + }; + + // kicker for idle notification + class kicker { + public: + virtual ~kicker() {} + virtual void kick() = 0; + }; + + + /********************************************************/ + + class Queue; + + // io item + // two variants: one with Cond*, one with callback*. + class biovec { + public: + static const char IO_WRITE = 1; + static const char IO_READ = 2; + + char type; + block_t start, length; + bufferlist bl; + callback *cb; + Cond *cond; + int rval; + char *note; + bool done; + + Queue *in_queue; + + biovec(char t, block_t s, block_t l, bufferlist& b, callback *c, char *n=0) : + type(t), start(s), length(l), bl(b), cb(c), cond(0), rval(0), note(n), done(false), in_queue(0) {} + biovec(char t, block_t s, block_t l, bufferlist& b, Cond *c, char *n=0) : + type(t), start(s), length(l), bl(b), cb(0), cond(c), rval(0), note(n), done(false), in_queue(0) {} + }; + friend ostream& operator<<(ostream& out, biovec &bio); + + + /********************************************************/ + + /* + * Queue -- abstract IO queue interface + */ + class Queue { + public: + virtual ~Queue() {} + virtual void submit_io(biovec *b) = 0; + virtual void cancel_io(biovec *b) = 0; + virtual int dequeue_io(list& biols, + block_t& start, block_t& length, + interval_set& locked) = 0; + virtual int size() = 0; + virtual bool empty() { return size() == 0; } + }; + + /* + * ElevatorQueue - simple elevator scheduler queue + */ + class ElevatorQueue : public Queue { + BlockDevice *bdev; + const char *dev; + map io_map; + bool el_dir_forward; + block_t el_pos; + utime_t el_stop; + + public: + ElevatorQueue(BlockDevice *bd, const char *d) : + bdev(bd), dev(d), + el_dir_forward(false), + el_pos(0) {} + void submit_io(biovec *b) { + b->in_queue = this; + assert(io_map.count(b->start) == 0); + io_map[b->start] = b; + } + void cancel_io(biovec *b) { + assert(b->in_queue == this); + assert(io_map.count(b->start) && + io_map[b->start] == b); + io_map.erase(b->start); + b->in_queue = 0; + } + int dequeue_io(list& biols, + block_t& start, block_t& length, + interval_set& locked); + int size() { + return io_map.size(); + } + }; + + /* + * BarrierQueue - lets you specify io "barriers" + * barrier() - force completion of all prior IOs before + * future ios are started. + * bump() - must be called after cancel_io to properly + * detect empty subqueue. + */ + class BarrierQueue : public Queue { + BlockDevice *bdev; + const char *dev; + list qls; + public: + BarrierQueue(BlockDevice *bd, const char *d) : bdev(bd), dev(d) { + barrier(); + } + int size() { + // this isn't perfectly accurate. + if (!qls.empty()) + return qls.front()->size(); + return 0; + } + void submit_io(biovec *b) { + assert(!qls.empty()); + qls.back()->submit_io(b); + } + void cancel_io(biovec *b) { + assert(0); // shouldn't happen. + } + int dequeue_io(list& biols, + block_t& start, block_t& length, + interval_set& locked); + void barrier(); + bool bump(); + }; + + + private: + string dev; // my device file + int fd; + block_t num_blocks; + + Mutex lock; + + /** the root io queue. + * i current assumeit's a barrier queue,but this can be changed + * with some minor rearchitecting. + */ + BarrierQueue root_queue; + + kicker *idle_kicker; // not used.. + + /* io_block_lock - block ranges current dispatched to kernel + * once a bio is dispatched, it cannot be canceled, so an overlapping + * io and be submitted. the overlapping io cannot be dispatched + * to the kernel, however, until the original io finishes, or else + * there will be a race condition. + */ + interval_set io_block_lock; // blocks currently dispatched to kernel + + // io threads + Cond io_wakeup; + bool io_stop; + int io_threads_started, io_threads_running; + + void *io_thread_entry(); + + class IOThread : public Thread { + BlockDevice *dev; + public: + IOThread(BlockDevice *d) : dev(d) {} + void *entry() { return (void*)dev->io_thread_entry(); } + } ; + + vector io_threads; + + // private io interface + int open_fd(); // get an fd (for a thread) + + void _submit_io(biovec *b); + int _cancel_io(biovec *bio); + void do_io(int fd, list& biols); // called by an io thread + + // low level io + int _read(int fd, block_t bno, unsigned num, bufferlist& bl); + int _write(int fd, unsigned bno, unsigned num, bufferlist& bl); + + + // completion callback queue + Mutex complete_lock; + Cond complete_wakeup; + list complete_queue; + int complete_queue_len; + + void finish_io(biovec *bio); + + // complete thread + void *complete_thread_entry(); + class CompleteThread : public Thread { + BlockDevice *dev; + public: + CompleteThread(BlockDevice *d) : dev(d) {} + void *entry() { return (void*)dev->complete_thread_entry(); } + } complete_thread; + + + public: + BlockDevice(const char *d) : + dev(d), fd(0), num_blocks(0), + root_queue(this, dev.c_str()), + idle_kicker(0), + io_stop(false), io_threads_started(0), io_threads_running(0), + complete_queue_len(0), + complete_thread(this) { } + ~BlockDevice() { + if (fd > 0) close(); + } + + // get size in blocks + block_t get_num_blocks(); + const char *get_device_name() const { return dev.c_str(); } + + // open/close + int open(kicker *idle = 0); + int close(); + + // state stuff + bool is_idle() { + lock.Lock(); + bool idle = (io_threads_running == 0) && root_queue.empty(); + lock.Unlock(); + return idle; + } + void barrier() { + lock.Lock(); + root_queue.barrier(); + lock.Unlock(); + } + + // ** blocking interface ** + + // read + int read(block_t bno, unsigned num, bufferptr& bptr, char *n=0) { + bufferlist bl; + bl.push_back(bptr); + return read(bno, num, bl, n); + } + int read(block_t bno, unsigned num, bufferlist& bl, char *n=0) { + Cond c; + biovec bio(biovec::IO_READ, bno, num, bl, &c, n); + + lock.Lock(); + _submit_io(&bio); + barrier(); // need this, to prevent starvation! + while (!bio.done) + c.Wait(lock); + lock.Unlock(); + return bio.rval; + } + + // write + int write(unsigned bno, unsigned num, bufferptr& bptr, char *n=0) { + bufferlist bl; + bl.push_back(bptr); + return write(bno, num, bl, n); + } + int write(unsigned bno, unsigned num, bufferlist& bl, char *n=0) { + Cond c; + biovec bio(biovec::IO_WRITE, bno, num, bl, &c, n); + + lock.Lock(); + _submit_io(&bio); + barrier(); // need this, to prevent starvation! + while (!bio.done) + c.Wait(lock); + lock.Unlock(); + return bio.rval; + } + + // ** non-blocking interface ** + ioh_t read(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { + biovec *pbio = new biovec(biovec::IO_READ, bno, num, bl, fin, n); + lock.Lock(); + _submit_io(pbio); + lock.Unlock(); + return (ioh_t)pbio; + } + ioh_t write(block_t bno, unsigned num, bufferlist& bl, callback *fin, char *n=0) { + biovec *pbio = new biovec(biovec::IO_WRITE, bno, num, bl, fin, n); + lock.Lock(); + _submit_io(pbio); + lock.Unlock(); + return (ioh_t)pbio; + } + int cancel_io(ioh_t ioh); + +}; + + + + +#endif diff --git a/branches/sage/cephmds2/ebofs/BufferCache.cc b/branches/sage/cephmds2/ebofs/BufferCache.cc new file mode 100644 index 0000000000000..cee7f2c12ce79 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/BufferCache.cc @@ -0,0 +1,1045 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "BufferCache.h" +#include "Onode.h" + + +/*********** BufferHead **************/ + + +#undef dout +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bh." + + + + + + +/************ ObjectCache **************/ + + +#undef dout +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.oc." + + + +void ObjectCache::rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl) +{ + list waiters; + + dout(10) << "rx_finish " << start << "~" << length << endl; + for (map::iterator p = data.lower_bound(start); + p != data.end(); + p++) { + BufferHead *bh = p->second; + dout(10) << "rx_finish ?" << *bh << endl; + assert(p->first == bh->start()); + + // past? + if (p->first >= start+length) break; + if (bh->end() > start+length) break; // past + + assert(p->first >= start); + assert(bh->end() <= start+length); + + dout(10) << "rx_finish !" << *bh << endl; + + if (bh->rx_ioh == ioh) + bh->rx_ioh = 0; + + if (bh->is_rx()) { + assert(bh->get_version() == 0); + assert(bh->end() <= start+length); + assert(bh->start() >= start); + dout(10) << "rx_finish rx -> clean on " << *bh << endl; + bh->data.substr_of(bl, (bh->start()-start)*EBOFS_BLOCK_SIZE, bh->length()*EBOFS_BLOCK_SIZE); + bc->mark_clean(bh); + } + else if (bh->is_partial()) { + dout(10) << "rx_finish partial -> tx on " << *bh << endl; + + if (1) { + // double-check what block i am + vector exv; + on->map_extents(bh->start(), 1, exv); + assert(exv.size() == 1); + block_t cur_block = exv[0].start; + assert(cur_block == bh->partial_tx_to); + } + + // ok, cancel my low-level partial (since we're still here, and can bh_write ourselves) + bc->cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); + + // apply partial to myself + assert(bh->data.length() == 0); + bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + bh->data.push_back( bp ); + bh->data.copy_in(0, EBOFS_BLOCK_SIZE, bl); + bh->apply_partial(); + + // write "normally" + bc->mark_dirty(bh); + bc->bh_write(on, bh, bh->partial_tx_to);//cur_block); + + // clean up a bit + bh->partial_tx_to = 0; + bh->partial_tx_epoch = 0; + bh->partial.clear(); + } + else { + dout(10) << "rx_finish ignoring status on (dirty|tx|clean) " << *bh << endl; + assert(bh->is_dirty() || // was overwritten + bh->is_tx() || // was overwritten and queued + bh->is_clean()); // was overwritten, queued, _and_ flushed to disk + } + + // trigger waiters + for (map >::iterator p = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + p++) { + assert(p->first >= bh->start() && p->first < bh->end()); + waiters.splice(waiters.begin(), p->second); + } + bh->waitfor_read.clear(); + } + + finish_contexts(waiters); +} + + +void ObjectCache::tx_finish(ioh_t ioh, block_t start, block_t length, + version_t version, version_t epoch) +{ + dout(10) << "tx_finish " << start << "~" << length << " v" << version << endl; + for (map::iterator p = data.lower_bound(start); + p != data.end(); + p++) { + BufferHead *bh = p->second; + dout(30) << "tx_finish ?bh " << *bh << endl; + assert(p->first == bh->start()); + + // past? + if (p->first >= start+length) break; + + if (bh->tx_ioh == ioh) + bh->tx_ioh = 0; + + if (!bh->is_tx()) { + dout(10) << "tx_finish bh not marked tx, skipping" << endl; + continue; + } + assert(bh->is_tx()); + + if (version == bh->version) { + dout(10) << "tx_finish tx -> clean on " << *bh << endl; + assert(bh->end() <= start+length); + bh->set_last_flushed(version); + bc->mark_clean(bh); + } else { + dout(10) << "tx_finish leaving tx, " << bh->version << " > " << version + << " on " << *bh << endl; + assert(bh->version > version); + } + } +} + + + +/* + * return any bh's that are (partially) in this range that are TX. + */ +int ObjectCache::find_tx(block_t start, block_t len, + list& tx) +{ + map::iterator p = data.lower_bound(start); + + block_t cur = start; + block_t left = len; + + /* don't care about overlap, we want things _fully_ in start~len. + if (p != data.begin() && + (p == data.end() || p->first > cur)) { + p--; // might overlap! + if (p->first + p->second->length() <= cur) + p++; // doesn't overlap. + } + */ + + while (left > 0) { + assert(cur+left == start+len); + + // at end? + if (p == data.end()) + break; + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (e->end() <= start+len && + e->is_tx()) + tx.push_back(e); + + block_t lenfromcur = MIN(e->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; // more? + } else if (p->first > cur) { + // gap.. miss + block_t next = p->first; + left -= (next-cur); + cur = next; + continue; + } + else + assert(0); + } + + return 0; +} + + + +/* + * map a range of blocks into buffer_heads. + * - create missing buffer_heads as necessary. + * - fragment along disk extent boundaries + */ +int ObjectCache::map_read(block_t start, block_t len, + map& hits, + map& missing, + map& rx, + map& partial) { + + map::iterator p = data.lower_bound(start); + + block_t cur = start; + block_t left = len; + + if (p != data.begin() && + (p == data.end() || p->first > cur)) { + p--; // might overlap! + if (p->first + p->second->length() <= cur) + p++; // doesn't overlap. + } + + while (left > 0) { + // at end? + if (p == data.end()) { + // rest is a miss. + vector exv; + //on->map_extents(cur, left, exv); // we might consider some prefetch here. + on->map_extents(cur, + //MIN(left + g_conf.ebofs_max_prefetch, // prefetch + //on->object_blocks-cur), + left, // no prefetch + exv); + for (unsigned i=0; i 0; i++) { + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( exv[i].length ); + bc->add_bh(n); + missing[cur] = n; + dout(20) << "map_read miss " << left << " left, " << *n << endl; + cur += MIN(left,exv[i].length); + left -= MIN(left,exv[i].length); + } + assert(left == 0); + assert(cur == start+len); + break; + } + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (e->is_clean() || + e->is_dirty() || + e->is_tx()) { + hits[cur] = e; // readable! + dout(20) << "map_read hit " << *e << endl; + bc->touch(e); + } + else if (e->is_rx()) { + rx[cur] = e; // missing, not readable. + dout(20) << "map_read rx " << *e << endl; + } + else if (e->is_partial()) { + partial[cur] = e; + dout(20) << "map_read partial " << *e << endl; + } + else { + dout(0) << "map_read ??? " << *e << endl; + assert(0); + } + + block_t lenfromcur = MIN(e->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; // more? + } else if (p->first > cur) { + // gap.. miss + block_t next = p->first; + vector exv; + on->map_extents(cur, + //MIN(next-cur, MIN(left + g_conf.ebofs_max_prefetch, // prefetch + // on->object_blocks-cur)), + MIN(next-cur, left), // no prefetch + exv); + + for (unsigned i=0; i0; i++) { + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( exv[i].length ); + bc->add_bh(n); + missing[cur] = n; + cur += MIN(left, n->length()); + left -= MIN(left, n->length()); + dout(20) << "map_read gap " << *n << endl; + } + continue; // more? + } + else + assert(0); + } + + assert(left == 0); + assert(cur == start+len); + return 0; +} + + +/* + * map a range of pages on an object's buffer cache. + * + * - break up bufferheads that don't fall completely within the range + * - cancel rx ops we obsolete. + * - resubmit rx ops if we split bufferheads + * + * - leave potentially obsoleted tx ops alone (for now) + * - don't worry about disk extent boundaries (yet) + */ +int ObjectCache::map_write(block_t start, block_t len, + interval_set& alloc, + map& hits, + version_t super_epoch) +{ + map::iterator p = data.lower_bound(start); + + dout(10) << "map_write " << *on << " " << start << "~" << len << " ... alloc " << alloc << endl; + // p->first >= start + + block_t cur = start; + block_t left = len; + + if (p != data.begin() && + (p == data.end() || p->first > cur)) { + p--; // might overlap! + if (p->first + p->second->length() <= cur) + p++; // doesn't overlap. + } + + //dump(); + + while (left > 0) { + // max for this bh (bc of (re)alloc on disk) + block_t max = left; + bool newalloc = false; + + // based on alloc/no-alloc boundary ... + if (alloc.contains(cur, left)) { + if (alloc.contains(cur)) { + block_t ends = alloc.end_after(cur); + max = MIN(left, ends-cur); + newalloc = true; + } else { + if (alloc.starts_after(cur)) { + block_t st = alloc.start_after(cur); + max = MIN(left, st-cur); + } + } + } + + // based on disk extent boundary ... + vector exv; + on->map_extents(cur, max, exv); + if (exv.size() > 1) + max = exv[0].length; + + if (newalloc) { + dout(10) << "map_write " << cur << "~" << max << " is new alloc on disk" << endl; + } else { + dout(10) << "map_write " << cur << "~" << max << " keeps old alloc on disk" << endl; + } + + // at end? + if (p == data.end()) { + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( max ); + bc->add_bh(n); + hits[cur] = n; + left -= max; + cur += max; + continue; + } + + dout(10) << "p is " << *p->second << endl; + + + if (p->first <= cur) { + BufferHead *bh = p->second; + dout(10) << "map_write bh " << *bh << " intersected" << endl; + + if (p->first < cur) { + if (cur+max >= p->first+p->second->length()) { + // we want right bit (one splice) + if (bh->is_rx() && bc->bh_cancel_read(bh)) { + BufferHead *right = bc->split(bh, cur); + bc->bh_read(on, bh); // reread left bit + bh = right; + } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + BufferHead *right = bc->split(bh, cur); + bc->bh_write(on, bh); // rewrite left bit + bh = right; + } else { + bh = bc->split(bh, cur); // just split it + } + p++; + assert(p->second == bh); + } else { + // we want middle bit (two splices) + if (bh->is_rx() && bc->bh_cancel_read(bh)) { + BufferHead *middle = bc->split(bh, cur); + bc->bh_read(on, bh); // reread left + p++; + assert(p->second == middle); + BufferHead *right = bc->split(middle, cur+max); + bc->bh_read(on, right); // reread right + bh = middle; + } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + BufferHead *middle = bc->split(bh, cur); + bc->bh_write(on, bh); // redo left + p++; + assert(p->second == middle); + BufferHead *right = bc->split(middle, cur+max); + bc->bh_write(on, right); // redo right + bh = middle; + } else { + BufferHead *middle = bc->split(bh, cur); + p++; + assert(p->second == middle); + bc->split(middle, cur+max); + bh = middle; + } + } + } else if (p->first == cur) { + if (p->second->length() <= max) { + // whole bufferhead, piece of cake. + } else { + // we want left bit (one splice) + if (bh->is_rx() && bc->bh_cancel_read(bh)) { + BufferHead *right = bc->split(bh, cur+max); + bc->bh_read(on, right); // re-rx the right bit + } else if (bh->is_tx() && !newalloc && bc->bh_cancel_write(bh, super_epoch)) { + BufferHead *right = bc->split(bh, cur+max); + bc->bh_write(on, right); // re-tx the right bit + } else { + bc->split(bh, cur+max); // just split + } + } + } + + // try to cancel tx? + if (bh->is_tx() && !newalloc) bc->bh_cancel_write(bh, super_epoch); + + // put in our map + hits[cur] = bh; + + // keep going. + block_t lenfromcur = bh->end() - cur; + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; + } else { + // gap! + block_t next = p->first; + block_t glen = MIN(next-cur, max); + dout(10) << "map_write gap " << cur << "~" << glen << endl; + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( glen ); + bc->add_bh(n); + hits[cur] = n; + + cur += glen; + left -= glen; + continue; // more? + } + } + + assert(left == 0); + assert(cur == start+len); + return 0; +} + +/* don't need this. +int ObjectCache::scan_versions(block_t start, block_t len, + version_t& low, version_t& high) +{ + map::iterator p = data.lower_bound(start); + // p->first >= start + + if (p != data.begin() && p->first > start) { + p--; // might overlap? + if (p->first + p->second->length() <= start) + p++; // doesn't overlap. + } + if (p->first >= start+len) + return -1; // to the right. no hits. + + // start + low = high = p->second->get_version(); + + for (p++; p != data.end(); p++) { + // past? + if (p->first >= start+len) break; + + const version_t v = p->second->get_version(); + if (low > v) low = v; + if (high < v) high = v; + } + + return 0; +} +*/ + +void ObjectCache::truncate(block_t blocks, version_t super_epoch) +{ + dout(7) << "truncate " << object_id + << " " << blocks << " blocks" + << endl; + + while (!data.empty()) { + block_t bhoff = data.rbegin()->first; + BufferHead *bh = data.rbegin()->second; + + if (bh->end() <= blocks) break; + + bool uncom = on->uncommitted.contains(bh->start(), bh->length()); + dout(10) << "truncate " << *bh << " uncom " << uncom + << " of " << on->uncommitted + << endl; + + if (bhoff < blocks) { + // we want right bit (one splice) + if (bh->is_rx() && bc->bh_cancel_read(bh)) { + BufferHead *right = bc->split(bh, blocks); + bc->bh_read(on, bh); // reread left bit + bh = right; + } else if (bh->is_tx() && uncom && bc->bh_cancel_write(bh, super_epoch)) { + BufferHead *right = bc->split(bh, blocks); + bc->bh_write(on, bh); // rewrite left bit + bh = right; + } else { + bh = bc->split(bh, blocks); // just split it + } + // no worries about partials up here, they're always 1 block (and thus never split) + } else { + // whole thing + // cancel any pending/queued io, if possible. + if (bh->is_rx()) + bc->bh_cancel_read(bh); + if (bh->is_tx() && uncom) + bc->bh_cancel_write(bh, super_epoch); + if (bh->shadow_of) { + dout(10) << "truncate " << *bh << " unshadowing " << *bh->shadow_of << endl; + // shadow + bh->shadow_of->remove_shadow(bh); + if (bh->is_partial()) + bc->cancel_shadow_partial(bh->rx_from.start, bh); + } else { + // normal + if (bh->is_partial() && uncom) + bc->bh_cancel_partial_write(bh); + } + } + + for (map >::iterator p = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + p++) { + finish_contexts(p->second, -1); + } + + bc->remove_bh(bh); + delete bh; + } +} + + +void ObjectCache::clone_to(Onode *other) +{ + ObjectCache *ton = 0; + + for (map::iterator p = data.begin(); + p != data.end(); + p++) { + BufferHead *bh = p->second; + dout(10) << "clone_to ? " << *bh << endl; + if (bh->is_dirty() || bh->is_tx() || bh->is_partial()) { + // dup dirty or tx bh's + if (!ton) + ton = other->get_oc(bc); + BufferHead *nbh = new BufferHead(ton); + nbh->set_start( bh->start() ); + nbh->set_length( bh->length() ); + nbh->data = bh->data; // just copy refs to underlying buffers. + bc->add_bh(nbh); + + if (bh->is_partial()) { + dout(0) << "clone_to PARTIAL FIXME NOT FULLY IMPLEMENTED ******" << endl; + nbh->partial = bh->partial; + bc->mark_partial(nbh); + // register as shadow_partial + bc->add_shadow_partial(bh->rx_from.start, nbh); + } else { + // clean buffer will shadow + bh->add_shadow(nbh); + bc->mark_clean(nbh); + } + + dout(10) << "clone_to dup " << *bh << " -> " << *nbh << endl; + } + } +} + + + +/************** BufferCache ***************/ + +#undef dout +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs.bc." + + + +BufferHead *BufferCache::split(BufferHead *orig, block_t after) +{ + dout(20) << "split " << *orig << " at " << after << endl; + + // split off right + BufferHead *right = new BufferHead(orig->get_oc()); + right->set_version(orig->get_version()); + right->epoch_modified = orig->epoch_modified; + right->last_flushed = orig->last_flushed; + right->set_state(orig->get_state()); + + block_t newleftlen = after - orig->start(); + right->set_start( after ); + right->set_length( orig->length() - newleftlen ); + + // shorten left + stat_sub(orig); + orig->set_length( newleftlen ); + stat_add(orig); + + // add right + add_bh(right); + + // adjust rx_from + if (orig->is_rx()) { + right->rx_from = orig->rx_from; + orig->rx_from.length = newleftlen; + right->rx_from.length -= newleftlen; + right->rx_from.start += newleftlen; + } + + // dup shadows + for (set::iterator p = orig->shadows.begin(); + p != orig->shadows.end(); + ++p) + right->add_shadow(*p); + + // split buffers too + bufferlist bl; + bl.claim(orig->data); + if (bl.length()) { + assert(bl.length() == (orig->length()+right->length())*EBOFS_BLOCK_SIZE); + right->data.substr_of(bl, orig->length()*EBOFS_BLOCK_SIZE, right->length()*EBOFS_BLOCK_SIZE); + orig->data.substr_of(bl, 0, orig->length()*EBOFS_BLOCK_SIZE); + } + + // move read waiters + if (!orig->waitfor_read.empty()) { + map >::iterator o, p = orig->waitfor_read.end(); + p--; + while (p != orig->waitfor_read.begin()) { + if (p->first < right->start()) break; + dout(0) << "split moving waiters at block " << p->first << " to right bh" << endl; + right->waitfor_read[p->first].swap( p->second ); + o = p; + p--; + orig->waitfor_read.erase(o); + } + } + + dout(20) << "split left is " << *orig << endl; + dout(20) << "split right is " << *right << endl; + return right; +} + + +void BufferCache::bh_read(Onode *on, BufferHead *bh, block_t from) +{ + dout(10) << "bh_read " << *on << " on " << *bh << endl; + + if (bh->is_missing()) { + mark_rx(bh); + } else { + assert(bh->is_partial()); + } + + // get extent. there should be only one! + vector exv; + on->map_extents(bh->start(), bh->length(), exv); + assert(exv.size() == 1); + Extent ex = exv[0]; + + if (from) { // force behavior, used for reading partials + dout(10) << "bh_read forcing read from block " << from << " (for a partial)" << endl; + ex.start = from; + ex.length = 1; + } + + // this should be empty!! + assert(bh->rx_ioh == 0); + + dout(20) << "bh_read " << *bh << " from " << ex << endl; + + C_OC_RxFinish *fin = new C_OC_RxFinish(ebofs_lock, on->oc, + bh->start(), bh->length(), + ex.start); + + //bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), fin->bl); // new buffers! + fin->bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); + + bh->rx_ioh = dev.read(ex.start, ex.length, fin->bl, + fin); + bh->rx_from = ex; + on->oc->get(); + +} + +bool BufferCache::bh_cancel_read(BufferHead *bh) +{ + if (bh->rx_ioh && dev.cancel_io(bh->rx_ioh) >= 0) { + dout(10) << "bh_cancel_read on " << *bh << endl; + bh->rx_ioh = 0; + mark_missing(bh); + int l = bh->oc->put(); + assert(l); + return true; + } + return false; +} + +void BufferCache::bh_write(Onode *on, BufferHead *bh, block_t shouldbe) +{ + dout(10) << "bh_write " << *on << " on " << *bh << " in epoch " << bh->epoch_modified << endl; + assert(bh->get_version() > 0); + + assert(bh->is_dirty()); + mark_tx(bh); + + // get extents + vector exv; + on->map_extents(bh->start(), bh->length(), exv); + assert(exv.size() == 1); + Extent ex = exv[0]; + + if (shouldbe) + assert(ex.length == 1 && ex.start == shouldbe); + + dout(20) << "bh_write " << *bh << " to " << ex << endl; + + //assert(bh->tx_ioh == 0); + + assert(bh->get_last_flushed() < bh->get_version()); + + bh->tx_block = ex.start; + bh->tx_ioh = dev.write(ex.start, ex.length, bh->data, + new C_OC_TxFinish(ebofs_lock, on->oc, + bh->start(), bh->length(), + bh->get_version(), + bh->epoch_modified), + "bh_write"); + + on->oc->get(); + inc_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); + + /* + // assert: no partials on the same block + // hose any partial on the same block + if (bh->partial_write.count(ex.start)) { + dout(10) << "bh_write hosing parital write on same block " << ex.start << " " << *bh << endl; + dec_unflushed( bh->partial_write[ex.start].epoch ); + bh->partial_write.erase(ex.start); + } + */ +} + + +bool BufferCache::bh_cancel_write(BufferHead *bh, version_t cur_epoch) +{ + if (bh->tx_ioh && dev.cancel_io(bh->tx_ioh) >= 0) { + dout(10) << "bh_cancel_write on " << *bh << endl; + bh->tx_ioh = 0; + mark_dirty(bh); + + assert(bh->epoch_modified == cur_epoch); + assert(bh->epoch_modified > 0); + dec_unflushed( EBOFS_BC_FLUSH_BHWRITE, bh->epoch_modified ); // assert.. this should be the same epoch! + + int l = bh->oc->put(); + assert(l); + return true; + } + return false; +} + +void BufferCache::tx_finish(ObjectCache *oc, + ioh_t ioh, block_t start, block_t length, + version_t version, version_t epoch) +{ + ebofs_lock.Lock(); + + // finish oc + if (oc->put() == 0) { + delete oc; + } else + oc->tx_finish(ioh, start, length, version, epoch); + + // update unflushed counter + assert(get_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch) > 0); + dec_unflushed(EBOFS_BC_FLUSH_BHWRITE, epoch); + + ebofs_lock.Unlock(); +} + +void BufferCache::rx_finish(ObjectCache *oc, + ioh_t ioh, block_t start, block_t length, + block_t diskstart, + bufferlist& bl) +{ + ebofs_lock.Lock(); + dout(10) << "rx_finish ioh " << ioh << " on " << start << "~" << length + << ", at device block " << diskstart << endl; + + // oc + if (oc->put() == 0) + delete oc; + else + oc->rx_finish(ioh, start, length, bl); + + // finish any partials? + // note: these are partials that were re-written after a commit, + // or for whom the OC was destroyed (eg truncated after a commit) + map >::iterator sp = partial_write.lower_bound(diskstart); + while (sp != partial_write.end()) { + if (sp->first >= diskstart+length) break; + assert(sp->first >= diskstart); + + block_t pblock = sp->first; + map writes; + writes.swap( sp->second ); + + map >::iterator t = sp; + sp++; + partial_write.erase(t); + + for (map::iterator p = writes.begin(); + p != writes.end(); + p++) { + dout(10) << "rx_finish partial from " << pblock << " -> " << p->first + << " for epoch " << p->second.epoch + //<< " (bh.epoch_modified is now " << bh->epoch_modified << ")" + << endl; + // this had better be a past epoch + //assert(p->epoch == epoch_modified - 1); // ?? + + // make the combined block + bufferlist combined; + bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + combined.push_back( bp ); + combined.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, bl); + BufferHead::apply_partial( combined, p->second.partial ); + + // write it! + dev.write( pblock, 1, combined, + new C_OC_PartialTxFinish( this, p->second.epoch ), + "finish_partials"); + } + } + + // shadow partials? + { + list waiters; + map >::iterator sp = shadow_partials.lower_bound(diskstart); + while (sp != shadow_partials.end()) { + if (sp->first >= diskstart+length) break; + assert(sp->first >= diskstart); + + block_t pblock = sp->first; + set ls; + ls.swap( sp->second ); + + map >::iterator t = sp; + sp++; + shadow_partials.erase(t); + + for (set::iterator p = ls.begin(); + p != ls.end(); + ++p) { + BufferHead *bh = *p; + dout(10) << "rx_finish applying shadow_partial for " << pblock + << " to " << *bh << endl; + bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + bh->data.clear(); + bh->data.push_back( bp ); + bh->data.copy_in((pblock-diskstart)*EBOFS_BLOCK_SIZE, + (pblock-diskstart+1)*EBOFS_BLOCK_SIZE, + bl); + bh->apply_partial(); + bh->set_state(BufferHead::STATE_CLEAN); + + // trigger waiters + for (map >::iterator p = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + p++) { + assert(p->first >= bh->start() && p->first < bh->end()); + waiters.splice(waiters.begin(), p->second); + } + bh->waitfor_read.clear(); + } + } + + // kick waiters + finish_contexts(waiters); + } + + // done. + ebofs_lock.Unlock(); +} + +void BufferCache::partial_tx_finish(version_t epoch) +{ + ebofs_lock.Lock(); + + dout(10) << "partial_tx_finish in epoch " << epoch << endl; + + // update unflushed counter + assert(get_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch) > 0); + dec_unflushed(EBOFS_BC_FLUSH_PARTIAL, epoch); + + ebofs_lock.Unlock(); +} + + + + +void BufferCache::bh_queue_partial_write(Onode *on, BufferHead *bh) +{ + assert(bh->get_version() > 0); + + assert(bh->is_partial()); + assert(bh->length() == 1); + + // get the block no + vector exv; + on->map_extents(bh->start(), bh->length(), exv); + assert(exv.size() == 1); + block_t b = exv[0].start; + assert(exv[0].length == 1); + bh->partial_tx_to = exv[0].start; + bh->partial_tx_epoch = bh->epoch_modified; + + dout(10) << "bh_queue_partial_write " << *on << " on " << *bh << " block " << b << " epoch " << bh->epoch_modified << endl; + + + // copy map state, queue for this block + assert(bh->rx_from.length == 1); + queue_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial, bh->partial_tx_epoch ); +} + +void BufferCache::bh_cancel_partial_write(BufferHead *bh) +{ + assert(bh->is_partial()); + assert(bh->length() == 1); + + cancel_partial( bh->rx_from.start, bh->partial_tx_to, bh->partial_tx_epoch ); +} + + +void BufferCache::queue_partial(block_t from, block_t to, + map& partial, version_t epoch) +{ + dout(10) << "queue_partial " << from << " -> " << to + << " in epoch " << epoch + << endl; + + if (partial_write[from].count(to)) { + // this should be in the same epoch. + assert( partial_write[from][to].epoch == epoch); + assert(0); // actually.. no! + } else { + inc_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); + } + + partial_write[from][to].partial = partial; + partial_write[from][to].epoch = epoch; +} + +void BufferCache::cancel_partial(block_t from, block_t to, version_t epoch) +{ + assert(partial_write.count(from)); + assert(partial_write[from].count(to)); + assert(partial_write[from][to].epoch == epoch); + + dout(10) << "cancel_partial " << from << " -> " << to + << " (was epoch " << partial_write[from][to].epoch << ")" + << endl; + + partial_write[from].erase(to); + if (partial_write[from].empty()) + partial_write.erase(from); + + dec_unflushed( EBOFS_BC_FLUSH_PARTIAL, epoch ); +} + + +void BufferCache::add_shadow_partial(block_t from, BufferHead *bh) +{ + dout(10) << "add_shadow_partial from " << from << " " << *bh << endl; + shadow_partials[from].insert(bh); +} + +void BufferCache::cancel_shadow_partial(block_t from, BufferHead *bh) +{ + dout(10) << "cancel_shadow_partial from " << from << " " << *bh << endl; + shadow_partials[from].erase(bh); +} diff --git a/branches/sage/cephmds2/ebofs/BufferCache.h b/branches/sage/cephmds2/ebofs/BufferCache.h new file mode 100644 index 0000000000000..922c5e531ee56 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/BufferCache.h @@ -0,0 +1,681 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_BUFFERCACHE_H +#define __EBOFS_BUFFERCACHE_H + +#include "include/lru.h" +#include "include/Context.h" + +#include "common/Clock.h" + +#include "types.h" +#include "BlockDevice.h" + +#include "include/interval_set.h" + +class ObjectCache; +class BufferCache; +class Onode; + +class BufferHead : public LRUObject { + public: + /* + * - buffer_heads should always break across disk extent boundaries + * - partial buffer_heads are always 1 block. + */ + const static int STATE_MISSING = 0; // missing; data is on disk, but not loaded. + const static int STATE_CLEAN = 1; // Rw clean + const static int STATE_DIRTY = 2; // RW dirty + const static int STATE_TX = 3; // Rw flushing to disk + const static int STATE_RX = 4; // w reading from disk + const static int STATE_PARTIAL = 5; // reading from disk, + partial content map. always 1 block. + + public: + ObjectCache *oc; + + bufferlist data; + + ioh_t rx_ioh; // + Extent rx_from; + ioh_t tx_ioh; // + block_t tx_block; + block_t partial_tx_to; + version_t partial_tx_epoch; + + map partial; // partial dirty content overlayed onto incoming data + + map< block_t, list > waitfor_read; + + set shadows; // shadow bh's that clone()ed me. + BufferHead* shadow_of; + + private: + int ref; + int state; + + public: + version_t epoch_modified; + + version_t version; // current version in cache + version_t last_flushed; // last version flushed to disk + + Extent object_loc; // block position _in_object_ + + utime_t dirty_stamp; + + public: + BufferHead(ObjectCache *o) : + oc(o), //cancellable_ioh(0), tx_epoch(0), + rx_ioh(0), tx_ioh(0), tx_block(0), partial_tx_to(0), partial_tx_epoch(0), + shadow_of(0), + ref(0), state(STATE_MISSING), epoch_modified(0), version(0), last_flushed(0) + {} + ~BufferHead() { + unpin_shadows(); + } + + ObjectCache *get_oc() { return oc; } + + int get() { + assert(ref >= 0); + if (ref == 0) lru_pin(); + return ++ref; + } + int put() { + assert(ref > 0); + if (ref == 1) lru_unpin(); + --ref; + return ref; + } + + block_t start() { return object_loc.start; } + void set_start(block_t s) { object_loc.start = s; } + block_t length() { return object_loc.length; } + void set_length(block_t l) { object_loc.length = l; } + block_t end() { return start() + length(); } + block_t last() { return end()-1; } + + version_t get_version() { return version; } + void set_version(version_t v) { version = v; } + version_t get_last_flushed() { return last_flushed; } + void set_last_flushed(version_t v) { + if (v <= last_flushed) cout << "last_flushed begin set to " << v << ", was " << last_flushed << endl; + assert(v > last_flushed); + last_flushed = v; + } + + utime_t get_dirty_stamp() { return dirty_stamp; } + void set_dirty_stamp(utime_t t) { dirty_stamp = t; } + + void set_state(int s) { + if (s == STATE_PARTIAL || s == STATE_RX || s == STATE_TX) get(); + if (state == STATE_PARTIAL || state == STATE_RX || state == STATE_TX) put(); + + if ((state == STATE_TX && s != STATE_TX) || + (state == STATE_PARTIAL && s != STATE_PARTIAL)) + unpin_shadows(); + + state = s; + } + int get_state() { return state; } + + bool is_missing() { return state == STATE_MISSING; } + bool is_dirty() { return state == STATE_DIRTY; } + bool is_clean() { return state == STATE_CLEAN; } + bool is_tx() { return state == STATE_TX; } + bool is_rx() { return state == STATE_RX; } + bool is_partial() { return state == STATE_PARTIAL; } + + //bool is_partial_writes() { return !partial_write.empty(); } + //void finish_partials(); + //void cancel_partials(); + //void queue_partial_write(block_t b); + + void add_shadow(BufferHead *dup) { + shadows.insert(dup); + dup->shadow_of = this; + dup->get(); + } + void remove_shadow(BufferHead *dup) { + shadows.erase(dup); + dup->shadow_of = 0; + dup->put(); + } + void unpin_shadows() { + for (set::iterator p = shadows.begin(); + p != shadows.end(); + ++p) { + //cout << "unpin shadow " << *p << endl; + (*p)->shadow_of = 0; + (*p)->put(); + } + shadows.clear(); + } + + void copy_partial_substr(off_t start, off_t end, bufferlist& bl) { + map::iterator i = partial.begin(); + + // skip first bits (fully to left) + while ((i->first + i->second.length() < start) && + i != partial.end()) + i++; + assert(i != partial.end()); + assert(i->first <= start); + + // first + unsigned bhoff = MAX(start, i->first) - i->first; + unsigned bhlen = MIN(end-start, i->second.length()); + bl.substr_of( i->second, bhoff, bhlen ); + + off_t pos = i->first + i->second.length(); + + // have continuous to end? + for (i++; i != partial.end(); i++) { + if (pos >= end) break; + assert(pos == i->first); + + pos = i->first + i->second.length(); + + if (pos <= end) { // this whole frag + bl.append( i->second ); + } else { // partial end + unsigned bhlen = end-start-bl.length(); + bufferlist frag; + frag.substr_of( i->second, 0, bhlen ); + bl.claim_append(frag); + break; // done. + } + } + + assert(pos >= end); + assert(bl.length() == (unsigned)(end-start)); + } + + bool have_partial_range(off_t start, off_t end) { + map::iterator i = partial.begin(); + + // skip first bits (fully to left) + while ((i->first + i->second.length() < start) && + i != partial.end()) + i++; + if (i == partial.end()) return false; + + // have start? + if (i->first > start) return false; + off_t pos = i->first + i->second.length(); + + // have continuous to end? + for (i++; i != partial.end(); i++) { + assert(pos <= i->first); + if (pos < i->first) return false; + assert(pos == i->first); + pos = i->first + i->second.length(); + if (pos >= end) break; // gone far enough + } + + if (pos >= end) return true; + return false; + } + + bool partial_is_complete(off_t size) { + return have_partial_range( 0, MIN(size, EBOFS_BLOCK_SIZE) ); + //(off_t)(start()*EBOFS_BLOCK_SIZE), + //MIN( size, (off_t)(end()*EBOFS_BLOCK_SIZE) ) ); + } + void apply_partial() { + apply_partial(data, partial); + partial.clear(); + } + static void apply_partial(bufferlist& bl, map& pm) { + assert(bl.length() == (unsigned)EBOFS_BLOCK_SIZE); + //assert(partial_is_complete()); + //cout << "apply_partial" << endl; + for (map::iterator i = pm.begin(); + i != pm.end(); + i++) { + int pos = i->first; + //cout << " frag at opos " << i->first << " bhpos " << pos << " len " << i->second.length() << endl; + bl.copy_in(pos, i->second.length(), i->second); + } + pm.clear(); + } + void add_partial(off_t off, bufferlist& p) { + unsigned len = p.length(); + assert(len <= (unsigned)EBOFS_BLOCK_SIZE); + //assert(off >= (off_t)(start()*EBOFS_BLOCK_SIZE)); + //assert(off + len <= (off_t)(end()*EBOFS_BLOCK_SIZE)); + assert(off >= 0); + assert(off + len <= EBOFS_BLOCK_SIZE); + + // trim any existing that overlaps + for (map::iterator i = partial.begin(); + i != partial.end(); + ) { + if (i->first + i->second.length() <= off) { // before + i++; + continue; + } + if (i->first >= off+len) break; // past affected area. + + // overlap all? + if (off <= i->first && i->first + i->second.length() <= off+len) { + // erase it and move on. + off_t dead = i->first; + i++; + partial.erase(dead); + continue; + } + // overlap tail? + else if (i->first < off && off < i->first + i->second.length()) { + // shorten. + unsigned newlen = off - i->first; + bufferlist o; + o.claim( i->second ); + i->second.substr_of(o, 0, newlen); + i++; + continue; + } + // overlap head? + else if (off < i->first && off+len < i->first + i->second.length()) { + // move. + off_t oldoff = i->first; + off_t newoff = off+len; + unsigned trim = newoff - oldoff; + partial[newoff].substr_of(i->second, trim, i->second.length()-trim); + i++; // should be at newoff! + partial.erase( oldoff ); + i++; + continue; + } else + assert(0); + } + + // insert + partial[off] = p; + } + + +}; + +inline ostream& operator<<(ostream& out, BufferHead& bh) +{ + out << "bufferhead(" << bh.start() << "~" << bh.length(); + out << " v" << bh.get_version() << "/" << bh.get_last_flushed(); + if (bh.is_missing()) out << " missing"; + if (bh.is_dirty()) out << " dirty"; + if (bh.is_clean()) out << " clean"; + if (bh.is_rx()) out << " rx"; + if (bh.is_tx()) out << " tx"; + if (bh.is_partial()) out << " partial"; + //out << " " << bh.data.length(); + out << " " << &bh; + out << ")"; + return out; +} + + +class ObjectCache { + public: + object_t object_id; + Onode *on; + BufferCache *bc; + + private: + map data; + int ref; + + public: + version_t write_count; + + + public: + ObjectCache(object_t o, Onode *_on, BufferCache *b) : + object_id(o), on(_on), bc(b), ref(0), + write_count(0) { } + ~ObjectCache() { + assert(data.empty()); + assert(ref == 0); + } + + int get() { + ++ref; + //cout << "oc.get " << object_id << " " << ref << endl; + return ref; + } + int put() { + assert(ref > 0); + --ref; + //cout << "oc.put " << object_id << " " << ref << endl; + return ref; + } + + object_t get_object_id() { return object_id; } + + void add_bh(BufferHead *bh) { + // add to my map + assert(data.count(bh->start()) == 0); + + if (0) { // sanity check FIXME DEBUG + //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; + map::iterator p = data.lower_bound(bh->start()); + if (p != data.end()) { + //cout << " after " << *p->second << endl; + //cout << " after starts at " << p->first << endl; + assert(p->first >= bh->end()); + } + if (p != data.begin()) { + p--; + //cout << " before starts at " << p->second->start() + //<< " and ends at " << p->second->end() << endl; + //cout << " before " << *p->second << endl; + assert(p->second->end() <= bh->start()); + } + } + + data[bh->start()] = bh; + } + void remove_bh(BufferHead *bh) { + assert(data.count(bh->start())); + data.erase(bh->start()); + } + bool is_empty() { return data.empty(); } + + int find_tx(block_t start, block_t len, + list& tx); + + int map_read(block_t start, block_t len, + map& hits, // hits + map& missing, // read these from disk + map& rx, // wait for these to finish reading from disk + map& partial); // (maybe) wait for these to read from disk + + int map_write(block_t start, block_t len, + interval_set& alloc, + map& hits, + version_t super_epoch); // can write to these. + + BufferHead *split(BufferHead *bh, block_t off); + + /*int scan_versions(block_t start, block_t len, + version_t& low, version_t& high); + */ + + void rx_finish(ioh_t ioh, block_t start, block_t length, bufferlist& bl); + void tx_finish(ioh_t ioh, block_t start, block_t length, version_t v, version_t epoch); + + void truncate(block_t blocks, version_t super_epoch); + // void tear_down(); + + void clone_to(Onode *other); + + void dump() { + for (map::iterator i = data.begin(); + i != data.end(); + i++) + cout << "dump: " << i->first << ": " << *i->second << endl; + } + +}; + + + +class BufferCache { + public: + Mutex &ebofs_lock; // hack: this is a ref to global ebofs_lock + BlockDevice &dev; + + set dirty_bh; + + LRU lru_dirty, lru_rest; + + private: + Cond stat_cond; + Cond flush_cond; + int stat_waiter; + + off_t stat_clean; + off_t stat_dirty; + off_t stat_rx; + off_t stat_tx; + off_t stat_partial; + off_t stat_missing; + +#define EBOFS_BC_FLUSH_BHWRITE 0 +#define EBOFS_BC_FLUSH_PARTIAL 1 + + map epoch_unflushed[2]; + + /* partial writes - incomplete blocks that can't be written until + * their prior content is read and overlayed with the new data. + * + * we put partial block management here because objects may be deleted + * before the read completes, but the write may have been committed in a + * prior epoch. + * + * we map: src block -> dest block -> PartialWrite + * + * really, at most there will only ever be two of these, for current+previous epochs. + */ + class PartialWrite { + public: + map partial; // partial dirty content overlayed onto incoming data + version_t epoch; + }; + + map > partial_write; // queued writes w/ partial content + map > shadow_partials; + + public: + BufferCache(BlockDevice& d, Mutex& el) : + ebofs_lock(el), dev(d), + stat_waiter(0), + stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_partial(0), stat_missing(0) + {} + + + off_t get_size() { + return stat_clean+stat_dirty+stat_rx+stat_tx+stat_partial; + } + off_t get_trimmable() { + return stat_clean; + } + + + // bh's in cache + void add_bh(BufferHead *bh) { + bh->get_oc()->add_bh(bh); + if (bh->is_dirty()) { + lru_dirty.lru_insert_mid(bh); + dirty_bh.insert(bh); + } else + lru_rest.lru_insert_mid(bh); + stat_add(bh); + } + void touch(BufferHead *bh) { + if (bh->is_dirty()) { + lru_dirty.lru_touch(bh); + } else + lru_rest.lru_touch(bh); + } + void remove_bh(BufferHead *bh) { + bh->get_oc()->remove_bh(bh); + stat_sub(bh); + if (bh->is_dirty()) { + lru_dirty.lru_remove(bh); + dirty_bh.erase(bh); + } else + lru_rest.lru_remove(bh); + } + + // stats + void stat_add(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; + case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; + case BufferHead::STATE_TX: stat_tx += bh->length(); break; + case BufferHead::STATE_RX: stat_rx += bh->length(); break; + case BufferHead::STATE_PARTIAL: stat_partial += bh->length(); break; + } + if (stat_waiter) stat_cond.Signal(); + } + void stat_sub(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; + case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; + case BufferHead::STATE_TX: stat_tx -= bh->length(); break; + case BufferHead::STATE_RX: stat_rx -= bh->length(); break; + case BufferHead::STATE_PARTIAL: stat_partial -= bh->length(); break; + } + } + off_t get_stat_tx() { return stat_tx; } + off_t get_stat_rx() { return stat_rx; } + off_t get_stat_dirty() { return stat_dirty; } + off_t get_stat_clean() { return stat_clean; } + off_t get_stat_partial() { return stat_partial; } + + + map &get_unflushed(int what) { + return epoch_unflushed[what]; + } + + int get_unflushed(int what, version_t epoch) { + return epoch_unflushed[what][epoch]; + } + void inc_unflushed(int what, version_t epoch) { + epoch_unflushed[what][epoch]++; + //cout << "inc_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; + } + void dec_unflushed(int what, version_t epoch) { + epoch_unflushed[what][epoch]--; + //cout << "dec_unflushed " << epoch << " now " << epoch_unflushed[epoch] << endl; + if (epoch_unflushed[what][epoch] == 0) + flush_cond.Signal(); + } + + void waitfor_stat() { + stat_waiter++; + stat_cond.Wait(ebofs_lock); + stat_waiter--; + } + void waitfor_flush() { + flush_cond.Wait(ebofs_lock); + } + + + // bh state + void set_state(BufferHead *bh, int s) { + // move between lru lists? + if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { + lru_rest.lru_remove(bh); + lru_dirty.lru_insert_top(bh); + dirty_bh.insert(bh); + } + if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { + lru_dirty.lru_remove(bh); + lru_rest.lru_insert_mid(bh); + dirty_bh.erase(bh); + } + + // set state + stat_sub(bh); + bh->set_state(s); + stat_add(bh); + } + + void copy_state(BufferHead *bh1, BufferHead *bh2) { + set_state(bh2, bh1->get_state()); + } + + void mark_missing(BufferHead *bh) { set_state(bh, BufferHead::STATE_MISSING); }; + void mark_clean(BufferHead *bh) { set_state(bh, BufferHead::STATE_CLEAN); }; + void mark_rx(BufferHead *bh) { set_state(bh, BufferHead::STATE_RX); }; + void mark_partial(BufferHead *bh) { set_state(bh, BufferHead::STATE_PARTIAL); }; + void mark_tx(BufferHead *bh) { set_state(bh, BufferHead::STATE_TX); }; + void mark_dirty(BufferHead *bh) { + set_state(bh, BufferHead::STATE_DIRTY); + bh->set_dirty_stamp(g_clock.now()); + }; + + + // io + void bh_read(Onode *on, BufferHead *bh, block_t from=0); + void bh_write(Onode *on, BufferHead *bh, block_t shouldbe=0); + + bool bh_cancel_read(BufferHead *bh); + bool bh_cancel_write(BufferHead *bh, version_t cur_epoch); + + void bh_queue_partial_write(Onode *on, BufferHead *bh); + void bh_cancel_partial_write(BufferHead *bh); + + void queue_partial(block_t from, block_t to, map& partial, version_t epoch); + void cancel_partial(block_t from, block_t to, version_t epoch); + + void add_shadow_partial(block_t from, BufferHead *bh); + void cancel_shadow_partial(block_t from, BufferHead *bh); + + void rx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, block_t diskstart, bufferlist& bl); + void tx_finish(ObjectCache *oc, ioh_t ioh, block_t start, block_t len, version_t v, version_t e); + void partial_tx_finish(version_t epoch); + + friend class C_E_FlushPartial; + + // bh fun + BufferHead *split(BufferHead *orig, block_t after); +}; + + +class C_OC_RxFinish : public BlockDevice::callback { + Mutex &lock; + ObjectCache *oc; + block_t start, length; + block_t diskstart; +public: + bufferlist bl; + C_OC_RxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, block_t ds) : + lock(m), oc(o), start(s), length(l), diskstart(ds) {} + void finish(ioh_t ioh, int r) { + oc->bc->rx_finish(oc, ioh, start, length, diskstart, bl); + } +}; + +class C_OC_TxFinish : public BlockDevice::callback { + Mutex &lock; + ObjectCache *oc; + block_t start, length; + version_t version; + version_t epoch; + public: + C_OC_TxFinish(Mutex &m, ObjectCache *o, block_t s, block_t l, version_t v, version_t e) : + lock(m), oc(o), start(s), length(l), version(v), epoch(e) {} + void finish(ioh_t ioh, int r) { + oc->bc->tx_finish(oc, ioh, start, length, version, epoch); + } +}; + +class C_OC_PartialTxFinish : public BlockDevice::callback { + BufferCache *bc; + version_t epoch; +public: + C_OC_PartialTxFinish(BufferCache *b, version_t e) : + bc(b), epoch(e) {} + void finish(ioh_t ioh, int r) { + bc->partial_tx_finish(epoch); + } +}; + + +#endif diff --git a/branches/sage/cephmds2/ebofs/Cnode.h b/branches/sage/cephmds2/ebofs/Cnode.h new file mode 100644 index 0000000000000..b906a6db24c57 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Cnode.h @@ -0,0 +1,100 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_CNODE_H +#define __EBOFS_CNODE_H + +#include "Onode.h" + +/* + * collection node + * + * holds attribute metadata for collections. + * colletion membership is stored in b+tree tables, independent of tte cnode. + */ + +class Cnode : public LRUObject +{ + private: + int ref; + bool dirty; + + public: + coll_t coll_id; + Extent cnode_loc; + + map attr; + + public: + Cnode(coll_t cid) : ref(0), dirty(false), coll_id(cid) { + cnode_loc.length = 0; + } + ~Cnode() { + } + + block_t get_cnode_id() { return cnode_loc.start; } + int get_cnode_len() { return cnode_loc.length; } + + void get() { + if (ref == 0) lru_pin(); + ref++; + } + void put() { + ref--; + if (ref == 0) lru_unpin(); + } + int get_ref_count() { return ref; } + + void mark_dirty() { + if (!dirty) { + dirty = true; + get(); + } + } + void mark_clean() { + if (dirty) { + dirty = false; + put(); + } + } + bool is_dirty() { return dirty; } + + + int get_attr_bytes() { + int s = 0; + for (map::iterator i = attr.begin(); + i != attr.end(); + i++) { + s += i->first.length() + 1; + s += i->second.length() + sizeof(int); + } + return s; + } + + // + //???void clear(); + + +}; + +inline ostream& operator<<(ostream& out, Cnode& cn) +{ + out << "cnode(" << hex << cn.coll_id << dec; + if (cn.is_dirty()) out << " dirty"; + //out << " " << &cn; + out << ")"; + return out; +} + +#endif diff --git a/branches/sage/cephmds2/ebofs/Ebofs.cc b/branches/sage/cephmds2/ebofs/Ebofs.cc new file mode 100644 index 0000000000000..520a9c7a00e92 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Ebofs.cc @@ -0,0 +1,3169 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Ebofs.h" + +#include +#include + +// ******************* + +#undef dout +#define dout(x) if (x <= g_conf.debug_ebofs) cout << "ebofs(" << dev.get_device_name() << ")." +#define derr(x) if (x <= g_conf.debug_ebofs) cerr << "ebofs(" << dev.get_device_name() << ")." + +char *nice_blocks(block_t b) +{ + static char s[20]; + float sz = b*4.0; + if (sz > (10 << 20)) + sprintf(s,"%.1f GB", sz / (1024.0*1024.0)); + else if (sz > (10 << 10)) + sprintf(s,"%.1f MB", sz / (1024.0)); + else + sprintf(s,"%llu KB", b*4ULL); + return s; +} + +int Ebofs::mount() +{ + ebofs_lock.Lock(); + assert(!mounted); + + int r = dev.open(&idle_kicker); + if (r < 0) { + ebofs_lock.Unlock(); + return r; + } + + dout(2) << "mounting " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; + + // read super + bufferptr bp1 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + bufferptr bp2 = buffer::create_page_aligned(EBOFS_BLOCK_SIZE); + dev.read(0, 1, bp1); + dev.read(1, 1, bp2); + + struct ebofs_super *sb1 = (struct ebofs_super*)bp1.c_str(); + struct ebofs_super *sb2 = (struct ebofs_super*)bp2.c_str(); + dout(3) << "mount super @0 epoch " << sb1->epoch << endl; + dout(3) << "mount super @1 epoch " << sb2->epoch << endl; + + // pick newest super + struct ebofs_super *sb = 0; + if (sb1->epoch > sb2->epoch) + sb = sb1; + else + sb = sb2; + super_epoch = sb->epoch; + dout(3) << "mount epoch " << super_epoch << endl; + assert(super_epoch == sb->epoch); + + free_blocks = sb->free_blocks; + limbo_blocks = sb->limbo_blocks; + + // init node pools + dout(3) << "mount nodepool" << endl; + nodepool.init( &sb->nodepool ); + nodepool.read_usemap( dev, super_epoch ); + nodepool.read_clean_nodes( dev ); + + // open tables + dout(3) << "mount opening tables" << endl; + object_tab = new Table( nodepool, sb->object_tab ); + for (int i=0; i( nodepool, sb->free_tab[i] ); + limbo_tab = new Table( nodepool, sb->limbo_tab ); + alloc_tab = new Table >( nodepool, sb->alloc_tab ); + + collection_tab = new Table( nodepool, sb->collection_tab ); + co_tab = new Table( nodepool, sb->co_tab ); + + allocator.release_limbo(); + + dout(3) << "mount starting commit+finisher threads" << endl; + commit_thread.create(); + finisher_thread.create(); + + dout(1) << "mounted " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; + mounted = true; + + ebofs_lock.Unlock(); + return 0; +} + + +int Ebofs::mkfs() +{ + ebofs_lock.Lock(); + assert(!mounted); + + int r = dev.open(); + if (r < 0) { + ebofs_lock.Unlock(); + return r; + } + + block_t num_blocks = dev.get_num_blocks(); + + free_blocks = 0; + limbo_blocks = 0; + + // create first noderegion + Extent nr; + nr.start = 2; + nr.length = 20+ (num_blocks / 1000); + if (nr.length < 10) nr.length = 10; + nodepool.add_region(nr); + dout(10) << "mkfs: first node region at " << nr << endl; + + // allocate two usemaps + block_t usemap_len = nodepool.get_usemap_len(); + nodepool.usemap_even.start = nr.end(); + nodepool.usemap_even.length = usemap_len; + nodepool.usemap_odd.start = nodepool.usemap_even.end(); + nodepool.usemap_odd.length = usemap_len; + dout(10) << "mkfs: even usemap at " << nodepool.usemap_even << endl; + dout(10) << "mkfs: odd usemap at " << nodepool.usemap_odd << endl; + + // init tables + struct ebofs_table empty; + empty.num_keys = 0; + empty.root = -1; + empty.depth = 0; + + object_tab = new Table( nodepool, empty ); + collection_tab = new Table( nodepool, empty ); + + for (int i=0; i( nodepool, empty ); + limbo_tab = new Table( nodepool, empty ); + alloc_tab = new Table >( nodepool, empty ); + + co_tab = new Table( nodepool, empty ); + + // add free space + Extent left; + left.start = nodepool.usemap_odd.end(); + left.length = num_blocks - left.start; + dout(10) << "mkfs: free data blocks at " << left << endl; + allocator._release_into_limbo( left ); + if (g_conf.ebofs_cloneable) { + allocator.alloc_inc(nr); + allocator.alloc_inc(nodepool.usemap_even); + allocator.alloc_inc(nodepool.usemap_odd); + } + allocator.commit_limbo(); // -> limbo_tab + allocator.release_limbo(); // -> free_tab + + // write nodes, super, 2x + dout(10) << "mkfs: flushing nodepool and superblocks (2x)" << endl; + + nodepool.commit_start( dev, 0 ); + nodepool.commit_wait(); + bufferptr superbp0; + prepare_super(0, superbp0); + write_super(0, superbp0); + + nodepool.commit_start( dev, 1 ); + nodepool.commit_wait(); + bufferptr superbp1; + prepare_super(1, superbp1); + write_super(1, superbp1); + + // free memory + dout(10) << "mkfs: cleaning up" << endl; + close_tables(); + + dev.close(); + + dout(2) << "mkfs: " << dev.get_device_name() << " " << dev.get_num_blocks() << " blocks, " << nice_blocks(dev.get_num_blocks()) << endl; + ebofs_lock.Unlock(); + return 0; +} + +void Ebofs::close_tables() +{ + // close tables + delete object_tab; + for (int i=0; i::iterator i = onode_map.begin(); + i != onode_map.end(); + i++) { + dout(0) << "umount *** leftover: " << i->first << " " << *(i->second) << endl; + } + + // free memory + dout(5) << "umount cleaning up" << endl; + close_tables(); + dev.close(); + readonly = unmounting = mounted = false; + + dout(1) << "umount done on " << dev.get_device_name() << endl; + ebofs_lock.Unlock(); + return 0; +} + + + +void Ebofs::prepare_super(version_t epoch, bufferptr& bp) +{ + struct ebofs_super sb; + + dout(10) << "prepare_super v" << epoch << endl; + + // fill in super + memset(&sb, 0, sizeof(sb)); + sb.s_magic = EBOFS_MAGIC; + sb.epoch = epoch; + sb.num_blocks = dev.get_num_blocks(); + + sb.free_blocks = free_blocks; + sb.limbo_blocks = limbo_blocks; + + + // tables + sb.object_tab.num_keys = object_tab->get_num_keys(); + sb.object_tab.root = object_tab->get_root(); + sb.object_tab.depth = object_tab->get_depth(); + + for (int i=0; iget_num_keys(); + sb.free_tab[i].root = free_tab[i]->get_root(); + sb.free_tab[i].depth = free_tab[i]->get_depth(); + } + sb.limbo_tab.num_keys = limbo_tab->get_num_keys(); + sb.limbo_tab.root = limbo_tab->get_root(); + sb.limbo_tab.depth = limbo_tab->get_depth(); + + sb.alloc_tab.num_keys = alloc_tab->get_num_keys(); + sb.alloc_tab.root = alloc_tab->get_root(); + sb.alloc_tab.depth = alloc_tab->get_depth(); + + sb.collection_tab.num_keys = collection_tab->get_num_keys(); + sb.collection_tab.root = collection_tab->get_root(); + sb.collection_tab.depth = collection_tab->get_depth(); + + sb.co_tab.num_keys = co_tab->get_num_keys(); + sb.co_tab.root = co_tab->get_root(); + sb.co_tab.depth = co_tab->get_depth(); + + // pools + sb.nodepool.num_regions = nodepool.region_loc.size(); + for (unsigned i=0; i 0) { + // periodically check for idle block device + dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms, " + << g_conf.ebofs_idle_commit_ms << " ms if idle" << endl; + long left = g_conf.ebofs_commit_ms; + while (left > 0) { + long next = MIN(left, g_conf.ebofs_idle_commit_ms); + if (commit_cond.WaitInterval(ebofs_lock, utime_t(0, next*1000)) != ETIMEDOUT) + break; // we got kicked + if (dev.is_idle()) { + dout(20) << "commit_thread bdev is idle, early commit" << endl; + break; // dev is idle + } + left -= next; + dout(20) << "commit_thread " << left << " ms left" << endl; + + // hack hack + //if (!left) g_conf.debug_ebofs = 10; + // /hack hack + } + } else { + // normal wait+timeout + dout(20) << "commit_thread sleeping (up to) " << g_conf.ebofs_commit_ms << " ms" << endl; + commit_cond.WaitInterval(ebofs_lock, utime_t(0, g_conf.ebofs_commit_ms*1000)); + } + + } else { + // DEBUG.. wait until kicked + dout(10) << "commit_thread no commit_ms, waiting until kicked" << endl; + commit_cond.Wait(ebofs_lock); + } + + if (unmounting) { + dout(10) << "commit_thread unmounting: final commit pass" << endl; + assert(readonly); + unmounting = false; + mounted = false; + dirty = true; + } + + if (!dirty && !limbo_blocks) { + dout(10) << "commit_thread not dirty" << endl; + } + else { + super_epoch++; + dirty = false; + + dout(10) << "commit_thread commit start, new epoch " << super_epoch << endl; + dout(2) << "commit_thread data: " + << 100*(dev.get_num_blocks()-get_free_blocks())/dev.get_num_blocks() << "% used, " + << get_free_blocks() << " (" << 100*get_free_blocks()/dev.get_num_blocks() + << "%) free in " << get_free_extents() + << ", " << get_limbo_blocks() << " (" << 100*get_limbo_blocks()/dev.get_num_blocks() + << "%) limbo in " << get_limbo_extents() + << endl; + dout(2) << "commit_thread nodes: " + << 100*nodepool.num_used()/nodepool.num_total() << "% used, " + << nodepool.num_free() << " (" << 100*nodepool.num_free()/nodepool.num_total() << "%) free, " + << nodepool.num_limbo() << " (" << 100*nodepool.num_limbo()/nodepool.num_total() << "%) limbo, " + << nodepool.num_total() << " total." << endl; + dout(2) << "commit_thread bc: " + << "size " << bc.get_size() + << ", trimmable " << bc.get_trimmable() + << ", max " << g_conf.ebofs_bc_size + << "; dirty " << bc.get_stat_dirty() + << ", tx " << bc.get_stat_tx() + << ", max dirty " << g_conf.ebofs_bc_max_dirty + << endl; + + + // (async) write onodes+condes (do this first; it currently involves inode reallocation) + commit_inodes_start(); + + allocator.commit_limbo(); // limbo -> limbo_tab + + // (async) write btree nodes + nodepool.commit_start( dev, super_epoch ); + + // blockdev barrier (prioritize our writes!) + dout(30) << "commit_thread barrier. flushing inodes " << inodes_flushing << endl; + dev.barrier(); + + // prepare super (before any changes get made!) + bufferptr superbp; + prepare_super(super_epoch, superbp); + + // wait for it all to flush (drops global lock) + commit_bc_wait(super_epoch-1); + dout(30) << "commit_thread bc flushed" << endl; + commit_inodes_wait(); + dout(30) << "commit_thread inodes flushed" << endl; + nodepool.commit_wait(); + dout(30) << "commit_thread btree nodes flushed" << endl; + + // ok, now (synchronously) write the prior super! + dout(10) << "commit_thread commit flushed, writing super for prior epoch" << endl; + ebofs_lock.Unlock(); + write_super(super_epoch, superbp); + ebofs_lock.Lock(); + + dout(10) << "commit_thread wrote super" << endl; + + // free limbo space now + // (since we're done allocating things, + // AND we've flushed all previous epoch data) + allocator.release_limbo(); // limbo_tab -> free_tabs + + // do we need more node space? + if (nodepool.num_free() < nodepool.num_total() / 3) { + dout(2) << "commit_thread running low on node space, allocating more." << endl; + alloc_more_node_space(); + } + + // kick waiters + dout(10) << "commit_thread queueing commit + kicking sync waiters" << endl; + + finisher_lock.Lock(); + finisher_queue.splice(finisher_queue.end(), commit_waiters[super_epoch-1]); + commit_waiters.erase(super_epoch-1); + finisher_cond.Signal(); + finisher_lock.Unlock(); + + sync_cond.Signal(); + + dout(10) << "commit_thread commit finish" << endl; + } + + // trim bc? + trim_bc(); + trim_inodes(); + + } + + dout(10) << "commit_thread finish" << endl; + commit_thread_started = false; + ebofs_lock.Unlock(); + return 0; +} + + +void Ebofs::alloc_more_node_space() +{ + dout(1) << "alloc_more_node_space free " << nodepool.num_free() << "/" << nodepool.num_total() << endl; + + if (nodepool.num_regions() < EBOFS_MAX_NODE_REGIONS) { + int want = nodepool.num_total(); + + Extent ex; + allocator.allocate(ex, want, 2); + dout(1) << "alloc_more_node_space wants " << want << " more, got " << ex << endl; + + Extent even, odd; + unsigned ulen = nodepool.get_usemap_len(nodepool.num_total() + ex.length); + allocator.allocate(even, ulen, 2); + allocator.allocate(odd, ulen, 2); + dout(1) << "alloc_more_node_space maps need " << ulen << " x2, got " << even << " " << odd << endl; + + if (even.length == ulen && odd.length == ulen) { + dout(1) << "alloc_more_node_space got " << ex << ", new usemaps at even " << even << " odd " << odd << endl; + allocator.release(nodepool.usemap_even); + allocator.release(nodepool.usemap_odd); + nodepool.add_region(ex); + nodepool.usemap_even = even; + nodepool.usemap_odd = odd; + } else { + dout (1) << "alloc_more_node_space failed to get space for new usemaps" << endl; + allocator.release(ex); + allocator.release(even); + allocator.release(odd); + //assert(0); + } + } else { + dout(1) << "alloc_more_node_space already have max node regions!" << endl; + assert(0); + } +} + + +void *Ebofs::finisher_thread_entry() +{ + finisher_lock.Lock(); + dout(10) << "finisher_thread start" << endl; + + while (!finisher_stop) { + while (!finisher_queue.empty()) { + list ls; + ls.swap(finisher_queue); + + finisher_lock.Unlock(); + + //ebofs_lock.Lock(); // um.. why lock this? -sage + finish_contexts(ls, 0); + //ebofs_lock.Unlock(); + + finisher_lock.Lock(); + } + if (finisher_stop) break; + + dout(30) << "finisher_thread sleeping" << endl; + finisher_cond.Wait(finisher_lock); + } + + dout(10) << "finisher_thread start" << endl; + finisher_lock.Unlock(); + return 0; +} + + +// *** onodes *** + +Onode* Ebofs::new_onode(object_t oid) +{ + Onode* on = new Onode(oid); + + assert(onode_map.count(oid) == 0); + onode_map[oid] = on; + onode_lru.lru_insert_top(on); + + assert(object_tab->lookup(oid) < 0); + object_tab->insert( oid, on->onode_loc ); // even tho i'm not placed yet + + on->get(); + on->onode_loc.start = 0; + on->onode_loc.length = 0; + + dirty_onode(on); + + dout(7) << "new_onode " << *on << endl; + return on; +} + + +Onode* Ebofs::get_onode(object_t oid) +{ + while (1) { + // in cache? + if (onode_map.count(oid)) { + // yay + Onode *on = onode_map[oid]; + on->get(); + //cout << "get_onode " << *on << endl; + return on; + } + + // on disk? + Extent onode_loc; + if (object_tab->lookup(oid, onode_loc) < 0) { + dout(10) << "onode lookup failed on " << oid << endl; + // object dne. + return 0; + } + + // already loading? + if (waitfor_onode.count(oid)) { + // yep, just wait. + Cond c; + waitfor_onode[oid].push_back(&c); + dout(10) << "get_onode " << oid << " already loading, waiting" << endl; + c.Wait(ebofs_lock); + continue; + } + + dout(10) << "get_onode reading " << oid << " from " << onode_loc << endl; + + assert(waitfor_onode.count(oid) == 0); + waitfor_onode[oid].clear(); // this should be empty initially. + + // read it! + bufferlist bl; + bl.push_back( buffer::create_page_aligned( EBOFS_BLOCK_SIZE*onode_loc.length ) ); + + ebofs_lock.Unlock(); + dev.read( onode_loc.start, onode_loc.length, bl ); + ebofs_lock.Lock(); + + // add onode + Onode *on = new Onode(oid); + onode_map[oid] = on; + onode_lru.lru_insert_top(on); + + // parse data block + struct ebofs_onode *eo = (struct ebofs_onode*)bl.c_str(); + if (eo->object_id != oid) { + cerr << " wrong oid in onode block: " << eo->object_id << " != " << oid << endl; + cerr << " onode_loc is " << eo->onode_loc << endl; + cerr << " object_size " << eo->object_size << endl; + cerr << " object_blocks " << eo->object_blocks << endl; + cerr << " " << eo->num_collections << " coll + " + << eo->num_attr << " attr + " + << eo->num_extents << " extents" << endl; + assert(eo->object_id == oid); + } + on->readonly = eo->readonly; + on->onode_loc = eo->onode_loc; + on->object_size = eo->object_size; + on->object_blocks = eo->object_blocks; + + // parse + char *p = bl.c_str() + sizeof(*eo); + + // parse collection list + for (int i=0; inum_collections; i++) { + coll_t c = *((coll_t*)p); + p += sizeof(c); + on->collections.insert(c); + } + + // parse attributes + for (int i=0; inum_attr; i++) { + string key = p; + p += key.length() + 1; + int len = *(int*)(p); + p += sizeof(len); + on->attr[key] = buffer::copy(p, len); + p += len; + dout(15) << "get_onode " << *on << " attr " << key << " len " << len << endl; + } + + // parse extents + on->extent_map.clear(); + block_t n = 0; + for (int i=0; inum_extents; i++) { + Extent ex = *((Extent*)p); + on->extent_map[n] = ex; + dout(15) << "get_onode " << *on << " ex " << i << ": " << ex << endl; + n += ex.length; + p += sizeof(Extent); + } + assert(n == on->object_blocks); + + // wake up other waiters + for (list::iterator i = waitfor_onode[oid].begin(); + i != waitfor_onode[oid].end(); + i++) + (*i)->Signal(); + waitfor_onode.erase(oid); // remove Cond list + + on->get(); + //cout << "get_onode " << *on << " (loaded)" << endl; + return on; + } +} + + +class C_E_InodeFlush : public BlockDevice::callback { + Ebofs *ebofs; +public: + C_E_InodeFlush(Ebofs *e) : ebofs(e) {} + void finish(ioh_t ioh, int r) { + ebofs->flush_inode_finish(); + } +}; + + +void Ebofs::encode_onode(Onode *on, bufferlist& bl, unsigned& off) +{ + // onode + struct ebofs_onode eo; + eo.readonly = on->readonly; + eo.onode_loc = on->onode_loc; + eo.object_id = on->object_id; + eo.object_size = on->object_size; + eo.object_blocks = on->object_blocks; + eo.num_collections = on->collections.size(); + eo.num_attr = on->attr.size(); + eo.num_extents = on->extent_map.size(); + bl.copy_in(off, sizeof(eo), (char*)&eo); + off += sizeof(eo); + + // collections + for (set::iterator i = on->collections.begin(); + i != on->collections.end(); + i++) { + bl.copy_in(off, sizeof(*i), (char*)&(*i)); + off += sizeof(*i); + } + + // attr + for (map::iterator i = on->attr.begin(); + i != on->attr.end(); + i++) { + bl.copy_in(off, i->first.length()+1, i->first.c_str()); + off += i->first.length()+1; + int l = i->second.length(); + bl.copy_in(off, sizeof(int), (char*)&l); + off += sizeof(int); + bl.copy_in(off, l, i->second.c_str()); + off += l; + dout(15) << "write_onode " << *on << " attr " << i->first << " len " << l << endl; + } + + // extents + for (map::iterator i = on->extent_map.begin(); + i != on->extent_map.end(); + i++) { + bl.copy_in(off, sizeof(Extent), (char*)&(i->second)); + off += sizeof(Extent); + dout(15) << "write_onode " << *on << " ex " << i->first << ": " << i->second << endl; + } +} + +void Ebofs::write_onode(Onode *on) +{ + // buffer + unsigned bytes = sizeof(ebofs_onode) + on->get_collection_bytes() + on->get_attr_bytes() + on->get_extent_bytes(); + unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; + + bufferlist bl; + bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); + + // (always) relocate onode + if (1) { + if (on->onode_loc.length) + allocator.release(on->onode_loc); + + block_t first = 0; + if (on->extent_map.size()) + first = on->extent_map.begin()->second.start; + + allocator.allocate(on->onode_loc, blocks, first); + object_tab->remove( on->object_id ); + object_tab->insert( on->object_id, on->onode_loc ); + //object_tab->verify(); + } + + dout(10) << "write_onode " << *on << " to " << on->onode_loc << endl; + + unsigned off = 0; + encode_onode(on, bl, off); + assert(off == bytes); + + // write + dev.write( on->onode_loc.start, on->onode_loc.length, bl, + new C_E_InodeFlush(this), "write_onode" ); +} + +void Ebofs::remove_onode(Onode *on) +{ + dout(8) << "remove_onode " << *on << endl; + + assert(on->get_ref_count() >= 1); // caller + + // tear down buffer cache + if (on->oc) { + on->oc->truncate(0, super_epoch); // this will kick readers along the way. + on->close_oc(); + } + + // remove from onode map, mark dangling/deleted + onode_map.erase(on->object_id); + onode_lru.lru_remove(on); + on->deleted = true; + on->dangling = true; + + // remove from object table + //dout(0) << "remove_onode on " << *on << endl; + object_tab->remove(on->object_id); + + // free onode space + if (on->onode_loc.length) + allocator.release(on->onode_loc); + + // free data space + for (map::iterator i = on->extent_map.begin(); + i != on->extent_map.end(); + i++) + allocator.release(i->second); + on->extent_map.clear(); + + // remove from collections + for (set::iterator i = on->collections.begin(); + i != on->collections.end(); + i++) { + co_tab->remove(coll_object_t(*i,on->object_id)); + } + on->collections.clear(); + + // dirty -> clean? + if (on->is_dirty()) { + on->mark_clean(); // this unpins *on + dirty_onodes.erase(on); + } + + if (on->get_ref_count() > 1) cout << "remove_onode **** will survive " << *on << endl; + put_onode(on); + + dirty = true; +} + +void Ebofs::put_onode(Onode *on) +{ + on->put(); + //cout << "put_onode " << *on << endl; + + if (on->get_ref_count() == 0 && on->dangling) { + //cout << " *** hosing on " << *on << endl; + delete on; + } +} + +void Ebofs::dirty_onode(Onode *on) +{ + if (!on->is_dirty()) { + on->mark_dirty(); + dirty_onodes.insert(on); + } + dirty = true; +} + +void Ebofs::trim_inodes(int max) +{ + unsigned omax = onode_lru.lru_get_max(); + unsigned cmax = cnode_lru.lru_get_max(); + if (max >= 0) omax = cmax = max; + dout(10) << "trim_inodes start " << onode_lru.lru_get_size() << " / " << omax << " onodes, " + << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; + + // onodes + while (onode_lru.lru_get_size() > omax) { + // expire an item + Onode *on = (Onode*)onode_lru.lru_expire(); + if (on == 0) break; // nothing to expire + + // expire + dout(20) << "trim_inodes removing onode " << *on << endl; + onode_map.erase(on->object_id); + on->dangling = true; + + if (on->get_ref_count() == 0) { + assert(on->oc == 0); // an open oc pins the onode! + delete on; + } else { + dout(-20) << "trim_inodes still active: " << *on << endl; + assert(0); // huh? + } + } + + + // cnodes + while (cnode_lru.lru_get_size() > cmax) { + // expire an item + Cnode *cn = (Cnode*)cnode_lru.lru_expire(); + if (cn == 0) break; // nothing to expire + + // expire + dout(20) << "trim_inodes removing cnode " << *cn << endl; + cnode_map.erase(cn->coll_id); + + delete cn; + } + + dout(10) << "trim_inodes finish " + << onode_lru.lru_get_size() << " / " << omax << " onodes, " + << cnode_lru.lru_get_size() << " / " << cmax << " cnodes" << endl; +} + + + +// *** cnodes **** + +Cnode* Ebofs::new_cnode(coll_t cid) +{ + Cnode* cn = new Cnode(cid); + + assert(cnode_map.count(cid) == 0); + cnode_map[cid] = cn; + cnode_lru.lru_insert_top(cn); + + assert(collection_tab->lookup(cid) < 0); + collection_tab->insert( cid, cn->cnode_loc ); // even tho i'm not placed yet + + cn->get(); + cn->cnode_loc.start = 0; + cn->cnode_loc.length = 0; + + dirty_cnode(cn); + + return cn; +} + +Cnode* Ebofs::get_cnode(coll_t cid) +{ + while (1) { + // in cache? + if (cnode_map.count(cid)) { + // yay + Cnode *cn = cnode_map[cid]; + cn->get(); + return cn; + } + + // on disk? + Extent cnode_loc; + if (collection_tab->lookup(cid, cnode_loc) < 0) { + // object dne. + return 0; + } + + // already loading? + if (waitfor_cnode.count(cid)) { + // yep, just wait. + Cond c; + waitfor_cnode[cid].push_back(&c); + dout(10) << "get_cnode " << cid << " already loading, waiting" << endl; + c.Wait(ebofs_lock); + continue; + } + + dout(10) << "get_cnode reading " << cid << " from " << cnode_loc << endl; + + assert(waitfor_cnode.count(cid) == 0); + waitfor_cnode[cid].clear(); // this should be empty initially. + + // read it! + bufferlist bl; + //bufferpool.alloc( EBOFS_BLOCK_SIZE*cnode_loc.length, bl ); + bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*cnode_loc.length) ); + + ebofs_lock.Unlock(); + dev.read( cnode_loc.start, cnode_loc.length, bl ); + ebofs_lock.Lock(); + + // parse data block + Cnode *cn = new Cnode(cid); + + cnode_map[cid] = cn; + cnode_lru.lru_insert_top(cn); + + struct ebofs_cnode *ec = (struct ebofs_cnode*)bl.c_str(); + cn->cnode_loc = ec->cnode_loc; + + // parse attributes + char *p = bl.c_str() + sizeof(*ec); + for (int i=0; inum_attr; i++) { + string key = p; + p += key.length() + 1; + int len = *(int*)(p); + p += sizeof(len); + cn->attr[key] = buffer::copy(p, len); + p += len; + dout(15) << "get_cnode " << *cn << " attr " << key << " len " << len << endl; + } + + // wake up other waiters + for (list::iterator i = waitfor_cnode[cid].begin(); + i != waitfor_cnode[cid].end(); + i++) + (*i)->Signal(); + waitfor_cnode.erase(cid); // remove Cond list + + cn->get(); + return cn; + } +} + +void Ebofs::encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off) +{ + // cnode + struct ebofs_cnode ec; + ec.cnode_loc = cn->cnode_loc; + ec.coll_id = cn->coll_id; + ec.num_attr = cn->attr.size(); + bl.copy_in(off, sizeof(ec), (char*)&ec); + off += sizeof(ec); + + // attr + for (map::iterator i = cn->attr.begin(); + i != cn->attr.end(); + i++) { + bl.copy_in(off, i->first.length()+1, i->first.c_str()); + off += i->first.length()+1; + int len = i->second.length(); + bl.copy_in(off, sizeof(int), (char*)&len); + off += sizeof(int); + bl.copy_in(off, len, i->second.c_str()); + off += len; + + dout(15) << "write_cnode " << *cn << " attr " << i->first << " len " << len << endl; + } +} + +void Ebofs::write_cnode(Cnode *cn) +{ + // allocate buffer + unsigned bytes = sizeof(ebofs_cnode) + cn->get_attr_bytes(); + unsigned blocks = (bytes-1)/EBOFS_BLOCK_SIZE + 1; + + bufferlist bl; + //bufferpool.alloc( EBOFS_BLOCK_SIZE*blocks, bl ); + bl.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*blocks) ); + + // (always) relocate cnode! + if (1) { + if (cn->cnode_loc.length) + allocator.release(cn->cnode_loc); + + allocator.allocate(cn->cnode_loc, blocks, Allocator::NEAR_LAST_FWD); + collection_tab->remove( cn->coll_id ); + collection_tab->insert( cn->coll_id, cn->cnode_loc ); + } + + dout(10) << "write_cnode " << *cn << " to " << cn->cnode_loc << endl; + + unsigned off = 0; + encode_cnode(cn, bl, off); + assert(off == bytes); + + // write + dev.write( cn->cnode_loc.start, cn->cnode_loc.length, bl, + new C_E_InodeFlush(this), "write_cnode" ); +} + +void Ebofs::remove_cnode(Cnode *cn) +{ + dout(10) << "remove_cnode " << *cn << endl; + + // remove from table + collection_tab->remove(cn->coll_id); + + // free cnode space + if (cn->cnode_loc.length) + allocator.release(cn->cnode_loc); + + // remove from dirty list? + if (cn->is_dirty()) + dirty_cnodes.erase(cn); + + // remove from map and lru + cnode_map.erase(cn->coll_id); + cnode_lru.lru_remove(cn); + + // count down refs + cn->mark_clean(); + cn->put(); + assert(cn->get_ref_count() == 0); + + // hose. + delete cn; + + dirty = true; +} + +void Ebofs::put_cnode(Cnode *cn) +{ + cn->put(); +} + +void Ebofs::dirty_cnode(Cnode *cn) +{ + if (!cn->is_dirty()) { + cn->mark_dirty(); + dirty_cnodes.insert(cn); + } + dirty = true; +} + + + + + +void Ebofs::flush_inode_finish() +{ + ebofs_lock.Lock(); + { + inodes_flushing--; + if (inodes_flushing < 1000) + dout(20) << "flush_inode_finish, " << inodes_flushing << " left" << endl; + if (inodes_flushing == 0) + inode_commit_cond.Signal(); + } + ebofs_lock.Unlock(); +} + +void Ebofs::commit_inodes_start() +{ + dout(10) << "commit_inodes_start" << endl; + + assert(inodes_flushing == 0); + + // onodes + for (set::iterator i = dirty_onodes.begin(); + i != dirty_onodes.end(); + i++) { + Onode *on = *i; + inodes_flushing++; + write_onode(on); + on->mark_clean(); + on->uncommitted.clear(); // commit allocated blocks + on->commit_waiters.clear(); // these guys are gonna get taken care of, bc we committed. + } + dirty_onodes.clear(); + + // cnodes + for (set::iterator i = dirty_cnodes.begin(); + i != dirty_cnodes.end(); + i++) { + Cnode *cn = *i; + inodes_flushing++; + write_cnode(cn); + cn->mark_clean(); + } + dirty_cnodes.clear(); + + dout(10) << "commit_inodes_start writing " << inodes_flushing << " onodes+cnodes" << endl; +} + +void Ebofs::commit_inodes_wait() +{ + // caller must hold ebofs_lock + while (inodes_flushing > 0) { + dout(10) << "commit_inodes_wait waiting for " << inodes_flushing << " onodes+cnodes to flush" << endl; + inode_commit_cond.Wait(ebofs_lock); + } + dout(10) << "commit_inodes_wait all flushed" << endl; +} + + + + + + + +// *** buffer cache *** + +void Ebofs::trim_buffer_cache() +{ + ebofs_lock.Lock(); + trim_bc(0); + ebofs_lock.Unlock(); +} + +void Ebofs::trim_bc(off_t max) +{ + if (max < 0) + max = g_conf.ebofs_bc_size; + dout(10) << "trim_bc start: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; + + while (bc.get_size() > max && + bc.get_trimmable()) { + BufferHead *bh = (BufferHead*) bc.lru_rest.lru_expire(); + if (!bh) break; + + dout(25) << "trim_bc trimming " << *bh << endl; + assert(bh->is_clean()); + + ObjectCache *oc = bh->oc; + bc.remove_bh(bh); + delete bh; + + if (oc->is_empty()) { + Onode *on = oc->on; + dout(10) << "trim_bc closing oc on " << *on << endl; + on->close_oc(); + } + } + + dout(10) << "trim_bc finish: size " << bc.get_size() << ", trimmable " << bc.get_trimmable() << ", max " << max << endl; +} + + +void Ebofs::kick_idle() +{ + dout(10) << "kick_idle" << endl; + commit_cond.Signal(); + + /* + ebofs_lock.Lock(); + if (mounted && !unmounting && dirty) { + dout(0) << "kick_idle dirty, doing commit" << endl; + commit_cond.Signal(); + } else { + dout(0) << "kick_idle !dirty or !mounted or unmounting, doing nothing" << endl; + } + ebofs_lock.Unlock(); + */ +} + +void Ebofs::sync(Context *onsafe) +{ + ebofs_lock.Lock(); + if (onsafe) + commit_waiters[super_epoch].push_back(onsafe); + ebofs_lock.Unlock(); +} + +void Ebofs::sync() +{ + ebofs_lock.Lock(); + if (!dirty) { + dout(7) << "sync in " << super_epoch << ", not dirty" << endl; + } else { + dout(7) << "sync in " << super_epoch << endl; + + if (!commit_thread_started) { + dout(10) << "sync waiting for commit thread to start" << endl; + sync_cond.Wait(ebofs_lock); + } + + if (mid_commit) { + dout(10) << "sync waiting for commit in progress" << endl; + sync_cond.Wait(ebofs_lock); + } + + commit_cond.Signal(); // trigger a commit + + sync_cond.Wait(ebofs_lock); // wait + + dout(10) << "sync finish in " << super_epoch << endl; + } + ebofs_lock.Unlock(); +} + + + +void Ebofs::commit_bc_wait(version_t epoch) +{ + dout(10) << "commit_bc_wait on epoch " << epoch << endl; + + while (bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE,epoch) > 0 || + bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL,epoch) > 0) { + //dout(10) << "commit_bc_wait " << bc.get_unflushed(epoch) << " unflushed in epoch " << epoch << endl; + dout(10) << "commit_bc_wait epoch " << epoch + << ", unflushed bhwrite " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) + << ", unflushed partial " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) + << endl; + bc.waitfor_flush(); + } + + bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE).erase(epoch); + bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL).erase(epoch); + + dout(10) << "commit_bc_wait all flushed for epoch " << epoch + << "; " << bc.get_unflushed(EBOFS_BC_FLUSH_BHWRITE) + << " " << bc.get_unflushed(EBOFS_BC_FLUSH_PARTIAL) + << endl; +} + + + +int Ebofs::statfs(struct statfs *buf) +{ + dout(7) << "statfs" << endl; + + buf->f_type = EBOFS_MAGIC; /* type of filesystem */ + buf->f_bsize = 4096; /* optimal transfer block size */ + buf->f_blocks = dev.get_num_blocks(); /* total data blocks in file system */ + buf->f_bfree = get_free_blocks() + + get_limbo_blocks(); /* free blocks in fs */ + buf->f_bavail = get_free_blocks(); /* free blocks avail to non-superuser -- actually, for writing. */ + buf->f_files = nodepool.num_total(); /* total file nodes in file system */ + buf->f_ffree = nodepool.num_free(); /* free file nodes in fs */ + //buf->f_fsid = 0; /* file system id */ + buf->f_namelen = 8; /* maximum length of filenames */ + + return 0; +} + + + + +/* + * allocate a write to blocks on disk. + * - take care to not overwrite any "safe" data blocks. + * - allocate/map new extents on disk as necessary + */ +void Ebofs::alloc_write(Onode *on, + block_t start, block_t len, + interval_set& alloc, + block_t& old_bfirst, block_t& old_blast) +{ + // first decide what pages to (re)allocate + alloc.insert(start, len); // start with whole range + + // figure out what bits are already uncommitted + interval_set already_uncom; + already_uncom.intersection_of(alloc, on->uncommitted); + + // subtract those off, so we're left with the committed bits (that must be reallocated). + alloc.subtract(already_uncom); + + dout(10) << "alloc_write must (re)alloc " << alloc << " on " << *on << endl; + + // release it (into limbo) + for (map::iterator i = alloc.m.begin(); + i != alloc.m.end(); + i++) { + // get old region + vector old; + on->map_extents(i->first, i->second, old); + for (unsigned o=0; ofirst == start) { + old_bfirst = old[0].start; + dout(20) << "alloc_write old_bfirst " << old_bfirst << " of " << old[0] << endl; + } + if (i->first+i->second == start+len) { + old_blast = old[old.size()-1].last(); + dout(20) << "alloc_write old_blast " << old_blast << " of " << old[old.size()-1] << endl; + } + } + } + + // reallocate uncommitted too? + // ( --> yes. we can always make better allocation decisions later, with more information. ) + if (g_conf.ebofs_realloc) { + list tx; + + ObjectCache *oc = on->get_oc(&bc); + oc->find_tx(start, len, tx); + + for (list::reverse_iterator p = tx.rbegin(); + p != tx.rend(); + p++) { + BufferHead *bh = *p; + + // cancelable/moveable? + if (alloc.contains(bh->start(), bh->length())) { + dout(10) << "alloc_write " << *bh << " already in " << alloc << endl; + continue; + } + + vector old; + on->map_extents(bh->start(), bh->length(), old); + assert(old.size() == 1); + + if (bh->start() >= start && bh->end() <= start+len) { + assert(bh->epoch_modified == super_epoch); + if (bc.bh_cancel_write(bh, super_epoch)) { + if (bh->length() == 1) + dout(10) << "alloc_write unallocated tx " << old[0] << ", canceled " << *bh << endl; + // no, this isn't compatible with clone() and extent reference counting. + //allocator.unallocate(old[0]); // release (into free) + allocator.release(old[0]); + alloc.insert(bh->start(), bh->length()); + } else { + if (bh->length() == 1) + dout(10) << "alloc_write released tx " << old[0] << ", couldn't cancel " << *bh << endl; + allocator.release(old[0]); // release (into limbo) + alloc.insert(bh->start(), bh->length()); + } + } else { + if (bh->length() == 1) + dout(10) << "alloc_write skipped tx " << old[0] << ", not entirely within " + << start << "~" << len + << " bh " << *bh << endl; + } + } + + dout(10) << "alloc_write will (re)alloc " << alloc << " on " << *on << endl; + } + + if (alloc.empty()) return; // no need to dirty the onode below! + + + // merge alloc into onode uncommitted map + //dout(10) << " union of " << on->uncommitted << " and " << alloc << endl; + interval_set old = on->uncommitted; + on->uncommitted.union_of(alloc); + + dout(10) << "alloc_write onode.uncommitted is now " << on->uncommitted << endl; + + if (0) { + // verify + interval_set ta; + ta.intersection_of(on->uncommitted, alloc); + cout << " ta " << ta << endl; + assert(alloc == ta); + + interval_set tb; + tb.intersection_of(on->uncommitted, old); + cout << " tb " << tb << endl; + assert(old == tb); + } + + dirty_onode(on); + + // allocate the space + for (map::iterator i = alloc.m.begin(); + i != alloc.m.end(); + i++) { + dout(15) << "alloc_write alloc " << i->first << "~" << i->second << " (of " << start << "~" << len << ")" << endl; + + // allocate new space + block_t left = i->second; + block_t cur = i->first; + while (left > 0) { + Extent ex; + allocator.allocate(ex, left, Allocator::NEAR_LAST_FWD); + dout(10) << "alloc_write got " << ex << " for object offset " << cur << endl; + on->set_extent(cur, ex); // map object to new region + left -= ex.length; + cur += ex.length; + } + } +} + + + + +void Ebofs::apply_write(Onode *on, off_t off, size_t len, bufferlist& bl) +{ + ObjectCache *oc = on->get_oc(&bc); + + // map into blocks + off_t opos = off; // byte pos in object + size_t zleft = 0; // zeros left to write + size_t left = len; // bytes left + + block_t bstart = off / EBOFS_BLOCK_SIZE; + + if (off > on->object_size) { + zleft = off - on->object_size; + opos = on->object_size; + bstart = on->object_size / EBOFS_BLOCK_SIZE; + } + if (off+(off_t)len > on->object_size) { + dout(10) << "apply_write extending size on " << *on << ": " << on->object_size + << " -> " << off+len << endl; + on->object_size = off+len; + dirty_onode(on); + } + if (bl.length() == 0) { + zleft += len; + left = 0; + } + if (zleft) + dout(10) << "apply_write zeroing first " << zleft << " bytes of " << *on << endl; + + block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; + block_t blen = blast-bstart+1; + + // allocate write on disk. + interval_set alloc; + block_t old_bfirst = 0; // zero means not defined here (since we ultimately pass to bh_read) + block_t old_blast = 0; + alloc_write(on, bstart, blen, alloc, old_bfirst, old_blast); + dout(20) << "apply_write old_bfirst " << old_bfirst << ", old_blast " << old_blast << endl; + + if (fake_writes) { + on->uncommitted.clear(); // worst case! + return; + } + + // map b range onto buffer_heads + map hits; + oc->map_write(bstart, blen, alloc, hits, super_epoch); + + // get current versions + //version_t lowv, highv; + //oc->scan_versions(bstart, blen, lowv, highv); + //highv++; + version_t highv = ++oc->write_count; + + // copy from bl into buffer cache + unsigned blpos = 0; // byte pos in input buffer + + // write data into buffers + for (map::iterator i = hits.begin(); + i != hits.end(); + i++) { + BufferHead *bh = i->second; + bh->set_version(highv); + bh->epoch_modified = super_epoch; + + // old write in progress? + if (bh->is_tx()) { // copy the buffer to avoid munging up in-flight write + dout(10) << "apply_write tx pending, copying buffer on " << *bh << endl; + bufferlist temp; + temp.claim(bh->data); + //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); + bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); + bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); + } + + // need to split off partial? (partials can only be ONE block) + if ((bh->is_missing() || bh->is_rx()) && bh->length() > 1) { + if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0)) { + BufferHead *right = bc.split(bh, bh->start()+1); + hits[right->start()] = right; + dout(10) << "apply_write split off left block for partial write; rest is " << *right << endl; + } + if ((bh->last() == blast && (len+off) % EBOFS_BLOCK_SIZE != 0) && + ((off_t)len+off < on->object_size)) { + BufferHead *right = bc.split(bh, bh->last()); + hits[right->start()] = right; + dout(10) << "apply_write split off right block for upcoming partial write; rest is " << *right << endl; + } + } + + // partial at head or tail? + if ((bh->start() == bstart && opos % EBOFS_BLOCK_SIZE != 0) || // opos, not off, in case we're zeroing... + (bh->last() == blast && ((off_t)len+off) % EBOFS_BLOCK_SIZE != 0 && ((off_t)len+off) < on->object_size)) { + // locate ourselves in bh + unsigned off_in_bh = opos - bh->start()*EBOFS_BLOCK_SIZE; + assert(off_in_bh >= 0); + unsigned len_in_bh = MIN( (off_t)(zleft+left), + (off_t)(bh->end()*EBOFS_BLOCK_SIZE)-opos ); + + if (bh->is_partial() || bh->is_rx() || bh->is_missing()) { + assert(bh->is_partial() || bh->is_rx() || bh->is_missing()); + assert(bh->length() == 1); + + // add frag to partial + dout(10) << "apply_write writing into partial " << *bh << ":" + << " off_in_bh " << off_in_bh + << " len_in_bh " << len_in_bh + << endl; + unsigned z = MIN( zleft, len_in_bh ); + if (z) { + bufferptr zp(z); + zp.zero(); + bufferlist zb; + zb.push_back(zp); + bh->add_partial(off_in_bh, zb); + zleft -= z; + opos += z; + } + + bufferlist sb; + sb.substr_of(bl, blpos, len_in_bh-z); // substr in existing buffer + bufferlist cp; + cp.append(sb.c_str(), len_in_bh-z); // copy the partial bit! + bh->add_partial(off_in_bh, cp); + left -= len_in_bh-z; + blpos += len_in_bh-z; + opos += len_in_bh-z; + + if (bh->partial_is_complete(on->object_size - bh->start()*EBOFS_BLOCK_SIZE)) { + dout(10) << "apply_write completed partial " << *bh << endl; + //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); // new buffers! + bh->data.clear(); + bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); + bh->data.zero(); + bh->apply_partial(); + bc.mark_dirty(bh); + bc.bh_write(on, bh); + } + else if (bh->is_rx()) { + dout(10) << "apply_write rx -> partial " << *bh << endl; + assert(bh->length() == 1); + bc.mark_partial(bh); + bc.bh_queue_partial_write(on, bh); // queue the eventual write + } + else if (bh->is_missing()) { + dout(10) << "apply_write missing -> partial " << *bh << endl; + assert(bh->length() == 1); + bc.mark_partial(bh); + + // take care to read from _old_ disk block locations! + if (bh->start() == bstart) + bc.bh_read(on, bh, old_bfirst); + else if (bh->start() == blast) + bc.bh_read(on, bh, old_blast); + else assert(0); + + bc.bh_queue_partial_write(on, bh); // queue the eventual write + } + else if (bh->is_partial()) { + dout(10) << "apply_write already partial, no need to submit rx on " << *bh << endl; + if (bh->partial_tx_epoch == super_epoch) + bc.bh_cancel_partial_write(bh); + bc.bh_queue_partial_write(on, bh); // queue the eventual write + } + + + } else { + assert(bh->is_clean() || bh->is_dirty() || bh->is_tx()); + + // just write into the bh! + dout(10) << "apply_write writing leading/tailing partial into " << *bh << ":" + << " off_in_bh " << off_in_bh + << " len_in_bh " << len_in_bh + << endl; + + // copy data into new buffers first (copy on write!) + // FIXME: only do the modified pages? this might be a big bh! + bufferlist temp; + temp.claim(bh->data); + //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); + bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); + bh->data.copy_in(0, bh->length()*EBOFS_BLOCK_SIZE, temp); + + unsigned z = MIN( zleft, len_in_bh ); + if (z) { + bufferptr zp(z); + zp.zero(); + bufferlist zb; + zb.push_back(zp); + bh->data.copy_in(off_in_bh, z, zb); + zleft -= z; + opos += z; + } + + bufferlist sub; + sub.substr_of(bl, blpos, len_in_bh-z); + bh->data.copy_in(off_in_bh+z, len_in_bh-z, sub); + blpos += len_in_bh-z; + left -= len_in_bh-z; + opos += len_in_bh-z; + + if (!bh->is_dirty()) + bc.mark_dirty(bh); + + bc.bh_write(on, bh); + } + continue; + } + + // ok, we're talking full block(s) now (modulo last block of the object) + assert(opos % EBOFS_BLOCK_SIZE == 0); + assert((off_t)(zleft+left) >= (off_t)(EBOFS_BLOCK_SIZE*bh->length()) || + opos+(off_t)(zleft+left) == on->object_size); + + // alloc new buffers. + //bc.bufferpool.alloc(EBOFS_BLOCK_SIZE*bh->length(), bh->data); + bh->data.clear(); + bh->data.push_back( buffer::create_page_aligned(EBOFS_BLOCK_SIZE*bh->length()) ); + + // copy! + unsigned len_in_bh = MIN(bh->length()*EBOFS_BLOCK_SIZE, zleft+left); + assert(len_in_bh <= zleft+left); + + dout(10) << "apply_write writing into " << *bh << ":" + << " len_in_bh " << len_in_bh + << endl; + + unsigned z = MIN(len_in_bh, zleft); + if (z) { + bufferptr zp(z); + zp.zero(); + bufferlist zb; + zb.push_back(zp); + bh->data.copy_in(0, z, zb); + zleft -= z; + } + + bufferlist sub; + sub.substr_of(bl, blpos, len_in_bh-z); + bh->data.copy_in(z, len_in_bh-z, sub); + + blpos += len_in_bh-z; + left -= len_in_bh-z; + opos += len_in_bh; + + // old partial? + if (bh->is_partial() && + bh->partial_tx_epoch == super_epoch) + bc.bh_cancel_partial_write(bh); + + // mark dirty + if (!bh->is_dirty()) + bc.mark_dirty(bh); + + bc.bh_write(on, bh); + } + + assert(zleft == 0); + assert(left == 0); + assert(opos == off+(off_t)len); + //assert(blpos == bl.length()); +} + + + + +// *** file i/o *** + +bool Ebofs::attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, + Cond *will_wait_on, bool *will_wait_on_bool) +{ + dout(10) << "attempt_read " << *on << " " << off << "~" << len << endl; + ObjectCache *oc = on->get_oc(&bc); + + // map + block_t bstart = off / EBOFS_BLOCK_SIZE; + block_t blast = (len+off-1) / EBOFS_BLOCK_SIZE; + block_t blen = blast-bstart+1; + + map hits; + map missing; // read these + map rx; // wait for these + map partials; // ?? + oc->map_read(bstart, blen, hits, missing, rx, partials); + + // missing buffers? + if (!missing.empty()) { + for (map::iterator i = missing.begin(); + i != missing.end(); + i++) { + dout(10) << "attempt_read missing buffer " << *(i->second) << endl; + bc.bh_read(on, i->second); + } + BufferHead *wait_on = missing.begin()->second; + block_t b = MAX(wait_on->start(), bstart); + wait_on->waitfor_read[b].push_back(new C_Cond(will_wait_on, will_wait_on_bool)); + return false; + } + + // are partials sufficient? + bool partials_ok = true; + for (map::iterator i = partials.begin(); + i != partials.end(); + i++) { + BufferHead *bh = i->second; + off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); + off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); + off_t start = MAX( off, bhstart ); + off_t end = MIN( off+(off_t)len, bhend ); + + if (!i->second->have_partial_range(start-bhstart, end-bhend)) { + if (partials_ok) { + // wait on this one + Context *c = new C_Cond(will_wait_on, will_wait_on_bool); + dout(10) << "attempt_read insufficient partial buffer " << *(i->second) << " c " << c << endl; + i->second->waitfor_read[i->second->start()].push_back(c); + } + partials_ok = false; + } + } + if (!partials_ok) return false; + + // wait on rx? + if (!rx.empty()) { + BufferHead *wait_on = rx.begin()->second; + Context *c = new C_Cond(will_wait_on, will_wait_on_bool); + dout(1) << "attempt_read waiting for read to finish on " << *wait_on << " c " << c << endl; + block_t b = MAX(wait_on->start(), bstart); + wait_on->waitfor_read[b].push_back(c); + return false; + } + + // yay, we have it all! + // concurrently walk thru hits, partials. + map::iterator h = hits.begin(); + map::iterator p = partials.begin(); + + bl.clear(); + off_t pos = off; + block_t curblock = bstart; + while (curblock <= blast) { + BufferHead *bh = 0; + if (h->first == curblock) { + bh = h->second; + h++; + } else if (p->first == curblock) { + bh = p->second; + p++; + } else assert(0); + + off_t bhstart = (off_t)(bh->start()*EBOFS_BLOCK_SIZE); + off_t bhend = (off_t)(bh->end()*EBOFS_BLOCK_SIZE); + off_t start = MAX( pos, bhstart ); + off_t end = MIN( off+(off_t)len, bhend ); + + if (bh->is_partial()) { + // copy from a partial block. yuck! + bufferlist frag; + bh->copy_partial_substr( start-bhstart, end-bhstart, frag ); + bl.claim_append( frag ); + pos += frag.length(); + } else { + // copy from a full block. + if (bhstart == start && bhend == end) { + bl.append( bh->data ); + pos += bh->data.length(); + } else { + bufferlist frag; + dout(10) << "substr " << (start-bhstart) << "~" << (end-start) << " of " << bh->data.length() << " in " << *bh << endl; + frag.substr_of(bh->data, start-bhstart, end-start); + pos += frag.length(); + bl.claim_append( frag ); + } + } + + curblock = bh->end(); + /* this assert is more trouble than it's worth + assert((off_t)(curblock*EBOFS_BLOCK_SIZE) == pos || // should be aligned with next block + end != bhend || // or we ended midway through bh + (bh->last() == blast && end == bhend)); // ended last block ** FIXME WRONG??? + */ + } + + assert(bl.length() == len); + return true; +} + +int Ebofs::read(object_t oid, + off_t off, size_t len, + bufferlist& bl) +{ + ebofs_lock.Lock(); + int r = _read(oid, off, len, bl); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_read(object_t oid, off_t off, size_t len, bufferlist& bl) +{ + dout(7) << "_read " << oid << " " << off << "~" << len << endl; + + Onode *on = get_onode(oid); + if (!on) { + dout(7) << "_read " << oid << " " << off << "~" << len << " ... dne " << endl; + return -ENOENT; // object dne? + } + + // read data into bl. block as necessary. + Cond cond; + + int r = 0; + while (1) { + // check size bound + if (off >= on->object_size) { + dout(7) << "_read " << oid << " " << off << "~" << len << " ... off past eof " << on->object_size << endl; + r = -ESPIPE; // FIXME better errno? + break; + } + + size_t try_len = len ? len:on->object_size; + size_t will_read = MIN(off+(off_t)try_len, on->object_size) - off; + + bool done; + if (attempt_read(on, off, will_read, bl, &cond, &done)) + break; // yay + + // wait + while (!done) + cond.Wait(ebofs_lock); + + if (on->deleted) { + dout(7) << "_read " << oid << " " << off << "~" << len << " ... object deleted" << endl; + r = -ENOENT; + break; + } + } + + put_onode(on); + + trim_bc(); + + if (r < 0) return r; // return error, + dout(7) << "_read " << oid << " " << off << "~" << len << " ... got " << bl.length() << endl; + return bl.length(); // or bytes read. +} + + +bool Ebofs::_write_will_block() +{ + return (bc.get_stat_dirty()+bc.get_stat_tx() > g_conf.ebofs_bc_max_dirty); +} + +bool Ebofs::write_will_block() +{ + ebofs_lock.Lock(); + bool b = _write_will_block(); + ebofs_lock.Unlock(); + return b; +} + + +unsigned Ebofs::apply_transaction(Transaction& t, Context *onsafe) +{ + ebofs_lock.Lock(); + dout(7) << "apply_transaction start (" << t.ops.size() << " ops)" << endl; + + // do ops + unsigned r = 0; // bit fields indicate which ops failed. + int bit = 1; + for (list::iterator p = t.ops.begin(); + p != t.ops.end(); + p++) { + switch (*p) { + case Transaction::OP_READ: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); + if (_read(oid, offset, len, *pbl) < 0) { + dout(7) << "apply_transaction fail on _read" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_STAT: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + struct stat *st = t.psts.front(); t.psts.pop_front(); + if (_stat(oid, st) < 0) { + dout(7) << "apply_transaction fail on _stat" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_GETATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); + if ((*(pattrval.second) = _getattr(oid, attrname, pattrval.first, *(pattrval.second))) < 0) { + dout(7) << "apply_transaction fail on _getattr" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_GETATTRS: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); + if (_getattrs(oid, *pset) < 0) { + dout(7) << "apply_transaction fail on _getattrs" << endl; + r &= bit; + } + } + break; + + + case Transaction::OP_WRITE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + bufferlist bl = t.bls.front(); t.bls.pop_front(); + if (_write(oid, offset, len, bl) < 0) { + dout(7) << "apply_transaction fail on _write" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_TRUNCATE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t len = t.offsets.front(); t.offsets.pop_front(); + if (_truncate(oid, len) < 0) { + dout(7) << "apply_transaction fail on _truncate" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_REMOVE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + if (_remove(oid) < 0) { + dout(7) << "apply_transaction fail on _remove" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_SETATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + bufferlist bl; + bl.claim( t.attrbls.front() ); + t.attrbls.pop_front(); + if (_setattr(oid, attrname, bl.c_str(), bl.length()) < 0) { + dout(7) << "apply_transaction fail on _setattr" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_SETATTRS: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); + if (_setattrs(oid, *pattrset) < 0) { + dout(7) << "apply_transaction fail on _setattrs" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_RMATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + if (_rmattr(oid, attrname) < 0) { + dout(7) << "apply_transaction fail on _rmattr" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_CLONE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + object_t noid = t.oids.front(); t.oids.pop_front(); + if (_clone(oid, noid) < 0) { + dout(7) << "apply_transaction fail on _clone" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + if (_create_collection(cid) < 0) { + dout(7) << "apply_transaction fail on _create_collection" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + if (_destroy_collection(cid) < 0) { + dout(7) << "apply_transaction fail on _destroy_collection" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + object_t oid = t.oids.front(); t.oids.pop_front(); + if (_collection_add(cid, oid) < 0) { + //dout(7) << "apply_transaction fail on _collection_add" << endl; + //r &= bit; + } + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + object_t oid = t.oids.front(); t.oids.pop_front(); + if (_collection_remove(cid, oid) < 0) { + dout(7) << "apply_transaction fail on _collection_remove" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_COLL_SETATTR: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + bufferlist bl; + bl.claim( t.attrbls.front() ); + t.attrbls.pop_front(); + if (_collection_setattr(cid, attrname, bl.c_str(), bl.length()) < 0) { + //if (_collection_setattr(cid, attrname, attrval.first, attrval.second) < 0) { + dout(7) << "apply_transaction fail on _collection_setattr" << endl; + r &= bit; + } + } + break; + + case Transaction::OP_COLL_RMATTR: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + if (_collection_rmattr(cid, attrname) < 0) { + dout(7) << "apply_transaction fail on _collection_rmattr" << endl; + r &= bit; + } + } + break; + + default: + cerr << "bad op " << *p << endl; + assert(0); + } + + bit = bit << 1; + } + + dout(7) << "apply_transaction finish (r = " << r << ")" << endl; + + // set up commit waiter + //if (r == 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + //} else { + //if (onsafe) delete onsafe; + //} + + ebofs_lock.Unlock(); + return r; +} + + + +int Ebofs::_write(object_t oid, off_t offset, size_t length, bufferlist& bl) +{ + dout(7) << "_write " << oid << " " << offset << "~" << length << endl; + assert(bl.length() == length); + + // too much unflushed dirty data? (if so, block!) + if (_write_will_block()) { + dout(10) << "_write blocking " + << oid << " " << offset << "~" << length + << " bc: " + << "size " << bc.get_size() + << ", trimmable " << bc.get_trimmable() + << ", max " << g_conf.ebofs_bc_size + << "; dirty " << bc.get_stat_dirty() + << ", tx " << bc.get_stat_tx() + << ", max dirty " << g_conf.ebofs_bc_max_dirty + << endl; + + while (_write_will_block()) + bc.waitfor_stat(); // waits on ebofs_lock + + dout(10) << "_write unblocked " + << oid << " " << offset << "~" << length + << " bc: " + << "size " << bc.get_size() + << ", trimmable " << bc.get_trimmable() + << ", max " << g_conf.ebofs_bc_size + << "; dirty " << bc.get_stat_dirty() + << ", tx " << bc.get_stat_tx() + << ", max dirty " << g_conf.ebofs_bc_max_dirty + << endl; + } + + // out of space? + unsigned max = (length+offset) / EBOFS_BLOCK_SIZE + 10; // very conservative; assumes we have to rewrite + max += dirty_onodes.size() + dirty_cnodes.size(); + if (max >= free_blocks) { + dout(1) << "write failing, only " << free_blocks << " blocks free, may need up to " << max << endl; + return -ENOSPC; + } + + // get|create inode + Onode *on = get_onode(oid); + if (!on) on = new_onode(oid); // new inode! + if (on->readonly) { + put_onode(on); + return -EACCES; + } + + dirty_onode(on); // dirty onode! + + // apply write to buffer cache + if (length > 0) + apply_write(on, offset, length, bl); + + // done. + put_onode(on); + trim_bc(); + + return length; +} + + +/*int Ebofs::write(object_t oid, + off_t off, size_t len, + bufferlist& bl, bool fsync) +{ + // wait? + if (fsync) { + // wait for flush. + Cond cond; + bool done; + int flush = 1; // write never returns positive + Context *c = new C_Cond(&cond, &done, &flush); + int r = write(oid, off, len, bl, c); + if (r < 0) return r; + + ebofs_lock.Lock(); + { + while (!done) + cond.Wait(ebofs_lock); + assert(flush <= 0); + } + ebofs_lock.Unlock(); + if (flush < 0) return flush; + return r; + } else { + // don't wait for flush. + return write(oid, off, len, bl, (Context*)0); + } +} +*/ + +int Ebofs::write(object_t oid, + off_t off, size_t len, + bufferlist& bl, Context *onsafe) +{ + ebofs_lock.Lock(); + assert(len > 0); + + // go + int r = _write(oid, off, len, bl); + + // commit waiter + if (r > 0) { + assert((size_t)r == len); + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + + +int Ebofs::_remove(object_t oid) +{ + dout(7) << "_remove " << oid << endl; + + // get inode + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + + // ok remove it! + remove_onode(on); + + return 0; +} + + +int Ebofs::remove(object_t oid, Context *onsafe) +{ + ebofs_lock.Lock(); + + // do it + int r = _remove(oid); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_truncate(object_t oid, off_t size) +{ + dout(7) << "_truncate " << oid << " size " << size << endl; + + Onode *on = get_onode(oid); + if (!on) + return -ENOENT; + if (on->readonly) { + put_onode(on); + return -EACCES; + } + + int r = 0; + if (size > on->object_size) { + r = -EINVAL; // whatever + } + else if (size < on->object_size) { + // change size + on->object_size = size; + dirty_onode(on); + + // free blocks + block_t nblocks = 0; + if (size) nblocks = 1 + (size-1) / EBOFS_BLOCK_SIZE; + if (on->object_blocks > nblocks) { + vector extra; + on->truncate_extents(nblocks, extra); + for (unsigned i=0; ioc) { + on->oc->truncate(on->object_blocks, super_epoch); + if (on->oc->is_empty()) + on->close_oc(); + } + + // update uncommitted + interval_set uncom; + if (nblocks > 0) { + interval_set left; + left.insert(0, nblocks); + uncom.intersection_of(left, on->uncommitted); + } + dout(10) << "uncommitted was " << on->uncommitted << " now " << uncom << endl; + on->uncommitted = uncom; + + } + else { + assert(size == on->object_size); + } + + put_onode(on); + return r; +} + + +int Ebofs::truncate(object_t oid, off_t size, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _truncate(oid, size); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + + + +int Ebofs::clone(object_t from, object_t to, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _clone(from, to); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_clone(object_t from, object_t to) +{ + dout(7) << "_clone " << from << " -> " << to << endl; + + if (!g_conf.ebofs_cloneable) + return -1; // no! + + Onode *fon = get_onode(from); + if (!fon) return -ENOENT; + Onode *ton = get_onode(to); + if (ton) { + put_onode(fon); + put_onode(ton); + return -EEXIST; + } + ton = new_onode(to); + assert(ton); + + // copy easy bits + ton->readonly = true; + ton->object_size = fon->object_size; + ton->object_blocks = fon->object_blocks; + ton->attr = fon->attr; + + // collections + for (set::iterator p = fon->collections.begin(); + p != fon->collections.end(); + p++) + _collection_add(*p, to); + + // extents + ton->extent_map = fon->extent_map; + for (map::iterator p = ton->extent_map.begin(); + p != ton->extent_map.end(); + ++p) { + allocator.alloc_inc(p->second); + } + + // clear uncommitted + fon->uncommitted.clear(); + + // muck with ObjectCache + if (fon->oc) + fon->oc->clone_to( ton ); + + // ok! + put_onode(ton); + put_onode(fon); + return 0; +} + + + + +/* + * pick object revision with rev < specified rev. + * (oid.rev is a noninclusive upper bound.) + * + */ +int Ebofs::pick_object_revision_lt(object_t& oid) +{ + assert(oid.rev > 0); // this is only useful for non-zero oid.rev + + int r = -EEXIST; // return code + ebofs_lock.Lock(); + { + object_t orig = oid; + object_t live = oid; + live.rev = 0; + + if (object_tab->get_num_keys() > 0) { + Table::Cursor cursor(object_tab); + + object_tab->find(oid, cursor); // this will be just _past_ highest eligible rev + if (cursor.move_left() > 0) { + bool firstpass = true; + while (1) { + object_t t = cursor.current().key; + if (t.ino != oid.ino || + t.bno != oid.bno) // passed to previous object + break; + if (oid.rev < t.rev) { // rev < desired. possible match. + r = 0; + oid = t; + break; + } + if (firstpass && oid.rev >= t.rev) { // there is no old rev < desired. try live. + r = 0; + oid = live; + break; + } + if (cursor.move_left() <= 0) break; + firstpass = false; + } + } + } + + dout(8) << "find_object_revision " << orig << " -> " << oid + << " r=" << r << endl; + } + ebofs_lock.Unlock(); + return r; +} + + + + +bool Ebofs::exists(object_t oid) +{ + ebofs_lock.Lock(); + dout(8) << "exists " << oid << endl; + bool e = (object_tab->lookup(oid) == 0); + ebofs_lock.Unlock(); + return e; +} + +int Ebofs::stat(object_t oid, struct stat *st) +{ + ebofs_lock.Lock(); + int r = _stat(oid,st); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_stat(object_t oid, struct stat *st) +{ + dout(7) << "_stat " << oid << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + + // ?? + st->st_size = on->object_size; + + put_onode(on); + return 0; +} + + +int Ebofs::_setattr(object_t oid, const char *name, const void *value, size_t size) +{ + dout(8) << "setattr " << oid << " '" << name << "' len " << size << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + if (on->readonly) { + put_onode(on); + return -EACCES; + } + + string n(name); + on->attr[n] = buffer::copy((char*)value, size); + dirty_onode(on); + put_onode(on); + + dout(8) << "setattr " << oid << " '" << name << "' len " << size << " success" << endl; + + return 0; +} + +int Ebofs::setattr(object_t oid, const char *name, const void *value, size_t size, Context *onsafe) +{ + ebofs_lock.Lock(); + int r = _setattr(oid, name, value, size); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_setattrs(object_t oid, map& attrset) +{ + dout(8) << "setattrs " << oid << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + if (on->readonly) { + put_onode(on); + return -EACCES; + } + + on->attr = attrset; + dirty_onode(on); + put_onode(on); + return 0; +} + +int Ebofs::setattrs(object_t oid, map& attrset, Context *onsafe) +{ + ebofs_lock.Lock(); + int r = _setattrs(oid, attrset); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::getattr(object_t oid, const char *name, void *value, size_t size) +{ + ebofs_lock.Lock(); + int r = _getattr(oid, name, value, size); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_getattr(object_t oid, const char *name, void *value, size_t size) +{ + dout(8) << "_getattr " << oid << " '" << name << "' maxlen " << size << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + + string n(name); + int r = 0; + if (on->attr.count(n) == 0) { + dout(10) << "_getattr " << oid << " '" << name << "' dne" << endl; + r = -1; + } else { + r = MIN( on->attr[n].length(), size ); + dout(10) << "_getattr " << oid << " '" << name << "' got len " << r << endl; + memcpy(value, on->attr[n].c_str(), r ); + } + put_onode(on); + return r; +} + +int Ebofs::getattrs(object_t oid, map &aset) +{ + ebofs_lock.Lock(); + int r = _getattrs(oid, aset); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_getattrs(object_t oid, map &aset) +{ + dout(8) << "_getattrs " << oid << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + aset = on->attr; + put_onode(on); + return 0; +} + + + +int Ebofs::_rmattr(object_t oid, const char *name) +{ + dout(8) << "_rmattr " << oid << " '" << name << "'" << endl; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + if (on->readonly) { + put_onode(on); + return -EACCES; + } + + string n(name); + on->attr.erase(n); + dirty_onode(on); + put_onode(on); + return 0; +} + +int Ebofs::rmattr(object_t oid, const char *name, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _rmattr(oid, name); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::listattr(object_t oid, vector& attrs) +{ + ebofs_lock.Lock(); + dout(8) << "listattr " << oid << endl; + + Onode *on = get_onode(oid); + if (!on) { + ebofs_lock.Unlock(); + return -ENOENT; + } + + attrs.clear(); + for (map::iterator i = on->attr.begin(); + i != on->attr.end(); + i++) { + attrs.push_back(i->first); + } + + put_onode(on); + ebofs_lock.Unlock(); + return 0; +} + + + +/***************** collections ******************/ + +int Ebofs::list_collections(list& ls) +{ + ebofs_lock.Lock(); + dout(9) << "list_collections " << endl; + + Table::Cursor cursor(collection_tab); + + int num = 0; + if (collection_tab->find(0, cursor) >= 0) { + while (1) { + ls.push_back(cursor.current().key); + num++; + if (cursor.move_right() <= 0) break; + } + } + + ebofs_lock.Unlock(); + return num; +} + +int Ebofs::_create_collection(coll_t cid) +{ + dout(9) << "_create_collection " << hex << cid << dec << endl; + + if (_collection_exists(cid)) + return -EEXIST; + + Cnode *cn = new_cnode(cid); + put_cnode(cn); + + return 0; +} + +int Ebofs::create_collection(coll_t cid, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _create_collection(cid); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_destroy_collection(coll_t cid) +{ + dout(9) << "_destroy_collection " << hex << cid << dec << endl; + + if (!_collection_exists(cid)) + return -ENOENT; + + Cnode *cn = get_cnode(cid); + assert(cn); + + // hose mappings + list objects; + collection_list(cid, objects); + for (list::iterator i = objects.begin(); + i != objects.end(); + i++) { + co_tab->remove(coll_object_t(cid,*i)); + + Onode *on = get_onode(*i); + if (on) { + on->collections.erase(cid); + dirty_onode(on); + put_onode(on); + } + } + + remove_cnode(cn); + return 0; +} + +int Ebofs::destroy_collection(coll_t cid, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _destroy_collection(cid); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return r; +} + +bool Ebofs::collection_exists(coll_t cid) +{ + ebofs_lock.Lock(); + dout(10) << "collection_exists " << hex << cid << dec << endl; + bool r = _collection_exists(cid); + ebofs_lock.Unlock(); + return r; +} +bool Ebofs::_collection_exists(coll_t cid) +{ + return (collection_tab->lookup(cid) == 0); +} + +int Ebofs::_collection_add(coll_t cid, object_t oid) +{ + dout(9) << "_collection_add " << hex << cid << " object " << oid << dec << endl; + + if (!_collection_exists(cid)) + return -ENOENT; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + + int r = 0; + + if (on->collections.count(cid) == 0) { + on->collections.insert(cid); + dirty_onode(on); + co_tab->insert(coll_object_t(cid,oid), true); + } else { + r = -ENOENT; // FIXME? already in collection. + } + + put_onode(on); + return r; +} + +int Ebofs::collection_add(coll_t cid, object_t oid, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _collection_add(cid, oid); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return 0; +} + +int Ebofs::_collection_remove(coll_t cid, object_t oid) +{ + dout(9) << "_collection_remove " << hex << cid << " object " << oid << dec << endl; + + if (!_collection_exists(cid)) + return -ENOENT; + + Onode *on = get_onode(oid); + if (!on) return -ENOENT; + + int r = 0; + + if (on->collections.count(cid)) { + on->collections.erase(cid); + dirty_onode(on); + co_tab->remove(coll_object_t(cid,oid)); + } else { + r = -ENOENT; // FIXME? + } + + put_onode(on); + return r; +} + +int Ebofs::collection_remove(coll_t cid, object_t oid, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _collection_remove(cid, oid); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return 0; +} + +int Ebofs::collection_list(coll_t cid, list& ls) +{ + ebofs_lock.Lock(); + dout(9) << "collection_list " << hex << cid << dec << endl; + + if (!_collection_exists(cid)) { + ebofs_lock.Unlock(); + return -ENOENT; + } + + Table::Cursor cursor(co_tab); + + int num = 0; + if (co_tab->find(coll_object_t(cid,object_t()), cursor) >= 0) { + while (1) { + const coll_t c = cursor.current().key.first; + const object_t o = cursor.current().key.second; + if (c != cid) break; // end! + dout(10) << "collection_list " << hex << cid << " includes " << o << dec << endl; + ls.push_back(o); + num++; + if (cursor.move_right() < 0) break; + } + } + + ebofs_lock.Unlock(); + return num; +} + + +int Ebofs::_collection_setattr(coll_t cid, const char *name, const void *value, size_t size) +{ + dout(10) << "_collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; + + Cnode *cn = get_cnode(cid); + if (!cn) return -ENOENT; + + string n(name); + cn->attr[n] = buffer::copy((char*)value, size); + dirty_cnode(cn); + put_cnode(cn); + + return 0; +} + +int Ebofs::collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) +{ + ebofs_lock.Lock(); + dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' len " << size << endl; + + int r = _collection_setattr(cid, name, value, size); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return 0; +} + +int Ebofs::collection_getattr(coll_t cid, const char *name, void *value, size_t size) +{ + ebofs_lock.Lock(); + dout(10) << "collection_setattr " << hex << cid << dec << " '" << name << "' maxlen " << size << endl; + + Cnode *cn = get_cnode(cid); + if (!cn) { + ebofs_lock.Unlock(); + return -ENOENT; + } + + string n(name); + int r; + if (cn->attr.count(n) == 0) { + r = -1; + } else { + r = MIN( cn->attr[n].length(), size ); + memcpy(value, cn->attr[n].c_str(), r); + } + + put_cnode(cn); + ebofs_lock.Unlock(); + return r; +} + +int Ebofs::_collection_rmattr(coll_t cid, const char *name) +{ + dout(10) << "_collection_rmattr " << hex << cid << dec << " '" << name << "'" << endl; + + Cnode *cn = get_cnode(cid); + if (!cn) return -ENOENT; + + string n(name); + cn->attr.erase(n); + + dirty_cnode(cn); + put_cnode(cn); + + return 0; +} + +int Ebofs::collection_rmattr(coll_t cid, const char *name, Context *onsafe) +{ + ebofs_lock.Lock(); + + int r = _collection_rmattr(cid, name); + + // set up commit waiter + if (r >= 0) { + if (onsafe) commit_waiters[super_epoch].push_back(onsafe); + } else { + if (onsafe) delete onsafe; + } + + ebofs_lock.Unlock(); + return 0; +} + +int Ebofs::collection_listattr(coll_t cid, vector& attrs) +{ + ebofs_lock.Lock(); + dout(10) << "collection_listattr " << hex << cid << dec << endl; + + Cnode *cn = get_cnode(cid); + if (!cn) { + ebofs_lock.Unlock(); + return -ENOENT; + } + + attrs.clear(); + for (map::iterator i = cn->attr.begin(); + i != cn->attr.end(); + i++) { + attrs.push_back(i->first); + } + + put_cnode(cn); + ebofs_lock.Unlock(); + return 0; +} + + + +void Ebofs::_export_freelist(bufferlist& bl) +{ + for (int b=0; b<=EBOFS_NUM_FREE_BUCKETS; b++) { + Table *tab; + if (b < EBOFS_NUM_FREE_BUCKETS) { + tab = free_tab[b]; + } else { + tab = limbo_tab; + } + + if (tab->get_num_keys() > 0) { + Table::Cursor cursor(tab); + assert(tab->find(0, cursor) >= 0); + while (1) { + assert(cursor.current().value > 0); + + Extent ex(cursor.current().key, cursor.current().value); + dout(10) << "_export_freelist " << ex << endl; + bl.append((char*)&ex, sizeof(ex)); + if (cursor.move_right() <= 0) break; + } + } + } +} + +void Ebofs::_import_freelist(bufferlist& bl) +{ + // clear + for (int b=0; bclear(); + limbo_tab->clear(); + + // import! + int num = bl.length() / sizeof(Extent); + Extent *p = (Extent*)bl.c_str(); + for (int i=0; i *tab; + if (b < EBOFS_NUM_FREE_BUCKETS) { + tab = free_tab[b]; + dout(30) << "dump bucket " << b << " " << tab->get_num_keys() << endl; + } else { + tab = limbo_tab; + dout(30) << "dump limbo " << tab->get_num_keys() << endl;; + } + + if (tab->get_num_keys() > 0) { + Table::Cursor cursor(tab); + assert(tab->find(0, cursor) >= 0); + while (1) { + assert(cursor.current().value > 0); + + block_t l = cursor.current().value; + tfree += l; + int b = 0; + do { + l = l >> 1; + b++; + } while (l); + st.free_extent_dist[b]++; + st.free_extent_dist_sum[b] += cursor.current().value; + st.num_free_extent++; + + if (cursor.move_right() <= 0) break; + } + } + } + st.avg_free_extent = tfree / st.num_free_extent; +*/ + + // used extents is harder. :( + st.num_extent = 0; + st.avg_extent = 0; + st.extent_dist.clear(); + st.extent_dist_sum.clear(); + st.avg_extent_per_object = 0; + st.avg_extent_jump = 0; + + Table::Cursor cursor(object_tab); + object_tab->find(object_t(), cursor); + int nobj = 0; + int njump = 0; + while (object_tab->get_num_keys() > 0) { + Onode *on = get_onode(cursor.current().key); + assert(on); + + nobj++; + st.avg_extent_per_object += on->extent_map.size(); + + for (map::iterator p = on->extent_map.begin(); + p != on->extent_map.end(); + p++) { + block_t l = p->second.length; + + st.num_extent++; + st.avg_extent += l; + if (p->first > 0) { + njump++; + st.avg_extent_jump += l; + } + + int b = 0; + do { + l = l >> 1; + b++; + } while (l); + st.extent_dist[b]++; + st.extent_dist_sum[b] += p->second.length; + } + put_onode(on); + if (cursor.move_right() <= 0) break; + } + if (njump) st.avg_extent_jump /= njump; + if (nobj) st.avg_extent_per_object /= (float)nobj; + if (st.num_extent) st.avg_extent /= st.num_extent; + + ebofs_lock.Unlock(); +} diff --git a/branches/sage/cephmds2/ebofs/Ebofs.h b/branches/sage/cephmds2/ebofs/Ebofs.h new file mode 100644 index 0000000000000..a8efe3b6a6b4c --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Ebofs.h @@ -0,0 +1,323 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +#include "include/Context.h" +#include "include/buffer.h" + +template +inline ostream& operator<<(ostream& out, const pair& p) { + return out << p.first << "," << p.second; +} + +#include "types.h" +#include "Onode.h" +#include "Cnode.h" +#include "BlockDevice.h" +#include "nodes.h" +#include "Allocator.h" +#include "Table.h" + +#include "common/Mutex.h" +#include "common/Cond.h" + +#include "osd/ObjectStore.h" + +//typedef pair object_coll_t; +typedef pair coll_object_t; + + +class Ebofs : public ObjectStore { + protected: + Mutex ebofs_lock; // a beautiful global lock + + // ** debuggy ** + bool fake_writes; + + // ** super ** + BlockDevice dev; + bool mounted, unmounting, dirty; + bool readonly; + version_t super_epoch; + bool commit_thread_started, mid_commit; + Cond commit_cond; // to wake up the commit thread + Cond sync_cond; + + map > commit_waiters; + + void prepare_super(version_t epoch, bufferptr& bp); + void write_super(version_t epoch, bufferptr& bp); + int commit_thread_entry(); + + class CommitThread : public Thread { + Ebofs *ebofs; + public: + CommitThread(Ebofs *e) : ebofs(e) {} + void *entry() { + ebofs->commit_thread_entry(); + return 0; + } + } commit_thread; + + + + + // ** allocator ** + block_t free_blocks, limbo_blocks; + Allocator allocator; + friend class Allocator; + + block_t get_free_blocks() { return free_blocks; } + block_t get_limbo_blocks() { return limbo_blocks; } + block_t get_free_extents() { + int n = 0; + for (int i=0; iget_num_keys(); + return n; + } + block_t get_limbo_extents() { return limbo_tab->get_num_keys(); } + + + // ** tables and sets ** + // nodes + NodePool nodepool; // for all tables... + + // tables + Table *object_tab; + Table *free_tab[EBOFS_NUM_FREE_BUCKETS]; + Table *limbo_tab; + Table > *alloc_tab; + + // collections + Table *collection_tab; + Table *co_tab; + + void close_tables(); + + + // ** onodes ** + hash_map onode_map; // onode cache + LRU onode_lru; + set dirty_onodes; + map > waitfor_onode; + + Onode* new_onode(object_t oid); // make new onode. ref++. + Onode* get_onode(object_t oid); // get cached onode, or read from disk. ref++. + void remove_onode(Onode *on); + void put_onode(Onode* o); // put it back down. ref--. + void dirty_onode(Onode* o); + void encode_onode(Onode *on, bufferlist& bl, unsigned& off); + void write_onode(Onode *on); + + // ** cnodes ** + hash_map cnode_map; + LRU cnode_lru; + set dirty_cnodes; + map > waitfor_cnode; + + Cnode* new_cnode(coll_t cid); + Cnode* get_cnode(coll_t cid); + void remove_cnode(Cnode *cn); + void put_cnode(Cnode *cn); + void dirty_cnode(Cnode *cn); + void encode_cnode(Cnode *cn, bufferlist& bl, unsigned& off); + void write_cnode(Cnode *cn); + + // ** onodes+cnodes = inodes ** + int inodes_flushing; + Cond inode_commit_cond; + + void flush_inode_finish(); + void commit_inodes_start(); + void commit_inodes_wait(); + friend class C_E_InodeFlush; + + void trim_inodes(int max = -1); + + // ** buffer cache ** + BufferCache bc; + pthread_t flushd_thread_id; + + version_t trigger_commit(); + void commit_bc_wait(version_t epoch); + void trim_bc(off_t max = -1); + + public: + void kick_idle(); + void sync(); + void sync(Context *onsafe); + void trim_buffer_cache(); + + class IdleKicker : public BlockDevice::kicker { + Ebofs *ebo; + public: + IdleKicker(Ebofs *t) : ebo(t) {} + void kick() { ebo->kick_idle(); } + } idle_kicker; + + + protected: + //void zero(Onode *on, size_t len, off_t off, off_t write_thru); + void alloc_write(Onode *on, + block_t start, block_t len, + interval_set& alloc, + block_t& old_bfirst, block_t& old_blast); + void apply_write(Onode *on, off_t off, size_t len, bufferlist& bl); + bool attempt_read(Onode *on, off_t off, size_t len, bufferlist& bl, + Cond *will_wait_on, bool *will_wait_on_bool); + + // ** finisher ** + // async write notification to users + Mutex finisher_lock; + Cond finisher_cond; + bool finisher_stop; + list finisher_queue; + + void *finisher_thread_entry(); + class FinisherThread : public Thread { + Ebofs *ebofs; + public: + FinisherThread(Ebofs *e) : ebofs(e) {} + void* entry() { return (void*)ebofs->finisher_thread_entry(); } + } finisher_thread; + + + void alloc_more_node_space(); + + void do_csetattrs(map > > &cmods); + void do_setattrs(Onode *on, map > &setattrs); + + + public: + Ebofs(char *devfn) : + fake_writes(false), + dev(devfn), + mounted(false), unmounting(false), dirty(false), readonly(false), + super_epoch(0), commit_thread_started(false), mid_commit(false), + commit_thread(this), + free_blocks(0), limbo_blocks(0), + allocator(this), + nodepool(ebofs_lock), + object_tab(0), limbo_tab(0), collection_tab(0), co_tab(0), + onode_lru(g_conf.ebofs_oc_size), + cnode_lru(g_conf.ebofs_cc_size), + inodes_flushing(0), + bc(dev, ebofs_lock), + idle_kicker(this), + finisher_stop(false), finisher_thread(this) { + for (int i=0; i& attrset, Context *onsafe=0); + int getattr(object_t oid, const char *name, void *value, size_t size); + int getattrs(object_t oid, map &aset); + int rmattr(object_t oid, const char *name, Context *onsafe=0); + int listattr(object_t oid, vector& attrs); + + // collections + int list_collections(list& ls); + bool collection_exists(coll_t c); + + int create_collection(coll_t c, Context *onsafe); + int destroy_collection(coll_t c, Context *onsafe); + int collection_add(coll_t c, object_t o, Context *onsafe); + int collection_remove(coll_t c, object_t o, Context *onsafe); + + int collection_list(coll_t c, list& o); + + int collection_setattr(coll_t oid, const char *name, const void *value, size_t size, Context *onsafe); + int collection_getattr(coll_t oid, const char *name, void *value, size_t size); + int collection_rmattr(coll_t cid, const char *name, Context *onsafe); + int collection_listattr(coll_t oid, vector& attrs); + + // maps + int map_lookup(object_t o, bufferlist& key, bufferlist& val); + int map_insert(object_t o, bufferlist& key, bufferlist& val); + int map_remove(object_t o, bufferlist& key); + int map_list(object_t o, list& keys); + int map_list(object_t o, map& vals); + int map_list(object_t o, + bufferlist& start, bufferlist& end, + map& vals); + + // crap + void _fake_writes(bool b) { fake_writes = b; } + void _get_frag_stat(FragmentationStat& st); + + void _import_freelist(bufferlist& bl); + void _export_freelist(bufferlist& bl); + + +private: + // private interface -- use if caller already holds lock + int _read(object_t oid, off_t off, size_t len, bufferlist& bl); + int _stat(object_t oid, struct stat *st); + int _getattr(object_t oid, const char *name, void *value, size_t size); + int _getattrs(object_t oid, map &aset); + + bool _write_will_block(); + int _write(object_t oid, off_t off, size_t len, bufferlist& bl); + int _truncate(object_t oid, off_t size); + int _truncate_front(object_t oid, off_t size); + int _remove(object_t oid); + int _clone(object_t from, object_t to); + int _setattr(object_t oid, const char *name, const void *value, size_t size); + int _setattrs(object_t oid, map& attrset); + int _rmattr(object_t oid, const char *name); + bool _collection_exists(coll_t c); + int _create_collection(coll_t c); + int _destroy_collection(coll_t c); + int _collection_add(coll_t c, object_t o); + int _collection_remove(coll_t c, object_t o); + int _collection_setattr(coll_t oid, const char *name, const void *value, size_t size); + int _collection_rmattr(coll_t cid, const char *name); + + +}; diff --git a/branches/sage/cephmds2/ebofs/Onode.h b/branches/sage/cephmds2/ebofs/Onode.h new file mode 100644 index 0000000000000..233c97e7ae172 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Onode.h @@ -0,0 +1,390 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_ONODE_H +#define __EBOFS_ONODE_H + +#include "include/lru.h" + +#include "types.h" +#include "BufferCache.h" + +#include "include/interval_set.h" + + +/* + * object node (like an inode) + * + * holds object metadata, including + * size + * allocation (extent list) + * attributes + * + */ + +class Onode : public LRUObject { +private: + int ref; + +public: + object_t object_id; + version_t version; // incremented on each modify. + + // data + bool readonly; + Extent onode_loc; + off_t object_size; + unsigned object_blocks; + + // onode + set collections; + map attr; + //vector extents; + map extent_map; + + interval_set uncommitted; + + ObjectCache *oc; + + bool dirty; + bool dangling; // not in onode_map + bool deleted; // deleted + + list commit_waiters; + + public: + Onode(object_t oid) : ref(0), object_id(oid), version(0), + readonly(false), + object_size(0), object_blocks(0), oc(0), + dirty(false), dangling(false), deleted(false) { + onode_loc.length = 0; + } + ~Onode() { + if (oc) delete oc; + } + + block_t get_onode_id() { return onode_loc.start; } + int get_onode_len() { return onode_loc.length; } + + int get_ref_count() { return ref; } + void get() { + if (ref == 0) lru_pin(); + ref++; + //cout << "ebofs.onode.get " << hex << object_id << dec << " " << ref << endl; + } + void put() { + ref--; + if (ref == 0) lru_unpin(); + //cout << "ebofs.onode.put " << hex << object_id << dec << " " << ref << endl; + } + + void mark_dirty() { + if (!dirty) { + dirty = true; + get(); + } + } + void mark_clean() { + if (dirty) { + dirty = false; + put(); + } + } + bool is_dirty() { return dirty; } + bool is_deleted() { return deleted; } + bool is_dangling() { return dangling; } + + + bool have_oc() { + return oc != 0; + } + ObjectCache *get_oc(BufferCache *bc) { + if (!oc) { + oc = new ObjectCache(object_id, this, bc); + oc->get(); + get(); + } + return oc; + } + void close_oc() { + if (oc) { + //cout << "close_oc on " << object_id << endl; + assert(oc->is_empty()); + if (oc->put() == 0){ + //cout << "************************* hosing oc" << endl; + delete oc; + } + oc = 0; + put(); + } + } + + + // allocation + void verify_extents() { + if (0) { // do crazy stupid sanity checking + block_t count = 0; + interval_set is; + + set s; + cout << "verifying" << endl; + + for (map::iterator p = extent_map.begin(); + p != extent_map.end(); + p++) { + cout << " " << p->first << ": " << p->second << endl; + assert(count == p->first); + count += p->second.length; + for (unsigned j=0;jsecond.length;j++) { + assert(s.count(p->second.start+j) == 0); + s.insert(p->second.start+j); + } + } + + assert(s.size() == count); + assert(count == object_blocks); + } + } + void set_extent(block_t offset, Extent ex) { + //cout << "set_extent " << offset << " -> " << ex << " ... " << object_blocks << endl; + assert(offset <= object_blocks); + verify_extents(); + + // at the end? + if (offset == object_blocks) { + //cout << " appending " << ex << endl; + if (!extent_map.empty() && extent_map.rbegin()->second.end() == ex.start) { + //cout << "appending " << ex << " to " << extent_map.rbegin()->second << endl; + extent_map.rbegin()->second.length += ex.length; + } else + extent_map[object_blocks] = ex; + object_blocks += ex.length; + return; + } + + // removing any extent bits we overwrite + if (!extent_map.empty()) { + // preceeding extent? + map::iterator p = extent_map.lower_bound(offset); + if (p != extent_map.begin()) { + p--; + if (p->first + p->second.length > offset) { + //cout << " preceeding was " << p->second << endl; + if (p->first + p->second.length > offset+ex.length) { + // cutting chunk out of middle, add last bit + Extent &n = extent_map[offset+ex.length] = p->second; + n.start += offset+ex.length - p->first; + n.length -= offset+ex.length - p->first; + //cout << " tail frag is " << n << endl; + } + p->second.length = offset - p->first; // cut tail off preceeding extent + //cout << " preceeding now " << p->second << endl; + } + p++; + } + + // overlapping extents + while (p != extent_map.end() && + p->first < offset + ex.length) { + map::iterator next = p; + next++; + + // completely subsumed? + if (p->first + p->second.length <= offset+ex.length) { + //cout << " erasing " << p->second << endl; + extent_map.erase(p); + p = next; + continue; + } + + // spans new extent, cut off head + Extent &n = extent_map[ offset+ex.length ] = p->second; + //cout << " cut head off " << p->second; + n.start += offset+ex.length - p->first; + n.length -= offset+ex.length - p->first; + extent_map.erase(p); + //cout << ", now " << n << endl; + break; + } + } + + extent_map[ offset ] = ex; + + // extend object? + if (offset + ex.length > object_blocks) + object_blocks = offset+ex.length; + + verify_extents(); + } + + + /* map_extents(start, len, ls) + * map teh given page range into extents on disk. + */ + int map_extents(block_t start, block_t len, vector& ls) { + //cout << "map_extents " << start << " " << len << endl; + verify_extents(); + + //assert(start+len <= object_blocks); + + map::iterator p = extent_map.lower_bound(start); + if (p != extent_map.begin() && + (p == extent_map.end() || p->first > start && p->first)) { + p--; + if (p->second.length > start - p->first) { + Extent ex; + ex.start = p->second.start + (start - p->first); + ex.length = MIN(len, p->second.length - (start - p->first)); + ls.push_back(ex); + + //cout << " got (tail of?) " << p->second << " : " << ex << endl; + + start += ex.length; + len -= ex.length; + } + p++; + } + + while (len > 0 && + p != extent_map.end()) { + assert(p->first == start); + Extent ex = p->second; + ex.length = MIN(len, ex.length); + ls.push_back(ex); + //cout << " got (head of?) " << p->second << " : " << ex << endl; + start += ex.length; + len -= ex.length; + p++; + } + + return 0; + } + + int truncate_extents(block_t len, vector& extra) { + verify_extents(); + + map::iterator p = extent_map.lower_bound(len); + if (p != extent_map.begin() && + (p == extent_map.end() || p->first > len && p->first)) { + p--; + if (p->second.length > len - p->first) { + Extent ex; + ex.start = p->second.start + (len - p->first); + ex.length = p->second.length - (len - p->first); + extra.push_back(ex); + + p->second.length = len - p->first; + assert(p->second.length > 0); + + //cout << " got (tail of?) " << p->second << " : " << ex << endl; + } + p++; + } + + while (p != extent_map.end()) { + assert(p->first >= len); + extra.push_back(p->second); + map::iterator n = p; + n++; + extent_map.erase(p); + p = n; + } + + object_blocks = len; + verify_extents(); + return 0; + } + + int truncate_front_extents(block_t len, vector& extra) { + verify_extents(); + + while (len > 0) { + Extent& ex = extent_map.begin()->second; // look, this is a reference! + if (ex.length > len) { + // partial first extent + Extent frontbit( ex.start, len ); + extra.push_back(frontbit); + ex.length -= len; + ex.start += len; + break; + } + + // pull off entire first extent. + assert(ex.length <= len); + len -= ex.length; + extra.push_back(ex); + extent_map.erase(extent_map.begin()); + } + + object_blocks -= len; + verify_extents(); + return 0; + } + + + + /* map_alloc_regions(start, len, map) + * map range into regions that need to be (re)allocated on disk + * because they overlap "safe" (or unallocated) parts of the object + */ + /* + void map_alloc_regions(block_t start, block_t len, + interval_set& alloc) { + interval_set already_uncom; + + alloc.insert(start, len); // start with whole range + already_uncom.intersection_of(alloc, uncommitted); + alloc.subtract(already_uncom); // take out the bits that aren't yet committed + } + */ + + + + // pack/unpack + int get_collection_bytes() { + return sizeof(coll_t) * collections.size(); + } + int get_attr_bytes() { + int s = 0; + for (map::iterator i = attr.begin(); + i != attr.end(); + i++) { + s += i->first.length() + 1; + s += i->second.length() + sizeof(int); + } + return s; + } + int get_extent_bytes() { + return sizeof(Extent) * extent_map.size(); + } + +}; + + +inline ostream& operator<<(ostream& out, Onode& on) +{ + out << "onode(" << hex << on.object_id << dec << " len=" << on.object_size; + out << " ref=" << on.get_ref_count(); + if (on.is_dirty()) out << " dirty"; + if (on.is_dangling()) out << " dangling"; + if (on.is_deleted()) out << " deleted"; + out << " uncom=" << on.uncommitted; + // out << " " << &on; + out << ")"; + return out; +} + + + +#endif diff --git a/branches/sage/cephmds2/ebofs/Table.h b/branches/sage/cephmds2/ebofs/Table.h new file mode 100644 index 0000000000000..e6b3fb39660e4 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/Table.h @@ -0,0 +1,897 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_TABLE_H +#define __EBOFS_TABLE_H + +#include "types.h" +#include "nodes.h" + +/** table **/ + +#define dbtout if (25 <= g_conf.debug_ebofs) cout << "ebofs.table(" << this << ")." + + +template +class Table { + private: + NodePool &pool; + + nodeid_t root; + int nkeys; + int depth; + + public: + Table(NodePool &p, + struct ebofs_table& bts) : + pool(p), + root(bts.root), nkeys(bts.num_keys), depth(bts.depth) { + dbtout << "cons" << endl; + } + + nodeid_t get_root() { return root; } + int get_num_keys() { return nkeys; } + int get_depth() { return depth; } + + + /* + */ + class _IndexItem { // i just need a struct size for below + K k; + nodeid_t n; + }; + class IndexItem { + public: + K key; + nodeid_t node; + static const int MAX = Node::ITEM_LEN / (sizeof(_IndexItem)); + static const int MIN = MAX/2; + }; + class _LeafItem { // i just need a struct size for below + K k; + V v; + }; + class LeafItem { + public: + K key; + V value; + static const int MAX = Node::ITEM_LEN / (sizeof(_LeafItem)); + static const int MIN = MAX/2; + }; + + class Nodeptr { + public: + Node *node; + + Nodeptr() : node(0) {} + Nodeptr(Node *n) : node(n) {} + Nodeptr& operator=(Node *n) { + node = n; + return *this; + } + + LeafItem& leaf_item(int i) { return (( LeafItem*)(node->item_ptr()))[i]; } + IndexItem& index_item(int i) { return ((IndexItem*)(node->item_ptr()))[i]; } + K key(int i) { + if (node->is_index()) + return index_item(i).key; + else + return leaf_item(i).key; + } + + bool is_leaf() { return node->is_leaf(); } + bool is_index() { return node->is_index(); } + void set_type(int t) { node->set_type(t); } + + int max_items() const { + if (node->is_leaf()) + return LeafItem::MAX; + else + return IndexItem::MAX; + } + int min_items() const { return max_items() / 2; } + + nodeid_t get_id() { return node->get_id(); } + + int size() { return node->size(); } + void set_size(int s) { node->set_size(s); } + + void remove_at_pos(int p) { + if (node->is_index()) { + for (int i=p; ip; i--) + leaf_item(i) = leaf_item(i-1); + leaf_item(p).key = key; + leaf_item(p).value = value; + set_size(size() + 1); + } + void insert_at_index_pos(int p, K key, nodeid_t node) { + assert(is_index()); + for (int i=size(); i>p; i--) + index_item(i) = index_item(i-1); + index_item(p).key = key; + index_item(p).node = node; + set_size(size() + 1); + } + + void append_item(LeafItem& i) { + leaf_item(size()) = i; + set_size(size() + 1); + } + void append_item(IndexItem& i) { + index_item(size()) = i; + set_size(size() + 1); + } + + void split(Nodeptr& right) { + if (node->is_index()) { + for (int i=min_items(); iis_index()) + for (int i=0; i open; // open nodes + vector pos; // position within the node + //Nodeptr open[20]; + //int pos[20]; + int level; + + Cursor(Table *t) : table(t), open(t->depth), pos(t->depth), level(0) {} + + public: + + const LeafItem& current() { + assert(open[level].is_leaf()); + return open[level].leaf_item(pos[level]); + } + V& dirty_current_value() { + assert(open[level].is_leaf()); + dirty(); + return open[level].leaf_item(pos[level]).value; + } + + // ** read-only bits ** + int move_left() { + if (table->depth == 0) return OOB; + + // work up around branch + int l; + for (l = level; l >= 0; l--) + if (pos[l] > 0) break; + if (l < 0) + return OOB; // we are the first item in the btree + + // move left one + pos[l]--; + + // work back down right side + for (; lpool.get_node( open[l].index_item(pos[l]).node ); + pos[l+1] = open[l+1].size() - 1; + } + return 1; + } + int move_right() { + if (table->depth == 0) return OOB; + + // work up branch + int l; + for (l=level; l>=0; l--) + if (pos[l] < open[l].size() - 1) break; + if (l < 0) { + /* we are at last item in btree. */ + if (pos[level] < open[level].size()) { + pos[level]++; /* move into add position! */ + return 0; + } + return -1; + } + + /* move right one */ + assert( pos[l] < open[l].size() ); + pos[l]++; + + /* work back down */ + for (; lpool.get_node( open[l].index_item(pos[l]).node ); + pos[l+1] = 0; // furthest left + } + return 1; + } + + // ** modifications ** + void dirty() { + for (int l=level; l>=0; l--) { + if (open[l].node->is_dirty()) break; // already dirty! (and thus parents are too) + + table->pool.dirty_node(open[l].node); + if (l > 0) + open[l-1].index_item( pos[l-1] ).node = open[l].get_id(); + else + table->root = open[0].get_id(); + } + } + private: + void repair_parents() { + // did i make a change at the start of a node? + if (pos[level] == 0) { + K key = open[level].key(0); // new key parents should have + for (int j=level-1; j>=0; j--) { + if (open[j].index_item(pos[j]).key == key) + break; /* it's the same key, we can stop fixing */ + open[j].index_item(pos[j]).key = key; + if (pos[j] > 0) break; /* last in position 0.. */ + } + } + } + + public: + void remove() { + dirty(); + + // remove from node + open[level].remove_at_pos( pos[level] ); + repair_parents(); + + // was it a key? + if (level == table->depth-1) + table->nkeys--; + } + + void insert(K key, V value) { + dirty(); + + // insert + open[level].insert_at_leaf_pos(pos[level], key, value); + repair_parents(); + + // was it a key? + if (level == table->depth-1) + table->nkeys++; + } + + int rotate_left() { + if (level == 0) return -1; // i am root + if (pos[level-1] == 0) return -1; // nothing to left + + Nodeptr here = open[level]; + Nodeptr parent = open[level-1]; + Nodeptr left = table->pool.get_node( parent.index_item(pos[level-1] - 1).node ); + if (left.size() == left.max_items()) return -1; // it's full + + // make both dirty + dirty(); + if (!left.node->is_dirty()) { + table->pool.dirty_node(left.node); + parent.index_item(pos[level-1]-1).node = left.get_id(); + } + + dbtout << "rotating item " << here.key(0) << " left from " << here.get_id() << " to " << left.get_id() << endl; + + /* add */ + if (here.node->is_leaf()) + left.append_item(here.leaf_item(0)); + else + left.append_item(here.index_item(0)); + + /* remove */ + here.remove_at_pos(0); + + /* fix parent index for me */ + parent.index_item( pos[level-1] ).key = here.key(0); + // we never have to update past immediate parent, since we're not at pos 0 + + /* adjust cursor */ + if (pos[level] > 0) + pos[level]--; + //else + //assert(1); /* if we were positioned here, we're equal */ + /* if it was 0, then the shifted item == our key, and we can stay here safely. */ + return 0; + } + int rotate_right() { + if (level == 0) return -1; // i am root + if (pos[level-1] + 1 >= open[level-1].size()) return -1; // nothing to right + + Nodeptr here = open[level]; + Nodeptr parent = open[level-1]; + Nodeptr right = table->pool.get_node( parent.index_item( pos[level-1] + 1 ).node ); + if (right.size() == right.max_items()) return -1; // it's full + + // make both dirty + dirty(); + if (!right.node->is_dirty()) { + table->pool.dirty_node(right.node); + parent.index_item( pos[level-1]+1 ).node = right.get_id(); + } + + if (pos[level] == here.size()) { + /* let's just move the cursor over! */ + //if (sizeof(K) == 8) + dbtout << "shifting cursor right from " << here.get_id() << " to less-full node " << right.get_id() << endl; + open[level] = right; + pos[level] = 0; + pos[level-1]++; + return 0; + } + + //if (sizeof(K) == 8) + dbtout << "rotating item " << hex << here.key(here.size()-1) << dec << " right from " + << here.get_id() << " to " << right.get_id() << endl; + + /* add */ + if (here.is_index()) + right.insert_at_index_pos(0, + here.index_item( here.size()-1 ).key, + here.index_item( here.size()-1 ).node); + else + right.insert_at_leaf_pos(0, + here.leaf_item( here.size()-1 ).key, + here.leaf_item( here.size()-1 ).value); + + /* remove */ + here.set_size(here.size() - 1); + + /* fix parent index for right */ + parent.index_item( pos[level-1] + 1 ).key = right.key(0); + + return 0; + } + }; + + + public: + bool almost_full() { + if (2*(depth+1) > pool.num_free()) // worst case, plus some. + return true; + return false; + } + + int find(K key, Cursor& cursor) { + dbtout << "find " << key << endl; + + if (depth == 0) + return Cursor::OOB; + + // init + cursor.level = 0; + + // start at root + Nodeptr curnode( pool.get_node(root) ); + cursor.open[0] = curnode; + + if (curnode.size() == 0) return -1; // empty! + + // find leaf + for (cursor.level = 0; cursor.level < depth-1; cursor.level++) { + /* if key=5, we want 2 3 [4] 6 7, or 3 4 [5] 5 6 (err to the left) */ + int left = 0; /* i >= left */ + int right = curnode.size()-1; /* i < right */ + while (left < right) { + int i = left + (right - left) / 2; + if (curnode.index_item(i).key < key) { + left = i + 1; + } else if (i && curnode.index_item(i-1).key >= key) { + right = i; + } else { + left = right = i; + break; + } + } + int i = left; + if (i && curnode.index_item(i).key > key) i--; + +#ifdef EBOFS_DEBUG_BTREE + int j; + for (j=0; j key) break; + } + if (i != j) { + dbtout << "btree binary search failed" << endl; + i = j; + } +#endif + + cursor.pos[cursor.level] = i; + + /* get child node */ + curnode = pool.get_node( cursor.open[cursor.level].index_item(i).node ); + cursor.open[cursor.level+1] = curnode; + } + + /* search leaf */ + /* if key=5, we want 2 3 4 [6] 7, or 3 4 [5] 5 6 (err to the right) */ + int left = 0; /* i >= left */ + int right = curnode.size(); /* i < right */ + while (left < right) { + int i = left + (right - left) / 2; + if (curnode.leaf_item(i).key < key) { + left = i + 1; + } else if (i && curnode.leaf_item(i-1).key >= key) { + right = i; + } else { + left = right = i; + break; + } + } + int i = left; + +#ifdef EBOFS_DEBUG_BTREE + int j; + for (j=0; j= key) break; + } + if (i != j) { + dbtout << "btree binary search failed" << endl; + i = j; + } +#endif + + cursor.pos[cursor.level] = i; /* first key in this node, or key insertion point */ + + if (curnode.size() >= i+1) { + if (curnode.leaf_item(i).key == key) { + return Cursor::MATCH; /* it's the actual key */ + } else { + return Cursor::INSERT; /* it's an insertion point */ + } + } + return Cursor::OOB; /* it's the end of the btree (also a valid insertion point) */ + } + + int lookup(K key) { + dbtout << "lookup" << endl; + Cursor cursor(this); + if (find(key, cursor) == Cursor::MATCH) + return 0; + return -1; + } + + int lookup(K key, V& value) { + dbtout << "lookup" << endl; + Cursor cursor(this); + if (find(key, cursor) == Cursor::MATCH) { + value = cursor.current().value; + return 0; + } + return -1; + } + + int insert(K key, V value) { + dbtout << "insert " << key << " -> " << value << endl; + if (almost_full()) return -1; + + // empty? + if (nkeys == 0) { + if (root == -1) { + // create a root node (leaf!) + assert(depth == 0); + Nodeptr newroot( pool.new_node(Node::TYPE_LEAF) ); + root = newroot.get_id(); + depth++; + } + assert(depth == 1); + assert(root >= 0); + } + + // start at/near key + Cursor cursor(this); + find(key, cursor); + + // insert loop + nodeid_t nodevalue = 0; + while (1) { + + /* room in this node? */ + if (cursor.open[cursor.level].size() < cursor.open[cursor.level].max_items()) { + if (cursor.open[cursor.level].is_leaf()) + cursor.insert( key, value ); // will dirty, etc. + else { + // indices are already dirty + cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); + } + verify("insert 1"); + return 0; + } + + /* this node is full. */ + assert( cursor.open[cursor.level].size() == cursor.open[cursor.level].max_items() ); + + /* can we rotate? */ + if (false) // NO! there's a bug in here somewhere, don't to it. + if (cursor.level > 0) { + if ((cursor.pos[cursor.level-1] > 0 + && cursor.rotate_left() >= 0) || + (cursor.pos[cursor.level-1] + 1 < cursor.open[cursor.level-1].size() + && cursor.rotate_right() >= 0)) { + + if (cursor.open[cursor.level].is_leaf()) + cursor.insert( key, value ); // will dirty, etc. + else { + // indices are already dirty + cursor.open[cursor.level].insert_at_index_pos(cursor.pos[cursor.level], key, nodevalue); + } + verify("insert 2"); + return 0; + } + } + + /** split node **/ + + if (cursor.level == depth-1) { + dbtout << "splitting leaf " << cursor.open[cursor.level].get_id() << endl; + } else { + dbtout << "splitting index " << cursor.open[cursor.level].get_id() << endl; + } + + cursor.dirty(); + + // split + Nodeptr leftnode = cursor.open[cursor.level]; + Nodeptr newnode( pool.new_node(leftnode.node->get_type()) ); + leftnode.split( newnode ); + + /* insert our item */ + if (cursor.pos[cursor.level] > leftnode.size()) { + // not with cursor, since this node isn't added yet! + if (newnode.is_leaf()) { + newnode.insert_at_leaf_pos( cursor.pos[cursor.level] - leftnode.size(), + key, value ); + nkeys++; + } else { + newnode.insert_at_index_pos( cursor.pos[cursor.level] - leftnode.size(), + key, nodevalue ); + } + } else { + // with cursor (if leaf) + if (leftnode.is_leaf()) + cursor.insert( key, value ); + else + leftnode.insert_at_index_pos( cursor.pos[cursor.level], + key, nodevalue ); + } + + /* are we at the root? */ + if (cursor.level == 0) { + /* split root. */ + dbtout << "that split was the root " << root << endl; + Nodeptr newroot( pool.new_node(Node::TYPE_INDEX) ); + + /* new root node */ + newroot.set_size(2); + newroot.index_item(0).key = leftnode.key(0); + newroot.index_item(0).node = root; + newroot.index_item(1).key = newnode.key(0); + newroot.index_item(1).node = newnode.get_id(); + + /* heighten tree */ + depth++; + root = newroot.get_id(); + verify("insert 3"); + return 0; + } + + /* now insert newindex in level-1 */ + nodevalue = newnode.get_id(); + key = newnode.key(0); + cursor.level--; + cursor.pos[cursor.level]++; // ...to the right of leftnode! + } + } + + + int remove(K key) { + dbtout << "remove " << key << endl; + + if (almost_full()) { + cout << "table almost full, failing" << endl; + assert(0); + return -1; + } + + Cursor cursor(this); + if (find(key, cursor) <= 0) { + cerr << "remove " << key << " 0x" << hex << key << dec << " .. dne" << endl; + g_conf.debug_ebofs = 33; + g_conf.ebofs_verify = true; + verify("remove dne"); + assert(0); + return -1; // key dne + } + + + while (1) { + cursor.remove(); + + // balance + adjust + + if (cursor.level == 0) { + // useless root index? + if (cursor.open[0].size() == 1 && + depth > 1) { + depth--; + root = cursor.open[0].index_item(0).node; + pool.release( cursor.open[0].node ); + } + + // note: root can be small, but not empty + else if (nkeys == 0) { + assert(cursor.open[cursor.level].size() == 0); + assert(depth == 1); + root = -1; + depth = 0; + pool.release(cursor.open[0].node); + } + verify("remove 1"); + return 0; + } + + if (cursor.open[cursor.level].size() > cursor.open[cursor.level].min_items()) { + verify("remove 2"); + return 0; + } + + // borrow from siblings? + Nodeptr left; + Nodeptr right; + + // left? + if (cursor.pos[cursor.level-1] > 0) { + int left_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] - 1).node; + left = pool.get_node( left_loc ); + + if (left.size() > left.min_items()) { + /* move cursor left, shift right */ + cursor.pos[cursor.level] = 0; + cursor.open[cursor.level] = left; + cursor.pos[cursor.level-1]--; + cursor.rotate_right(); + verify("remove 3"); + return 0; + } + + /* combine to left */ + right = cursor.open[cursor.level]; + } + else { + assert(cursor.pos[cursor.level-1] < cursor.open[cursor.level-1].size() - 1); + int right_loc = cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1] + 1 ).node; + right = pool.get_node( right_loc ); + + if (right.size() > right.min_items()) { + /* move cursor right, shift an item left */ + cursor.pos[cursor.level] = 1; + cursor.open[cursor.level] = right; + cursor.pos[cursor.level-1]++; + cursor.rotate_left(); + verify("remove 4"); + return 0; + } + + /* combine to left */ + left = cursor.open[cursor.level]; + cursor.pos[cursor.level-1]++; /* move cursor to (soon-to-be-empty) right side item */ + } + + // note: cursor now points to _right_ node. + + /* combine (towards left) + * (this makes it so our next delete will be in the index + * interior, which is less scary.) + */ + dbtout << "combining nodes " << left.get_id() << " and " << right.get_id() << endl; + + left.merge(right); + + // dirty left + right + cursor.dirty(); // right + if (!left.node->is_dirty()) { + pool.dirty_node(left.node); + cursor.open[cursor.level-1].index_item( cursor.pos[cursor.level-1]-1 ).node = left.get_id(); + } + + pool.release(right.node); + + cursor.level--; // now point to the link to the obsolete (right-side) sib */ + } + + } + + void clear(Cursor& cursor, int node_loc, int level) { + dbtout << "clear" << endl; + + Nodeptr node = pool.get_node( node_loc ); + cursor.open[level] = node; + + // hose children? + if (level < depth-1) { + for (int i=0; i max) + max = node.key(i); + + if (level < depth-1) { + // index + cursor.pos[level] = i; + err += verify_sub( cursor, cursor.open[level].index_item(i).node, level+1, count, last, on ); + } else { + // leaf + count++; + last = node.key(i); + } + } + + if (level) { + // verify that parent's keys are appropriate + if (min != cursor.open[level-1].index_item(cursor.pos[level-1]).key) { + dbtout << ":: key in index node " << cursor.open[level-1].get_id() + << " != min in child " << node_loc + << "(key is " << hex << cursor.open[level-1].index_item(cursor.pos[level-1]).key + << ", min is " << min << ")" << dec << endl; + err++; + } + if (cursor.pos[level-1] < cursor.open[level-1].size()-1) { + if (max > cursor.open[level-1].index_item(1+cursor.pos[level-1]).key) { + dbtout << ":: next key in index node " << cursor.open[level-1].get_id() + << " < max in child " << node_loc + << "(key is " << hex << cursor.open[level-1].index_item(1+cursor.pos[level-1]).key + << ", max is " << max << ")" << dec << endl; + err++; + } + } + } + + //return err; + + // print it + char s[1000]; + strcpy(s," "); + s[level+1] = 0; + if (1) { + if (root == node_loc) { + dbtout << s << "root " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; + } else if (level == depth-1) { + dbtout << s << "leaf " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; + } else { + dbtout << s << "indx " << node_loc << ": " + << node.size() << " / " << node.max_items() << " keys, " << hex << min << "-" << max << dec << endl; + } + + if (0) { + for (int i=0; i " << node.leaf_item(i).value << dec << endl; + } + } + } + } + + return err; + } + + void verify(const char *on) { + if (!g_conf.ebofs_verify) + return; + + if (root == -1 && depth == 0) { + return; // empty! + } + + int count = 0; + Cursor cursor(this); + K last; + + int before = g_conf.debug_ebofs; + g_conf.debug_ebofs = 0; + + int err = verify_sub(cursor, root, 0, count, last, on); + if (count != nkeys) { + cerr << "** count " << count << " != nkeys " << nkeys << endl; + err++; + } + + g_conf.debug_ebofs = before; + + // ok? + if (err) { + cerr << "verify failure, called by '" << on << "'" << endl; + g_conf.debug_ebofs = 30; + // do it again, so we definitely get the dump. + int count = 0; + Cursor cursor(this); + K last; + verify_sub(cursor, root, 0, count, last, on); + assert(err == 0); + } + } + +}; + + +#endif diff --git a/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc b/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc new file mode 100644 index 0000000000000..af5f57842068a --- /dev/null +++ b/branches/sage/cephmds2/ebofs/mkfs.ebofs.cc @@ -0,0 +1,299 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include "ebofs/Ebofs.h" + + +int main(int argc, char **argv) +{ + // args + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + if (args.size() < 1) { + cerr << "usage: mkfs.ebofs [options] " << endl; + return -1; + } + char *filename = args[0]; + + // mkfs + Ebofs mfs(filename); + int r = mfs.mkfs(); + if (r < 0) exit(r); + + if (args.size() > 1) { // pass an extra arg of some sort to trigger the test crapola + // test-o-rama! + Ebofs fs(filename); + fs.mount(); + + /* + if (1) { + // partial write tests + char crap[1024*1024]; + memset(crap, 0, 1024*1024); + + bufferlist small; + small.append(crap, 10); + bufferlist med; + med.append(crap, 1000); + bufferlist big; + big.append(crap, 1024*1024); + + cout << "0" << endl; + fs.write(10, 0, 1024*1024, big, (Context*)0); + fs.sync(); + fs.trim_buffer_cache(); + + cout << "1" << endl; + fs.write(10, 10, 10, small, 0); + fs.write(10, 1, 1000, med, 0); + fs.sync(); + fs.trim_buffer_cache(); + + cout << "2" << endl; + fs.write(10, 10, 10, small, 0); + //fs.sync(); + fs.write(10, 1, 1000, med, 0); + fs.sync(); + fs.trim_buffer_cache(); + + cout << "3" << endl; + fs.write(10, 1, 1000, med, 0); + fs.write(10, 10000, 10, small, 0); + fs.truncate(10, 100, 0); + fs.sync(); + fs.trim_buffer_cache(); + + cout << "4" << endl; + fs.remove(10); + fs.sync(); + fs.write(10, 10, 10, small, 0); + fs.sync(); + fs.write(10, 1, 1000, med, 0); + fs.sync(); + fs.truncate(10, 100, 0); + fs.write(10, 10, 10, small, 0); + fs.trim_buffer_cache(); + + + + } + + if (0) { // onode write+read test + bufferlist bl; + char crap[1024*1024]; + memset(crap, 0, 1024*1024); + bl.append(crap, 10); + + fs.write(10, 10, 0, bl, (Context*)0); + fs.umount(); + + Ebofs fs2(filename); + fs2.mount(); + fs2.read(10, 10, 0, bl); + fs2.umount(); + + return 0; + } + + + if (0) { // small write + read test + bufferlist bl; + char crap[1024*1024]; + memset(crap, 0, 1024*1024); + + object_t oid = 10; + int n = 10000; + int l = 128; + bl.append(crap, l); + + + char *p = bl.c_str(); + off_t o = 0; + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_NODES_H +#define __EBOFS_NODES_H + +/** nodes, node regions **/ + +#include "types.h" +#include "BlockDevice.h" + + +/* + + disk wire memory + + free free -> free can alloc + free used -> dirty can modify + + free used used -> tx + free used free -> limbo + + used used -> clean + used free -> limbo + + + // meaningless + used free free -> free can alloc + used free used __DNE__ + + +*/ + +#undef debofs +#define debofs(x) if (x < g_conf.debug_ebofs) cout << "ebofs.nodepool." + + +class Node { + public: + // bit fields + static const int STATE_CLEAN = 1; + static const int STATE_DIRTY = 2; + static const int STATE_TX = 3; + + static const int ITEM_LEN = EBOFS_NODE_BYTES - sizeof(int) - sizeof(int) - sizeof(int); + + static const int TYPE_INDEX = 1; + static const int TYPE_LEAF = 2; + + protected: + nodeid_t id; + int state; // use bit fields above! + + bufferptr bptr; + bufferptr shadow_bptr; + + // in disk buffer + int *type; + int *nrecs; + + public: + Node(nodeid_t i, bufferptr& b, int s) : id(i), state(s), bptr(b) { + nrecs = (int*)(bptr.c_str()); + type = (int*)(bptr.c_str() + sizeof(*nrecs)); + } + + + // id + nodeid_t get_id() const { return id; } + void set_id(nodeid_t n) { id = n; } + + // buffer + bufferptr& get_buffer() { return bptr; } + + char *item_ptr() { return bptr.c_str() + sizeof(*nrecs) + sizeof(*type); } + + // size + int size() { return *nrecs; } + void set_size(int s) { *nrecs = s; } + + // type + int& get_type() { return *type; } + void set_type(int t) { *type = t; } + bool is_index() { return *type == TYPE_INDEX; } + bool is_leaf() { return *type == TYPE_LEAF; } + + + // state + bool is_dirty() { return state == STATE_DIRTY; } + bool is_tx() { return state == STATE_TX; } + bool is_clean() { return state == STATE_CLEAN; } + + void set_state(int s) { state = s; } + + void make_shadow() { + assert(is_tx()); + + shadow_bptr = bptr; + + // new buffer + bptr = buffer::create_page_aligned(EBOFS_NODE_BYTES); + nrecs = (int*)(bptr.c_str()); + type = (int*)(bptr.c_str() + sizeof(*nrecs)); + + // copy contents! + memcpy(bptr.c_str(), shadow_bptr.c_str(), EBOFS_NODE_BYTES); + } + +}; + + + + + +class NodePool { + protected: + map node_map; // open node map + + public: + vector region_loc; // region locations + Extent usemap_even; + Extent usemap_odd; + + protected: + // on-disk block states + int num_nodes; + set free; + set dirty; + set tx; + set clean; // aka used + set limbo; + + Mutex &ebofs_lock; + Cond commit_cond; + int flushing; + + static int make_nodeid(int region, int offset) { + return (region << 24) | offset; + } + static int nodeid_region(nodeid_t nid) { + return nid >> 24; + } + static int nodeid_offset(nodeid_t nid) { + return nid & ((1 << 24) - 1); + } + + + public: + NodePool(Mutex &el) : + num_nodes(0), + ebofs_lock(el), + flushing(0) {} + ~NodePool() { + // nodes + release_all(); + } + + int num_free() { return free.size(); } + int num_dirty() { return dirty.size(); } + int num_limbo() { return limbo.size(); } + int num_tx() { return tx.size(); } + int num_clean() { return clean.size(); } + int num_total() { return num_nodes; } + int num_used() { return num_clean() + num_dirty() + num_tx(); } + + int get_usemap_len(int n=0) { + if (n == 0) n = num_nodes; + return ((n-1) / 8 / EBOFS_BLOCK_SIZE) + 1; + } + + int num_regions() { return region_loc.size(); } + + // the caller had better adjust usemap locations... + void add_region(Extent ex) { + int region = region_loc.size(); + assert(ex.length <= (1 << 24)); + region_loc.push_back(ex); + for (unsigned o = 0; o < ex.length; o++) { + free.insert( make_nodeid(region, o) ); + } + num_nodes += ex.length; + } + + int init(struct ebofs_nodepool *np) { + // regions + assert(region_loc.empty()); + num_nodes = 0; + for (int i=0; inum_regions; i++) { + debofs(3) << "init region " << i << " at " << np->region_loc[i] << endl; + region_loc.push_back( np->region_loc[i] ); + num_nodes += np->region_loc[i].length; + } + + // usemap + usemap_even = np->node_usemap_even; + usemap_odd = np->node_usemap_odd; + debofs(3) << "init even map at " << usemap_even << endl; + debofs(3) << "init odd map at " << usemap_odd << endl; + + return 0; + } + + void close() { + release_all(); + + region_loc.clear(); + free.clear(); + dirty.clear(); + tx.clear(); + clean.clear(); + limbo.clear(); + flushing = 0; + node_map.clear(); + } + + + // *** blocking i/o routines *** + + int read_usemap(BlockDevice& dev, version_t epoch) { + // read map + Extent loc; + if (epoch & 1) + loc = usemap_odd; + else + loc = usemap_even; + + bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); + dev.read(loc.start, loc.length, bp); + + // parse + unsigned region = 0; // current region + unsigned roff = 0; // offset in region + for (unsigned byte = 0; byte> 1; // move one bit right. + roff++; + if (roff == region_loc[region].length) { + // next region! + roff = 0; + region++; + break; + } + } + if (region == region_loc.size()) break; + } + return 0; + } + + int read_clean_nodes(BlockDevice& dev) { + /* + this relies on the clean set begin defined so that we know which nodes + to read. so it only really works when called from mount()! + */ + for (unsigned r=0; rflushed_usemap(); + } + }; + + void flushed_usemap() { + ebofs_lock.Lock(); + flushing--; + if (flushing == 0) + commit_cond.Signal(); + ebofs_lock.Unlock(); + } + + public: + int write_usemap(BlockDevice& dev, version_t version) { + // alloc + Extent loc; + if (version & 1) + loc = usemap_odd; + else + loc = usemap_even; + + bufferptr bp = buffer::create_page_aligned(EBOFS_BLOCK_SIZE*loc.length); + + // fill in + unsigned region = 0; // current region + unsigned roff = 0; // offset in region + for (unsigned byte = 0; byte> 1; + if (roff == region_loc[region].length) { + // next region! + roff = 0; + region++; + break; + } + } + + *(unsigned char*)(bp.c_str() + byte) = x; + if (region == region_loc.size()) break; + } + + + // write + bufferlist bl; + bl.append(bp); + dev.write(loc.start, loc.length, bl, + new C_NP_FlushUsemap(this), "usemap"); + return 0; + } + + + + // *** node commit *** + private: + + class C_NP_FlushNode : public BlockDevice::callback { + NodePool *pool; + nodeid_t nid; + public: + C_NP_FlushNode(NodePool *p, nodeid_t n) : + pool(p), nid(n) {} + void finish(ioh_t ioh, int r) { + pool->flushed_node(nid); + } + }; + + void flushed_node(nodeid_t nid) { + ebofs_lock.Lock(); + + // mark nid clean|limbo + if (tx.count(nid)) { // tx -> clean + tx.erase(nid); + clean.insert(nid); + + // make node itself clean + node_map[nid]->set_state(Node::STATE_CLEAN); + } + else { // already limbo (was dirtied, or released) + assert(limbo.count(nid)); + } + + flushing--; + if (flushing == 0) + commit_cond.Signal(); + ebofs_lock.Unlock(); + } + + public: + void commit_start(BlockDevice& dev, version_t version) { + dout(20) << "ebofs.nodepool.commit_start start" << endl; + + assert(flushing == 0); + /*if (0) + for (unsigned i=0; i tx (write to disk) + assert(tx.empty()); + set didb; + for (set::iterator i = dirty.begin(); + i != dirty.end(); + i++) { + Node *n = get_node(*i); + assert(n); + assert(n->is_dirty()); + n->set_state(Node::STATE_TX); + + unsigned region = nodeid_region(*i); + block_t off = nodeid_offset(*i); + block_t b = region_loc[region].start + off; + + if (1) { // sanity check debug FIXME + assert(didb.count(b) == 0); + didb.insert(b); + } + + bufferlist bl; + bl.append(n->get_buffer()); + dev.write(b, EBOFS_NODE_BLOCKS, + bl, + new C_NP_FlushNode(this, *i), "node"); + flushing++; + + tx.insert(*i); + } + dirty.clear(); + + // limbo -> free + for (set::iterator i = limbo.begin(); + i != limbo.end(); + i++) { + free.insert(*i); + } + limbo.clear(); + + dout(20) << "ebofs.nodepool.commit_start finish" << endl; + } + + void commit_wait() { + while (flushing > 0) + commit_cond.Wait(ebofs_lock); + dout(20) << "ebofs.nodepool.commit_wait finish" << endl; + } + + + + + + + + + + // *** nodes *** + // opened node + Node* get_node(nodeid_t nid) { + //dbtout << "pool.get " << nid << endl; + assert(node_map.count(nid)); + return node_map[nid]; + } + + // unopened node + /* not implemented yet!! + Node* open_node(nodeid_t nid) { + Node *n = node_regions[ NodeRegion::nodeid_region(nid) ]->open_node(nid); + dbtout << "pool.open_node " << n->get_id() << endl; + node_map[n->get_id()] = n; + return n; + } + */ + + // allocate id/block on disk. always free -> dirty. + nodeid_t alloc_id() { + // pick node id + assert(!free.empty()); + nodeid_t nid = *(free.begin()); + free.erase(nid); + dirty.insert(nid); + return nid; + } + + // new node + Node* new_node(int type) { + nodeid_t nid = alloc_id(); + debofs(15) << "ebofs.nodepool.new_node " << nid << endl; + + // alloc node + bufferptr bp = buffer::create_page_aligned(EBOFS_NODE_BYTES); + Node *n = new Node(nid, bp, Node::STATE_DIRTY); + n->set_type(type); + n->set_size(0); + + assert(node_map.count(nid) == 0); + node_map[nid] = n; + return n; + } + + void release(Node *n) { + const nodeid_t nid = n->get_id(); + debofs(15) << "ebofs.nodepool.release on " << nid << endl; + node_map.erase(nid); + + if (n->is_dirty()) { + assert(dirty.count(nid)); + dirty.erase(nid); + free.insert(nid); + } else if (n->is_clean()) { + assert(clean.count(nid)); + clean.erase(nid); + limbo.insert(nid); + } else if (n->is_tx()) { + assert(tx.count(nid)); // i guess htis happens? -sage + tx.erase(nid); + limbo.insert(nid); + } + + delete n; + } + + void release_all() { + while (!node_map.empty()) { + map::iterator i = node_map.begin(); + debofs(2) << "ebofs.nodepool.release_all leftover " << i->first << " " << i->second << endl; + release( i->second ); + } + assert(node_map.empty()); + } + + void dirty_node(Node *n) { + // get new node id? + nodeid_t oldid = n->get_id(); + nodeid_t newid = alloc_id(); + debofs(15) << "ebofs.nodepool.dirty_node on " << oldid << " now " << newid << endl; + + // release old block + if (n->is_clean()) { + assert(clean.count(oldid)); + clean.erase(oldid); + } else { + assert(n->is_tx()); + assert(tx.count(oldid)); + tx.erase(oldid); + + // move/copy current -> shadow buffer as necessary + n->make_shadow(); + } + limbo.insert(oldid); + node_map.erase(oldid); + + n->set_state(Node::STATE_DIRTY); + + // move to new one! + n->set_id(newid); + node_map[newid] = n; + } + + + +}; + +#endif diff --git a/branches/sage/cephmds2/ebofs/test.ebofs.cc b/branches/sage/cephmds2/ebofs/test.ebofs.cc new file mode 100644 index 0000000000000..0e6a7625c502a --- /dev/null +++ b/branches/sage/cephmds2/ebofs/test.ebofs.cc @@ -0,0 +1,224 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include "ebofs/Ebofs.h" + +bool stop = false; + + +int nt = 0; +class Tester : public Thread { + Ebofs &fs; + int t; + + char b[1024*1024]; + +public: + Tester(Ebofs &e) : fs(e), t(nt) { nt++; } + void *entry() { + + while (!stop) { + object_t oid; + oid.ino = (rand() % 10) + 0x10000000; + coll_t cid = rand() % 50; + off_t off = rand() % 10000;//0;//rand() % 1000000; + off_t len = 1+rand() % 100000; + char *a = "one"; + if (rand() % 2) a = "two"; + int l = 3;//rand() % 10; + + switch (rand() % 10) { + case 0: + { + oid.rev = rand() % 10; + cout << t << " read " << hex << oid << dec << " at " << off << " len " << len << endl; + bufferlist bl; + fs.read(oid, off, len, bl); + int l = MIN(len,bl.length()); + if (l) { + cout << t << " got " << l << endl; + bl.copy(0, l, b); + char *p = b; + while (l--) { + assert(*p == 0 || + *p == (char)(off ^ oid.ino)); + off++; + p++; + } + } + } + break; + + case 1: + { + cout << t << " write " << hex << oid << dec << " at " << off << " len " << len << endl; + for (int j=0;j args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + // args + if (args.size() != 3) return -1; + char *filename = args[0]; + int seconds = atoi(args[1]); + int threads = atoi(args[2]); + + cout << "dev " << filename << " .. " << threads << " threads .. " << seconds << " seconds" << endl; + + Ebofs fs(filename); + if (fs.mount() < 0) return -1; + + + // explicit tests + if (1) { + // verify that clone() plays nice with partial writes + object_t oid(1,1); + bufferptr bp(10000); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + fs.write(oid, 0, 10000, bl, 0); + + fs.sync(); + fs.trim_buffer_cache(); + + // induce a partial write + bufferlist bl2; + bl2.substr_of(bl, 0, 100); + fs.write(oid, 100, 100, bl2, 0); + + // clone it + object_t oid2; + oid2 = oid; + oid2.rev = 1; + fs.clone(oid, oid2, 0); + + // ... + if (0) { + // make sure partial still behaves after orig is removed... + fs.remove(oid, 0); + + // or i read for oid2... + bufferlist rbl; + fs.read(oid2, 0, 200, rbl); + } + if (1) { + // make sure things behave if we remove the clone + fs.remove(oid2,0); + } + } + // /explicit tests + + list ls; + for (int i=0; icreate(); + ls.push_back(t); + } + + utime_t now = g_clock.now(); + utime_t dur(seconds,0); + utime_t end = now + dur; + cout << "stop at " << end << endl; + while (now < end) { + sleep(1); + now = g_clock.now(); + cout << now << endl; + } + + cout << "stopping" << endl; + stop = true; + + while (!ls.empty()) { + Tester *t = ls.front(); + ls.pop_front(); + t->join(); + delete t; + } + + fs.umount(); + return 0; +} + diff --git a/branches/sage/cephmds2/ebofs/types.h b/branches/sage/cephmds2/ebofs/types.h new file mode 100644 index 0000000000000..1b85d138ec342 --- /dev/null +++ b/branches/sage/cephmds2/ebofs/types.h @@ -0,0 +1,168 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __EBOFS_TYPES_H +#define __EBOFS_TYPES_H + +#include +#include "include/buffer.h" +#include "include/Context.h" +#include "common/Cond.h" + +#include +#include +#include +#include +using namespace std; +using namespace __gnu_cxx; + + +#include "include/object.h" + + +#ifndef MIN +# define MIN(a,b) ((a)<=(b) ? (a):(b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a)>=(b) ? (a):(b)) +#endif + + +/* +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(unsigned long long __x) const { + static hash H; + return H((__x >> 32) ^ (__x & 0xffffffff)); + } + }; + + template<> struct hash< std::string > + { + size_t operator()( const std::string& x ) const + { + static hash H; + return H(x.c_str()); + } + }; +} +*/ + + +// disk +typedef __uint64_t block_t; // disk location/sector/block + +static const int EBOFS_BLOCK_SIZE = 4096; +static const int EBOFS_BLOCK_BITS = 12; // 1<<12 == 4096 + +class Extent { + public: + block_t start, length; + + Extent() : start(0), length(0) {} + Extent(block_t s, block_t l) : start(s), length(l) {} + + block_t last() const { return start + length - 1; } + block_t end() const { return start + length; } +}; + +inline ostream& operator<<(ostream& out, Extent& ex) +{ + return out << ex.start << "~" << ex.length; +} + + +// tree/set nodes +typedef int nodeid_t; + +static const int EBOFS_NODE_BLOCKS = 1; +static const int EBOFS_NODE_BYTES = EBOFS_NODE_BLOCKS * EBOFS_BLOCK_SIZE; +static const int EBOFS_MAX_NODE_REGIONS = 10; // pick a better value! + +struct ebofs_nodepool { + Extent node_usemap_even; // for even sb versions + Extent node_usemap_odd; // for odd sb versions + + int num_regions; + Extent region_loc[EBOFS_MAX_NODE_REGIONS]; +}; + + +// objects + +typedef __uint64_t coll_t; + +struct ebofs_onode { + Extent onode_loc; /* this is actually the block we live in */ + + object_t object_id; /* for kicks */ + off_t object_size; /* file size in bytes. should this be 64-bit? */ + unsigned object_blocks; + bool readonly; + + int num_collections; + int num_attr; // num attr in onode + int num_extents; /* number of extents used. if 0, data is in the onode */ +}; + +struct ebofs_cnode { + Extent cnode_loc; /* this is actually the block we live in */ + coll_t coll_id; + int num_attr; // num attr in cnode +}; + + +// table +struct ebofs_table { + nodeid_t root; /* root node of btree */ + int num_keys; + int depth; +}; + + +// super +typedef __uint64_t version_t; + +static const unsigned EBOFS_MAGIC = 0x000EB0F5; + +static const int EBOFS_NUM_FREE_BUCKETS = 5; /* see alloc.h for bucket constraints */ +static const int EBOFS_FREE_BUCKET_BITS = 2; + + +struct ebofs_super { + unsigned s_magic; + + unsigned epoch; // version of this superblock. + + unsigned num_blocks; /* # blocks in filesystem */ + + // some basic stats, for kicks + unsigned free_blocks; /* unused blocks */ + unsigned limbo_blocks; /* limbo blocks */ + //unsigned num_objects; + //unsigned num_fragmented; + + struct ebofs_nodepool nodepool; + + // tables + struct ebofs_table free_tab[EBOFS_NUM_FREE_BUCKETS]; + struct ebofs_table limbo_tab; + struct ebofs_table alloc_tab; + struct ebofs_table object_tab; // object directory + struct ebofs_table collection_tab; // collection directory + struct ebofs_table co_tab; +}; + + +#endif diff --git a/branches/sage/cephmds2/fakefuse.cc b/branches/sage/cephmds2/fakefuse.cc new file mode 100644 index 0000000000000..f021d83bac035 --- /dev/null +++ b/branches/sage/cephmds2/fakefuse.cc @@ -0,0 +1,147 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mon/Monitor.h" + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "client/Client.h" +#include "client/fuse.h" + +#include "common/Timer.h" + +#include "msg/FakeMessenger.h" + + + + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + + +class C_Test : public Context { +public: + void finish(int r) { + cout << "C_Test->finish(" << r << ")" << endl; + } +}; +class C_Test2 : public Context { +public: + void finish(int r) { + cout << "C_Test2->finish(" << r << ")" << endl; + g_timer.add_event_after(2, new C_Test); + } +}; + + + +int main(int argc, char **argv) { + cerr << "fakefuse starting" << endl; + + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + // start messenger thread + fakemessenger_startthread(); + + //g_timer.add_event_after(5.0, new C_Test2); + //g_timer.add_event_after(10.0, new C_Test); + + vector nargs; + for (unsigned i=0; iinit(); + } + for (int i=0; iinit(); + } + + for (int i=0; iinit(); + } + + + // create client + Client *client[NUMCLIENT]; + for (int i=0; iinit(); + + + // start up fuse + // use my argc, argv (make sure you pass a mount point!) + cout << "starting fuse on pid " << getpid() << endl; + client[i]->mount(); + ceph_fuse_main(client[i], argc, argv); + client[i]->unmount(); + cout << "fuse finished on pid " << getpid() << endl; + client[i]->shutdown(); + } + + + + // wait for it to finish + cout << "DONE -----" << endl; + fakemessenger_wait(); // blocks until messenger stops + + + // cleanup + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mds/MDCluster.h" + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "mon/Monitor.h" +#include "client/Client.h" + +#include "client/SyntheticClient.h" + +#include "msg/FakeMessenger.h" + +#include "common/Timer.h" + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + +class C_Test : public Context { +public: + void finish(int r) { + cout << "C_Test->finish(" << r << ")" << endl; + } +}; + + +int main(int argc, char **argv) +{ + cerr << "fakesyn start" << endl; + + //cerr << "inode_t " << sizeof(inode_t) << endl; + + vector args; + argv_to_vec(argc, argv, args); + + parse_config_options(args); + + int start = 0; + + parse_syn_options(args); + + vector nargs; + + for (unsigned i=0; iinit(); + } + for (int i=0; iinit(); + if (g_conf.mds_local_osd) + mdsosd[i]->init(); + } + + for (int i=0; iinit(); + } + + + // create client(s) + for (int i=0; iinit(); + + // use my argc, argv (make sure you pass a mount point!) + //cout << "mounting" << endl; + client[i]->mount(); + + //cout << "starting synthetic client " << endl; + syn[i] = new SyntheticClient(client[i]); + + syn[i]->start_thread(); + } + + + for (int i=0; ijoin_thread(); + delete syn[i]; + + client[i]->unmount(); + //cout << "unmounted" << endl; + client[i]->shutdown(); + } + + + // wait for it to finish + fakemessenger_wait(); + + // cleanup + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "mon/Monitor.h" +#include "client/Client.h" + +#include "client/SyntheticClient.h" + +#include "msg/FakeMessenger.h" + +#include "common/Timer.h" + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + +class C_Test : public Context { +public: + void finish(int r) { + cout << "C_Test->finish(" << r << ")" << endl; + } +}; + + +int main(int argc, char **argv) +{ + cerr << "fakesyn start" << endl; + + //cerr << "inode_t " << sizeof(inode_t) << endl; + + vector args; + argv_to_vec(argc, argv, args); + + parse_config_options(args); + + int start = 0; + + parse_syn_options(args); + + vector nargs; + + for (unsigned i=0; iinit(); + } + for (int i=0; iinit(); + if (g_conf.mds_local_osd) + mdsosd[i]->init(); + } + + for (int i=0; iinit(); + } + + + // create client(s) + for (int i=0; iinit(); + + // use my argc, argv (make sure you pass a mount point!) + //cout << "mounting" << endl; + client[i]->mount(); + + //cout << "starting synthetic client " << endl; + syn[i] = new SyntheticClient(client[i]); + + syn[i]->start_thread(); + } + + + for (int i=0; ijoin_thread(); + delete syn[i]; + + client[i]->unmount(); + //cout << "unmounted" << endl; + client[i]->shutdown(); + } + + + // wait for it to finish + fakemessenger_wait(); + + // cleanup + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __CONTEXT_H +#define __CONTEXT_H + +#include "config.h" + +#include +#include +#include + +#include + + +/* + * Context - abstract callback class + */ +class Context { + public: + virtual ~Context() {} // we want a virtual destructor!!! + virtual void finish(int r) = 0; +}; + + +/* + * finish and destroy a list of Contexts + */ +inline void finish_contexts(std::list& finished, + int result = 0) +{ + using std::cout; + using std::endl; + + if (finished.empty()) return; + + dout(10) << finished.size() << " contexts to finish with " << result << endl; + for (std::list::iterator it = finished.begin(); + it != finished.end(); + it++) { + Context *c = *it; + dout(10) << "---- " << c << endl; + c->finish(result); + delete c; + } +} + +/* + * C_Contexts - set of Contexts + */ +class C_Contexts : public Context { + std::list clist; + +public: + void add(Context* c) { + clist.push_back(c); + } + void take(std::list& ls) { + clist.splice(clist.end(), ls); + } + void finish(int r) { + finish_contexts(clist, r); + } +}; + + +/* + * C_Gather + * + * BUG: does not report errors. + */ +class C_Gather : public Context { +public: + class C_GatherSub : public Context { + C_Gather *gather; + int num; + public: + C_GatherSub(C_Gather *g, int n) : gather(g), num(n) {} + void finish(int r) { + gather->finish(num); + } + }; + +private: + Context *onfinish; + std::set waitfor; + int num; + +public: + C_Gather(Context *f) : onfinish(f), num(0) {} + + void finish(int r) { + assert(waitfor.count(r)); + waitfor.erase(r); + if (waitfor.empty()) { + onfinish->finish(0); + delete onfinish; + } + } + + Context *new_sub() { + num++; + waitfor.insert(num); + return new C_GatherSub(this, num); + } +}; + +#endif diff --git a/branches/sage/cephmds2/include/Distribution.h b/branches/sage/cephmds2/include/Distribution.h new file mode 100644 index 0000000000000..00f352d59efab --- /dev/null +++ b/branches/sage/cephmds2/include/Distribution.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __DISTRIBUTION_H +#define __DISTRIBUTION_H + +#include +#include +using namespace std; + +class Distribution { + vector p; + vector v; + + public: + //Distribution() { + //} + + unsigned get_width() { + return p.size(); + } + + void clear() { + p.clear(); + v.clear(); + } + void add(int val, float pr) { + p.push_back(pr); + v.push_back(val); + } + + void random() { + float sum = 0.0; + for (unsigned i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __BUFFER_H +#define __BUFFER_H + +#include "common/Mutex.h" + +#include +#include + +using std::cout; +using std::endl; + +#ifndef __CYGWIN__ +# include +#endif + +#define BUFFER_PAGE_SIZE 4096 // fixme. + +// +// these are in config.o +extern Mutex bufferlock; +extern long buffer_total_alloc; +// + +class buffer { +private: + + /* hack for memory utilization debugging. */ + static void inc_total_alloc(unsigned len) { + bufferlock.Lock(); + buffer_total_alloc += len; + bufferlock.Unlock(); + } + static void dec_total_alloc(unsigned len) { + bufferlock.Lock(); + buffer_total_alloc -= len; + bufferlock.Unlock(); + } + + /* + * an abstract raw buffer. with a reference count. + */ + class raw { + public: + char *data; + unsigned len; + int nref; + Mutex lock; // we'll make it non-recursive. + + raw(unsigned l) : len(l), nref(0), lock(false) {} + raw(char *c, unsigned l) : data(c), len(l), nref(0), lock(false) {} + virtual ~raw() {}; + + // no copying. + raw(const raw &other); + const raw& operator=(const raw &other); + + virtual raw* clone_empty() = 0; + raw *clone() { + raw *c = clone_empty(); + memcpy(c->data, data, len); + return c; + } + }; + + friend std::ostream& operator<<(std::ostream& out, const raw &r); + + /* + * primitive buffer types + */ + class raw_char : public raw { + public: + raw_char(unsigned l) : raw(l) { + data = new char[len]; + inc_total_alloc(len); + } + ~raw_char() { + delete[] data; + dec_total_alloc(len); + } + raw* clone_empty() { + return new raw_char(len); + } + }; + + class raw_static : public raw { + public: + raw_static(const char *d, unsigned l) : raw((char*)d, l) { } + ~raw_static() {} + raw* clone_empty() { + return new raw_char(len); + } + }; + +#ifndef __CYGWIN__ + class raw_mmap_pages : public raw { + public: + raw_mmap_pages(unsigned l) : raw(l) { + data = (char*)::mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + inc_total_alloc(len); + } + ~raw_mmap_pages() { + ::munmap(data, len); + dec_total_alloc(len); + } + raw* clone_empty() { + return new raw_mmap_pages(len); + } + }; + + class raw_posix_aligned : public raw { + public: + raw_posix_aligned(unsigned l) : raw(l) { + ::posix_memalign((void**)&data, BUFFER_PAGE_SIZE, len); + inc_total_alloc(len); + } + ~raw_posix_aligned() { + ::free((void*)data); + dec_total_alloc(len); + } + raw* clone_empty() { + return new raw_posix_aligned(len); + } + }; +#endif + +#ifdef __CYGWIN__ + class raw_hack_aligned : public raw { + char *realdata; + public: + raw_hack_aligned(unsigned l) : raw(l) { + realdata = new char[len+4095]; + unsigned off = ((unsigned)realdata) % 4096; + if (off) + data = realdata + 4096 - off; + else + data = realdata; + inc_total_alloc(len+4095); + //cout << "hack aligned " << (unsigned)data + //<< " in raw " << (unsigned)realdata + //<< " off " << off << endl; + assert(((unsigned)data & 4095) == 0); + } + ~raw_hack_aligned() { + delete[] realdata; + dec_total_alloc(len+4095); + } + raw* clone_empty() { + return new raw_hack_aligned(len); + } + }; +#endif + +public: + + /* + * named constructors + */ + + static raw* copy(const char *c, unsigned len) { + raw* r = new raw_char(len); + memcpy(r->data, c, len); + return r; + } + static raw* create(unsigned len) { + return new raw_char(len); + } + + static raw* create_page_aligned(unsigned len) { +#ifndef __CYGWIN__ + return new raw_mmap_pages(len); +#else + return new raw_hack_aligned(len); +#endif + } + + + /* + * a buffer pointer. references (a subsequence of) a raw buffer. + */ + class ptr { + raw *_raw; + unsigned _off, _len; + + public: + ptr() : _raw(0), _off(0), _len(0) {} + ptr(raw *r) : _raw(r), _off(0), _len(r->len) { // no lock needed; this is an unref raw. + ++r->nref; + } + ptr(unsigned l) : _off(0), _len(l) { + _raw = create(l); + ++_raw->nref; + } + ptr(char *d, unsigned l) : _off(0), _len(l) { // ditto. + _raw = copy(d, l); + ++_raw->nref; + } + ptr(const ptr& p) : _raw(p._raw), _off(p._off), _len(p._len) { + if (_raw) { + _raw->lock.Lock(); + ++_raw->nref; + _raw->lock.Unlock(); + } + } + ptr(const ptr& p, unsigned o, unsigned l) : _raw(p._raw), _off(p._off + o), _len(l) { + assert(o+l <= p._len); + assert(_raw); + _raw->lock.Lock(); + ++_raw->nref; + _raw->lock.Unlock(); + } + ptr& operator= (const ptr& p) { + // be careful -- we need to properly handle self-assignment. + if (p._raw) { + p._raw->lock.Lock(); + ++p._raw->nref; // inc new + p._raw->lock.Unlock(); + } + release(); // dec (+ dealloc) old (if any) + _raw = p._raw; // change my ref + _off = p._off; + _len = p._len; + return *this; + } + ~ptr() { + release(); + } + + void release() { + if (_raw) { + _raw->lock.Lock(); + if (--_raw->nref == 0) { + //cout << "hosing raw " << (void*)_raw << " len " << _raw->len << std::endl; + _raw->lock.Unlock(); + delete _raw; // dealloc old (if any) + } else + _raw->lock.Unlock(); + _raw = 0; + } + } + + // misc + bool at_buffer_head() const { return _off == 0; } + bool at_buffer_tail() const { return _off + _len == _raw->len; } + + // accessors + const char *c_str() const { assert(_raw); return _raw->data + _off; } + char *c_str() { assert(_raw); return _raw->data + _off; } + unsigned length() const { return _len; } + unsigned offset() const { return _off; } + unsigned unused_tail_length() const { return _raw->len - (_off+_len); } + const char& operator[](unsigned n) const { + assert(_raw); + assert(n < _len); + return _raw->data[_off + n]; + } + char& operator[](unsigned n) { + assert(_raw); + assert(n < _len); + return _raw->data[_off + n]; + } + + const char *raw_c_str() const { assert(_raw); return _raw->data; } + unsigned raw_length() const { assert(_raw); return _raw->len; } + int raw_nref() const { assert(_raw); return _raw->nref; } + + void copy_out(unsigned o, unsigned l, char *dest) const { + assert(_raw); + assert(o >= 0 && o <= _len); + assert(l >= 0 && o+l <= _len); + memcpy(dest, c_str()+o, l); + } + + unsigned wasted() { + assert(_raw); + return _raw->len - _len; + } + + // modifiers + void set_offset(unsigned o) { _off = o; } + void set_length(unsigned l) { _len = l; } + + void append(const char *p, unsigned l) { + assert(_raw); + assert(l <= unused_tail_length()); + memcpy(c_str() + _len, p, l); + _len += l; + } + + void copy_in(unsigned o, unsigned l, const char *src) { + assert(_raw); + assert(o >= 0 && o <= _len); + assert(l >= 0 && o+l <= _len); + memcpy(c_str()+o, src, l); + } + + void zero() { + memset(c_str(), 0, _len); + } + + void clean() { + //raw *newraw = _raw->makesib(_len); + } + }; + + friend std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp); + + /* + * list - the useful bit! + */ + + class list { + // my private bits + std::list _buffers; + unsigned _len; + + public: + // cons/des + list() : _len(0) {} + list(const list& other) : _buffers(other._buffers), _len(other._len) { } + list(unsigned l) : _len(0) { + ptr bp(l); + push_back(bp); + } + ~list() {} + + list& operator= (const list& other) { + _buffers = other._buffers; + _len = other._len; + return *this; + } + + const std::list& buffers() const { return _buffers; } + + unsigned length() const { +#if 0 + // DEBUG: verify _len + unsigned len = 0; + for (std::list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + len += (*it).length(); + } + assert(len == _len); +#endif + return _len; + } + + + // modifiers + void clear() { + _buffers.clear(); + _len = 0; + } + void push_front(ptr& bp) { + _buffers.push_front(bp); + _len += bp.length(); + } + void push_front(raw *r) { + ptr bp(r); + _buffers.push_front(bp); + _len += bp.length(); + } + void push_back(ptr& bp) { + _buffers.push_back(bp); + _len += bp.length(); + } + void push_back(raw *r) { + ptr bp(r); + _buffers.push_back(bp); + _len += bp.length(); + } + void zero() { + for (std::list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) + it->zero(); + } + + // sort-of-like-assignment-op + void claim(list& bl) { + // free my buffers + clear(); + claim_append(bl); + } + void claim_append(list& bl) { + // steal the other guy's buffers + _len += bl._len; + _buffers.splice( _buffers.end(), bl._buffers ); + bl._len = 0; + } + + // crope lookalikes + void copy(unsigned off, unsigned len, char *dest) { + assert(off >= 0); + assert(off + len <= length()); + /*assert(off < length()); + if (off + len > length()) + len = length() - off; + */ + // advance to off + std::list::iterator curbuf = _buffers.begin(); + + // skip off + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + break; + } + } + + // copy + while (len > 0) { + // is the rest ALL in this buffer? + if (off + len <= (*curbuf).length()) { + (*curbuf).copy_out(off, len, dest); // yup, last bit! + break; + } + + // get as much as we can from this buffer. + unsigned howmuch = (*curbuf).length() - off; + (*curbuf).copy_out(off, howmuch, dest); + + dest += howmuch; + len -= howmuch; + off = 0; + curbuf++; + assert(curbuf != _buffers.end()); + } + } + + void copy_in(unsigned off, unsigned len, const char *src) { + assert(off >= 0); + assert(off + len <= length()); + + // advance to off + std::list::iterator curbuf = _buffers.begin(); + + // skip off + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + break; + } + } + + // copy + while (len > 0) { + // is the rest ALL in this buffer? + if (off + len <= (*curbuf).length()) { + (*curbuf).copy_in(off, len, src); // yup, last bit! + break; + } + + // get as much as we can from this buffer. + unsigned howmuch = (*curbuf).length() - off; + (*curbuf).copy_in(off, howmuch, src); + + src += howmuch; + len -= howmuch; + off = 0; + curbuf++; + assert(curbuf != _buffers.end()); + } + } + void copy_in(unsigned off, unsigned len, const list& bl) { + unsigned left = len; + for (std::list::const_iterator i = bl._buffers.begin(); + i != bl._buffers.end(); + i++) { + unsigned l = (*i).length(); + if (left < l) l = left; + copy_in(off, l, (*i).c_str()); + left -= l; + if (left == 0) break; + off += l; + } + } + + + void append(const char *data, unsigned len) { + if (len == 0) return; + + unsigned alen = 0; + + // copy into the tail buffer? + if (!_buffers.empty()) { + unsigned avail = _buffers.back().unused_tail_length(); + if (avail > 0) { + //std::cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf " << _buffers.back() << std::endl; + if (avail > len) + avail = len; + _buffers.back().append(data, avail); + _len += avail; + data += avail; + len -= avail; + } + alen = _buffers.back().length(); + } + if (len == 0) return; + + // just add another buffer. + // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! + if (alen < 4096) alen = 4096; + ptr bp = create(alen); + bp.set_length(len); + bp.copy_in(0, len, data); + push_back(bp); + } + void append(ptr& bp) { + push_back(bp); + } + void append(ptr& bp, unsigned off, unsigned len) { + assert(len+off <= bp.length()); + ptr tempbp(bp, off, len); + push_back(tempbp); + } + void append(const list& bl) { + list temp(bl); // copy list + claim_append(temp); // and append + } + + + /* + * get a char + */ + const char& operator[](unsigned n) { + assert(n < _len); + for (std::list::iterator p = _buffers.begin(); + p != _buffers.end(); + p++) { + if (n >= p->length()) { + n -= p->length(); + continue; + } + return (*p)[n]; + } + assert(0); + } + + /* + * return a contiguous ptr to whole bufferlist contents. + */ + char *c_str() { + if (_buffers.size() == 1) { + return _buffers.front().c_str(); // good, we're already contiguous. + } + else if (_buffers.size() == 0) { + return 0; // no buffers + } + else { + ptr newbuf = create(length()); // make one new contiguous buffer. + copy(0, length(), newbuf.c_str()); // copy myself into it. + clear(); + push_back(newbuf); + return newbuf.c_str(); // now it'll work. + } + } + + void substr_of(list& other, unsigned off, unsigned len) { + assert(off + len <= other.length()); + clear(); + + // skip off + std::list::iterator curbuf = other._buffers.begin(); + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + //cout << "skipping over " << *curbuf << endl; + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + //cout << "somewhere in " << *curbuf << endl; + break; + } + } + + while (len > 0) { + // partial? + if (off + len < (*curbuf).length()) { + //cout << "copying partial of " << *curbuf << endl; + _buffers.push_back( ptr( *curbuf, off, len ) ); + _len += len; + break; + } + + // through end + //cout << "copying end (all?) of " << *curbuf << endl; + unsigned howmuch = (*curbuf).length() - off; + _buffers.push_back( ptr( *curbuf, off, howmuch ) ); + _len += howmuch; + len -= howmuch; + off = 0; + curbuf++; + } + } + + + // funky modifer + void splice(unsigned off, unsigned len, list *claim_by=0 /*, bufferlist& replace_with */) { // fixme? + assert(off < length()); + assert(len > 0); + //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; + + // skip off + std::list::iterator curbuf = _buffers.begin(); + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + //cout << "off = " << off << " skipping over " << *curbuf << endl; + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + //cout << "off = " << off << " somewhere in " << *curbuf << endl; + break; + } + } + assert(off >= 0); + + if (off) { + // add a reference to the front bit + // insert it before curbuf (which we'll hose) + //cout << "keeping front " << off << " of " << *curbuf << endl; + _buffers.insert( curbuf, ptr( *curbuf, 0, off ) ); + _len += off; + } + + while (len > 0) { + // partial? + if (off + len < (*curbuf).length()) { + //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; + if (claim_by) + claim_by->append( *curbuf, off, len ); + (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big + (*curbuf).set_length( (*curbuf).length() - (len+off) ); + _len -= off+len; + //cout << " now " << *curbuf << endl; + break; + } + + // hose though the end + unsigned howmuch = (*curbuf).length() - off; + //cout << "discarding " << howmuch << " of " << *curbuf << endl; + if (claim_by) + claim_by->append( *curbuf, off, howmuch ); + _len -= (*curbuf).length(); + _buffers.erase( curbuf++ ); + len -= howmuch; + off = 0; + } + + // splice in *replace (implement me later?) + } + + }; + +}; + +typedef buffer::ptr bufferptr; +typedef buffer::list bufferlist; + + +inline bool operator>(bufferlist& l, bufferlist& r) { + for (unsigned p = 0; ; p++) { + if (l.length() > p && r.length() == p) return true; + if (l.length() == p) return false; + if (l[p] > r[p]) return true; + if (l[p] < r[p]) return false; + p++; + } +} +inline bool operator>=(bufferlist& l, bufferlist& r) { + for (unsigned p = 0; ; p++) { + if (l.length() > p && r.length() == p) return true; + if (r.length() == p && l.length() == p) return true; + if (l[p] > r[p]) return true; + if (l[p] < r[p]) return false; + p++; + } +} +inline bool operator<(bufferlist& l, bufferlist& r) { + return r > l; +} +inline bool operator<=(bufferlist& l, bufferlist& r) { + return r >= l; +} + + +inline std::ostream& operator<<(std::ostream& out, const buffer::raw &r) { + return out << "buffer::raw(" << (void*)r.data << " len " << r.len << " nref " << r.nref << ")"; +} + +inline std::ostream& operator<<(std::ostream& out, const buffer::ptr& bp) { + out << "buffer::ptr(" << bp.offset() << "~" << bp.length() + << " " << (void*)bp.c_str() + << " in raw " << (void*)bp.raw_c_str() + << " len " << bp.raw_length() + << " nref " << bp.raw_nref() << ")"; + return out; +} + +inline std::ostream& operator<<(std::ostream& out, const buffer::list& bl) { + out << "buffer::list(len=" << bl.length() << "," << std::endl; + + std::list::const_iterator it = bl.buffers().begin(); + while (it != bl.buffers().end()) { + out << "\t" << *it; + if (++it == bl.buffers().end()) break; + out << "," << std::endl; + } + out << std::endl << ")"; + return out; +} + + + + +// encoder/decode helpers + +// string +inline void _encode(const std::string& s, bufferlist& bl) +{ + bl.append(s.c_str(), s.length()+1); +} +inline void _decode(std::string& s, bufferlist& bl, int& off) +{ + s = bl.c_str() + off; + off += s.length() + 1; +} + +// bufferptr (encapsulated) +inline void _encode(bufferptr& bp, bufferlist& bl) +{ + size_t len = bp.length(); + bl.append((char*)&len, sizeof(len)); + bl.append(bp); +} +inline void _decode(bufferptr& bp, bufferlist& bl, int& off) +{ + size_t len; + bl.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + bufferlist s; + s.substr_of(bl, off, len); + off += len; + + if (s.buffers().size() == 1) + bp = s.buffers().front(); + else + bp = buffer::copy(s.c_str(), s.length()); +} + +// bufferlist (encapsulated) +inline void _encode(const bufferlist& s, bufferlist& bl) +{ + size_t len = s.length(); + bl.append((char*)&len, sizeof(len)); + bl.append(s); +} +inline void _decode(bufferlist& s, bufferlist& bl, int& off) +{ + size_t len; + bl.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + s.substr_of(bl, off, len); + off += len; +} + +#include +#include +#include +#include + +// set +template +inline void _encode(std::set& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::set::iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(std::set& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(std::vector& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::vector::iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(std::vector& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + s = std::vector(n); + for (int i=0; i +template +inline void _encode(const std::list& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::list::const_iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(std::list& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +inline void _encode(std::map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (std::map::iterator it = s.begin(); + it != s.end(); + it++) { + _encode(it->first, bl); + _encode(it->second, bl); + n--; + } + assert(n==0); +} +inline void _decode(std::map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(const std::map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + //std::cout << "n = " << n << std::endl; + for (typename std::map::const_iterator it = s.begin(); + it != s.end(); + it++) { + T k = it->first; + bl.append((char*)&k, sizeof(k)); + _encode(it->second, bl); + n--; + //std::cout << "--n = " << n << " after k " << k << std::endl; + } + assert(n==0); +} +template +inline void _decode(std::map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(const std::map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename std::map::const_iterator it = s.begin(); + it != s.end(); + it++) { + T k = it->first; + U v = it->second; + bl.append((char*)&k, sizeof(k)); + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(std::map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SYSERROR() syserror("At %s:%d", __FILE__, __LINE__) + +#define ASSERT(c) \ + ((c) || (exiterror("Assertion failed at %s:%d", __FILE__, __LINE__), 1)) + +/* print usage error message and exit */ +extern void userror(const char *use, const char *fmt, ...); + +/* print system error message and exit */ +extern void syserror(const char *fmt, ...); + +/* print error message and exit */ +extern void exiterror(const char *fmt, ...); + +/* print error message */ +extern void error(const char *fmt, ...); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/branches/sage/cephmds2/include/filepath.h b/branches/sage/cephmds2/include/filepath.h new file mode 100644 index 0000000000000..5585e536b42db --- /dev/null +++ b/branches/sage/cephmds2/include/filepath.h @@ -0,0 +1,206 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FILEPATH_H +#define __FILEPATH_H + + +/* + * BUG: /a/b/c is equivalent to a/b/c in dentry-breakdown, but not string. + * -> should it be different? how? should this[0] be "", with depth 4? + * + */ + + +#include +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +#include "buffer.h" + + +class filepath { + string path; + vector bits; + + void rebuild() { + if (absolute()) + path = "/"; + else + path.clear(); + for (unsigned i=0; i::iterator it = bits.begin(); + it != bits.end(); + it++) { + r.append((*it).c_str(), (*it).length()+1); + } + } + + void _unrope(crope& r, int& off) { + clear(); + + char n; + r.copy(off, sizeof(char), (char*)&n); + off += sizeof(char); + for (int i=0; i::iterator it = bits.begin(); + it != bits.end(); + it++) { + bl.append((*it).c_str(), (*it).length()+1); + } + } + + void _decode(bufferlist& bl, int& off) { + clear(); + + char n; + bl.copy(off, sizeof(char), (char*)&n); + off += sizeof(char); + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __INTERVAL_SET_H +#define __INTERVAL_SET_H + +#include +#include +#include +using namespace std; + +#ifndef MIN +# define MIN(a,b) ((a)<=(b) ? (a):(b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a)>=(b) ? (a):(b)) +#endif + + +template +class interval_set { + public: + map m; // map start -> len + + // helpers + private: + typename map::const_iterator find_inc(T start) const { + typename map::const_iterator p = m.lower_bound(start); // p->first >= start + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might overlap? + if (p->first + p->second <= start) + p++; // it doesn't. + } + return p; + } + + typename map::iterator find_inc_m(T start) { + typename map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might overlap? + if (p->first + p->second <= start) + p++; // it doesn't. + } + return p; + } + + typename map::const_iterator find_adj(T start) const { + typename map::const_iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might touch? + if (p->first + p->second < start) + p++; // it doesn't. + } + return p; + } + + typename map::iterator find_adj_m(T start) { + typename map::iterator p = m.lower_bound(start); + if (p != m.begin() && + (p == m.end() || p->first > start)) { + p--; // might touch? + if (p->first + p->second < start) + p++; // it doesn't. + } + return p; + } + + public: + bool operator==(const interval_set& other) const { + return m == other.m; + } + + void clear() { + m.clear(); + } + + bool contains(T i) const { + typename map::const_iterator p = find_inc(i); + if (p == m.end()) return false; + if (p->first > i) return false; + if (p->first+p->second <= i) return false; + assert(p->first <= i && p->first+p->second > i); + return true; + } + bool contains(T start, T len) const { + typename map::const_iterator p = find_inc(start); + if (p == m.end()) return false; + if (p->first > start) return false; + if (p->first+p->second <= start) return false; + assert(p->first <= start && p->first+p->second > start); + if (p->first+p->second < start+len) return false; + return true; + } + bool intersects(T start, T len) const { + interval_set a; + a.insert(start, len); + interval_set i; + i.intersection_of( *this, a ); + if (i.empty()) return false; + return true; + } + + // outer range of set + bool empty() const { + return m.empty(); + } + T start() const { + assert(!empty()); + typename map::const_iterator p = m.begin(); + return p->first; + } + T end() const { + assert(!empty()); + typename map::const_iterator p = m.end(); + p--; + return p->first+p->second; + } + + // interval start after p (where p not in set) + bool starts_after(T i) const { + assert(!contains(i)); + typename map::const_iterator p = find_inc(i); + if (p == m.end()) return false; + return true; + } + T start_after(T i) const { + assert(!contains(i)); + typename map::const_iterator p = find_inc(i); + return p->first; + } + + // interval end that contains start + T end_after(T start) const { + assert(contains(start)); + typename map::const_iterator p = find_inc(start); + return p->first+p->second; + } + + void insert(T val) { + insert(val, 1); + } + + void insert(T start, T len) { + //cout << "insert " << start << "~" << len << endl; + assert(len > 0); + typename map::iterator p = find_adj_m(start); + if (p == m.end()) { + m[start] = len; // new interval + } else { + if (p->first < start) { + + if (p->first + p->second != start) { + //cout << "p is " << p->first << "~" << p->second << ", start is " << start << ", len is " << len << endl; + assert(0); + } + + assert(p->first + p->second == start); + p->second += len; // append to end + + typename map::iterator n = p; + n++; + if (n != m.end() && + start+len == n->first) { // combine with next, too! + p->second += n->second; + m.erase(n); + } + } else { + if (start+len == p->first) { + m[start] = len + p->second; // append to front + m.erase(p); + } else { + assert(p->first > start+len); + m[start] = len; // new interval + } + } + } + } + + void erase(T val) { + erase(val, 1); + } + + void erase(T start, T len) { + typename map::iterator p = find_inc_m(start); + + assert(p != m.end()); + assert(p->first <= start); + + T before = start - p->first; + assert(p->second >= before+len); + T after = p->second - before - len; + + if (before) + p->second = before; // shorten bit before + else + m.erase(p); + if (after) + m[start+len] = after; + } + + + void subtract(const interval_set &a) { + for (typename map::const_iterator p = a.m.begin(); + p != a.m.end(); + p++) + erase(p->first, p->second); + } + + void insert(const interval_set &a) { + for (typename map::const_iterator p = a.m.begin(); + p != a.m.end(); + p++) + insert(p->first, p->second); + } + + + void intersection_of(const interval_set &a, const interval_set &b) { + assert(&a != this); + assert(&b != this); + clear(); + + typename map::const_iterator pa = a.m.begin(); + typename map::const_iterator pb = b.m.begin(); + + while (pa != a.m.end() && pb != b.m.end()) { + // passing? + if (pa->first + pa->second <= pb->first) + { pa++; continue; } + if (pb->first + pb->second <= pa->first) + { pb++; continue; } + T start = MAX(pa->first, pb->first); + T end = MIN(pa->first+pa->second, pb->first+pb->second); + assert(end > start); + insert(start, end-start); + if (pa->first+pa->second > pb->first+pb->second) + pb++; + else + pa++; + } + } + + void union_of(const interval_set &a, const interval_set &b) { + assert(&a != this); + assert(&b != this); + clear(); + + //cout << "union_of" << endl; + + // a + m = a.m; + + // - (a*b) + interval_set ab; + ab.intersection_of(a, b); + subtract(ab); + + // + b + insert(b); + return; + } + void union_of(const interval_set &b) { + interval_set a; + a.m.swap(m); + union_of(a, b); + } + + bool subset_of(const interval_set &big) const { + for (typename map::const_iterator i = m.begin(); + i != m.end(); + i++) + if (!big.contains(i->first, i->second)) return false; + return true; + } + +}; + +template +inline ostream& operator<<(ostream& out, const interval_set &s) { + out << "["; + for (typename map::const_iterator i = s.m.begin(); + i != s.m.end(); + i++) { + if (i != s.m.begin()) out << ","; + out << i->first << "~" << i->second; + } + out << "]"; + return out; +} + + +#endif diff --git a/branches/sage/cephmds2/include/lru.h b/branches/sage/cephmds2/include/lru.h new file mode 100644 index 0000000000000..63096d0e32079 --- /dev/null +++ b/branches/sage/cephmds2/include/lru.h @@ -0,0 +1,321 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __LRU_H +#define __LRU_H + +#include +#include +using namespace std; + +#include "config.h" + + + +class LRUObject { + private: + LRUObject *lru_next, *lru_prev; + bool lru_pinned; + class LRU *lru; + class LRUList *lru_list; + + public: + LRUObject() { + lru_next = lru_prev = NULL; + lru_list = 0; + lru_pinned = false; + lru = 0; + } + + // pin/unpin item in cache + void lru_pin(); + void lru_unpin(); + bool lru_is_expireable() { return !lru_pinned; } + + friend class LRU; + friend class LRUList; +}; + + +class LRUList { + private: + LRUObject *head, *tail; + __uint32_t len; + + public: + LRUList() { + head = tail = 0; + len = 0; + } + + __uint32_t get_length() { return len; } + + LRUObject *get_head() { + return head; + } + LRUObject *get_tail() { + return tail; + } + + void insert_head(LRUObject *o) { + o->lru_next = head; + o->lru_prev = NULL; + if (head) { + head->lru_prev = o; + } else { + tail = o; + } + head = o; + o->lru_list = this; + len++; + } + void insert_tail(LRUObject *o) { + o->lru_next = NULL; + o->lru_prev = tail; + if (tail) { + tail->lru_next = o; + } else { + head = o; + } + tail = o; + o->lru_list = this; + len++; + } + + void remove(LRUObject *o) { + assert(o->lru_list == this); + if (o->lru_next) + o->lru_next->lru_prev = o->lru_prev; + else + tail = o->lru_prev; + if (o->lru_prev) + o->lru_prev->lru_next = o->lru_next; + else + head = o->lru_next; + o->lru_next = o->lru_prev = NULL; + o->lru_list = 0; + assert(len>0); + len--; + } + +}; + + +class LRU { + protected: + LRUList lru_top, lru_bot, lru_pintail; + __uint32_t lru_num, lru_num_pinned; + __uint32_t lru_max; // max items + double lru_midpoint; + + friend class LRUObject; + //friend class MDCache; // hack + + public: + LRU(int max = 0) { + lru_num = 0; + lru_num_pinned = 0; + lru_midpoint = .9; + lru_max = max; + } + + __uint32_t lru_get_size() { return lru_num; } + __uint32_t lru_get_top() { return lru_top.get_length(); } + __uint32_t lru_get_bot() { return lru_bot.get_length(); } + __uint32_t lru_get_pintail() { return lru_pintail.get_length(); } + __uint32_t lru_get_max() { return lru_max; } + __uint32_t lru_get_num_pinned() { return lru_num_pinned; } + + void lru_set_max(__uint32_t m) { lru_max = m; } + void lru_set_midpoint(float f) { lru_midpoint = f; } + + + // insert at top of lru + void lru_insert_top(LRUObject *o) { + //assert(!o->lru_in_lru); + //o->lru_in_lru = true; + assert(!o->lru); + o->lru = this; + lru_top.insert_head( o ); + lru_num++; + if (o->lru_pinned) lru_num_pinned++; + lru_adjust(); + } + + // insert at mid point in lru + void lru_insert_mid(LRUObject *o) { + //assert(!o->lru_in_lru); + //o->lru_in_lru = true; + assert(!o->lru); + o->lru = this; + lru_bot.insert_head(o); + lru_num++; + if (o->lru_pinned) lru_num_pinned++; + } + + // insert at bottom of lru + void lru_insert_bot(LRUObject *o) { + assert(!o->lru); + o->lru = this; + lru_bot.insert_tail(o); + lru_num++; + if (o->lru_pinned) lru_num_pinned++; + } + + /* + // insert at bottom of lru + void lru_insert_pintail(LRUObject *o) { + assert(!o->lru); + o->lru = this; + + assert(o->lru_pinned); + + lru_pintail.insert_head(o); + lru_num++; + lru_num_pinned += o->lru_pinned; + } + */ + + + + + // adjust top/bot balance, as necessary + void lru_adjust() { + if (!lru_max) return; + + unsigned toplen = lru_top.get_length(); + unsigned topwant = (unsigned)(lru_midpoint * (double)lru_max); + while (toplen > 0 && + toplen > topwant) { + // remove from tail of top, stick at head of bot + // FIXME: this could be way more efficient by moving a whole chain of items. + + LRUObject *o = lru_top.get_tail(); + lru_top.remove(o); + lru_bot.insert_head(o); + toplen--; + } + } + + + // remove an item + LRUObject *lru_remove(LRUObject *o) { + // not in list + //assert(o->lru_in_lru); + //if (!o->lru_in_lru) return o; // might have expired and been removed that way. + if (!o->lru) return o; + + + if (o->lru_list == &lru_top) + lru_top.remove(o); + else if (o->lru_list == &lru_bot) + lru_bot.remove(o); + else if (o->lru_list == &lru_pintail) + lru_pintail.remove(o); + else + assert(0); + + lru_num--; + if (o->lru_pinned) lru_num_pinned--; + o->lru = 0; + return o; + } + + // touch item -- move to head of lru + bool lru_touch(LRUObject *o) { + lru_remove(o); + lru_insert_top(o); + return true; + } + + // touch item -- move to midpoint (unless already higher) + bool lru_midtouch(LRUObject *o) { + if (o->lru_list == &lru_top) return false; + + lru_remove(o); + lru_insert_mid(o); + return true; + } + + // touch item -- move to bottom + bool lru_bottouch(LRUObject *o) { + lru_remove(o); + lru_insert_bot(o); + return true; + } + + + // expire -- expire a single item + LRUObject *lru_get_next_expire() { + LRUObject *p; + + // look through tail of bot + while (lru_bot.get_length()) { + p = lru_bot.get_tail(); + if (!p->lru_pinned) return p; + + // move to pintail + lru_bot.remove(p); + lru_pintail.insert_head(p); + } + + // ok, try head then + while (lru_top.get_length()) { + p = lru_top.get_tail(); + if (!p->lru_pinned) return p; + + // move to pintail + lru_top.remove(p); + lru_pintail.insert_head(p); + } + + // no luck! + return NULL; + } + + LRUObject *lru_expire() { + LRUObject *p = lru_get_next_expire(); + if (p) + return lru_remove(p); + return NULL; + } + + + void lru_status() { + dout(10) << "lru: " << lru_num << " items, " << lru_top.get_length() << " top, " << lru_bot.get_length() << " bot, " << lru_pintail.get_length() << " pintail" << endl; + } + +}; + + +inline void LRUObject::lru_pin() +{ + lru_pinned = true; + if (lru) lru->lru_num_pinned++; +} +inline void LRUObject::lru_unpin() { + lru_pinned = false; + if (lru) { + lru->lru_num_pinned--; + + // move from pintail -> bot + if (lru_list == &lru->lru_pintail) { + lru->lru_pintail.remove(this); + lru->lru_bot.insert_tail(this); + } + } +} + +#endif diff --git a/branches/sage/cephmds2/include/object.h b/branches/sage/cephmds2/include/object.h new file mode 100644 index 0000000000000..3a66c4ab83d54 --- /dev/null +++ b/branches/sage/cephmds2/include/object.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __OBJECT_H +#define __OBJECT_H + +#include +#include +using namespace std; + + +typedef __uint32_t objectrev_t; + +struct object_t { + static const __uint32_t MAXREV = 0xffffffffU; + + __uint64_t ino; // "file" identifier + __uint32_t bno; // "block" in that "file" + objectrev_t rev; // revision. normally ctime (as epoch). + + object_t() : ino(0), bno(0), rev(0) {} + object_t(__uint64_t i, __uint32_t b) : ino(i), bno(b), rev(0) {} +}; + + +inline bool operator==(const object_t l, const object_t r) { + return (l.ino == r.ino) && (l.bno == r.bno) && (l.rev == r.rev); +} +inline bool operator!=(const object_t l, const object_t r) { + return (l.ino != r.ino) || (l.bno != r.bno) || (l.rev != r.rev); +} +inline bool operator>(const object_t l, const object_t r) { + if (l.ino > r.ino) return true; + if (l.ino < r.ino) return false; + if (l.bno > r.bno) return true; + if (l.bno < r.bno) return false; + if (l.rev > r.rev) return true; + return false; +} +inline bool operator<(const object_t l, const object_t r) { + if (l.ino < r.ino) return true; + if (l.ino > r.ino) return false; + if (l.bno < r.bno) return true; + if (l.bno > r.bno) return false; + if (l.rev < r.rev) return true; + return false; +} +inline bool operator>=(const object_t l, const object_t r) { + return !(l < r); +} +inline bool operator<=(const object_t l, const object_t r) { + return !(l > r); +} +inline ostream& operator<<(ostream& out, const object_t o) { + out << hex << o.ino << '.'; + out.setf(ios::right); + out.fill('0'); + out << setw(8) << o.bno << dec; + out.unsetf(ios::right); + if (o.rev) + out << '.' << o.rev; + return out; +} +namespace __gnu_cxx { + template<> struct hash<__uint64_t> { + size_t operator()(__uint64_t __x) const { + static hash<__uint32_t> H; + return H((__x >> 32) ^ (__x & 0xffffffff)); + } + }; + + template<> struct hash { + size_t operator()(const object_t &r) const { + static hash<__uint64_t> H; + static hash<__uint32_t> I; + return H(r.ino) ^ I(r.bno); + } + }; +} + +#endif diff --git a/branches/sage/cephmds2/include/oldbuffer.h b/branches/sage/cephmds2/include/oldbuffer.h new file mode 100644 index 0000000000000..fda7336bc6461 --- /dev/null +++ b/branches/sage/cephmds2/include/oldbuffer.h @@ -0,0 +1,357 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __BUFFER_H +#define __BUFFER_H + +#include +#include + +#include +using namespace std; + +// bit masks +#define BUFFER_MODE_NOCOPY 0 +#define BUFFER_MODE_COPY 1 // copy on create, my buffer + +#define BUFFER_MODE_NOFREE 0 +#define BUFFER_MODE_FREE 2 + +#define BUFFER_MODE_CUSTOMFREE 4 + +#define BUFFER_MODE_DEFAULT 3//(BUFFER_MODE_COPY|BUFFER_MODE_FREE) + + +// debug crap +#include "config.h" +#define bdbout(x) if (x <= g_conf.debug_buffer) cout + +#include "common/Mutex.h" + +// HACK: in config.cc +/* + * WARNING: bufferlock placements are tricky for efficiency. note that only bufferptr and + * buffer ever use buffer._ref, and only bufferptr should call ~buffer(). + * + * So, I only need to protect: + * - buffer()'s modification of buffer_total_alloc + * - ~bufferptr() check of buffer._ref, and ~buffer's mod of buffer_total_alloc + * + * I don't protect + * - buffer._get() .. increment is atomic on any sane architecture + * - buffer._put() .. only called by ~bufferptr. + * - ~buffer .. only called by ~bufferptr *** I HOPE!! + */ +extern Mutex bufferlock; +extern long buffer_total_alloc; + + +typedef void (buffer_free_func_t)(void*,char*,unsigned); + + +/* + * buffer - the underlying buffer container. with a reference count. + * + * the buffer never shrinks. + * + * some invariants: + * _len never shrinks + * _len <= _alloc_len + */ +class buffer { + protected: + //wtf + //static Mutex bufferlock; + //static long buffer_total_alloc;// = 0; + + private: + // raw buffer alloc + char *_dataptr; + bool _myptr; + unsigned _len; + unsigned _alloc_len; + + // ref counts + unsigned _ref; + int _get() { + bdbout(1) << "buffer.get " << *this << " get " << _ref+1 << endl; + return ++_ref; + } + int _put() { + bdbout(1) << "buffer.put " << *this << " put " << _ref-1 << endl; + assert(_ref > 0); + return --_ref; + } + + // custom (de!)allocator + buffer_free_func_t *free_func; + void *free_func_arg; + + friend class bufferptr; + + public: + // constructors + buffer() : _dataptr(0), _myptr(true), _len(0), _alloc_len(0), _ref(0), free_func(0), free_func_arg(0) { + bdbout(1) << "buffer.cons " << *this << endl; + } + buffer(unsigned a) : _dataptr(0), _myptr(true), _len(a), _alloc_len(a), _ref(0), free_func(0), free_func_arg(0) { + bdbout(1) << "buffer.cons " << *this << endl; + _dataptr = new char[a]; + bufferlock.Lock(); + buffer_total_alloc += _alloc_len; + bufferlock.Unlock(); + bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; + } + ~buffer() { + bdbout(1) << "buffer.des " << *this << " " << (void*)free_func << endl; + if (free_func) { + bdbout(1) << "buffer.custom_free_func " << free_func_arg << " " << (void*)_dataptr << endl; + free_func( free_func_arg, _dataptr, _alloc_len ); + } + else if (_dataptr && _myptr) { + bdbout(1) << "buffer.free " << (void*)_dataptr << endl; + delete[] _dataptr; + buffer_total_alloc -= _alloc_len; + } + } + + buffer(const char *p, int l, int mode=BUFFER_MODE_DEFAULT, int alloc_len=0, + buffer_free_func_t free_func=0, void* free_func_arg=0) : + _dataptr(0), + _myptr(false), + _len(l), + _ref(0), + free_func(0), free_func_arg(0) { + + if (alloc_len) + _alloc_len = alloc_len; + else + _alloc_len = l; + + _myptr = mode & BUFFER_MODE_FREE ? true:false; + bdbout(1) << "buffer.cons " << *this << " mode = " << mode << ", myptr=" << _myptr << endl; + if (mode & BUFFER_MODE_COPY) { + _dataptr = new char[_alloc_len]; + bdbout(1) << "buffer.malloc " << (void*)_dataptr << endl; + bufferlock.Lock(); + buffer_total_alloc += _alloc_len; + bufferlock.Unlock(); + memcpy(_dataptr, p, l); + bdbout(1) << "buffer.copy " << *this << endl; + } else { + _dataptr = (char*)p; // ugly + bdbout(1) << "buffer.claim " << *this << " myptr=" << _myptr << endl; + } + + if (mode & BUFFER_MODE_CUSTOMFREE && free_func) { + this->free_func = free_func; + this->free_func_arg = free_func_arg; + } + } + + // operators + buffer& operator=(buffer& other) { + assert(0); // not implemented, no reasonable assignment semantics. + return *this; + } + + char *c_str() { + return _dataptr; + } + + bool has_free_func() { return free_func != 0; } + + // accessor + unsigned alloc_length() { + return _alloc_len; + } + void set_length(unsigned l) { + assert(l <= _alloc_len); + _len = l; + } + unsigned length() { return _len; } + unsigned unused_tail_length() { return _alloc_len - _len; } + + friend ostream& operator<<(ostream& out, buffer& b); +}; + +inline ostream& operator<<(ostream& out, buffer& b) { + return out << "buffer(this=" << &b << " len=" << b._len << ", alloc=" << b._alloc_len << ", data=" << (void*)b._dataptr << " ref=" << b._ref << ")"; +} + + +/* + * smart pointer class for buffer + * + * we reference count the actual buffer. + * we also let you refer to a subset of a buffer. + * we implement the high-level buffer accessor methods. + * + * some invariants: + * _off < _buffer->_len + * _off + _len <= _buffer->_len + */ +class bufferptr { + private: + buffer *_buffer; + unsigned _len, _off; + + public: + // empty cons + bufferptr() : + _buffer(0), + _len(0), + _off(0) { } + // main cons - the entire buffer + bufferptr(buffer *b) : + _buffer(b), + _len(b->_len), + _off(0) { + assert(_buffer->_ref == 0); + _buffer->_get(); // this is always the first one. + } + // subset cons - a subset of another bufferptr (subset) + bufferptr(const bufferptr& bp, unsigned len, unsigned off) { + bufferlock.Lock(); + _buffer = bp._buffer; + _len = len; + _off = bp._off + off; + _buffer->_get(); + assert(_off < _buffer->_len); // sanity checks + assert(_off + _len <= _buffer->_len); + bufferlock.Unlock(); + } + + // copy cons + bufferptr(const bufferptr &other) { + bufferlock.Lock(); + _buffer = other._buffer; + _len = other._len; + _off = other._off; + if (_buffer) _buffer->_get(); + bufferlock.Unlock(); + } + + // assignment operator + bufferptr& operator=(const bufferptr& other) { + //assert(0); + // discard old + discard_buffer(); + + // point to other + bufferlock.Lock(); + _buffer = other._buffer; + _len = other._len; + _off = other._off; + if (_buffer) _buffer->_get(); + bufferlock.Unlock(); + return *this; + } + + ~bufferptr() { + discard_buffer(); + } + + void discard_buffer() { + if (_buffer) { + bufferlock.Lock(); + if (_buffer->_put() == 0) + delete _buffer; + _buffer = 0; + bufferlock.Unlock(); + } + } + + + // dereference to get the actual buffer + buffer& operator*() { + return *_buffer; + } + + + bool at_buffer_head() const { + return _off == 0; + } + bool at_buffer_tail() const { + return _off + _len == _buffer->_len; + } + + // accessors for my subset + char *c_str() { + return _buffer->c_str() + _off; + } + unsigned length() const { + return _len; + } + unsigned offset() const { + return _off; + } + unsigned unused_tail_length() { + if (!at_buffer_tail()) return 0; + return _buffer->unused_tail_length(); + } + + + + // modifiers + void set_offset(unsigned off) { + assert(off <= _buffer->_alloc_len); + _off = off; + } + void set_length(unsigned len) { + assert(len >= 0 && _off + len <= _buffer->_alloc_len); + if (_buffer->_len < _off + len) + _buffer->_len = _off + len; // set new buffer len (_IF_ i'm expanding it) + _len = len; // my len too + } + void zero() { + //bzero((void*)c_str(), _len); + memset((void*)c_str(), 0, _len); + } + + + // crope lookalikes + void append(const char *p, unsigned len) { + assert(len + _len + _off <= _buffer->_alloc_len); // FIXME later for auto-expansion? + + // copy + memcpy(c_str() + _len, p, len); + _buffer->_len += len; + _len += len; + } + void copy_out(unsigned off, unsigned len, char *dest) { + assert(off >= 0 && off <= _len); + assert(len >= 0 && off + len <= _len); + memcpy(dest, c_str() + off, len); + } + void copy_in(unsigned off, unsigned len, const char *src) { + assert(off >= 0 && off <= _len); + assert(len >= 0 && off + len <= _len); + memcpy(c_str() + off, src, len); + } + + friend ostream& operator<<(ostream& out, bufferptr& bp); +}; + + +inline ostream& operator<<(ostream& out, bufferptr& bp) { + return out << "bufferptr(len=" << bp._len << " off=" << bp._off + << " cstr=" << (void*)bp.c_str() + << " buf=" << *bp._buffer + << ")"; +} + + + +#endif diff --git a/branches/sage/cephmds2/include/oldbufferlist.h b/branches/sage/cephmds2/include/oldbufferlist.h new file mode 100644 index 0000000000000..466a5ead25d77 --- /dev/null +++ b/branches/sage/cephmds2/include/oldbufferlist.h @@ -0,0 +1,681 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __BUFFERLIST_H +#define __BUFFERLIST_H + +#include "buffer.h" + +#include +#include +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + + +// debug crap +#include "config.h" +#define bdbout(x) if (x <= g_conf.debug_buffer) cout + + + +class bufferlist { + private: + /* local state limited to _buffers, and _len. + * we maintain _len ourselves, so we must be careful when fiddling with buffers! + */ + list _buffers; + unsigned _len; + + public: + // cons/des + bufferlist() : _len(0) { + bdbout(1) << "bufferlist.cons " << this << endl; + } + bufferlist(const bufferlist& bl) : _len(0) { + //assert(0); // o(n) and stupid + bdbout(1) << "bufferlist.cons " << this << endl; + _buffers = bl._buffers; + _len = bl._len; + } + ~bufferlist() { + bdbout(1) << "bufferlist.des " << this << endl; + } + + bufferlist& operator=(bufferlist& bl) { + //assert(0); // actually, this should be fine, just slow (O(n)) and stupid. + bdbout(1) << "bufferlist.= " << this << endl; + _buffers = bl._buffers; + _len = bl._len; + return *this; + } + + + // accessors + list& buffers() { + return _buffers; + } + //list::iterator begin() { return _buffers.begin(); } + //list::iterator end() { return _buffers.end(); } + + unsigned length() const { +#if 0 + { // DEBUG: verify _len + int len = 0; + for (list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + len += (*it).length(); + } + assert(len == _len); + } +#endif + return _len; + } + + void _rope(crope& r) { + for (list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) + r.append((*it).c_str(), (*it).length()); + } + + // modifiers + void clear() { + _buffers.clear(); + _len = 0; + } + void push_front(bufferptr& bp) { + _buffers.push_front(bp); + _len += bp.length(); + } + void push_front(buffer *b) { + bufferptr bp(b); + _buffers.push_front(bp); + _len += bp.length(); + } + void push_back(bufferptr& bp) { + _buffers.push_back(bp); + _len += bp.length(); + } + void push_back(buffer *b) { + bufferptr bp(b); + + _buffers.push_back(bp); + _len += bp.length(); + + } + void zero() { + for (list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) + it->zero(); + } + + // sort-of-like-assignment-op + void claim(bufferlist& bl) { + // free my buffers + clear(); + claim_append(bl); + } + void claim_append(bufferlist& bl) { + // steal the other guy's buffers + _len += bl._len; + _buffers.splice( _buffers.end(), bl._buffers ); + bl._len = 0; + } + + + + + // crope lookalikes + void copy(unsigned off, unsigned len, char *dest) { + assert(off >= 0); + assert(off + len <= length()); + /*assert(off < length()); + if (off + len > length()) + len = length() - off; + */ + // advance to off + list::iterator curbuf = _buffers.begin(); + + // skip off + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + break; + } + } + + // copy + while (len > 0) { + // is the rest ALL in this buffer? + if (off + len <= (*curbuf).length()) { + (*curbuf).copy_out(off, len, dest); // yup, last bit! + break; + } + + // get as much as we can from this buffer. + unsigned howmuch = (*curbuf).length() - off; + (*curbuf).copy_out(off, howmuch, dest); + + dest += howmuch; + len -= howmuch; + off = 0; + curbuf++; + assert(curbuf != _buffers.end()); + } + } + + void copy_in(unsigned off, unsigned len, const char *src) { + assert(off >= 0); + assert(off + len <= length()); + + // advance to off + list::iterator curbuf = _buffers.begin(); + + // skip off + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + break; + } + } + + // copy + while (len > 0) { + // is the rest ALL in this buffer? + if (off + len <= (*curbuf).length()) { + (*curbuf).copy_in(off, len, src); // yup, last bit! + break; + } + + // get as much as we can from this buffer. + unsigned howmuch = (*curbuf).length() - off; + (*curbuf).copy_in(off, howmuch, src); + + src += howmuch; + len -= howmuch; + off = 0; + curbuf++; + assert(curbuf != _buffers.end()); + } + } + void copy_in(unsigned off, unsigned len, bufferlist& bl) { + unsigned left = len; + for (list::iterator i = bl._buffers.begin(); + i != bl._buffers.end(); + i++) { + unsigned l = (*i).length(); + if (left < l) l = left; + copy_in(off, l, (*i).c_str()); + left -= l; + if (left == 0) break; + off += l; + } + } + + + void append(const char *data, unsigned len) { + if (len == 0) return; + + unsigned alen = 0; + + // copy into the tail buffer? + if (!_buffers.empty()) { + unsigned avail = _buffers.back().unused_tail_length(); + if (avail > 0) { + //cout << "copying up to " << len << " into tail " << avail << " bytes of tail buf" << endl; + if (avail > len) + avail = len; + unsigned blen = _buffers.back().length(); + memcpy(_buffers.back().c_str() + blen, data, avail); + blen += avail; + _buffers.back().set_length(blen); + _len += avail; + data += avail; + len -= avail; + } + alen = _buffers.back().length(); + } + if (len == 0) return; + + // just add another buffer. + // alloc a bit extra, in case we do a bunch of appends. FIXME be smarter! + if (alen < 1024) alen = 1024; + push_back(new buffer(data, len, BUFFER_MODE_DEFAULT, len+alen)); + } + void append(bufferptr& bp) { + push_back(bp); + } + void append(bufferptr& bp, unsigned len, unsigned off) { + bufferptr tempbp(bp, len, off); + push_back(tempbp); + } + void append(const bufferlist& bl) { + bufferlist temp = bl; // copy list + claim_append(temp); // and append + } + + + /* + * return a contiguous ptr to whole bufferlist contents. + */ + char *c_str() { + if (_buffers.size() == 1) { + return _buffers.front().c_str(); // good, we're already contiguous. + } + else if (_buffers.size() == 0) { + return 0; // no buffers + } + else { + // make one new contiguous buffer. + bufferptr newbuf = new buffer(length()); + unsigned off = 0; + + for (list::iterator it = _buffers.begin(); + it != _buffers.end(); + it++) { + //assert((*(*it)).has_free_func() == false); // not allowed if there's a funky free_func.. -sage ...for debugging at least! + memcpy(newbuf.c_str() + off, + (*it).c_str(), (*it).length()); + off += (*it).length(); + } + assert(off == newbuf.length()); + + _buffers.clear(); + _buffers.push_back( newbuf ); + + // now it'll work. + return c_str(); + } + } + + + void substr_of(bufferlist& other, unsigned off, unsigned len) { + assert(off + len <= other.length()); + clear(); + + // skip off + list::iterator curbuf = other._buffers.begin(); + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + //cout << "skipping over " << *curbuf << endl; + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + //cout << "somewhere in " << *curbuf << endl; + break; + } + } + + while (len > 0) { + // partial? + if (off + len < (*curbuf).length()) { + //cout << "copying partial of " << *curbuf << endl; + _buffers.push_back( bufferptr( *curbuf, len, off ) ); + _len += len; + break; + } + + // through end + //cout << "copying end (all?) of " << *curbuf << endl; + unsigned howmuch = (*curbuf).length() - off; + _buffers.push_back( bufferptr( *curbuf, howmuch, off ) ); + _len += howmuch; + len -= howmuch; + off = 0; + curbuf++; + } + } + + // funky modifer + void splice(unsigned off, unsigned len, bufferlist *claim_by=0 /*, bufferlist& replace_with */) { // fixme? + assert(off < length()); + assert(len > 0); + //cout << "splice off " << off << " len " << len << " ... mylen = " << length() << endl; + + // skip off + list::iterator curbuf = _buffers.begin(); + while (off > 0) { + assert(curbuf != _buffers.end()); + if (off >= (*curbuf).length()) { + // skip this buffer + //cout << "off = " << off << " skipping over " << *curbuf << endl; + off -= (*curbuf).length(); + curbuf++; + } else { + // somewhere in this buffer! + //cout << "off = " << off << " somewhere in " << *curbuf << endl; + break; + } + } + assert(off >= 0); + + if (off) { + // add a reference to the front bit + // insert it before curbuf (which we'll hose) + //cout << "keeping front " << off << " of " << *curbuf << endl; + _buffers.insert( curbuf, bufferptr( *curbuf, off, 0 ) ); + _len += off; + } + + while (len > 0) { + // partial? + if (off + len < (*curbuf).length()) { + //cout << "keeping end of " << *curbuf << ", losing first " << off+len << endl; + if (claim_by) + claim_by->append( *curbuf, len, off ); + (*curbuf).set_offset( off+len + (*curbuf).offset() ); // ignore beginning big + (*curbuf).set_length( (*curbuf).length() - (len+off) ); + _len -= off+len; + //cout << " now " << *curbuf << endl; + break; + } + + // hose though the end + unsigned howmuch = (*curbuf).length() - off; + //cout << "discarding " << howmuch << " of " << *curbuf << endl; + if (claim_by) + claim_by->append( *curbuf, howmuch, off ); + _len -= (*curbuf).length(); + _buffers.erase( curbuf++ ); + len -= howmuch; + off = 0; + } + + // splice in *replace (implement me later?) + } + + friend ostream& operator<<(ostream& out, bufferlist& bl); + +}; + +inline ostream& operator<<(ostream& out, bufferlist& bl) { + out << "bufferlist(len=" << bl.length() << endl; + for (list::iterator it = bl._buffers.begin(); + it != bl._buffers.end(); + it++) + out << "\t" << *it << endl; + out << ")" << endl; + return out; +} + + + +// encoder/decode helpers + +// string +inline void _encode(const string& s, bufferlist& bl) +{ + bl.append(s.c_str(), s.length()+1); +} +inline void _decode(string& s, bufferlist& bl, int& off) +{ + s = bl.c_str() + off; + off += s.length() + 1; +} + +// bufferptr (encapsulated) +inline void _encode(bufferptr& bp, bufferlist& bl) +{ + size_t len = bp.length(); + bl.append((char*)&len, sizeof(len)); + bl.append(bp); +} +inline void _decode(bufferptr& bp, bufferlist& bl, int& off) +{ + size_t len; + bl.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + bufferlist s; + s.substr_of(bl, off, len); + off += len; + + if (s.buffers().size() == 1) + bp = s.buffers().front(); + else + bp = new buffer(s.c_str(), s.length()); +} + +// bufferlist (encapsulated) +inline void _encode(const bufferlist& s, bufferlist& bl) +{ + size_t len = s.length(); + bl.append((char*)&len, sizeof(len)); + bl.append(s); +} +inline void _decode(bufferlist& s, bufferlist& bl, int& off) +{ + size_t len; + bl.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + s.substr_of(bl, off, len); + off += len; +} + + +// set +template +inline void _encode(set& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename set::iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(set& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(vector& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename vector::iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(vector& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + s = vector(n); + for (int i=0; i +template +inline void _encode(const list& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename list::const_iterator it = s.begin(); + it != s.end(); + it++) { + T v = *it; + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(list& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +inline void _encode(map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (map::iterator it = s.begin(); + it != s.end(); + it++) { + _encode(it->first, bl); + _encode(it->second, bl); + n--; + } + assert(n==0); +} +inline void _decode(map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(const map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename map::const_iterator it = s.begin(); + it != s.end(); + it++) { + T k = it->first; + bl.append((char*)&k, sizeof(k)); + _encode(it->second, bl); + n--; + } + assert(n==0); +} +template +inline void _decode(map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i +template +inline void _encode(const map& s, bufferlist& bl) +{ + int n = s.size(); + bl.append((char*)&n, sizeof(n)); + for (typename map::const_iterator it = s.begin(); + it != s.end(); + it++) { + T k = it->first; + U v = it->second; + bl.append((char*)&k, sizeof(k)); + bl.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +template +inline void _decode(map& s, bufferlist& bl, int& off) +{ + s.clear(); + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __RANGESET_H +#define __RANGESET_H + +/* + * + * my first container with iterator! it's pretty ugly. + * + */ + +#include +#include +#include +using namespace std; + +//typedef int T; + +template +struct _rangeset_base { + map ranges; // pair(first,last) (inclusive, e.g. [first,last]) + + typedef typename map::iterator mapit; + + // get iterator for range including val. or ranges.end(). + mapit get_range_for(T val) { + mapit it = ranges.lower_bound(val); + if (it == ranges.end()) { + // search backwards + typename map::reverse_iterator it = ranges.rbegin(); + if (it == ranges.rend()) return ranges.end(); + if (it->first <= val && it->second >= val) + return ranges.find(it->first); + return ranges.end(); + } else { + if (it->first == val) return + it--; + if (it->first <= val && it->second >= val) + return it; + return ranges.end(); + } + } + +}; + + +template +class rangeset_iterator : + public std::iterator +{ + //typedef typename map::iterator mapit; + + map ranges; + typename map::iterator it; + T current; + +public: + // cons + rangeset_iterator() {} + + rangeset_iterator(typename map::iterator& it, map& ranges) { + this->ranges = ranges; + this->it = it; + if (this->it != ranges.end()) + current = it->first; + } + + bool operator==(rangeset_iterator rit) { + return (it == rit.it && rit.current == current); + } + bool operator!=(rangeset_iterator rit) { + return (it != rit.it) || (rit.current != current); + } + + T& operator*() { + return current; + } + + rangeset_iterator operator++(int) { + if (current < it->second) + current++; + else { + it++; + if (it != ranges.end()) + current = it->first; + } + + return *this; + } +}; + + +template +class rangeset +{ + typedef typename map::iterator map_iterator; + + _rangeset_base theset; + inodeno_t _size; + +public: + rangeset() { _size = 0; } + typedef rangeset_iterator iterator; + + iterator begin() { + map_iterator it = theset.ranges.begin(); + return iterator(it, theset.ranges); + } + + iterator end() { + map_iterator it = theset.ranges.end(); + return iterator(it, theset.ranges); + } + + map_iterator map_begin() { + return theset.ranges.begin(); + } + map_iterator map_end() { + return theset.ranges.end(); + } + int map_size() { + return theset.ranges.size(); + } + + void map_insert(T v1, T v2) { + theset.ranges.insert(pair(v1,v2)); + _size += v2 - v1+1; + } + + + // ... + bool contains(T val) { + if (theset.get_range_for(val) == theset.ranges.end()) return false; + assert(!empty()); + return true; + } + + void insert(T val) { + assert(!contains(val)); + + map_iterator left = theset.get_range_for(val-1); + map_iterator right = theset.get_range_for(val+1); + + if (left != theset.ranges.end() && + right != theset.ranges.end()) { + // join! + left->second = right->second; + theset.ranges.erase(right); + _size++; + return; + } + + if (left != theset.ranges.end()) { + // add to left range + left->second = val; + _size++; + return; + } + + if (right != theset.ranges.end()) { + // add to right range + theset.ranges.insert(pair(val, right->second)); + theset.ranges.erase(val+1); + _size++; + return; + } + + // new range + theset.ranges.insert(pair(val,val)); + _size++; + return; + } + + unsigned size() { + return size(); + } + + bool empty() { + if (theset.ranges.empty()) { + assert(_size == 0); + return true; + } + assert(_size>0); + return false; + } + + + T first() { + assert(!empty()); + map_iterator it = theset.ranges.begin(); + return it->first; + } + + void erase(T val) { + assert(contains(val)); + map_iterator it = theset.get_range_for(val); + assert(it != theset.ranges.end()); + + // entire range + if (val == it->first && val == it->second) { + theset.ranges.erase(it); + _size--; + return; + } + + // beginning + if (val == it->first) { + theset.ranges.insert(pair(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + // end + if (val == it->second) { + it->second = val-1; + _size--; + return; + } + + // middle split + theset.ranges.insert(pair(it->first, val-1)); + theset.ranges.insert(pair(val+1, it->second)); + theset.ranges.erase(it); + _size--; + return; + } + + void dump() { + for (typename map::iterator it = theset.ranges.begin(); + it != theset.ranges.end(); + it++) { + cout << " " << it->first << "-" << it->second << endl; + } + } + +}; + + +#endif diff --git a/branches/sage/cephmds2/include/statlite.h b/branches/sage/cephmds2/include/statlite.h new file mode 100644 index 0000000000000..60a977e49a499 --- /dev/null +++ b/branches/sage/cephmds2/include/statlite.h @@ -0,0 +1,70 @@ +#ifndef _STATLITE_H +#define _STATLITE_H + +extern "C" { + +#include +#include +#include +#include +#include + +struct statlite { + dev_t st_dev; /* device */ + ino_t st_ino; /* inode */ + mode_t st_mode; /* protection */ + nlink_t st_nlink; /* number of hard links */ + uid_t st_uid; /* user ID of owner */ + gid_t st_gid; /* group ID of owner */ + dev_t st_rdev; /* device type (if inode device)*/ + unsigned long st_litemask; /* bit mask for optional fields */ + /***************************************************************/ + /**** Remaining fields are optional according to st_litemask ***/ + off_t st_size; /* total size, in bytes */ + blksize_t st_blksize; /* blocksize for filesystem I/O */ + blkcnt_t st_blocks; /* number of blocks allocated */ + struct timespec st_atim; /* Time of last access. */ + struct timespec st_mtim; /* Time of last modification. */ + struct timespec st_ctim; /* Time of last status change. */ + //time_t st_atime; /* time of last access */ + //time_t st_mtime; /* time of last modification */ + //time_t st_ctime; /* time of last change */ +}; + +#define S_STATLITE_SIZE 1 +#define S_STATLITE_BLKSIZE 2 +#define S_STATLITE_BLOCKS 4 +#define S_STATLITE_ATIME 8 +#define S_STATLITE_MTIME 16 +#define S_STATLITE_CTIME 32 + +#define S_REQUIRESIZE(m) (m | S_STATLITE_SIZE) +#define S_REQUIREBLKSIZE(m) (m | S_STATLITE_BLKSIZE) +#define S_REQUIREBLOCKS(m) (m | S_STATLITE_BLOCKS) +#define S_REQUIREATIME(m) (m | S_STATLITE_ATIME) +#define S_REQUIREMTIME(m) (m | S_STATLITE_MTIME) +#define S_REQUIRECTIME(m) (m | S_STATLITE_CTIME) + +#define S_ISVALIDSIZE(m) (m & S_STATLITE_SIZE) +#define S_ISVALIDBLKSIZE(m) (m & S_STATLITE_BLKSIZE) +#define S_ISVALIDBLOCKS(m) (m & S_STATLITE_BLOCKS) +#define S_ISVALIDATIME(m) (m & S_STATLITE_ATIME) +#define S_ISVALIDMTIME(m) (m & S_STATLITE_MTIME) +#define S_ISVALIDCTIME(m) (m & S_STATLITE_CTIME) + + +// readdirplus etc. + +struct dirent_plus { + struct dirent d_dirent; /* dirent struct for this entry */ + struct stat d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; +struct dirent_lite { + struct dirent d_dirent; /* dirent struct for this entry */ + struct statlite d_stat; /* attributes for this entry */ + int d_stat_err;/* errno for d_stat, or 0 */ +}; + +} +#endif diff --git a/branches/sage/cephmds2/include/types.h b/branches/sage/cephmds2/include/types.h new file mode 100644 index 0000000000000..d93d9c2f7c636 --- /dev/null +++ b/branches/sage/cephmds2/include/types.h @@ -0,0 +1,537 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_TYPES_H +#define __MDS_TYPES_H + +extern "C" { +#include +#include +#include +#include "statlite.h" +} + +#include +#include +#include +#include +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + + +#include "object.h" + + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a):(b)) +#endif +#ifndef MAX +# define MAX(a,b) ((a) > (b) ? (a):(b)) +#endif + + +// md ops +#define MDS_OP_STATFS 1 + +#define MDS_OP_STAT 100 +#define MDS_OP_LSTAT 101 +#define MDS_OP_UTIME 102 +#define MDS_OP_CHMOD 103 +#define MDS_OP_CHOWN 104 + + +#define MDS_OP_READDIR 200 +#define MDS_OP_MKNOD 201 +#define MDS_OP_LINK 202 +#define MDS_OP_UNLINK 203 +#define MDS_OP_RENAME 204 + +#define MDS_OP_MKDIR 220 +#define MDS_OP_RMDIR 221 +#define MDS_OP_SYMLINK 222 + +#define MDS_OP_OPEN 301 +#define MDS_OP_TRUNCATE 306 +#define MDS_OP_FSYNC 307 +//#define MDS_OP_CLOSE 310 +#define MDS_OP_RELEASE 308 + + + +// -- stl crap -- + +/* +- this is to make some of the STL types work with 64 bit values, string hash keys, etc. +- added when i was using an old STL.. maybe try taking these out and see if things + compile now? +*/ + +namespace __gnu_cxx { + template<> struct hash< std::string > + { + size_t operator()( const std::string& x ) const + { + static hash H; + return H(x.c_str()); + } + }; +} + + +/* + * comparators for stl containers + */ +// for hash_map: +// hash_map, eqstr> vals; +struct eqstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) == 0; + } +}; + +// for set, map +struct ltstr +{ + bool operator()(const char* s1, const char* s2) const + { + return strcmp(s1, s2) < 0; + } +}; + + + +/** object layout + * how objects are mapped into PGs + */ +#define OBJECT_LAYOUT_DEFAULT 0 // see g_conf +#define OBJECT_LAYOUT_HASH 1 +#define OBJECT_LAYOUT_LINEAR 2 +#define OBJECT_LAYOUT_HASHINO 3 +#define OBJECT_LAYOUT_STARTOSD 4 + +/** pg layout + * how PGs are mapped into (sets of) OSDs + */ +#define PG_LAYOUT_CRUSH 0 +#define PG_LAYOUT_HASH 1 +#define PG_LAYOUT_LINEAR 2 +#define PG_LAYOUT_HYBRID 3 + +/** FileLayout + * specifies a striping and replication strategy + */ + +//#define FILE_LAYOUT_CRUSH 0 // stripe via crush +//#define FILE_LAYOUT_LINEAR 1 // stripe linearly across cluster + +struct FileLayout { + // layout + int object_layout; + + // FIXME: make this a union? + // rushstripe + int stripe_size; // stripe unit, in bytes + int stripe_count; // over this many objects + int object_size; // until objects are this big, then use a new set of objects. + + // period = bytes before i start on a new set of objects. + int period() { return object_size * stripe_count; } + + int osd; // osdlocal + + int num_rep; // replication + + FileLayout() { } + FileLayout(int ss, int sc, int os, int nr=2, int o=-1) : + object_layout(o < 0 ? OBJECT_LAYOUT_DEFAULT:OBJECT_LAYOUT_STARTOSD), + stripe_size(ss), stripe_count(sc), object_size(os), + osd(o), + num_rep(nr) { } + +}; + + + +// -- inode -- + +//typedef __uint64_t inodeno_t; + +struct inodeno_t { + __uint64_t val; + inodeno_t() : val() {} + inodeno_t(__uint64_t v) : val(v) {} + inodeno_t operator+=(inodeno_t o) { val += o.val; return *this; } + operator __uint64_t() const { return val; } +}; + +inline ostream& operator<<(ostream& out, inodeno_t ino) { + return out << hex << ino.val << dec; +} + +namespace __gnu_cxx { + template<> struct hash< inodeno_t > + { + size_t operator()( const inodeno_t& x ) const + { + static hash<__uint64_t> H; + return H(x.val); + } + }; +} + +typedef __uint64_t version_t; + + + +#define INODE_MODE_FILE 0100000 // S_IFREG +#define INODE_MODE_SYMLINK 0120000 // S_IFLNK +#define INODE_MODE_DIR 0040000 // S_IFDIR +#define INODE_TYPE_MASK 0170000 + +#define FILE_MODE_R 1 +#define FILE_MODE_W 2 +#define FILE_MODE_RW (1|2) +#define FILE_MODE_LAZY 4 + +#define INODE_MASK_BASE 1 // ino, ctime, nlink +#define INODE_MASK_PERM 2 // uid, gid, mode +#define INODE_MASK_SIZE 4 // size, blksize, blocks +#define INODE_MASK_MTIME 8 // mtime +#define INODE_MASK_ATIME 16 // atime + +#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME) +//#define INODE_MASK_ALL_STAT (INODE_MASK_BASE|INODE_MASK_PERM|INODE_MASK_SIZE|INODE_MASK_MTIME|INODE_MASK_ATIME) + +struct inode_t { + // base (immutable) + inodeno_t ino; // NOTE: ino _must_ come first for MDStore.cc to behave!! + time_t ctime; + + // other + FileLayout layout; // ?immutable? + int nlink; // base, + + // hard/perm (namespace permissions) + mode_t mode; + uid_t uid; + gid_t gid; + + // file (data access) + off_t size; + time_t atime, mtime; // maybe atime different? "lazy"? + + int mask; + + // special stuff + version_t version; // auth only + unsigned char hash_seed; // only defined for dir; 0 if not hashed. + bool anchored; // auth only + version_t file_data_version; // auth only + + bool is_symlink() { return (mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK; } + bool is_dir() { return (mode & INODE_TYPE_MASK) == INODE_MODE_DIR; } + bool is_file() { return (mode & INODE_TYPE_MASK) == INODE_MODE_FILE; } +}; + + + +// lame 128-bit value class. +class lame128_t { +public: + __uint64_t hi, lo; + lame128_t(__uint64_t h=0, __uint64_t l=0) : hi(h), lo(l) {} +}; + +inline ostream& operator<<(ostream& out, lame128_t& oid) { + return out << oid.hi << "." << oid.lo; +} + + +// osd types +//typedef __uint32_t ps_t; // placement seed +//typedef __uint32_t pg_t; // placement group +typedef __uint64_t coll_t; // collection id +typedef __uint64_t tid_t; // transaction id + +typedef __uint32_t epoch_t; // map epoch (32bits -> 13 epochs/second for 10 years) + +// pg stuff +typedef __uint16_t ps_t; +typedef __uint8_t pruleset_t; + +// placement group id +struct pg_t { + union { + struct { + int preferred; + ps_t ps; + __uint8_t nrep; + pruleset_t ruleset; + } fields; + __uint64_t val; + } u; + pg_t() { u.val = 0; } + pg_t(const pg_t& o) { u.val = o.u.val; } + pg_t(ps_t s, int p, unsigned char n, pruleset_t r=0) { + u.fields.ps = s; + u.fields.preferred = p; + u.fields.nrep = n; + u.fields.ruleset = r; + } + pg_t(__uint64_t v) { u.val = v; } + /* + pg_t operator=(__uint64_t v) { u.val = v; return *this; } + pg_t operator&=(__uint64_t v) { u.val &= v; return *this; } + pg_t operator+=(pg_t o) { u.val += o.val; return *this; } + pg_t operator-=(pg_t o) { u.val -= o.val; return *this; } + pg_t operator++() { ++u.val; return *this; } + */ + operator __uint64_t() const { return u.val; } +}; + +inline ostream& operator<<(ostream& out, pg_t pg) { + //return out << hex << pg.val << dec; + if (pg.u.fields.ruleset) + out << (int)pg.u.fields.ruleset << '.'; + out << (int)pg.u.fields.nrep << '.'; + if (pg.u.fields.preferred) + out << pg.u.fields.preferred << '.'; + out << hex << pg.u.fields.ps << dec; + return out; +} + +namespace __gnu_cxx { + template<> struct hash< pg_t > + { + size_t operator()( const pg_t& x ) const + { + static hash<__uint64_t> H; + return H(x); + } + }; +} + + + +// compound rados version type +class eversion_t { +public: + epoch_t epoch; + version_t version; + eversion_t(epoch_t e=0, version_t v=0) : epoch(e), version(v) {} +}; + +inline bool operator==(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) && (l.version == r.version); +} +inline bool operator!=(const eversion_t& l, const eversion_t& r) { + return (l.epoch != r.epoch) || (l.version != r.version); +} +inline bool operator<(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version < r.version):(l.epoch < r.epoch); +} +inline bool operator<=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version <= r.version):(l.epoch <= r.epoch); +} +inline bool operator>(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version > r.version):(l.epoch > r.epoch); +} +inline bool operator>=(const eversion_t& l, const eversion_t& r) { + return (l.epoch == r.epoch) ? (l.version >= r.version):(l.epoch >= r.epoch); +} +inline ostream& operator<<(ostream& out, const eversion_t e) { + return out << e.epoch << "'" << e.version; +} + + + +#define PG_NONE 0xffffffffL + + +typedef __uint16_t snapv_t; // snapshot version + + +class OSDSuperblock { +public: + const static __uint64_t MAGIC = 0xeb0f505dULL; + __uint64_t magic; + __uint64_t fsid; // unique fs id (random number) + int whoami; // my role in this fs. + epoch_t current_epoch; // most recent epoch + epoch_t oldest_map, newest_map; // oldest/newest maps we have. + OSDSuperblock(__uint64_t f=0, int w=0) : + magic(MAGIC), fsid(f), whoami(w), + current_epoch(0), oldest_map(0), newest_map(0) {} +}; + +inline ostream& operator<<(ostream& out, OSDSuperblock& sb) +{ + return out << "sb(fsid " << sb.fsid + << " osd" << sb.whoami + << " e" << sb.current_epoch + << " [" << sb.oldest_map << "," << sb.newest_map + << "])"; +} + +class MonSuperblock { +public: + const static __uint64_t MAGIC = 0x00eb0f5000ULL; + __uint64_t magic; + __uint64_t fsid; + int whoami; // mon # + epoch_t current_epoch; + MonSuperblock(__uint64_t f=0, int w=0) : + magic(MAGIC), fsid(f), whoami(w), current_epoch(0) {} +}; + + +// new types + +class ObjectExtent { + public: + object_t oid; // object id + off_t start; // in object + size_t length; // in object + + objectrev_t rev; // which revision? + pg_t pgid; // where to find the object + + map buffer_extents; // off -> len. extents in buffer being mapped (may be fragmented bc of striping!) + + ObjectExtent() : start(0), length(0), rev(0), pgid(0) {} + ObjectExtent(object_t o, off_t s=0, size_t l=0) : oid(o), start(s), length(l), rev(0), pgid(0) { } +}; + +inline ostream& operator<<(ostream& out, ObjectExtent &ex) +{ + return out << "extent(" + << ex.oid << " in " << hex << ex.pgid << dec + << " " << ex.start << "~" << ex.length + << ")"; +} + + + +// client types +typedef int fh_t; // file handle + + +// dentries +#define MAX_DENTRY_LEN 255 + + + + + + +// -- io helpers -- + +template +inline ostream& operator<<(ostream& out, vector& v) { + out << "["; + for (unsigned i=0; i +inline ostream& operator<<(ostream& out, const set& iset) { + for (typename set::const_iterator it = iset.begin(); + it != iset.end(); + it++) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template +inline ostream& operator<<(ostream& out, const multiset& iset) { + for (typename multiset::const_iterator it = iset.begin(); + it != iset.end(); + it++) { + if (it != iset.begin()) out << ","; + out << *it; + } + return out; +} + +template +inline ostream& operator<<(ostream& out, const map& m) +{ + out << "{"; + for (typename map::const_iterator it = m.begin(); + it != m.end(); + it++) { + if (it != m.begin()) out << ","; + out << it->first << "=" << it->second; + } + out << "}"; + return out; +} + + + + +// -- rope helpers -- + +// string +inline void _rope(string& s, crope& r) +{ + r.append(s.c_str(), s.length()+1); +} +inline void _unrope(string& s, crope& r, int& off) +{ + s = r.c_str() + off; + off += s.length() + 1; +} + +// set +inline void _rope(set& s, crope& r) +{ + int n = s.size(); + r.append((char*)&n, sizeof(n)); + for (set::iterator it = s.begin(); + it != s.end(); + it++) { + int v = *it; + r.append((char*)&v, sizeof(v)); + n--; + } + assert(n==0); +} +inline void _unrope(set& s, crope& r, int& off) +{ + s.clear(); + int n; + r.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +/* + * uofs.h + * + * user-level object-based file system + */ + + #ifndef _UOFS_H_ + #define _UOFS_H_ + + #include + #include + #include + + + int device_open(char *path, int xflags); + void device_findsizes(int fd, long long *sz, int *bsz); + + int uofs_format(int bdev_id, int donode_size, int bd_ratio, int reg_size, int sb_size, int lb_size, + int nr_hash_table_buckets, int delay_allocation, int flush_interval); + + int uofs_mount(int bdev_id); + void uofs_shutdown(void); + + int uofs_read(long long oid, void *buf, off_t offset, size_t count); + int uofs_write(long long oid, void *buf, off_t offset, size_t count); + int uofs_del(long long oid); + int uofs_sync(long long oid); + int uofs_exist(long long oid); + + int uofs_get_size(long long oid); + + void uofs_superblock_printout(void); + int get_large_object_pages(void); + + int uofs_buffer_size(void); + #endif diff --git a/branches/sage/cephmds2/jobs/alc.tp b/branches/sage/cephmds2/jobs/alc.tp new file mode 100644 index 0000000000000..c600850c54be0 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alc.tp @@ -0,0 +1,38 @@ +#PSUB -s /bin/bash # Sets your shell in batch +#PSUB -c alc # Where to run the job + +#PSUB -eo # Send std error & std out to the same file + +#PSUB -ln $NUM # Number of nodes to use +#PSUB -g $NUM # Total Number of tasks to use +#PSUB -cpn 1 # cpus per node + +####PSUB -c 1024Mb # memory limit +#PSUB -lc 1500 # Core file size per process +#PSUB -nr # Do not automatically resubmit job +#PSUB -tM 20m # Select time limit. The default time limit + # is only 30 minutes! Time can be HH:MM:SS or HH:MM + +#PSUB -o $CWD/$OUT # filename for output + +# Put your commands here. Remember to 'cd' to the appropriate +# directory, because the job will initially be in your home directory. +# To run a parallel job, you need to use the srun. + + + +echo job $PSUB_JOBID nodes $NUM name $NAME + +# environment +cd $CWD +export LD_LIBRARY_PATH=/usr/lib/mpi/mpi_gnu/lib + +# create fakestore dirs +srun -l -N $NUM -ppbatch bash -c "test -d tmp/osddata || mkdir tmp/osddata || echo cant make osddata ; uptime" + +# go +srun -l -N $NUM -ppbatch $CMD && touch $DONE + +# clean up fakestore +srun -l -N $NUM -ppbatch bash -c 'uptime ; rm -r tmp/osddata/*' + diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs b/branches/sage/cephmds2/jobs/alcdat/makedirs new file mode 100644 index 0000000000000..af5a098a254c9 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/makedirs @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], + 'nummds' => [1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], + + 'cper' => [15,20], + '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds > 1 ? $nummds:2', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + # parameters + 'fs' => 'ebofs', + #'fs' => 'fakestore', + + 'mds_bal_rep' => 10000, # none of that! + 'mds_decay_halflife' => 30, + + 'mds_bal_interval' => 45, + 'mds_bal_max' => [2], + + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 100, + 'end' => 300, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 4, + + # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 + # --osd_pg_layout linear + 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs.big b/branches/sage/cephmds2/jobs/alcdat/makedirs.big new file mode 100644 index 0000000000000..c67b2b93dd742 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/makedirs.big @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], + 'nummds' => [160, 200],#[1, 2, 8, 16, 32, 48, 64, 80, 96, 112, 128],#144, 160, 192, 208], + + 'cper' => [15,20], + '_dep' => [ 'cnode' => '40',#$nummds',# / 4 + 1', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds * .8', + 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], + + # parameters + 'fs' => 'ebofs', + #'fs' => 'fakestore', + + 'mds_bal_rep' => 10000, # none of that! + 'mds_decay_halflife' => 30, + + 'mds_bal_interval' => 45, + 'mds_bal_max' => 2, + + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 100, + 'end' => 300, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 4, + + # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 + # --osd_pg_layout linear + 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/makedirs.tput b/branches/sage/cephmds2/jobs/alcdat/makedirs.tput new file mode 100644 index 0000000000000..8dd5ae4c47d8c --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/makedirs.tput @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192], + 'nummds' => [4, 16, 64],#[1, 16, 64, 128],#144, 160, 192, 208], + + #'cper' => [2, 5, 7, 10, 13, 16, 20, 30, 40, 50, 100, 150], + 'cper' => [13, 30, 40], # just for final run... + '_dep' => [ 'cnode' => '$nummds',# / 4 + 1', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + # parameters + 'fs' => 'ebofs', + #'fs' => 'fakestore', + + 'mds_bal_rep' => 10000, # none of that! + 'mds_decay_halflife' => 30, + + 'mds_bal_interval' => 45, + 'mds_bal_max' => 2, + + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 100, + 'end' => 300, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 4, + + # --meta_log_layout_scount 32 --meta_log_layout_ssize 256 + # --osd_pg_layout linear + 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + 'comb' => { + 'x' => 'cper',#nummds', + 'vars' => [ 'mds.req', 'cl.lat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/makefiles.shared b/branches/sage/cephmds2/jobs/alcdat/makefiles.shared new file mode 100644 index 0000000000000..ab96702c73289 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/makefiles.shared @@ -0,0 +1,32 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => [1, 2, 4, 8, 16, 32, 64, 96, 128], #2, 4, 8, 16, 32, 48, 64, 80, 96], + + 'cper' => [25, 50, 100, 150],# 100, 150, 200], + + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + # parameters + 'fs' => 'ebofs', + + 'mds_bal_hash_wr' => 1000, + + 'until' => 180, # --syn until $n ... when to stop clients + 'kill_after' => 250, + 'start' => 30, + 'end' => 180, + + 'custom' => '--tcp_skip_rank0 --meta_log_layout_num_rep 1 --meta_dir_layout_num_rep 1 --mds_shutdown_check 60 --syn makefiles 100000 1000 0', + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req', 'cl.lat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/openshared b/branches/sage/cephmds2/jobs/alcdat/openshared new file mode 100644 index 0000000000000..5ed7ba95894b3 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/openshared @@ -0,0 +1,32 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => [1, 4, 16, 64, 128, 192 ], + + 'cper' => [10, 50, 100, 150], + '_dep' => [ 'cnode' => '$nummds',# > 30 ? 30:$nummds', + 'numclient' => '$nummds*$cper', + 'numosd' => '$nummds > 30 ? 30:$nummds', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + # parameters + 'fs' => 'ebofs', + + 'mds_bal_interval' => 10000, + 'mds_bal_hash_wr' => 1000, + + 'until' => 120, # --syn until $n ... when to stop clients + 'kill_after' => 180, + 'start' => 10, + 'end' => 120, + + 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 10 --mds_shutdown_check 60 --syn only 0 --syn createshared 10 --syn sleep 5 --syn openshared 10 10000', + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req', 'cl.lat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.include b/branches/sage/cephmds2/jobs/alcdat/ossh.include new file mode 100644 index 0000000000000..c9a368ba5c60f --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/ossh.include @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 10, + + #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + #'nummds' => [1, 2, 4, 6, 7], # googoo + 'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], + + #'trace' => ['make.lib', 'make.include'], + + 'mds_bal_interval' => 45, + 'mds_bal_max' => 2,#6, #[ 2,4,6 ], + 'mds_decay_halflife' => 30, + 'mds_bal_rep' => 1500, + 'mds_bal_hash_rd' => 100000, + + 'cper' => [15, 20],#25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], + #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], + + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', + + # parameters + 'fs' => 'ebofs', + + #'until' => 500, + #'kill_after' => 600, + #'start' => 200, + #'end' => 500, + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 200, + 'end' => 300, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.include.big b/branches/sage/cephmds2/jobs/alcdat/ossh.include.big new file mode 100644 index 0000000000000..b92895a53a763 --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/ossh.include.big @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 10, + + #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + #'nummds' => [1, 2, 4, 6, 7], # googoo + #'nummds' => [ 2, 4, 8, 16, 32, 48, 64, 80, 96, 128 ], + 'nummds' => [160,200], + + #'trace' => ['make.lib', 'make.include'], + + 'mds_bal_interval' => 45, + 'mds_bal_max' => 2,#6, #[ 2,4,6 ], + 'mds_decay_halflife' => 30, + 'mds_bal_rep' => 1500, + 'mds_bal_hash_rd' => 100000, + + 'cper' => [25, 50], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], + #'cper' => 125, #[30, 50, 75, 100, 125, 150], #50, #[10,50,100],# [ 50, 75, 100 ], + + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds * .6', + 'n' => '415'],#1 + $cnode + $nummds + $numosd' ], + + 'custom' => '--tcp_skip_rank0 --tcp_overlay_clients --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.include 1 --syn sleep 30 --syn trace traces/openssh/make.include 1000', + + # parameters + 'fs' => 'ebofs', + + #'until' => 500, + #'kill_after' => 600, + #'start' => 200, + #'end' => 500, + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 200, + 'end' => 300, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.lib b/branches/sage/cephmds2/jobs/alcdat/ossh.lib new file mode 100644 index 0000000000000..73372866f051f --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/ossh.lib @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 10, + + #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + 'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + + #'nummds' => [1, 2, 4, 6, 7], # googoo + #'trace' => ['make.lib', 'make.include'], + + 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], + #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], + + 'mds_decay_halflife' => 30, + 'mds_bal_rep' => 1500, + 'cper' => [10, 16], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], + + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + + 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', + + # parameters + #'fs' => ['fakestore'], + 'fs' => 'ebofs', + + #'until' => 500, + #'kill_after' => 600, + #'start' => 200, + #'end' => 500, + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 150, + 'end' => 300, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big b/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big new file mode 100644 index 0000000000000..b9e0dd1ff68cd --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/ossh.lib.big @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 10, + + #'nummds' => [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + #'nummds' => [2, 4, 8, 16, 32, 48, 64, 80, 96, 128], + 'nummds' => [160,200], + + #'nummds' => [1, 2, 4, 6, 7], # googoo + #'trace' => ['make.lib', 'make.include'], + + 'mds_bal_interval' => 90, #[30, 60, 90], #$[60,90], #[60,90],#[30, 60, 90], + #'mds_bal_max' => [4, 10],#6,#[2,4,6,8], + + 'mds_decay_halflife' => 30, + 'mds_bal_rep' => 1500, + 'cper' => [25, 50, 100], #50,#[25, 50, 75, 100],#50,# [ 50, 100 ], + + '_dep' => [ 'cnode' => 0,#'30', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds * .6', + 'n' => '415'],#'1 + $cnode + $nummds + $numosd' ], + + + 'custom' => '--tcp_skip_rank0 --debug_mds_balancer 1 --mds_shutdown_check 60 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn trace traces/openssh/make.lib 1000', + + # parameters + #'fs' => ['fakestore'], + 'fs' => 'ebofs', + + #'until' => 500, + #'kill_after' => 600, + #'start' => 200, + #'end' => 500, + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 150, + 'end' => 300, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/alcdat/striping b/branches/sage/cephmds2/jobs/alcdat/striping new file mode 100644 index 0000000000000..de71828d12bde --- /dev/null +++ b/branches/sage/cephmds2/jobs/alcdat/striping @@ -0,0 +1,48 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => 1, + 'numosd' => 10, + + 'cnode' => 10, + 'cper' => [ 10, 25, 50, 100 ], + + '_dep' => [ 'numclient' => '$cper * $cnode', + 'n' => '1 + $cnode + $nummds + $numosd', + 'file_layout_osize' => '$writefile_size' ], + + # parameters + 'fs' => 'ebofs', + #'fs' => 'fakestore', + + 'until' => 160, # --syn until $n ... when to stop clients + 'kill_after' => 200, + 'start' => 100, + 'end' => 160, + + 'writefile' => 1, + 'writefile_size' => [ +# 4*1024*1024, + 1024*1024 ], +# 256*1024, +# 64*1024 + 'writefile_mb' => 100000, + + 'osd_pg_bits' => 10,#16, + #'osd_pg_bits' => [ 16, 20 ], + + #'osd_object_layout' => [ 'hash', 'hashino', 'linear' ], + 'osd_pg_layout' => [ 'crush', +# 'hash', + 'linear' ], + + 'custom' => '--tcp_skip_rank0 --file_layout_num_rep 1 --mds_shutdown_check 60', + + 'comb' => { + 'x' => 'cper',#writefile_size', + 'vars' => [ 'osd.c_wrb', 'osd.c_wr' ], + } +}; diff --git a/branches/sage/cephmds2/jobs/mds/log_striping b/branches/sage/cephmds2/jobs/mds/log_striping new file mode 100644 index 0000000000000..46242cdda4f00 --- /dev/null +++ b/branches/sage/cephmds2/jobs/mds/log_striping @@ -0,0 +1,36 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + 'kill_after' => 300, + + 'nummds' => 1, + 'numosd' => 8, + 'numclient' => 100, + 'n' => 16, + + # parameters + 'fs' => ['ebofs','fakestore'], + 'meta_log_ssize' => [ 128, 256, 1024, 1 << 15, 1 << 20 ], + 'meta_log_scount' => 4,#[ 1, 2, 4, 8 ], + + 'until' => 200, # --syn until $n ... when to stop clients + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 4, + + 'custom' => '--tcp_skip_rank0', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + # for final summation (script/sum.pl) + 'start' => 100, + 'end' => 550, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/mds/makedir_lat b/branches/sage/cephmds2/jobs/mds/makedir_lat new file mode 100644 index 0000000000000..63374f52a36c0 --- /dev/null +++ b/branches/sage/cephmds2/jobs/mds/makedir_lat @@ -0,0 +1,33 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => 1, + 'numosd' => 8, + 'numclient' => [1],#, 40, 80, 160 ], + 'n' => 20, + + 'fs' => 'ebofs', + + 'start' => 20, + 'end' => 40, + 'until' => 40, + 'kill_after' => 60, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 5, + + 'mds_local_osd' => [ 0, 1 ], + 'meta_log_layout_num_rep' => [ 0, 1, 2, 3, 4], + + 'custom' => '--tcp_skip_rank0', + + 'comb' => { + 'x' => 'meta_log_layout_num_rep', + 'vars' => [ 'mds.log.lat', 'cl.lat', 'osd.rlat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/mds/makedirs b/branches/sage/cephmds2/jobs/mds/makedirs new file mode 100644 index 0000000000000..4ca42d72fa37e --- /dev/null +++ b/branches/sage/cephmds2/jobs/mds/makedirs @@ -0,0 +1,40 @@ +#!/usr/bin/perl + +# hi there +{ + '_psub' => 'jobs/alc.tp', + + 'sleep' => 3, + + 'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], + + 'cper' => 50, + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$cnode * $cper', + 'numosd' => '$nummds * 2', + 'n' => '1 + $cnode + $nummds + $numosd' ], + + # parameters + #'fs' => 'ebofs', + 'fs' => 'fakestore', + + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 3, + + 'custom' => '--tcp_skip_rank0', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + # for final summation (script/sum.pl) + 'start' => 100, + 'end' => 550, + + 'comb' => { + 'x' => 'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/mds/opensshlib b/branches/sage/cephmds2/jobs/mds/opensshlib new file mode 100644 index 0000000000000..d8b61ae52c655 --- /dev/null +++ b/branches/sage/cephmds2/jobs/mds/opensshlib @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => [1, 2, 4, 7], # googoo + #'nummds' => [1, 2, 4, 6, 8, 12, 16, 24, 32, 40, 48, 64], # alc + + + # parameters + 'fs' => 'ebofs', + #'fs' => 'fakestore', + + 'until' => 300, # --syn until $n ... when to stop clients + 'kill_after' => 400, + 'start' => 150, + 'end' => 300, + + 'mds_bal_interval' => 90,#[60, 90], + #'mds_bal_max' => [3,4,5], + 'mds_bal_max' => 4, + 'mds_decay_halflife' => 30,#[15, 25, 30, 45, 60], + 'mds_bal_rep' => 1500,#[1000, 1500, 2000], + + 'decay_hl' => 100,#[ 25, 50, 100, 150 ], + + 'cper' => 100, #[50, 75, 100, 125, 150, 200], + '_dep' => [ 'cnode' => '$nummds', + 'numclient' => '$nummds * $cper', + 'numosd' => '$nummds * 2', + 'n' => '1 + $cnode + $nummds + $numosd', + 'mds_bal_rep' => '$mds_decay_halflife * $decay_hl'], + + 'custom' => '--tcp_skip_rank0 --syn only 0 --syn trace traces/openssh/untar.lib 1 --syn sleep 10 --syn randomsleep 30 --syn trace traces/openssh/make.lib 100 --debug_mds_balancer 1 --mds_shutdown_check 60', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + # for final summation (script/sum.pl) + + 'comb' => { + 'x' => 'nummds',#decay_hl',#'nummds', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/meta1 b/branches/sage/cephmds2/jobs/meta1 new file mode 100644 index 0000000000000..743212f1c3009 --- /dev/null +++ b/branches/sage/cephmds2/jobs/meta1 @@ -0,0 +1,19 @@ +#!/bin/sh + +# makedirs for 300 seconds +# first bit in memory +# second bit is commiting from journal too +# then walk fs for 300 seconds +# this should all be in memory. + +JOB="meta1" +ARGS="--numosd 10 --fullmkfs --syn until 180 --syn makedirs 10 10 4 --syn until 360 --syn repeatwalk --mds_bal_max 1 --osd_fsync 0 --mds_log_max_len 200000 --mds_cache_size 500000" + +#rm core* ; make tcpsyn && mpiexec -l -n 17 ./tcpsyn $ARGS --nummds 1 --log_name $JOB/1 --numclient 25 > log/$JOB/o.1 +#rm core* ; make tcpsyn && mpiexec -l -n 18 ./tcpsyn $ARGS --nummds 2 --log_name $JOB/2 --numclient 50 > log/$JOB/o.2 +#rm core* ; make tcpsyn && mpiexec -l -n 20 ./tcpsyn $ARGS --nummds 4 --log_name $JOB/4 --numclient 100 > log/$JOB/o.4 +#rm core* ; make tcpsyn && mpiexec -l -n 24 ./tcpsyn $ARGS --nummds 8 --log_name $JOB/8 --numclient 200 > log/$JOB/o.8 +#rm core* ; make tcpsyn && mpiexec -l -n 28 ./tcpsyn $ARGS --nummds 12 --log_name $JOB/12 --numclient 300 > log/$JOB/o.12 +rm core* ; make tcpsyn && mpiexec -l -n 32 ./tcpsyn $ARGS --nummds 16 --log_name $JOB/16 --numclient 300 > log/$JOB/o.16 + + diff --git a/branches/sage/cephmds2/jobs/meta1.proc.sh b/branches/sage/cephmds2/jobs/meta1.proc.sh new file mode 100755 index 0000000000000..616acbefff619 --- /dev/null +++ b/branches/sage/cephmds2/jobs/meta1.proc.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +for d in 1 2 4 8 12 +do + echo $d + cd $d + ../../../script/sum.pl mds? mds?? > mds.sum + ../../../script/sum.pl -avg mds? mds?? > mds.avg + + ../../../script/sum.pl -start 90 -end 180 mds? mds?? > mds.sum.makedirs + ../../../script/sum.pl -start 200 -end 300 mds? mds?? > mds.sum.walk + + cd .. +done diff --git a/branches/sage/cephmds2/jobs/osd/ebofs b/branches/sage/cephmds2/jobs/osd/ebofs new file mode 100644 index 0000000000000..5d11523f6f832 --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/ebofs @@ -0,0 +1,51 @@ +# hi there +{ + # startup + 'n' => 30, # mpi nodes + 'sleep' => 3, # seconds between runs + 'nummds' => 1, + 'numosd' => 8, + 'numclient' => 100,#[10, 50, 100, 200, 400], + +'kill_after' => 200, + + # parameters + 'fs' => 'ebofs',#[ +# 'obfs', +# 'fakestore', +# 'ebofs' +# ], + 'until' => 100, # --syn until $n ... when to stop clients + 'writefile' => 1, + 'writefile_size' => [ +# 2560000, + 1024000, + 262144, +# 131072, +# 98304, + 65536, +# 16384, +# 4096, + 256, +# 16, +# 1 + ], + 'writefile_mb' => 1000, + + 'ebofs_idle_commit_ms' => [ 100, 500 ], + 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], + +# 'custom' => '--tcp_skip_rank0',# --osd_maxthreads 0', + 'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + # for final summation (script/sum.pl) + 'start' => 30, + 'end' => 90, + +'comb' => { + 'x' => 'writefile_size', + 'vars' => [ 'osd.c_wrb' ], +# 'maptitle' => { 'osd_object_layout=' => '', +# ',osd_pg_layout=' => ' + '} + } +}; diff --git a/branches/sage/cephmds2/jobs/osd/mds_log b/branches/sage/cephmds2/jobs/osd/mds_log new file mode 100644 index 0000000000000..0f99f6998dcfc --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/mds_log @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# hi there +{ + #'_psub' => 'jobs/alc.tp', + 'sleep' => 3, + + 'nummds' => 1, + 'numclient' => [5, 10, 15, 25, 50, 100, 200, 300, 400], + #'numclient' => [ 50, 100, 200 ], + 'numosd' => [2,4],#[ 4, 8, 12, 16, 20, 24 ], + 'n' => 12, + + # parameters + 'fs' => 'fakestore',#['ebofs', 'fakestore','obfs'], + #'fs' => 'ebofs', + #'ebofs_commit_ms' => [ 1000, 5000 ], + #'osd_maxthreads' => [ 0, 1, 2, 4, 8 ], + + 'until' => 100, # --syn until $n ... when to stop clients + 'kill_after' => 300, + 'start' => 20, + 'end' => 90, + + 'makedirs' => 1, + 'makedirs_dirs' => 10, + 'makedirs_files' => 10, + 'makedirs_depth' => 3, + + + #'meta_log_layout_ssize' => [256, 512, 1024, 4096, 16384, 65536, 262400], + #'meta_log_layout_scount' => [2, 4, 8], + #'meta_log_layout_num_rep' => [1, 2], + #'meta_log_layout_num_rep' => 1, + + 'custom' => '--tcp_skip_rank0 --mds_shutdown_check 60', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + 'comb' => { + 'x' => 'numclient',#'meta_log_layout_ssize', + 'vars' => [ 'mds.req' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/osd/osd_threads b/branches/sage/cephmds2/jobs/osd/osd_threads new file mode 100644 index 0000000000000..ef271f9e88710 --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/osd_threads @@ -0,0 +1,33 @@ +# hi there +{ + # startup + 'n' => 30, # mpi nodes + 'sleep' => 10, # seconds between runs + 'nummds' => 1, + 'numosd' => 8, + 'numclient' => 50, + + # parameters + 'fs' => [ +# 'obfs', + 'fakestore', + 'ebofs' + ], + 'until' => 100, # --syn until $n ... when to stop clients + 'writefile' => 1, + 'writefile_size' => [ + 1024000, + 131072, + 65536, + 16 + ], + 'writefile_mb' => 1000, + + 'osd_maxthreads' => [0, 1, 2, 4, 8], + + 'custom' => '--tcp_skip_rank0', + + # for final summation (script/sum.pl) + 'start' => 30, + 'end' => 90 +}; diff --git a/branches/sage/cephmds2/jobs/osd/striping b/branches/sage/cephmds2/jobs/osd/striping new file mode 100644 index 0000000000000..ea8cabe643274 --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/striping @@ -0,0 +1,78 @@ +#!/usr/bin/perl +# hi there +{ + # startup + #'n' => 28, # mpi nodes + + 'sleep' => 3, # seconds between runs + 'nummds' => 1, + + 'numosd' => [2,3,4,5,6,7,8,10,12], #[6, 8, 10, 12, 16], + 'numosd' => [14], + #'cper' => [4, 5, 6, 7, 8, 9, 10, 11, 12], #[1, 4, 6, 8, 16, 32, 64], + #'cper' => [4, 6, 8, 10, 12, 16, 24, 32 ], #[1, 4, 6, 8, 16, 32, 64], + 'cper' => [30], + + '_dep' => [ 'cnode' => '$numosd', + 'numclient' => '$cnode * $cper', + 'n' => 38],#'$nummds + $numosd + $cnode'], + #'numclient' => [5, 10, 20, 50, 75, 100, 150 ], + + 'start' => 30, + 'end' => 90, + 'until' => 100, # --syn until $n ... when to stop clients + 'kill_after' => 260, + + # parameters + 'fs' => 'ebofs', + 'writefile' => 1, + + 'writefile_size' => [# 4096, + # 16*1024, + # 64*1024, + # 256*1024, + 1024*1024 ], +# 'writefile_size' => [ +# 2048*1024, +# 1048576, +# 512*1024, +# 262144, +# 65536, +# 16384 +# ], + 'writefile_mb' => 1000, + + 'file_layout_num_rep'=> [1,2,3], + + 'osd_pg_bits' => 12,#[6, 8, 10, 12, 14], + + 'osd_object_layout' => [ 'hashino' ],#'hash', 'hashino', 'linear' ], + 'osd_pg_layout' => [ 'crush', 'linear' ],#, 'linear'],#, 'hash' ],#, 'linear' ],#, 'hash' ], + + #'custom' => '--tcp_skip_rank0', # --osd_maxthreads 0', + #'custom' => '--debug_after 110 --debug_osd 15 --debug_filer 15 --debug 5', + + # for final summation (script/sum.pl) + + 'comb' => { + 'x' => 'numosd',#'writefile_size', + 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ], +# 'maptitle' => { 'osd_object_layout=' => '', +# ',osd_pg_layout=' => ' + '} + } +}; + + +=item some googoo notes + +for 1mb 1x writes, + + with numosd=6, min cper=6 to saturate (cper_saturate) + googoo saturates at numosd=8. (osd_saturate) + + -> so, numosd=6 or 7 is a safe size! + + + + +=cut diff --git a/branches/sage/cephmds2/jobs/osd/wr_lat2 b/branches/sage/cephmds2/jobs/osd/wr_lat2 new file mode 100644 index 0000000000000..47053dd61f3ab --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/wr_lat2 @@ -0,0 +1,44 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => 1, + 'numosd' => [12], + 'numclient' => [1],#, 40, 80, 160 ], + 'n' => 16, + + 'fs' => 'ebofs', + + 'start' => 10, + 'end' => 40, + 'until' => 40, + 'kill_after' => 90, + + 'writefile' => 1, + 'writefile_size' => [4096, + 8*1024, + 16*1024, + 32*1024, + 64*1024, + 128*1024, + 256*1024, + 512*1024, + 1024*1024], + 'writefile_mb' => 10000, + + #'tcp_multi_out' => [0,1], + +# 'mds_local_osd' => [ 0, 1 ], + 'file_layout_num_rep' => [1,2,3],#, 2, 3, 4], + + 'client_oc' => [0,1], + + 'custom' => '--tcp_skip_rank0', + + 'comb' => { + 'x' => 'writefile_size',#'file_layout_num_rep', + 'vars' => [ 'osd.c_wrb','cl.wrlat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/osd/write_sizes b/branches/sage/cephmds2/jobs/osd/write_sizes new file mode 100644 index 0000000000000..57369f3a97c50 --- /dev/null +++ b/branches/sage/cephmds2/jobs/osd/write_sizes @@ -0,0 +1,60 @@ +#!/usr/bin/perl +# hi there +{ + # startup + 'n' => 30, # mpi nodes + 'sleep' => 3, # seconds between runs + 'nummds' => 1, + 'numosd' => 6, + 'numclient' => 100,#[25,50,100,300],#100,#[10, 50, 100, 200, 400], + + 'until' => 100, # --syn until $n ... when to stop clients + 'kill_after' => 300, + + # parameters + 'fs' => [ +# 'obfs', + 'fakestore', +# 'ebofs' + ], + 'writefile' => 1, + 'writefile_size' => [ +# 2048*1024, + 1024*1024, + 512*1024, + 256*1024, + 128*1024, + 64*1024, + 48*1024, + 32*1024, + 28*1024, + 24*1024, + 16*1024, + 12*1024, + 8*1024, + 4096, +# 256, +# 16, +# 1 + ], + 'writefile_mb' => 1000, + + 'file_layout_num_rep'=> 1,#[1,2], + + +# 'ebofs_idle_commit_ms' => [ 100, 500 ], +# 'ebofs_abp_max_alloc' => [ 4096*16, 4096*64 ], + + 'custom' => '--debug_after 110 --debug_mds 15 --debug 5 --mds_shutdown_check 60', + + # for final summation (script/sum.pl) + 'start' => 30, + 'end' => 90, + + 'comb' => { + 'x' => 'writefile_size', + 'vars' => [ 'osd.c_wrb' ], +# 'maptitle' => { 'osd_object_layout=' => '', +# ',osd_pg_layout=' => ' + '} + } +}; diff --git a/branches/sage/cephmds2/jobs/rados/map_dist b/branches/sage/cephmds2/jobs/rados/map_dist new file mode 100644 index 0000000000000..39f16daa1cdc2 --- /dev/null +++ b/branches/sage/cephmds2/jobs/rados/map_dist @@ -0,0 +1,32 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'osdbits' => [6,7,8],#,9],10,11], + 'pgperbits' => [3],#,4,5],#[4,6,8], + + 'nummds' => 1, + + '_dep' => [ 'numosd' => '1 << $osdbits', + 'osd_pg_bits' => '$pgperbits + $osdbits', + 'n' => '3 + $numosd / 32'], + 'numclient' => 0, + + 'fake_osdmap_updates' => [30], + + 'fs' => 'ebofs', + + 'start' => 30, + 'end' => 300, + 'kill_after' => 300, + + 'custom' => '--bdev_lock 0 --ms_stripe_osds --osd_maxthreads 0', + #'custom' => '--tcp_skip_rank0', + + 'comb' => { + 'x' => 'osdbits', + 'vars' => [ 'osd.sum=mapi', 'osd.sum=mapidup', 'osd.numpg', 'osd.pingset' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/rados/rep_lat b/branches/sage/cephmds2/jobs/rados/rep_lat new file mode 100644 index 0000000000000..3f5ab0c8a7d87 --- /dev/null +++ b/branches/sage/cephmds2/jobs/rados/rep_lat @@ -0,0 +1,43 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => 1, + 'numosd' => 8, #[6], + 'numclient' => 1,#, 40, 80, 160 ], + 'n' => 10, + + 'fs' => 'ebofs', + + 'start' => 10, + 'end' => 40, + 'until' => 40, + 'kill_after' => 45, + + 'writefile' => 1, + 'writefile_size' => [4096, +# 8*1024, +# 16*1024, +# 32*1024, + 64*1024, +# 128*1024, +# 256*1024, +# 512*1024, +# 1024*1024 +], + 'writefile_mb' => 10000, + + 'osd_rep' => [0,1,2], + + 'file_layout_num_rep' => [1,2,3,4,5,6],#, 2, 3, 4], + + 'osd_pg_bits' => 4, + 'custom' => '--osd_max_rep 8', + + 'comb' => { + 'x' => 'file_layout_num_rep', + 'vars' => [ 'cl.wrlat' ] + } +}; diff --git a/branches/sage/cephmds2/jobs/rados/wr_sizes b/branches/sage/cephmds2/jobs/rados/wr_sizes new file mode 100644 index 0000000000000..9b73477ea6142 --- /dev/null +++ b/branches/sage/cephmds2/jobs/rados/wr_sizes @@ -0,0 +1,50 @@ +#!/usr/bin/perl + +# hi there +{ + 'sleep' => 3, + + 'nummds' => 1, + 'numosd' => [8],#10,14,16], + 'numclient' => [10*16], + 'n' => 15, + + 'fs' => 'ebofs', + + 'start' => 60, + 'end' => 90, + 'until' => 90, + 'kill_after' => 190, + + 'writefile' => 1, + 'writefile_size' => [4096, + 8*1024, + 16*1024, + 32*1024, + 64*1024, + 128*1024, + 256*1024, + # 512*1024, +# 4*1024*1024, +# 2*1024*1024, +# 1024*1024 +], + 'writefile_mb' => 10000, + + 'file_layout_num_rep' => 1, + 'file_layout_ssize' => 4*1024*1024, + 'file_layout_osize' => 4*1024*1024, + + 'osd_pg_bits' => 12, + +# 'ebofs_freelist' => [0, 1080, 65400], + + 'custom' => '--objecter_buffer_uncommitted 0', + + #'custom' => '--tcp_skip_rank0', + + 'comb' => { + 'x' => 'writefile_size', + 'vars' => [ 'osd.c_wrb', 'cl.wrlat' ] + } +}; diff --git a/branches/sage/cephmds2/mds/Anchor.h b/branches/sage/cephmds2/mds/Anchor.h new file mode 100644 index 0000000000000..8da2bbdb52cd5 --- /dev/null +++ b/branches/sage/cephmds2/mds/Anchor.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __ANCHOR_H +#define __ANCHOR_H + +#include +using std::string; + +#include "include/types.h" +#include "include/buffer.h" + +class Anchor { +public: + inodeno_t ino; // my ino + inodeno_t dirino; // containing dir + string ref_dn; // referring dentry + int nref; // reference count + + Anchor() {} + Anchor(inodeno_t ino, inodeno_t dirino, string& ref_dn, int nref=0) { + this->ino = ino; + this->dirino = dirino; + this->ref_dn = ref_dn; + this->nref = nref; + } + + void _encode(bufferlist &bl) { + bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&nref, sizeof(nref)); + ::_encode(ref_dn, bl); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + bl.copy(off, sizeof(nref), (char*)&nref); + off += sizeof(nref); + ::_decode(ref_dn, bl, off); + } +} ; + +#endif diff --git a/branches/sage/cephmds2/mds/AnchorClient.cc b/branches/sage/cephmds2/mds/AnchorClient.cc new file mode 100644 index 0000000000000..b330a93cec6ca --- /dev/null +++ b/branches/sage/cephmds2/mds/AnchorClient.cc @@ -0,0 +1,149 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +using std::cout; +using std::cerr; +using std::endl; + +#include "Anchor.h" +#include "AnchorClient.h" +#include "MDSMap.h" + +#include "include/Context.h" +#include "msg/Messenger.h" + +#include "MDS.h" + +#include "messages/MAnchorRequest.h" +#include "messages/MAnchorReply.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " +#define derr(x) if (x <= g_conf.debug) cout << g_clock.now() << " " << messenger->get_myaddr() << ".anchorclient " + + +void AnchorClient::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_MDS_ANCHORREPLY: + handle_anchor_reply((MAnchorReply*)m); + break; + + default: + assert(0); + } +} + +void AnchorClient::handle_anchor_reply(class MAnchorReply *m) +{ + switch (m->get_op()) { + + case ANCHOR_OP_LOOKUP: + { + assert(pending_lookup_trace.count(m->get_ino()) == 1); + + *(pending_lookup_trace[ m->get_ino() ]) = m->get_trace(); + Context *onfinish = pending_lookup_context[ m->get_ino() ]; + + pending_lookup_trace.erase(m->get_ino()); + pending_lookup_context.erase(m->get_ino()); + + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } + } + break; + + case ANCHOR_OP_UPDATE: + case ANCHOR_OP_CREATE: + case ANCHOR_OP_DESTROY: + { + assert(pending_op.count(m->get_ino()) == 1); + + Context *onfinish = pending_op[m->get_ino()]; + pending_op.erase(m->get_ino()); + + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } + } + break; + + default: + assert(0); + } + +} + + + +/* + * public async interface + */ + +void AnchorClient::lookup(inodeno_t ino, vector& trace, Context *onfinish) +{ + // send message + MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_LOOKUP, ino); + + pending_lookup_trace[ino] = &trace; + pending_lookup_context[ino] = onfinish; + + messenger->send_message(req, + MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); +} + +void AnchorClient::create(inodeno_t ino, vector& trace, Context *onfinish) +{ + // send message + MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_CREATE, ino); + req->set_trace(trace); + + pending_op[ino] = onfinish; + + messenger->send_message(req, + MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); +} + +void AnchorClient::update(inodeno_t ino, vector& trace, Context *onfinish) +{ + // send message + MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_UPDATE, ino); + req->set_trace(trace); + + pending_op[ino] = onfinish; + + messenger->send_message(req, + MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); +} + +void AnchorClient::destroy(inodeno_t ino, Context *onfinish) +{ + // send message + MAnchorRequest *req = new MAnchorRequest(ANCHOR_OP_DESTROY, ino); + + pending_op[ino] = onfinish; + + messenger->send_message(req, + MSG_ADDR_MDS(mdsmap->get_anchortable()), mdsmap->get_inst(mdsmap->get_anchortable()), + MDS_PORT_ANCHORMGR, MDS_PORT_ANCHORCLIENT); +} + + diff --git a/branches/sage/cephmds2/mds/AnchorClient.h b/branches/sage/cephmds2/mds/AnchorClient.h new file mode 100644 index 0000000000000..80b736a4b65c7 --- /dev/null +++ b/branches/sage/cephmds2/mds/AnchorClient.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __ANCHORCLIENT_H +#define __ANCHORCLIENT_H + +#include +using std::vector; +#include +using __gnu_cxx::hash_map; + +#include "include/types.h" +#include "msg/Dispatcher.h" + +#include "Anchor.h" + +class Messenger; +class MDSMap; +class Context; + +class AnchorClient : public Dispatcher { + Messenger *messenger; + MDSMap *mdsmap; + + // remote state + hash_map pending_op; + hash_map pending_lookup_context; + hash_map*> pending_lookup_trace; + + void handle_anchor_reply(class MAnchorReply *m); + + +public: + AnchorClient(Messenger *ms, MDSMap *mm) : messenger(ms), mdsmap(mm) {} + + // async user interface + void lookup(inodeno_t ino, vector& trace, Context *onfinish); + void create(inodeno_t ino, vector& trace, Context *onfinish); + void update(inodeno_t ino, vector& trace, Context *onfinish); + void destroy(inodeno_t ino, Context *onfinish); + + void dispatch(Message *m); +}; + +#endif diff --git a/branches/sage/cephmds2/mds/AnchorTable.cc b/branches/sage/cephmds2/mds/AnchorTable.cc new file mode 100644 index 0000000000000..7b881de0339da --- /dev/null +++ b/branches/sage/cephmds2/mds/AnchorTable.cc @@ -0,0 +1,347 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "AnchorTable.h" +#include "MDS.h" + +#include "osdc/Filer.h" + +#include "msg/Messenger.h" +#include "messages/MAnchorRequest.h" +#include "messages/MAnchorReply.h" + +#include "common/Clock.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " +#define derr(x) if (x <= g_conf.debug_mds) cerr << g_clock.now() << " " << mds->messenger->get_myaddr() << ".anchortable " + +AnchorTable::AnchorTable(MDS *mds) +{ + this->mds = mds; + opening = false; + opened = false; + + memset(&table_inode, 0, sizeof(table_inode)); + table_inode.ino = MDS_INO_ANCHORTABLE+mds->get_nodeid(); + table_inode.layout = g_OSD_FileLayout; +} + +/* + * basic updates + */ + +bool AnchorTable::add(inodeno_t ino, inodeno_t dirino, string& ref_dn) +{ + dout(7) << "add " << std::hex << ino << " dirino " << dirino << std::dec << " ref_dn " << ref_dn << endl; + + // parent should be there + assert(dirino < 1000 || // system dirino + anchor_map.count(dirino)); // have + + if (anchor_map.count(ino) == 0) { + // new item + anchor_map[ ino ] = new Anchor(ino, dirino, ref_dn); + dout(10) << " add: added " << std::hex << ino << std::dec << endl; + return true; + } else { + dout(10) << " add: had " << std::hex << ino << std::dec << endl; + return false; + } +} + +void AnchorTable::inc(inodeno_t ino) +{ + dout(7) << "inc " << std::hex << ino << std::dec << endl; + + assert(anchor_map.count(ino) != 0); + Anchor *anchor = anchor_map[ino]; + assert(anchor); + + while (1) { + anchor->nref++; + + dout(10) << " inc: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; + ino = anchor->dirino; + + if (ino == 0) break; + if (anchor_map.count(ino) == 0) break; + anchor = anchor_map[ino]; + assert(anchor); + } +} + +void AnchorTable::dec(inodeno_t ino) +{ + dout(7) << "dec " << std::hex << ino << std::dec << endl; + + assert(anchor_map.count(ino) != 0); + Anchor *anchor = anchor_map[ino]; + assert(anchor); + + while (true) { + anchor->nref--; + + if (anchor->nref == 0) { + dout(10) << " dec: record " << std::hex << ino << std::dec << " now 0, removing" << endl; + inodeno_t dirino = anchor->dirino; + anchor_map.erase(ino); + delete anchor; + ino = dirino; + } else { + dout(10) << " dec: record " << std::hex << ino << std::dec << " now " << anchor->nref << endl; + ino = anchor->dirino; + } + + if (ino == 0) break; + if (anchor_map.count(ino) == 0) break; + anchor = anchor_map[ino]; + assert(anchor); + } +} + + +/* + * high level + */ + +void AnchorTable::lookup(inodeno_t ino, vector& trace) +{ + dout(7) << "lookup " << std::hex << ino << std::dec << endl; + + assert(anchor_map.count(ino) == 1); + Anchor *anchor = anchor_map[ino]; + assert(anchor); + + while (true) { + dout(10) << " record " << std::hex << anchor->ino << " dirino " << anchor->dirino << std::dec << " ref_dn " << anchor->ref_dn << endl; + trace.insert(trace.begin(), anchor); // lame FIXME + + if (anchor->dirino < MDS_INO_BASE) break; + + assert(anchor_map.count(anchor->dirino) == 1); + anchor = anchor_map[anchor->dirino]; + assert(anchor); + } +} + +void AnchorTable::create(inodeno_t ino, vector& trace) +{ + dout(7) << "create " << std::hex << ino << std::dec << endl; + + // make sure trace is in table + for (unsigned i=0; iino, trace[i]->dirino, trace[i]->ref_dn); + + inc(ino); // ok! +} + +void AnchorTable::destroy(inodeno_t ino) +{ + dec(ino); +} + + + +/* + * messages + */ + +void AnchorTable::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_MDS_ANCHORREQUEST: + handle_anchor_request((MAnchorRequest*)m); + break; + + default: + assert(0); + } +} + + + +void AnchorTable::handle_anchor_request(class MAnchorRequest *m) +{ + // make sure i'm open! + if (!opened) { + dout(7) << "not open yet" << endl; + + waiting_for_open.push_back(new C_MDS_RetryMessage(mds,m)); + + if (!opening) { + opening = true; + load(0); + } + return; + } + + // go + MAnchorReply *reply = new MAnchorReply(m); + + switch (m->get_op()) { + + case ANCHOR_OP_LOOKUP: + lookup( m->get_ino(), reply->get_trace() ); + break; + + case ANCHOR_OP_UPDATE: + destroy( m->get_ino() ); + create( m->get_ino(), m->get_trace() ); + break; + + case ANCHOR_OP_CREATE: + create( m->get_ino(), m->get_trace() ); + break; + + case ANCHOR_OP_DESTROY: + destroy( m->get_ino() ); + break; + + default: + assert(0); + } + + // send reply + mds->messenger->send_message(reply, m->get_source(), m->get_source_port()); + delete m; +} + + + + +// primitive load/save for now! + +// load/save entire table for now! + +void AnchorTable::save(Context *onfinish) +{ + dout(7) << "save" << endl; + if (!opened) return; + + // build up write + bufferlist tabbl; + + int num = anchor_map.size(); + tabbl.append((char*)&num, sizeof(int)); + + for (hash_map::iterator it = anchor_map.begin(); + it != anchor_map.end(); + it++) { + dout(14) << " saving anchor for " << std::hex << it->first << std::dec << endl; + Anchor *a = it->second; + assert(a); + a->_encode(tabbl); + } + + bufferlist bl; + size_t size = tabbl.length(); + bl.append((char*)&size, sizeof(size)); + bl.claim_append(tabbl); + + dout(7) << " " << num << " anchors, " << size << " bytes" << endl; + + // write! + mds->filer->write(table_inode, + 0, bl.length(), + bl, 0, + NULL, onfinish); +} + + + +class C_AT_Load : public Context { + AnchorTable *at; +public: + size_t size; + bufferlist bl; + C_AT_Load(size_t size, AnchorTable *at) { + this->size = size; + this->at = at; + } + void finish(int result) { + assert(result > 0); + + at->load_2(size, bl); + } +}; + +class C_AT_LoadSize : public Context { + AnchorTable *at; + MDS *mds; +public: + bufferlist bl; + C_AT_LoadSize(AnchorTable *at, MDS *mds) { + this->at = at; + this->mds = mds; + } + void finish(int r) { + size_t size = 0; + assert(bl.length() >= sizeof(size)); + bl.copy(0, sizeof(size), (char*)&size); + cout << "r is " << r << " size is " << size << endl; + if (r > 0 && size > 0) { + C_AT_Load *c = new C_AT_Load(size, at); + mds->filer->read(at->table_inode, + sizeof(size), size, + &c->bl, + c); + } else { + // fail + bufferlist empty; + at->load_2(0, empty); + } + } +}; + +void AnchorTable::load(Context *onfinish) +{ + dout(7) << "load" << endl; + + assert(!opened); + + waiting_for_open.push_back(onfinish); + + C_AT_LoadSize *c = new C_AT_LoadSize(this, mds); + mds->filer->read(table_inode, + 0, sizeof(size_t), + &c->bl, + c); +} + +void AnchorTable::load_2(size_t size, bufferlist& bl) +{ + // num + int off = 0; + int num; + bl.copy(0, sizeof(num), (char*)&num); + off += sizeof(num); + + // parse anchors + for (int i=0; i_decode(bl, off); + dout(10) << "load_2 decoded " << std::hex << a->ino << " dirino " << a->dirino << std::dec << " ref_dn " << a->ref_dn << endl; + anchor_map[a->ino] = a; + } + + dout(7) << "load_2 got " << num << " anchors" << endl; + + opened = true; + opening = false; + + // finish + finish_contexts(waiting_for_open); +} + diff --git a/branches/sage/cephmds2/mds/AnchorTable.h b/branches/sage/cephmds2/mds/AnchorTable.h new file mode 100644 index 0000000000000..2e6c1d7b07788 --- /dev/null +++ b/branches/sage/cephmds2/mds/AnchorTable.h @@ -0,0 +1,82 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __ANCHORTABLE_H +#define __ANCHORTABLE_H + +#include "Anchor.h" +#include "include/Context.h" + +#include +using namespace __gnu_cxx; + +class MDS; + + +class AnchorTable { + MDS *mds; + hash_map anchor_map; + + bool opening, opened; + list waiting_for_open; + + public: + inode_t table_inode; + + public: + AnchorTable(MDS *mds); + + protected: + // + bool have_ino(inodeno_t ino) { + return true; // always in memory for now. + } + void fetch_ino(inodeno_t ino, Context *onfinish) { + assert(!opened); + load(onfinish); + } + + // adjust table + bool add(inodeno_t ino, inodeno_t dirino, string& ref_dn); + void inc(inodeno_t ino); + void dec(inodeno_t ino); + + + // high level interface + void lookup(inodeno_t ino, vector& trace); + void create(inodeno_t ino, vector& trace); + void destroy(inodeno_t ino); + + // messages + public: + void dispatch(class Message *m); + protected: + void handle_anchor_request(class MAnchorRequest *m); + + + public: + + // load/save entire table for now! + void reset() { + opened = true; + anchor_map.clear(); + } + void save(Context *onfinish); + void load(Context *onfinish); + void load_2(size_t size, bufferlist& bl); + + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/CDentry.cc b/branches/sage/cephmds2/mds/CDentry.cc new file mode 100644 index 0000000000000..2cfbbd80b58be --- /dev/null +++ b/branches/sage/cephmds2/mds/CDentry.cc @@ -0,0 +1,141 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "CDentry.h" +#include "CInode.h" +#include "CDir.h" + +#include + +#undef dout +#define dout(x) if ((x) <= g_conf.debug) cout << "mds.dentry " + + +// CDentry + +ostream& operator<<(ostream& out, CDentry& dn) +{ + out << "[dentry " << dn.get_name(); + if (dn.is_pinned()) out << " " << dn.num_pins() << " pins"; + + if (dn.is_null()) out << " NULL"; + if (dn.is_remote()) out << " REMOTE"; + + if (dn.get_lockstate() == DN_LOCK_UNPINNING) out << " unpinning"; + if (dn.is_dirty()) out << " dirty"; + if (dn.get_lockstate() == DN_LOCK_PREXLOCK) out << " prexlock=" << dn.get_xlockedby() << " g=" << dn.get_gather_set(); + if (dn.get_lockstate() == DN_LOCK_XLOCK) out << " xlock=" << dn.get_xlockedby(); + + out << " dirv=" << dn.get_parent_dir_version(); + + out << " inode=" << dn.get_inode(); + out << " " << &dn; + out << " in " << *dn.get_dir(); + out << "]"; + return out; +} + +CDentry::CDentry(const CDentry& m) { + assert(0); //std::cerr << "copy cons called, implement me" << endl; +} + + +void CDentry::mark_dirty() +{ + dout(10) << " mark_dirty " << *this << endl; + + // dir is now dirty (if it wasn't already) + dir->mark_dirty(); + + // pin inode? + if (is_primary() && !dirty && inode) inode->get(CINODE_PIN_DNDIRTY); + + // i now live in that (potentially newly dirty) version + parent_dir_version = dir->get_version(); + + dirty = true; +} +void CDentry::mark_clean() { + dout(10) << " mark_clean " << *this << endl; + assert(dirty); + assert(parent_dir_version <= dir->get_version()); + + if (parent_dir_version < dir->get_last_committed_version()) + cerr << " bad mark_clean " << *this << endl; + + assert(parent_dir_version >= dir->get_last_committed_version()); + + if (is_primary() && dirty && inode) inode->put(CINODE_PIN_DNDIRTY); + dirty = false; +} + + +void CDentry::make_path(string& s) +{ + if (dir->inode->get_parent_dn()) + dir->inode->get_parent_dn()->make_path(s); + + s += "/"; + s += name; +} + + +void CDentry::link_remote(CInode *in) +{ + assert(is_remote()); + assert(in->ino() == remote_ino); + + inode = in; + in->add_remote_parent(this); +} + +void CDentry::unlink_remote() +{ + assert(is_remote()); + assert(inode); + + inode->remove_remote_parent(this); + inode = 0; +} + + + + + +// = +const CDentry& CDentry::operator= (const CDentry& right) { + assert(0); //std::cerr << "copy op called, implement me" << endl; + return *this; +} + + // comparisons + bool CDentry::operator== (const CDentry& right) const { + return name == right.name; + } + bool CDentry::operator!= (const CDentry& right) const { + return name == right.name; + } + bool CDentry::operator< (const CDentry& right) const { + return name < right.name; + } + bool CDentry::operator> (const CDentry& right) const { + return name > right.name; + } + bool CDentry::operator>= (const CDentry& right) const { + return name >= right.name; + } + bool CDentry::operator<= (const CDentry& right) const { + return name <= right.name; + } diff --git a/branches/sage/cephmds2/mds/CDentry.h b/branches/sage/cephmds2/mds/CDentry.h new file mode 100644 index 0000000000000..a399ef7acfe5a --- /dev/null +++ b/branches/sage/cephmds2/mds/CDentry.h @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __CDENTRY_H +#define __CDENTRY_H + +#include +#include +#include +using namespace std; + +#include "include/types.h" + +class CInode; +class CDir; + +#define DN_LOCK_SYNC 0 +#define DN_LOCK_PREXLOCK 1 +#define DN_LOCK_XLOCK 2 +#define DN_LOCK_UNPINNING 3 // waiting for pins to go away + +#define DN_XLOCK_FOREIGN ((Message*)0x1) // not 0, not a valid pointer. + +class Message; + +// dentry +class CDentry { + protected: + string name; + CInode *inode; + CDir *dir; + + inodeno_t remote_ino; // if remote dentry + + // state + bool dirty; + version_t parent_dir_version; // dir version when last touched. + + // locking + int lockstate; + Message *xlockedby; + set gather_set; + + int npins; + multiset pinset; + + friend class Migrator; + friend class Locker; + friend class Renamer; + friend class Server; + friend class MDCache; + friend class MDS; + friend class CInode; + friend class C_MDC_XlockRequest; + + public: + // cons + CDentry() : + inode(0), + dir(0), + remote_ino(0), + dirty(0), + parent_dir_version(0), + lockstate(DN_LOCK_SYNC), + xlockedby(0), + npins(0) { } + CDentry(const string& n, inodeno_t ino, CInode *in=0) : + name(n), + inode(in), + dir(0), + remote_ino(ino), + dirty(0), + parent_dir_version(0), + lockstate(DN_LOCK_SYNC), + xlockedby(0), + npins(0) { } + CDentry(const string& n, CInode *in) : + name(n), + inode(in), + dir(0), + remote_ino(0), + dirty(0), + parent_dir_version(0), + lockstate(DN_LOCK_SYNC), + xlockedby(0), + npins(0) { } + + CInode *get_inode() { return inode; } + CDir *get_dir() { return dir; } + const string& get_name() { return name; } + inodeno_t get_remote_ino() { return remote_ino; } + + void set_remote_ino(inodeno_t ino) { remote_ino = ino; } + + // dentry type is primary || remote || null + // inode ptr is required for primary, optional for remote, undefined for null + bool is_primary() { return remote_ino == 0 && inode != 0; } + bool is_remote() { return remote_ino > 0; } + bool is_null() { return (remote_ino == 0 && inode == 0) ? true:false; } + + // remote links + void link_remote(CInode *in); + void unlink_remote(); + + + // copy cons + CDentry(const CDentry& m); + const CDentry& operator= (const CDentry& right); + + // comparisons + bool operator== (const CDentry& right) const; + bool operator!= (const CDentry& right) const; + bool operator< (const CDentry& right) const; + bool operator> (const CDentry& right) const; + bool operator>= (const CDentry& right) const; + bool operator<= (const CDentry& right) const; + + // misc + void make_path(string& p); + + // -- state + __uint64_t get_parent_dir_version() { return parent_dir_version; } + void float_parent_dir_version(__uint64_t ge) { + if (parent_dir_version < ge) + parent_dir_version = ge; + } + + bool is_dirty() { return dirty; } + bool is_clean() { return !dirty; } + + void mark_dirty(); + void mark_clean(); + + + // -- locking + int get_lockstate() { return lockstate; } + set& get_gather_set() { return gather_set; } + + bool is_sync() { return lockstate == DN_LOCK_SYNC; } + bool can_read() { return (lockstate == DN_LOCK_SYNC) || (lockstate == DN_LOCK_UNPINNING); } + bool can_read(Message *m) { return is_xlockedbyme(m) || can_read(); } + bool is_xlocked() { return lockstate == DN_LOCK_XLOCK; } + Message* get_xlockedby() { return xlockedby; } + bool is_xlockedbyother(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m != xlockedby; } + bool is_xlockedbyme(Message *m) { return (lockstate == DN_LOCK_XLOCK) && m == xlockedby; } + bool is_prexlockbyother(Message *m) { + return (lockstate == DN_LOCK_PREXLOCK) && m != xlockedby; + } + + // pins + void pin(Message *m) { + npins++; + pinset.insert(m); + assert(pinset.size() == (unsigned)npins); + } + void unpin(Message *m) { + npins--; + assert(npins >= 0); + assert(pinset.count(m) > 0); + pinset.erase(pinset.find(m)); + assert(pinset.size() == (unsigned)npins); + } + bool is_pinnable(Message *m) { + return (lockstate == DN_LOCK_SYNC) || + (lockstate == DN_LOCK_UNPINNING && pinset.count(m)); + } + bool is_pinned() { return npins>0; } + int num_pins() { return npins; } + + friend class CDir; +}; + +ostream& operator<<(ostream& out, CDentry& dn); + + +#endif diff --git a/branches/sage/cephmds2/mds/CDir.cc b/branches/sage/cephmds2/mds/CDir.cc new file mode 100644 index 0000000000000..a590e6821e1de --- /dev/null +++ b/branches/sage/cephmds2/mds/CDir.cc @@ -0,0 +1,914 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "CDir.h" +#include "CDentry.h" +#include "CInode.h" + +#include "MDS.h" +#include "MDSMap.h" + +#include "include/Context.h" +#include "common/Clock.h" + +#include + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache.dir(" << inode->inode.ino << ") " + + +// PINS +int cdir_pins[CDIR_NUM_PINS] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + +static char* cdir_pin_names[CDIR_NUM_PINS] = { + "child", + "opened", + "waiter", + "import", + "export", + "freeze", + "proxy", + "authpin", + "imping", + "impex", + "hashed", + "hashing", + "dirty", + "reqpins" +}; + + +ostream& operator<<(ostream& out, CDir& dir) +{ + string path; + dir.get_inode()->make_path(path); + out << "[dir " << dir.ino() << " " << path << "/"; + if (dir.is_dirty()) out << " dirty"; + if (dir.is_import()) out << " import"; + if (dir.is_export()) out << " export"; + if (dir.is_rep()) out << " repl"; + if (dir.is_hashed()) out << " hashed"; //=" << (int)dir.get_inode()->inode.hash_seed; + if (dir.is_auth()) { + out << " auth"; + if (dir.is_open_by_anyone()) + out << "+" << dir.get_open_by(); + } else { + out << " rep@" << dir.authority(); + if (dir.get_replica_nonce() > 1) + out << "." << dir.get_replica_nonce(); + } + + if (dir.is_pinned()) { + out << " |"; + for(set::iterator it = dir.get_ref_set().begin(); + it != dir.get_ref_set().end(); + it++) + if (*it < CDIR_NUM_PINS) + out << " " << cdir_pin_names[*it]; + else + out << " " << *it; + } + + if (dir.get_dir_auth() != CDIR_AUTH_PARENT) + out << " dir_auth=" << dir.get_dir_auth(); + + out << " state=" << dir.get_state(); + out << " sz=" << dir.get_nitems() << "+" << dir.get_nnull(); + + out << " v=" << dir.get_version(); + out << " cv=" << dir.get_committing_version(); + out << " lastcv=" << dir.get_last_committed_version(); + + out << " " << &dir; + return out << "]"; +} + + +// ------------------------------------------------------------------- +// CDir + +CDir::CDir(CInode *in, MDS *mds, bool auth) +{ + inode = in; + this->mds = mds; + + nitems = 0; + nnull = 0; + state = CDIR_STATE_INITIAL; + + version = 0; + committing_version = 0; + last_committed_version = 0; + + ref = 0; + + // auth + dir_auth = -1; + assert(in->is_dir()); + if (auth) + state |= CDIR_STATE_AUTH; + /* + if (in->dir_is_hashed()) { + assert(0); // when does this happen? + state |= CDIR_STATE_HASHED; + } + */ + + auth_pins = 0; + nested_auth_pins = 0; + request_pins = 0; + + dir_rep = CDIR_REP_NONE; +} + + + + +/*** + * linking fun + */ + +CDentry* CDir::add_dentry( const string& dname, inodeno_t ino) +{ + // foreign + assert(lookup(dname) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, ino); + dn->dir = this; + dn->parent_dir_version = version; + + // add to dir + assert(items.count(dn->name) == 0); + assert(null_items.count(dn->name) == 0); + + items[dn->name] = dn; + nitems++; + + dout(12) << "add_dentry " << *dn << endl; + + // pin? + if (nnull + nitems == 1) get(CDIR_PIN_CHILD); + + assert(nnull + nitems == items.size()); + assert(nnull == null_items.size()); + return dn; +} + + +CDentry* CDir::add_dentry( const string& dname, CInode *in ) +{ + // primary + assert(lookup(dname) == 0); + + // create dentry + CDentry* dn = new CDentry(dname, in); + dn->dir = this; + dn->parent_dir_version = version; + + // add to dir + assert(items.count(dn->name) == 0); + assert(null_items.count(dn->name) == 0); + + items[dn->name] = dn; + + if (in) { + link_inode_work( dn, in ); + } else { + assert(dn->inode == 0); + null_items[dn->name] = dn; + nnull++; + } + + dout(12) << "add_dentry " << *dn << endl; + + // pin? + if (nnull + nitems == 1) get(CDIR_PIN_CHILD); + + assert(nnull + nitems == items.size()); + assert(nnull == null_items.size()); + return dn; +} + + + +void CDir::remove_dentry(CDentry *dn) +{ + dout(12) << "remove_dentry " << *dn << endl; + + if (dn->inode) { + // detach inode and dentry + unlink_inode_work(dn); + } else { + // remove from null list + assert(null_items.count(dn->name) == 1); + null_items.erase(dn->name); + nnull--; + } + + // remove from list + assert(items.count(dn->name) == 1); + items.erase(dn->name); + + delete dn; + + // unpin? + if (nnull + nitems == 0) put(CDIR_PIN_CHILD); + + assert(nnull + nitems == items.size()); + assert(nnull == null_items.size()); +} + +void CDir::link_inode( CDentry *dn, inodeno_t ino) +{ + //dout(12) << "link_inode " << *dn << " remote " << ino << endl; + + assert(dn->is_null()); + dn->set_remote_ino(ino); + nitems++; + + assert(null_items.count(dn->name) == 1); + null_items.erase(dn->name); + nnull--; +} + +void CDir::link_inode( CDentry *dn, CInode *in ) +{ + assert(!dn->is_remote()); + + link_inode_work(dn,in); + //dout(12) << "link_inode " << *dn << " " << *in << endl; + + // remove from null list + assert(null_items.count(dn->name) == 1); + null_items.erase(dn->name); + nnull--; + + assert(nnull + nitems == items.size()); + assert(nnull == null_items.size()); +} + +void CDir::link_inode_work( CDentry *dn, CInode *in ) +{ + dn->inode = in; + in->set_primary_parent(dn); + + nitems++; // adjust dir size + + // set dir version + in->parent_dir_version = get_version(); + + // clear dangling + in->state_clear(CINODE_STATE_DANGLING); + + // dn dirty? + if (dn->is_dirty()) in->get(CINODE_PIN_DNDIRTY); + + // adjust auth pin count + if (in->auth_pins + in->nested_auth_pins) + adjust_nested_auth_pins( in->auth_pins + in->nested_auth_pins ); +} + +void CDir::unlink_inode( CDentry *dn ) +{ + dout(12) << "unlink_inode " << *dn << " " << *dn->inode << endl; + + unlink_inode_work(dn); + + // add to null list + assert(null_items.count(dn->name) == 0); + null_items[dn->name] = dn; + nnull++; + + assert(nnull + nitems == items.size()); + assert(nnull == null_items.size()); +} + +void CDir::unlink_inode_work( CDentry *dn ) +{ + CInode *in = dn->inode; + + if (dn->is_remote()) { + // remote + if (in) + dn->unlink_remote(); + + dn->set_remote_ino(0); + } else { + // primary + assert(dn->is_primary()); + + // explicitly define auth + in->dangling_auth = in->authority(); + //dout(10) << "unlink_inode " << *in << " dangling_auth now " << in->dangling_auth << endl; + + // unlink auth_pin count + if (in->auth_pins + in->nested_auth_pins) + adjust_nested_auth_pins( 0 - (in->auth_pins + in->nested_auth_pins) ); + + // set dangling flag + in->state_set(CINODE_STATE_DANGLING); + + // dn dirty? + if (dn->is_dirty()) in->put(CINODE_PIN_DNDIRTY); + + // detach inode + in->remove_primary_parent(dn); + dn->inode = 0; + } + + nitems--; // adjust dir size +} + +void CDir::remove_null_dentries() { + dout(12) << "remove_null_dentries " << *this << endl; + + list dns; + for (CDir_map_t::iterator it = null_items.begin(); + it != null_items.end(); + it++) { + dns.push_back(it->second); + } + + for (list::iterator it = dns.begin(); + it != dns.end(); + it++) { + CDentry *dn = *it; + assert(dn->is_sync()); + remove_dentry(dn); + } + assert(null_items.empty()); + assert(nnull == 0); + assert(nnull + nitems == items.size()); +} + + + +/**************************************** + * WAITING + */ + +bool CDir::waiting_for(int tag) +{ + return waiting.count(tag) > 0; +} + +bool CDir::waiting_for(int tag, const string& dn) +{ + if (!waiting_on_dentry.count(dn)) + return false; + return waiting_on_dentry[dn].count(tag) > 0; +} + +void CDir::add_waiter(int tag, + const string& dentry, + Context *c) { + if (waiting.empty() && waiting_on_dentry.size() == 0) + get(CDIR_PIN_WAITER); + waiting_on_dentry[ dentry ].insert(pair(tag,c)); + dout(10) << "add_waiter dentry " << dentry << " tag " << tag << " " << c << " on " << *this << endl; +} + +void CDir::add_waiter(int tag, Context *c) { + // hierarchical? + if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { + if (is_freezing_tree_root() || is_frozen_tree_root() || + is_freezing_dir() || is_frozen_dir()) { + // it's us, pin here. (fall thru) + } else { + // pin parent! + dout(10) << "add_waiter " << tag << " " << c << " should be ATFREEZEROOT, " << *this << " is not root, trying parent" << endl; + inode->parent->dir->add_waiter(tag, c); + return; + } + } + + // this dir. + if (waiting.empty() && waiting_on_dentry.size() == 0) + get(CDIR_PIN_WAITER); + waiting.insert(pair(tag,c)); + dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; +} + + +void CDir::take_waiting(int mask, + const string& dentry, + list& ls, + int num) +{ + if (waiting_on_dentry.empty()) return; + + multimap::iterator it = waiting_on_dentry[dentry].begin(); + while (it != waiting_on_dentry[dentry].end()) { + if (it->first & mask) { + ls.push_back(it->second); + dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; + waiting_on_dentry[dentry].erase(it++); + + if (num) { + if (num == 1) break; + num--; + } + } else { + dout(10) << "take_waiting dentry " << dentry << " mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; + it++; + } + } + + // did we clear dentry? + if (waiting_on_dentry[dentry].empty()) + waiting_on_dentry.erase(dentry); + + // ...whole map? + if (waiting_on_dentry.size() == 0 && waiting.empty()) + put(CDIR_PIN_WAITER); +} + +/* NOTE: this checks dentry waiters too */ +void CDir::take_waiting(int mask, + list& ls) +{ + if (waiting_on_dentry.size()) { + // try each dentry + hash_map >::iterator it = + waiting_on_dentry.begin(); + while (it != waiting_on_dentry.end()) { + take_waiting(mask, (it++)->first, ls); // not post-inc + } + } + + // waiting + if (!waiting.empty()) { + multimap::iterator it = waiting.begin(); + while (it != waiting.end()) { + if (it->first & mask) { + ls.push_back(it->second); + dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; + waiting.erase(it++); + } else { + dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this<< endl; + it++; + } + } + + if (waiting_on_dentry.size() == 0 && waiting.empty()) + put(CDIR_PIN_WAITER); + } +} + + +void CDir::finish_waiting(int mask, int result) +{ + dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; + + list finished; + take_waiting(mask, finished); + finish_contexts(finished, result); +} + +void CDir::finish_waiting(int mask, const string& dn, int result) +{ + dout(11) << "finish_waiting mask " << mask << " dn " << dn << " result " << result << " on " << *this << endl; + + list finished; + take_waiting(mask, dn, finished); + finish_contexts(finished, result); +} + + +// dirty/clean + +void CDir::mark_dirty() +{ + if (!state_test(CDIR_STATE_DIRTY)) { + version++; + state_set(CDIR_STATE_DIRTY); + dout(10) << "mark_dirty (was clean) " << *this << " new version " << version << endl; + get(CDIR_PIN_DIRTY); + } + else if (state_test(CDIR_STATE_COMMITTING) && + committing_version == version) { + version++; // now dirtier than committing version! + dout(10) << "mark_dirty (committing) " << *this << " new version " << version << "/" << committing_version << endl; + } else { + dout(10) << "mark_dirty (already dirty) " << *this << " version " << version << endl; + } +} + +void CDir::mark_clean() +{ + dout(10) << "mark_clean " << *this << " version " << version << endl; + if (state_test(CDIR_STATE_DIRTY)) { + state_clear(CDIR_STATE_DIRTY); + put(CDIR_PIN_DIRTY); + } +} + + + +// ref counts + +void CDir::put(int by) { + cdir_pins[by]--; + + // bad? + if (ref == 0 || ref_set.count(by) != 1) { + dout(7) << *this << " bad put by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 1); + assert(ref > 0); + } + + ref--; + ref_set.erase(by); + + // inode + if (ref == 0) + inode->put(CINODE_PIN_DIR); + + dout(7) << *this << " put by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; +} + +void CDir::get(int by) { + cdir_pins[by]++; + + // inode + if (ref == 0) + inode->get(CINODE_PIN_DIR); + + // bad? + if (ref_set.count(by)) { + dout(7) << *this << " bad get by " << by << " " << cdir_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 0); + } + + ref++; + ref_set.insert(by); + + dout(7) << *this << " get by " << by << " " << cdir_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; +} + + + +/******************************** + * AUTHORITY + */ + +/* + * simple rule: if dir_auth isn't explicit, auth is the same as the inode. + */ +int CDir::authority() +{ + if (get_dir_auth() >= 0) + return get_dir_auth(); + + /* + CDir *parent = inode->get_parent_dir(); + if (parent) + return parent->authority(); + + // root, or dangling + assert(inode->is_root()); // no dirs under danglers!? + //assert(inode->is_root() || inode->is_dangling()); + */ + + return inode->authority(); +} + +int CDir::dentry_authority(const string& dn ) +{ + // hashing -- subset of nodes have hashed the contents + if (is_hashing() && !hashed_subset.empty()) { + int hashauth = mds->hash_dentry( inode->ino(), dn ); // hashed + if (hashed_subset.count(hashauth)) + return hashauth; + } + + // hashed + if (is_hashed()) { + return mds->hash_dentry( inode->ino(), dn ); // hashed + } + + if (get_dir_auth() == CDIR_AUTH_PARENT) { + //dout(15) << "dir_auth = parent at " << *this << endl; + return inode->authority(); // same as my inode + } + + // it's explicit for this whole dir + //dout(15) << "dir_auth explicit " << dir_auth << " at " << *this << endl; + return get_dir_auth(); +} + +void CDir::set_dir_auth(int d) +{ + dout(10) << "setting dir_auth=" << d << " from " << dir_auth << " on " << *this << endl; + dir_auth = d; +} + + +/***************************************** + * AUTH PINS + */ + +void CDir::auth_pin() { + if (auth_pins == 0) + get(CDIR_PIN_AUTHPIN); + auth_pins++; + + dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; + + inode->nested_auth_pins++; + if (inode->parent) + inode->parent->dir->adjust_nested_auth_pins( 1 ); +} + +void CDir::auth_unpin() { + auth_pins--; + if (auth_pins == 0) + put(CDIR_PIN_AUTHPIN); + + dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; + assert(auth_pins >= 0); + + // pending freeze? + if (auth_pins + nested_auth_pins == 0) + on_freezeable(); + + inode->nested_auth_pins--; + if (inode->parent) + inode->parent->dir->adjust_nested_auth_pins( -1 ); +} + +void CDir::adjust_nested_auth_pins(int inc) +{ + CDir *dir = this; + + while (1) { + // dir + dir->nested_auth_pins += inc; + + dout(10) << "adjust_nested_auth_pins on " << *dir << " count now " << dir->auth_pins << " + " << dir->nested_auth_pins << endl; + assert(dir->nested_auth_pins >= 0); + + // pending freeze? + if (dir->auth_pins + dir->nested_auth_pins == 0) + dir->on_freezeable(); + + // it's inode + dir->inode->nested_auth_pins += inc; + + if (dir->inode->parent) + dir = dir->inode->parent->dir; + else + break; + } +} + + + +/***************************************************************************** + * FREEZING + */ + +void CDir::on_freezeable() +{ + // check for anything pending freezeable + + /* NOTE: the first of these will likely freeze the dir, and unmark + FREEZING. additional ones will re-flag FREEZING. this isn't + particularly graceful, and might cause problems if the first one + needs to know about other waiters.... FIXME? */ + + finish_waiting(CDIR_WAIT_FREEZEABLE); +} + +// FREEZE TREE + +class C_MDS_FreezeTree : public Context { + CDir *dir; + Context *con; +public: + C_MDS_FreezeTree(CDir *dir, Context *c) { + this->dir = dir; + this->con = c; + } + virtual void finish(int r) { + dir->freeze_tree_finish(con); + } +}; + +void CDir::freeze_tree(Context *c) +{ + assert(!is_frozen()); + assert(!is_freezing()); + + if (is_freezeable()) { + dout(10) << "freeze_tree " << *this << endl; + + state_set(CDIR_STATE_FROZENTREE); + inode->auth_pin(); // auth_pin for duration of freeze + + // easy, we're frozen + c->finish(0); + delete c; + + } else { + state_set(CDIR_STATE_FREEZINGTREE); + dout(10) << "freeze_tree + wait " << *this << endl; + + // need to wait for auth pins to expire + add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); + } +} + +void CDir::freeze_tree_finish(Context *c) +{ + // freezeable now? + if (!is_freezeable()) { + // wait again! + dout(10) << "freeze_tree_finish still waiting " << *this << endl; + state_set(CDIR_STATE_FREEZINGTREE); + add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeTree(this, c)); + return; + } + + dout(10) << "freeze_tree_finish " << *this << endl; + state_set(CDIR_STATE_FROZENTREE); + state_clear(CDIR_STATE_FREEZINGTREE); // actually, this may get set again by next context? + + inode->auth_pin(); // auth_pin for duration of freeze + + // continue to frozen land + if (c) { + c->finish(0); + delete c; + } +} + +void CDir::unfreeze_tree() +{ + dout(10) << "unfreeze_tree " << *this << endl; + state_clear(CDIR_STATE_FROZENTREE); + + // unpin (may => FREEZEABLE) FIXME: is this order good? + inode->auth_unpin(); + + // waiters? + finish_waiting(CDIR_WAIT_UNFREEZE); +} + +bool CDir::is_freezing_tree() +{ + CDir *dir = this; + while (1) { + if (dir->is_freezing_tree_root()) return true; + if (dir->is_import()) return false; + if (dir->is_hashed()) return false; + if (dir->inode->parent) + dir = dir->inode->parent->dir; + else + return false; // root on replica + } +} + +bool CDir::is_frozen_tree() +{ + CDir *dir = this; + while (1) { + if (dir->is_frozen_tree_root()) return true; + if (dir->is_import()) return false; + if (dir->is_hashed()) return false; + if (dir->inode->parent) + dir = dir->inode->parent->dir; + else + return false; // root on replica + } +} + + + +// FREEZE DIR + +class C_MDS_FreezeDir : public Context { + CDir *dir; + Context *con; +public: + C_MDS_FreezeDir(CDir *dir, Context *c) { + this->dir = dir; + this->con = c; + } + virtual void finish(int r) { + dir->freeze_dir_finish(con); + } +}; + +void CDir::freeze_dir(Context *c) +{ + assert(!is_frozen()); + assert(!is_freezing()); + + if (is_freezeable_dir()) { + dout(10) << "freeze_dir " << *this << endl; + + state_set(CDIR_STATE_FROZENDIR); + inode->auth_pin(); // auth_pin for duration of freeze + + // easy, we're frozen + c->finish(0); + delete c; + + } else { + state_set(CDIR_STATE_FREEZINGDIR); + dout(10) << "freeze_dir + wait " << *this << endl; + + // need to wait for auth pins to expire + add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); + } +} + +void CDir::freeze_dir_finish(Context *c) +{ + // freezeable now? + if (!is_freezeable_dir()) { + // wait again! + dout(10) << "freeze_dir_finish still waiting " << *this << endl; + state_set(CDIR_STATE_FREEZINGDIR); + add_waiter(CDIR_WAIT_FREEZEABLE, new C_MDS_FreezeDir(this, c)); + return; + } + + dout(10) << "freeze_dir_finish " << *this << endl; + state_set(CDIR_STATE_FROZENDIR); + state_clear(CDIR_STATE_FREEZINGDIR); // actually, this may get set again by next context? + + inode->auth_pin(); // auth_pin for duration of freeze + + // continue to frozen land + if (c) { + c->finish(0); + delete c; + } +} + +void CDir::unfreeze_dir() +{ + dout(10) << "unfreeze_dir " << *this << endl; + state_clear(CDIR_STATE_FROZENDIR); + + // unpin (may => FREEZEABLE) FIXME: is this order good? + inode->auth_unpin(); + + // waiters? + finish_waiting(CDIR_WAIT_UNFREEZE); +} + + + + + + + + + +// ----------------------------------------------------------------- +// debug shite + + +void CDir::dump(int depth) { + string ind(depth, '\t'); + + dout(10) << "dump:" << ind << *this << endl; + + map::iterator iter = items.begin(); + while (iter != items.end()) { + CDentry* d = iter->second; + if (d->inode) { + char isdir = ' '; + if (d->inode->dir != NULL) isdir = '/'; + dout(10) << "dump: " << ind << *d << " = " << *d->inode << endl; + d->inode->dump(depth+1); + } else { + dout(10) << "dump: " << ind << *d << " = [null]" << endl; + } + iter++; + } + + if (!(state_test(CDIR_STATE_COMPLETE))) + dout(10) << ind << "..." << endl; + if (state_test(CDIR_STATE_DIRTY)) + dout(10) << ind << "[dirty]" << endl; + +} + diff --git a/branches/sage/cephmds2/mds/CDir.h b/branches/sage/cephmds2/mds/CDir.h new file mode 100644 index 0000000000000..a1e857a72f9f9 --- /dev/null +++ b/branches/sage/cephmds2/mds/CDir.h @@ -0,0 +1,706 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __CDIR_H +#define __CDIR_H + +#include "include/types.h" +#include "include/buffer.h" +#include "config.h" +#include "common/DecayCounter.h" + +#include +#include + +#include +#include +#include +#include +using namespace std; + +#include +using __gnu_cxx::hash_map; + + +#include "CInode.h" + +class CDentry; +class MDS; +class MDCluster; +class Context; + + +// directory authority types +// >= 0 is the auth mds +#define CDIR_AUTH_PARENT -1 // default + + +#define CDIR_NONCE_EXPORT 1 + + +// state bits +#define CDIR_STATE_AUTH (1<<0) // auth for this dir (hashing doesn't count) +#define CDIR_STATE_PROXY (1<<1) // proxy auth + +#define CDIR_STATE_COMPLETE (1<<2) // the complete contents are in cache +#define CDIR_STATE_DIRTY (1<<3) // has been modified since last commit + +#define CDIR_STATE_FROZENTREE (1<<4) // root of tree (bounded by exports) +#define CDIR_STATE_FREEZINGTREE (1<<5) // in process of freezing +#define CDIR_STATE_FROZENDIR (1<<6) +#define CDIR_STATE_FREEZINGDIR (1<<7) + +#define CDIR_STATE_COMMITTING (1<<8) // mid-commit +#define CDIR_STATE_FETCHING (1<<9) // currenting fetching + +#define CDIR_STATE_DELETED (1<<10) + +#define CDIR_STATE_IMPORT (1<<11) // flag set if this is an import. +#define CDIR_STATE_EXPORT (1<<12) +#define CDIR_STATE_IMPORTINGEXPORT (1<<13) + +#define CDIR_STATE_HASHED (1<<14) // if hashed +#define CDIR_STATE_HASHING (1<<15) +#define CDIR_STATE_UNHASHING (1<<16) + + + + + +// these state bits are preserved by an import/export +// ...except if the directory is hashed, in which case none of them are! +#define CDIR_MASK_STATE_EXPORTED (CDIR_STATE_COMPLETE\ + |CDIR_STATE_DIRTY) +#define CDIR_MASK_STATE_IMPORT_KEPT (CDIR_STATE_IMPORT\ + |CDIR_STATE_EXPORT\ + |CDIR_STATE_IMPORTINGEXPORT) +#define CDIR_MASK_STATE_EXPORT_KEPT (CDIR_STATE_HASHED\ + |CDIR_STATE_FROZENTREE\ + |CDIR_STATE_FROZENDIR\ + |CDIR_STATE_EXPORT\ + |CDIR_STATE_PROXY) + +// common states +#define CDIR_STATE_CLEAN 0 +#define CDIR_STATE_INITIAL 0 + +// directory replication +#define CDIR_REP_ALL 1 +#define CDIR_REP_NONE 0 +#define CDIR_REP_LIST 2 + + + +// pins + +#define CDIR_PIN_CHILD 0 +#define CDIR_PIN_OPENED 1 // open by another node +#define CDIR_PIN_WAITER 2 // waiter(s) + +#define CDIR_PIN_IMPORT 3 +#define CDIR_PIN_EXPORT 4 +#define CDIR_PIN_FREEZE 5 +#define CDIR_PIN_PROXY 6 // auth just changed. + +#define CDIR_PIN_AUTHPIN 7 + +#define CDIR_PIN_IMPORTING 8 +#define CDIR_PIN_IMPORTINGEXPORT 9 + +#define CDIR_PIN_HASHED 10 +#define CDIR_PIN_HASHING 11 +#define CDIR_PIN_DIRTY 12 + +#define CDIR_PIN_REQUEST 13 + +#define CDIR_NUM_PINS 14 + + + +// wait reasons +#define CDIR_WAIT_DENTRY 1 // wait for item to be in cache + // waiters: path_traverse + // trigger: handle_discover, fetch_dir_2 +#define CDIR_WAIT_COMPLETE 2 // wait for complete dir contents + // waiters: fetch_dir, commit_dir + // trigger: fetch_dir_2 +#define CDIR_WAIT_FREEZEABLE 4 // hard_pins removed + // waiters: freeze, freeze_finish + // trigger: auth_unpin, adjust_nested_auth_pins +#define CDIR_WAIT_UNFREEZE 8 // unfreeze + // waiters: path_traverse, handle_discover, handle_inode_update, + // export_dir_frozen (mdcache) + // handle_client_readdir (mds) + // trigger: unfreeze +#define CDIR_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE + // waiters: commit_dir (mdstore) + // trigger: (see CDIR_WAIT_UNFREEZE) +#define CDIR_WAIT_COMMITTED 32 // did commit (who uses this?**) + // waiters: commit_dir (if already committing) + // trigger: commit_dir_2 +#define CDIR_WAIT_IMPORTED 64 // import finish + // waiters: import_dir_block + // triggers: handle_export_dir_finish + +#define CDIR_WAIT_EXPORTWARNING 8192 // on bystander. + // watiers: handle_export_dir_notify + // triggers: handle_export_dir_warning +#define CDIR_WAIT_EXPORTPREPACK 16384 + // waiter export_dir + // trigger handel_export_dir_prep_ack + +#define CDIR_WAIT_HASHED (1<<17) // hash finish +#define CDIR_WAIT_THISHASHEDREADDIR (1<<18) // current readdir lock +#define CDIR_WAIT_NEXTHASHEDREADDIR (1<<19) // after current readdir lock finishes + +#define CDIR_WAIT_DNREAD (1<<20) +#define CDIR_WAIT_DNLOCK (1<<21) +#define CDIR_WAIT_DNUNPINNED (1<<22) +#define CDIR_WAIT_DNPINNABLE (CDIR_WAIT_DNREAD|CDIR_WAIT_DNUNPINNED) + +#define CDIR_WAIT_DNREQXLOCK (1<<23) + +#define CDIR_WAIT_ANY (0xffffffff) + +#define CDIR_WAIT_ATFREEZEROOT (CDIR_WAIT_AUTHPINNABLE|\ + CDIR_WAIT_UNFREEZE) // hmm, same same + + +ostream& operator<<(ostream& out, class CDir& dir); + + +// CDir +typedef map CDir_map_t; + + +extern int cdir_pins[CDIR_NUM_PINS]; + + +class CDir { + public: + CInode *inode; + + protected: + // contents + CDir_map_t items; // non-null AND null + CDir_map_t null_items; // null and foreign + size_t nitems; // non-null + size_t nnull; // null + //size_t nauthitems; + //size_t namesize; + + // state + unsigned state; + version_t version; + version_t committing_version; + version_t last_committed_version; + + // authority, replicas + set open_by; // nodes that have me open + map open_by_nonce; + int replica_nonce; + int dir_auth; + + // reference countin/pins + int ref; // reference count + set ref_set; + + // lock nesting, freeze + int auth_pins; + int nested_auth_pins; + int request_pins; + + // hashed dirs + set hashed_subset; // HASHING: subset of mds's that are hashed + public: + // for class MDS + map, list > > hashed_readdir; + protected: + + // context + MDS *mds; + + + // waiters + multimap waiting; // tag -> context + hash_map< string, multimap > + waiting_on_dentry; + + // cache control (defined for authority; hints for replicas) + int dir_rep; + set dir_rep_by; // if dir_rep == CDIR_REP_LIST + + // popularity + meta_load_t popularity[MDS_NPOP]; + + // friends + friend class Migrator; + friend class CInode; + friend class MDCache; + friend class MDiscover; + friend class MDBalancer; + + friend class CDirDiscover; + friend class CDirExport; + + public: + CDir(CInode *in, MDS *mds, bool auth); + + + + // -- accessors -- + inodeno_t ino() { return inode->ino(); } + CInode *get_inode() { return inode; } + CDir *get_parent_dir() { return inode->get_parent_dir(); } + + CDir_map_t::iterator begin() { return items.begin(); } + CDir_map_t::iterator end() { return items.end(); } + size_t get_size() { + + //if ( is_auth() && !is_hashed()) assert(nauthitems == nitems); + //if (!is_auth() && !is_hashed()) assert(nauthitems == 0); + + return nitems; + } + size_t get_nitems() { return nitems; } + size_t get_nnull() { return nnull; } + /* + size_t get_auth_size() { + assert(nauthitems <= nitems); + return nauthitems; + } + */ + + /* + float get_popularity() { + return popularity[0].get(); + } + */ + + + // -- dentries and inodes -- + public: + CDentry* lookup(const string& n) { + map::iterator iter = items.find(n); + if (iter == items.end()) + return 0; + else + return iter->second; + } + + CDentry* add_dentry( const string& dname, CInode *in=0 ); + CDentry* add_dentry( const string& dname, inodeno_t ino ); + void remove_dentry( CDentry *dn ); // delete dentry + void link_inode( CDentry *dn, inodeno_t ino ); + void link_inode( CDentry *dn, CInode *in ); + void unlink_inode( CDentry *dn ); + private: + void link_inode_work( CDentry *dn, CInode *in ); + void unlink_inode_work( CDentry *dn ); + + void remove_null_dentries(); // on empty, clean dir + + // -- authority -- + public: + int authority(); + int dentry_authority(const string& d); + int get_dir_auth() { return dir_auth; } + void set_dir_auth(int d); + + bool is_open_by_anyone() { return !open_by.empty(); } + bool is_open_by(int mds) { return open_by.count(mds); } + int get_open_by_nonce(int mds) { + map::iterator it = open_by_nonce.find(mds); + return it->second; + } + set::iterator open_by_begin() { return open_by.begin(); } + set::iterator open_by_end() { return open_by.end(); } + set& get_open_by() { return open_by; } + + int get_replica_nonce() { assert(!is_auth()); return replica_nonce; } + + int open_by_add(int mds) { + int nonce = 1; + + if (is_open_by(mds)) { // already had it? + nonce = get_open_by_nonce(mds) + 1; // new nonce (+1) + dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl; + open_by_nonce.erase(mds); + } else { + if (open_by.empty()) + get(CDIR_PIN_OPENED); + open_by.insert(mds); + } + open_by_nonce.insert(pair(mds,nonce)); // first! serial of 1. + return nonce; // default nonce + } + void open_by_remove(int mds) { + //if (!is_open_by(mds)) return; + assert(is_open_by(mds)); + + open_by.erase(mds); + open_by_nonce.erase(mds); + if (open_by.empty()) + put(CDIR_PIN_OPENED); + } + void open_by_clear() { + if (!open_by.empty()) + put(CDIR_PIN_OPENED); + open_by.clear(); + open_by_nonce.clear(); + } + + + + // for giving to clients + void get_dist_spec(set& ls, int auth) { + if (( popularity[MDS_POP_CURDOM].pop[META_POP_IRD].get() > g_conf.mds_bal_replicate_threshold)) { + //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; + ls = open_by; + if (!ls.empty()) ls.insert(auth); + } + } + + + // -- state -- + unsigned get_state() { return state; } + void reset_state(unsigned s) { + state = s; + dout(10) << " cdir:" << *this << " state reset" << endl; + } + void state_clear(unsigned mask) { + state &= ~mask; + dout(10) << " cdir:" << *this << " state -" << mask << " = " << state << endl; + } + void state_set(unsigned mask) { + state |= mask; + dout(10) << " cdir:" << *this << " state +" << mask << " = " << state << endl; + } + unsigned state_test(unsigned mask) { return state & mask; } + + bool is_complete() { return state & CDIR_STATE_COMPLETE; } + bool is_dirty() { return state_test(CDIR_STATE_DIRTY); } + + bool is_auth() { return state & CDIR_STATE_AUTH; } + bool is_proxy() { return state & CDIR_STATE_PROXY; } + bool is_import() { return state & CDIR_STATE_IMPORT; } + bool is_export() { return state & CDIR_STATE_EXPORT; } + + bool is_hashed() { return state & CDIR_STATE_HASHED; } + bool is_hashing() { return state & CDIR_STATE_HASHING; } + bool is_unhashing() { return state & CDIR_STATE_UNHASHING; } + + bool is_rep() { + if (dir_rep == CDIR_REP_NONE) return false; + return true; + } + + + + // -- dirtyness -- + version_t get_version() { return version; } + void float_version(version_t ge) { + if (version < ge) + version = ge; + } + void set_version(version_t v) { version = v; } + + version_t get_committing_version() { return committing_version; } + version_t get_last_committed_version() { return last_committed_version; } + // as in, we're committing the current version. + void set_committing_version() { committing_version = version; } + void set_last_committed_version(version_t v) { last_committed_version = v; } + void mark_dirty(); + void mark_clean(); + void mark_complete() { state_set(CDIR_STATE_COMPLETE); } + bool is_clean() { return !state_test(CDIR_STATE_DIRTY); } + + + + + // -- reference counting -- + void put(int by); + void get(int by); + bool is_pinned_by(int by) { + return ref_set.count(by); + } + bool is_pinned() { return ref > 0; } + int get_ref() { return ref; } + set& get_ref_set() { return ref_set; } + void request_pin_get() { + if (request_pins == 0) get(CDIR_PIN_REQUEST); + request_pins++; + } + void request_pin_put() { + request_pins--; + if (request_pins == 0) put(CDIR_PIN_REQUEST); + } + + + // -- waiters -- + bool waiting_for(int tag); + bool waiting_for(int tag, const string& dn); + void add_waiter(int tag, Context *c); + void add_waiter(int tag, + const string& dentry, + Context *c); + void take_waiting(int mask, list& ls); // includes dentry waiters + void take_waiting(int mask, + const string& dentry, + list& ls, + int num=0); + void finish_waiting(int mask, int result = 0); // ditto + void finish_waiting(int mask, const string& dn, int result = 0); // ditto + + + // -- auth pins -- + bool can_auth_pin() { return !(is_frozen() || is_freezing()); } + int is_auth_pinned() { return auth_pins; } + void auth_pin(); + void auth_unpin(); + void adjust_nested_auth_pins(int inc); + void on_freezeable(); + + // -- freezing -- + void freeze_tree(Context *c); + void freeze_tree_finish(Context *c); + void unfreeze_tree(); + + void freeze_dir(Context *c); + void freeze_dir_finish(Context *c); + void unfreeze_dir(); + + bool is_freezing() { return is_freezing_tree() || is_freezing_dir(); } + bool is_freezing_tree(); + bool is_freezing_tree_root() { return state & CDIR_STATE_FREEZINGTREE; } + bool is_freezing_dir() { return state & CDIR_STATE_FREEZINGDIR; } + + bool is_frozen() { return is_frozen_dir() || is_frozen_tree(); } + bool is_frozen_tree(); + bool is_frozen_tree_root() { return state & CDIR_STATE_FROZENTREE; } + bool is_frozen_dir() { return state & CDIR_STATE_FROZENDIR; } + + bool is_freezeable() { + if (auth_pins == 0 && nested_auth_pins == 0) return true; + return false; + } + bool is_freezeable_dir() { + if (auth_pins == 0) return true; + return false; + } + + + + // debuggin bs + void dump(int d = 0); +}; + + + +// -- encoded state -- + +// discover + +class CDirDiscover { + inodeno_t ino; + int nonce; + int dir_auth; + int dir_rep; + set rep_by; + + public: + CDirDiscover() {} + CDirDiscover(CDir *dir, int nonce) { + ino = dir->ino(); + this->nonce = nonce; + dir_auth = dir->dir_auth; + dir_rep = dir->dir_rep; + rep_by = dir->dir_rep_by; + } + + void update_dir(CDir *dir) { + assert(dir->ino() == ino); + assert(!dir->is_auth()); + + dir->replica_nonce = nonce; + dir->dir_auth = dir_auth; + dir->dir_rep = dir_rep; + dir->dir_rep_by = rep_by; + } + + inodeno_t get_ino() { return ino; } + + + void _encode(bufferlist& bl) { + bl.append((char*)&ino, sizeof(ino)); + bl.append((char*)&nonce, sizeof(nonce)); + bl.append((char*)&dir_auth, sizeof(dir_auth)); + bl.append((char*)&dir_rep, sizeof(dir_rep)); + ::_encode(rep_by, bl); + } + + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + bl.copy(off, sizeof(nonce), (char*)&nonce); + off += sizeof(nonce); + bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); + off += sizeof(dir_auth); + bl.copy(off, sizeof(dir_rep), (char*)&dir_rep); + off += sizeof(dir_rep); + ::_decode(rep_by, bl, off); + } + +}; + + +// export + +typedef struct { + inodeno_t ino; + __uint64_t nitems; // actual real entries + __uint64_t nden; // num dentries (including null ones) + version_t version; + unsigned state; + meta_load_t popularity_justme; + meta_load_t popularity_curdom; + int dir_auth; + int dir_rep; + int nopen_by; + int nrep_by; + // ints follow +} CDirExport_st; + +class CDirExport { + CDirExport_st st; + set open_by; + map open_by_nonce; + set rep_by; + + public: + CDirExport() {} + CDirExport(CDir *dir) { + memset(&st, 0, sizeof(st)); + + st.ino = dir->ino(); + st.nitems = dir->nitems; + st.nden = dir->items.size(); + st.version = dir->version; + st.state = dir->state; + st.dir_auth = dir->dir_auth; + st.dir_rep = dir->dir_rep; + + st.popularity_justme.take( dir->popularity[MDS_POP_JUSTME] ); + st.popularity_curdom.take( dir->popularity[MDS_POP_CURDOM] ); + dir->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; + dir->popularity[MDS_POP_NESTED] -= st.popularity_curdom; + + rep_by = dir->dir_rep_by; + open_by = dir->open_by; + open_by_nonce = dir->open_by_nonce; + } + + inodeno_t get_ino() { return st.ino; } + __uint64_t get_nden() { return st.nden; } + + void update_dir(CDir *dir) { + assert(dir->ino() == st.ino); + + //dir->nitems = st.nitems; + dir->version = st.version; + if (dir->state & CDIR_STATE_HASHED) + dir->state |= CDIR_STATE_AUTH; // just inherit auth flag when hashed + else + dir->state = (dir->state & CDIR_MASK_STATE_IMPORT_KEPT) | // remember import flag, etc. + (st.state & CDIR_MASK_STATE_EXPORTED); + dir->dir_auth = st.dir_auth; + dir->dir_rep = st.dir_rep; + + dir->popularity[MDS_POP_JUSTME] += st.popularity_justme; + dir->popularity[MDS_POP_CURDOM] += st.popularity_curdom; + dir->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; + dir->popularity[MDS_POP_NESTED] += st.popularity_curdom; + + dir->replica_nonce = 0; // no longer defined + + if (!dir->open_by.empty()) + dout(0) << "open_by not empty non import, " << *dir << ", " << dir->open_by << endl; + + dir->dir_rep_by = rep_by; + dir->open_by = open_by; + dout(12) << "open_by in export is " << open_by << ", dir now " << dir->open_by << endl; + dir->open_by_nonce = open_by_nonce; + if (!open_by.empty()) + dir->get(CDIR_PIN_OPENED); + if (dir->is_dirty()) + dir->get(CDIR_PIN_DIRTY); + } + + + void _encode(bufferlist& bl) { + st.nrep_by = rep_by.size(); + st.nopen_by = open_by_nonce.size(); + bl.append((char*)&st, sizeof(st)); + + // open_by + for (map::iterator it = open_by_nonce.begin(); + it != open_by_nonce.end(); + it++) { + int m = it->first; + bl.append((char*)&m, sizeof(int)); + int n = it->second; + bl.append((char*)&n, sizeof(int)); + } + + // rep_by + for (set::iterator it = rep_by.begin(); + it != rep_by.end(); + it++) { + int m = *it; + bl.append((char*)&m, sizeof(int)); + } + } + + int _decode(bufferlist& bl, int off = 0) { + bl.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + + // open_by + for (int i=0; i(m,n)); + } + + // rep_by + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" + +#include "MDS.h" +#include "MDCache.h" +#include "AnchorTable.h" + +#include "common/Clock.h" + +#include + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mdcache->mds->get_nodeid() << ".cache.inode(" << inode.ino << ") " + + +int cinode_pins[CINODE_NUM_PINS]; // counts + + +ostream& operator<<(ostream& out, CInode& in) +{ + string path; + in.make_path(path); + out << "[inode " << in.inode.ino << " " << path << (in.is_dir() ? "/ ":" "); + if (in.is_auth()) { + out << "auth"; + if (in.is_cached_by_anyone()) { + //out << "+" << in.get_cached_by(); + for (set::iterator it = in.cached_by_begin(); + it != in.cached_by_end(); + it++) { + out << "+" << *it << "." << in.get_cached_by_nonce(*it); + } + } + } else { + out << "rep@" << in.authority(); + //if (in.get_replica_nonce() > 1) + out << "." << in.get_replica_nonce(); + assert(in.get_replica_nonce() >= 0); + } + + if (in.is_symlink()) out << " symlink"; + + out << " v" << in.get_version(); + + out << " hard=" << in.hardlock; + out << " file=" << in.filelock; + + if (in.is_pinned()) { + out << " |"; + for(set::iterator it = in.get_ref_set().begin(); + it != in.get_ref_set().end(); + it++) + if (*it < CINODE_NUM_PINS) + out << " " << cinode_pin_names[*it]; + else + out << " " << *it; + } + + // hack: spit out crap on which clients have caps + if (!in.get_client_caps().empty()) { + out << " caps={"; + for (map::iterator it = in.get_client_caps().begin(); + it != in.get_client_caps().end(); + it++) { + if (it != in.get_client_caps().begin()) out << ","; + out << it->first; + } + out << "}"; + } + out << " " << ∈ + out << "]"; + return out; +} + + +// ====== CInode ======= +CInode::CInode(MDCache *c, bool auth) : LRUObject() { + mdcache = c; + + ref = 0; + + parent = NULL; + + dir = NULL; // CDir opened separately + + auth_pins = 0; + nested_auth_pins = 0; + num_request_pins = 0; + + state = 0; + + committing_version = committed_version = 0; + + if (auth) state_set(CINODE_STATE_AUTH); +} + +CInode::~CInode() { + if (dir) { delete dir; dir = 0; } +} + +CDir *CInode::get_parent_dir() +{ + if (parent) + return parent->dir; + return NULL; +} +CInode *CInode::get_parent_inode() +{ + if (parent) + return parent->dir->inode; + return NULL; +} + +bool CInode::dir_is_auth() { + if (dir) + return dir->is_auth(); + else + return is_auth(); +} + +CDir *CInode::get_or_open_dir(MDS *mds) +{ + assert(is_dir()); + + if (dir) return dir; + + // can't open a dir if we're frozen_dir, bc of hashing stuff. + assert(!is_frozen_dir()); + + // only auth can open dir alone. + assert(is_auth()); + set_dir( new CDir(this, mds, true) ); + dir->dir_auth = -1; + return dir; +} + +CDir *CInode::set_dir(CDir *newdir) +{ + assert(dir == 0); + dir = newdir; + return dir; +} + +void CInode::set_auth(bool a) +{ + if (!is_dangling() && !is_root() && + is_auth() != a) { + /* + CDir *dir = get_parent_dir(); + if (is_auth() && !a) + dir->nauthitems--; + else + dir->nauthitems++; + */ + } + + if (a) state_set(CINODE_STATE_AUTH); + else state_clear(CINODE_STATE_AUTH); +} + + + +void CInode::make_path(string& s) +{ + if (parent) { + parent->make_path(s); + } + else if (is_root()) { + s = ""; // root + } + else { + s = "(dangling)"; // dangling + } +} + +void CInode::make_anchor_trace(vector& trace) +{ + if (parent) { + parent->dir->inode->make_anchor_trace(trace); + + dout(7) << "make_anchor_trace adding " << ino() << " dirino " << parent->dir->inode->ino() << " dn " << parent->name << endl; + trace.push_back( new Anchor(ino(), + parent->dir->inode->ino(), + parent->name) ); + } + else if (state_test(CINODE_STATE_DANGLING)) { + dout(7) << "make_anchor_trace dangling " << ino() << " on mds " << dangling_auth << endl; + string ref_dn; + trace.push_back( new Anchor(ino(), + MDS_INO_INODEFILE_OFFSET+dangling_auth, + ref_dn) ); + } + else + assert(is_root()); +} + + + + +void CInode::mark_dirty() { + + dout(10) << "mark_dirty " << *this << endl; + + if (!parent) { + dout(10) << " dangling, not marking dirty!" << endl; + return; + } + + /* + NOTE: I may already be dirty, but this fn _still_ needs to be called so that + the directory is (perhaps newly) dirtied, and so that parent_dir_version is + updated below. + */ + + // only auth can get dirty. "dirty" async data in replicas is relative to (say) filelock state, not dirty flag. + assert(is_auth()); + + // touch my private version + inode.version++; + if (!(state & CINODE_STATE_DIRTY)) { + state |= CINODE_STATE_DIRTY; + get(CINODE_PIN_DIRTY); + } + + // relative to parent dir: + if (parent) { + // dir is now dirty (if it wasn't already) + parent->dir->mark_dirty(); + + // i now live in that (potentially newly dirty) version + parent_dir_version = parent->dir->get_version(); + } +} + +void CInode::mark_clean() +{ + dout(10) << " mark_clean " << *this << endl; + if (state & CINODE_STATE_DIRTY) { + state &= ~CINODE_STATE_DIRTY; + put(CINODE_PIN_DIRTY); + } +} + +// state + + + + + +// new state encoders + +void CInode::encode_file_state(bufferlist& bl) +{ + bl.append((char*)&inode.size, sizeof(inode.size)); + bl.append((char*)&inode.mtime, sizeof(inode.mtime)); + bl.append((char*)&inode.atime, sizeof(inode.atime)); // ?? +} + +void CInode::decode_file_state(bufferlist& r, int& off) +{ + r.copy(off, sizeof(inode.size), (char*)&inode.size); + off += sizeof(inode.size); + r.copy(off, sizeof(inode.mtime), (char*)&inode.mtime); + off += sizeof(inode.mtime); + r.copy(off, sizeof(inode.atime), (char*)&inode.atime); + off += sizeof(inode.atime); +} + +/* not used currently +void CInode::decode_merge_file_state(crope& r, int& off) +{ + __uint64_t size; + r.copy(off, sizeof(size), (char*)&size); + off += sizeof(size); + if (size > inode.size) inode.size = size; + + time_t t; + r.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); + if (t > inode.mtime) inode.mtime = t; + + r.copy(off, sizeof(t), (char*)&t); + off += sizeof(t); + if (t > inode.atime) inode.atime = t; +} +*/ + +void CInode::encode_hard_state(bufferlist& r) +{ + r.append((char*)&inode.mode, sizeof(inode.mode)); + r.append((char*)&inode.uid, sizeof(inode.uid)); + r.append((char*)&inode.gid, sizeof(inode.gid)); + r.append((char*)&inode.ctime, sizeof(inode.ctime)); +} + +void CInode::decode_hard_state(bufferlist& r, int& off) +{ + r.copy(off, sizeof(inode.mode), (char*)&inode.mode); + off += sizeof(inode.mode); + r.copy(off, sizeof(inode.uid), (char*)&inode.uid); + off += sizeof(inode.uid); + r.copy(off, sizeof(inode.gid), (char*)&inode.gid); + off += sizeof(inode.gid); + r.copy(off, sizeof(inode.ctime), (char*)&inode.ctime); + off += sizeof(inode.ctime); +} + + +// old state encoders + +/* +void CInode::encode_basic_state(bufferlist& r) +{ + // inode + r.append((char*)&inode, sizeof(inode)); + ::_encode(cached_by, r); + ::_encode(cached_by_nonce, r); +} + +void CInode::decode_basic_state(bufferlist& r, int& off) +{ + // inode + r.copy(0,sizeof(inode_t), (char*)&inode); + off += sizeof(inode_t); + + bool empty = cached_by.empty(); + ::_decode(cached_by, r, off); + ::_decode(cached_by_nonce, r, off); + if (!empty) + get(CINODE_PIN_CACHED); +} +*/ + + +// waiting + +bool CInode::is_frozen() +{ + if (parent && parent->dir->is_frozen()) + return true; + return false; +} + +bool CInode::is_frozen_dir() +{ + if (parent && parent->dir->is_frozen_dir()) + return true; + return false; +} + +bool CInode::is_freezing() +{ + if (parent && parent->dir->is_freezing()) + return true; + return false; +} + +bool CInode::waiting_for(int tag) +{ + return waiting.count(tag) > 0; +} + +void CInode::add_waiter(int tag, Context *c) { + // waiting on hierarchy? + if (tag & CDIR_WAIT_ATFREEZEROOT && (is_freezing() || is_frozen())) { + parent->dir->add_waiter(tag, c); + return; + } + + // this inode. + if (waiting.size() == 0) + get(CINODE_PIN_WAITER); + waiting.insert(pair(tag,c)); + dout(10) << "add_waiter " << tag << " " << c << " on " << *this << endl; + +} + +void CInode::take_waiting(int mask, list& ls) +{ + if (waiting.empty()) return; + + multimap::iterator it = waiting.begin(); + while (it != waiting.end()) { + if (it->first & mask) { + ls.push_back(it->second); + dout(10) << "take_waiting mask " << mask << " took " << it->second << " tag " << it->first << " on " << *this << endl; + + waiting.erase(it++); + } else { + dout(10) << "take_waiting mask " << mask << " SKIPPING " << it->second << " tag " << it->first << " on " << *this << endl; + it++; + } + } + + if (waiting.empty()) + put(CINODE_PIN_WAITER); +} + +void CInode::finish_waiting(int mask, int result) +{ + dout(11) << "finish_waiting mask " << mask << " result " << result << " on " << *this << endl; + + list finished; + take_waiting(mask, finished); + finish_contexts(finished, result); +} + + +// auth_pins +bool CInode::can_auth_pin() { + if (parent) + return parent->dir->can_auth_pin(); + return true; +} + +void CInode::auth_pin() { + if (auth_pins == 0) + get(CINODE_PIN_AUTHPIN); + auth_pins++; + + dout(7) << "auth_pin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; + + if (parent) + parent->dir->adjust_nested_auth_pins( 1 ); +} + +void CInode::auth_unpin() { + auth_pins--; + if (auth_pins == 0) + put(CINODE_PIN_AUTHPIN); + + dout(7) << "auth_unpin on " << *this << " count now " << auth_pins << " + " << nested_auth_pins << endl; + + assert(auth_pins >= 0); + + if (parent) + parent->dir->adjust_nested_auth_pins( -1 ); +} + + + +// authority + +int CInode::authority() { + if (is_dangling()) + return dangling_auth; // explicit + if (is_root()) + return 0; // i am root + assert(parent); + return parent->dir->dentry_authority( parent->name ); +} + + +CInodeDiscover* CInode::replicate_to( int rep ) +{ + assert(is_auth()); + + // relax locks? + if (!is_cached_by_anyone()) + replicate_relax_locks(); + + // return the thinger + int nonce = cached_by_add( rep ); + return new CInodeDiscover( this, nonce ); +} + + +// debug crap ----------------------------- + +void CInode::dump(int dep) +{ + string ind(dep, '\t'); + //cout << ind << "[inode " << this << "]" << endl; + + if (dir) + dir->dump(dep); +} + diff --git a/branches/sage/cephmds2/mds/CInode.h b/branches/sage/cephmds2/mds/CInode.h new file mode 100644 index 0000000000000..3d754ad9c4fbc --- /dev/null +++ b/branches/sage/cephmds2/mds/CInode.h @@ -0,0 +1,757 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __CINODE_H +#define __CINODE_H + +#include "config.h" +#include "include/types.h" +#include "include/lru.h" + +#include "CDentry.h" +#include "Lock.h" +#include "Capability.h" + +#include "mdstypes.h" + +#include +#include +#include +#include +#include +#include +using namespace std; + + + + + +// pins for keeping an item in cache (and debugging) +#define CINODE_PIN_DIR 0 +#define CINODE_PIN_CACHED 1 +#define CINODE_PIN_DIRTY 2 // must flush +#define CINODE_PIN_PROXY 3 // can't expire yet +#define CINODE_PIN_WAITER 4 // waiter + +#define CINODE_PIN_CAPS 5 // local fh's + +#define CINODE_PIN_DNDIRTY 7 // dentry is dirty + +#define CINODE_PIN_AUTHPIN 8 +#define CINODE_PIN_IMPORTING 9 // multipurpose, for importing +#define CINODE_PIN_REQUEST 10 // request is logging, finishing +#define CINODE_PIN_RENAMESRC 11 // pinned on dest for foreign rename +#define CINODE_PIN_ANCHORING 12 + +#define CINODE_PIN_OPENINGDIR 13 + +#define CINODE_PIN_DENTRYLOCK 14 + +#define CINODE_NUM_PINS 15 + +static char *cinode_pin_names[CINODE_NUM_PINS] = { + "dir", + "cached", + "dirty", + "proxy", + "waiter", + "caps", + "--", + "dndirty", + "authpin", + "imping", + "request", + "rensrc", + "anching", + "opdir", + "dnlock" +}; + + + + + + +// wait reasons +#define CINODE_WAIT_AUTHPINNABLE CDIR_WAIT_UNFREEZE + // waiters: write_hard_start, read_file_start, write_file_start (mdcache) + // handle_client_chmod, handle_client_touch (mds) + // trigger: (see CDIR_WAIT_UNFREEZE) +#define CINODE_WAIT_GETREPLICA (1<<11) // update/replicate individual inode + // waiters: import_dentry_inode + // trigger: handle_inode_replicate_ack + +#define CINODE_WAIT_DIR (1<<13) + // waiters: traverse_path + // triggers: handle_disocver_reply + +#define CINODE_WAIT_LINK (1<<14) // as in remotely nlink++ +#define CINODE_WAIT_ANCHORED (1<<15) +#define CINODE_WAIT_UNLINK (1<<16) // as in remotely nlink-- + +#define CINODE_WAIT_HARDR (1<<17) // 131072 +#define CINODE_WAIT_HARDW (1<<18) // 262... +#define CINODE_WAIT_HARDB (1<<19) +#define CINODE_WAIT_HARDRWB (CINODE_WAIT_HARDR|CINODE_WAIT_HARDW|CINODE_WAIT_HARDB) +#define CINODE_WAIT_HARDSTABLE (1<<20) +#define CINODE_WAIT_HARDNORD (1<<21) +#define CINODE_WAIT_FILER (1<<22) +#define CINODE_WAIT_FILEW (1<<23) +#define CINODE_WAIT_FILEB (1<<24) +#define CINODE_WAIT_FILERWB (CINODE_WAIT_FILER|CINODE_WAIT_FILEW|CINODE_WAIT_FILEB) +#define CINODE_WAIT_FILESTABLE (1<<25) +#define CINODE_WAIT_FILENORD (1<<26) +#define CINODE_WAIT_FILENOWR (1<<27) + +#define CINODE_WAIT_RENAMEACK (1<<28) +#define CINODE_WAIT_RENAMENOTIFYACK (1<<29) + +#define CINODE_WAIT_CAPS (1<<30) + + + + +#define CINODE_WAIT_ANY 0xffffffff + + +// state +#define CINODE_STATE_AUTH (1<<0) +#define CINODE_STATE_ROOT (1<<1) + +#define CINODE_STATE_DIRTY (1<<2) +#define CINODE_STATE_UNSAFE (1<<3) // not logged yet +#define CINODE_STATE_DANGLING (1<<4) // delete me when i expire; i have no dentry +#define CINODE_STATE_UNLINKING (1<<5) +#define CINODE_STATE_PROXY (1<<6) // can't expire yet +#define CINODE_STATE_EXPORTING (1<<7) // on nonauth bystander. + +#define CINODE_STATE_ANCHORING (1<<8) + +#define CINODE_STATE_OPENINGDIR (1<<9) + +//#define CINODE_STATE_RENAMING (1<<8) // moving me +//#define CINODE_STATE_RENAMINGTO (1<<9) // rename target (will be unlinked) + + +// misc +#define CINODE_EXPORT_NONCE 1 // nonce given to replicas created by export +#define CINODE_HASHREPLICA_NONCE 1 // hashed inodes that are duped ???FIXME??? + +class Context; +class CDentry; +class CDir; +class MDS; +class Message; +class CInode; +class CInodeDiscover; +class MDCache; + +//class MInodeSyncStart; + +ostream& operator<<(ostream& out, CInode& in); + + +extern int cinode_pins[CINODE_NUM_PINS]; // counts + + +// cached inode wrapper +class CInode : public LRUObject { + public: + MDCache *mdcache; + + inode_t inode; // the inode itself + + CDir *dir; // directory, if we have it opened. + string symlink; // symlink dest, if symlink + + // inode metadata locks + CLock hardlock; + CLock filelock; + + protected: + int ref; // reference count + set ref_set; + version_t parent_dir_version; // parent dir version when i was last touched. + version_t committing_version; + version_t committed_version; + + unsigned state; + + // parent dentries in cache + CDentry *parent; // primary link + set remote_parents; // if hard linked + + // -- distributed caching + set cached_by; // [auth] mds's that cache me. + /* NOTE: on replicas, this doubles as replicated_by, but the + cached_by_* access methods below should NOT be used in those + cases, as the semantics are different! */ + map cached_by_nonce; // [auth] nonce issued to each replica + int replica_nonce; // [replica] defined on replica + + int dangling_auth; // explicit auth, when dangling. + + int num_request_pins; + + // waiters + multimap waiting; + + // file capabilities + map client_caps; // client -> caps + + map mds_caps_wanted; // [auth] mds -> caps wanted + int replica_caps_wanted; // [replica] what i've requested from auth + utime_t replica_caps_wanted_keep_until; + + + private: + // lock nesting + int auth_pins; + int nested_auth_pins; + + public: + meta_load_t popularity[MDS_NPOP]; + + // friends + friend class Server; + friend class Locker; + friend class Migrator; + friend class MDCache; + friend class CDir; + friend class CInodeExport; + friend class CInodeDiscover; + + public: + // --------------------------- + CInode(MDCache *c, bool auth=true); + ~CInode(); + + + // -- accessors -- + bool is_file() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_FILE) ? true:false; } + bool is_symlink() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_SYMLINK) ? true:false; } + bool is_dir() { return ((inode.mode & INODE_TYPE_MASK) == INODE_MODE_DIR) ? true:false; } + + bool is_anchored() { return inode.anchored; } + + bool is_root() { return state & CINODE_STATE_ROOT; } + bool is_proxy() { return state & CINODE_STATE_PROXY; } + + bool is_auth() { return state & CINODE_STATE_AUTH; } + void set_auth(bool auth); + bool is_replica() { return !is_auth(); } + int get_replica_nonce() { assert(!is_auth()); return replica_nonce; } + + inodeno_t ino() { return inode.ino; } + inode_t& get_inode() { return inode; } + CDentry* get_parent_dn() { return parent; } + CDir *get_parent_dir(); + CInode *get_parent_inode(); + CInode *get_realm_root(); // import, hash, or root + + CDir *get_or_open_dir(MDS *mds); + CDir *set_dir(CDir *newdir); + + bool dir_is_auth(); + + + + // -- misc -- + void make_path(string& s); + void make_anchor_trace(vector& trace); + + + + // -- state -- + unsigned get_state() { return state; } + void state_clear(unsigned mask) { state &= ~mask; } + void state_set(unsigned mask) { state |= mask; } + unsigned state_test(unsigned mask) { return state & mask; } + + bool is_unsafe() { return state & CINODE_STATE_UNSAFE; } + bool is_dangling() { return state & CINODE_STATE_DANGLING; } + bool is_unlinking() { return state & CINODE_STATE_UNLINKING; } + + void mark_unsafe() { state |= CINODE_STATE_UNSAFE; } + void mark_safe() { state &= ~CINODE_STATE_UNSAFE; } + + // -- state encoding -- + //void encode_basic_state(bufferlist& r); + //void decode_basic_state(bufferlist& r, int& off); + + + void encode_file_state(bufferlist& r); + void decode_file_state(bufferlist& r, int& off); + + void encode_hard_state(bufferlist& r); + void decode_hard_state(bufferlist& r, int& off); + + + // -- dirtyness -- + version_t get_version() { return inode.version; } + version_t get_parent_dir_version() { return parent_dir_version; } + void float_parent_dir_version(version_t ge) { + if (parent_dir_version < ge) + parent_dir_version = ge; + } + version_t get_committing_version() { return committing_version; } + version_t get_last_committed_version() { return committed_version; } + void set_committing_version(version_t v) { committing_version = v; } + void set_committed_version() { + committed_version = committing_version; + committing_version = 0; + } + + bool is_dirty() { return state & CINODE_STATE_DIRTY; } + bool is_clean() { return !is_dirty(); } + + void mark_dirty(); + void mark_clean(); + + + + // -- cached_by -- to be used ONLY when we're authoritative or cacheproxy + bool is_cached_by_anyone() { return !cached_by.empty(); } + bool is_cached_by(int mds) { return cached_by.count(mds); } + int num_cached_by() { return cached_by.size(); } + // cached_by_add returns a nonce + int cached_by_add(int mds) { + int nonce = 1; + if (is_cached_by(mds)) { // already had it? + nonce = get_cached_by_nonce(mds) + 1; // new nonce (+1) + dout(10) << *this << " issuing new nonce " << nonce << " to mds" << mds << endl; + cached_by_nonce.erase(mds); + } else { + if (cached_by.empty()) + get(CINODE_PIN_CACHED); + cached_by.insert(mds); + } + cached_by_nonce.insert(pair(mds,nonce)); // first! serial of 1. + return nonce; // default nonce + } + void cached_by_add(int mds, int nonce) { + if (cached_by.empty()) + get(CINODE_PIN_CACHED); + cached_by.insert(mds); + cached_by_nonce.insert(pair(mds,nonce)); + } + int get_cached_by_nonce(int mds) { + map::iterator it = cached_by_nonce.find(mds); + return it->second; + } + void cached_by_remove(int mds) { + //if (!is_cached_by(mds)) return; + assert(is_cached_by(mds)); + + cached_by.erase(mds); + cached_by_nonce.erase(mds); + if (cached_by.empty()) + put(CINODE_PIN_CACHED); + } + void cached_by_clear() { + if (cached_by.size()) + put(CINODE_PIN_CACHED); + cached_by.clear(); + cached_by_nonce.clear(); + } + set::iterator cached_by_begin() { return cached_by.begin(); } + set::iterator cached_by_end() { return cached_by.end(); } + set& get_cached_by() { return cached_by; } + + CInodeDiscover* replicate_to(int rep); + + + // -- waiting -- + bool waiting_for(int tag); + void add_waiter(int tag, Context *c); + void take_waiting(int tag, list& ls); + void finish_waiting(int mask, int result = 0); + + + // -- caps -- (new) + // client caps + map& get_client_caps() { return client_caps; } + void add_client_cap(int client, Capability& cap) { + if (client_caps.empty()) + get(CINODE_PIN_CAPS); + assert(client_caps.count(client) == 0); + client_caps[client] = cap; + } + void remove_client_cap(int client) { + assert(client_caps.count(client) == 1); + client_caps.erase(client); + if (client_caps.empty()) + put(CINODE_PIN_CAPS); + } + Capability* get_client_cap(int client) { + if (client_caps.count(client)) + return &client_caps[client]; + return 0; + } + /* + void set_client_caps(map& cl) { + if (client_caps.empty() && !cl.empty()) + get(CINODE_PIN_CAPS); + client_caps.clear(); + client_caps = cl; + } + */ + void take_client_caps(map& cl) { + if (!client_caps.empty()) + put(CINODE_PIN_CAPS); + cl = client_caps; + client_caps.clear(); + } + void merge_client_caps(map& cl, set& new_client_caps) { + if (client_caps.empty() && !cl.empty()) + get(CINODE_PIN_CAPS); + for (map::iterator it = cl.begin(); + it != cl.end(); + it++) { + new_client_caps.insert(it->first); + if (client_caps.count(it->first)) { + // merge + client_caps[it->first].merge(it->second); + } else { + // new + client_caps[it->first] = it->second; + } + } + } + + // caps issued, wanted + int get_caps_issued() { + int c = 0; + for (map::iterator it = client_caps.begin(); + it != client_caps.end(); + it++) + c |= it->second.issued(); + return c; + } + int get_caps_wanted() { + int w = 0; + for (map::iterator it = client_caps.begin(); + it != client_caps.end(); + it++) { + w |= it->second.wanted(); + //cout << " get_caps_wanted client " << it->first << " " << cap_string(it->second.wanted()) << endl; + } + if (is_auth()) + for (map::iterator it = mds_caps_wanted.begin(); + it != mds_caps_wanted.end(); + it++) { + w |= it->second; + //cout << " get_caps_wanted mds " << it->first << " " << cap_string(it->second) << endl; + } + return w; + } + + + void replicate_relax_locks() { + assert(is_auth()); + assert(!is_cached_by_anyone()); + dout(10) << " relaxing locks on " << *this << endl; + + if (hardlock.get_state() == LOCK_LOCK && + !hardlock.is_used()) { + dout(10) << " hard now sync " << *this << endl; + hardlock.set_state(LOCK_SYNC); + } + if (filelock.get_state() == LOCK_LOCK) { + if (!filelock.is_used() && + (get_caps_issued() & CAP_FILE_WR) == 0) { + filelock.set_state(LOCK_SYNC); + dout(10) << " file now sync " << *this << endl; + } else { + dout(10) << " can't relax filelock on " << *this << endl; + } + } + } + + + // -- authority -- + int authority(); + + + // -- auth pins -- + int is_auth_pinned() { + return auth_pins; + } + int adjust_nested_auth_pins(int a); + bool can_auth_pin(); + void auth_pin(); + void auth_unpin(); + + + // -- freeze -- + bool is_frozen(); + bool is_frozen_dir(); + bool is_freezing(); + + + // -- reference counting -- + + /* these can be pinned any # of times, and are + linked to an active_request, so they're automatically cleaned + up when a request is finished. pin at will! */ + void request_pin_get() { + if (num_request_pins == 0) get(CINODE_PIN_REQUEST); + num_request_pins++; + } + void request_pin_put() { + num_request_pins--; + if (num_request_pins == 0) put(CINODE_PIN_REQUEST); + assert(num_request_pins >= 0); + } + + + bool is_pinned() { return ref > 0; } + set& get_ref_set() { return ref_set; } + void put(int by) { + cinode_pins[by]--; + if (ref == 0 || ref_set.count(by) != 1) { + dout(7) << " bad put " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 1); + assert(ref > 0); + } + ref--; + ref_set.erase(by); + if (ref == 0) + lru_unpin(); + dout(7) << " put " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; + } + void get(int by) { + cinode_pins[by]++; + if (ref == 0) + lru_pin(); + if (ref_set.count(by)) { + dout(7) << " bad get " << *this << " by " << by << " " << cinode_pin_names[by] << " was " << ref << " (" << ref_set << ")" << endl; + assert(ref_set.count(by) == 0); + } + ref++; + ref_set.insert(by); + dout(7) << " get " << *this << " by " << by << " " << cinode_pin_names[by] << " now " << ref << " (" << ref_set << ")" << endl; + } + bool is_pinned_by(int by) { + return ref_set.count(by); + } + + // -- hierarchy stuff -- + void set_primary_parent(CDentry *p) { + parent = p; + } + void remove_primary_parent(CDentry *dn) { + assert(dn == parent); + parent = 0; + } + void add_remote_parent(CDentry *p) { + remote_parents.insert(p); + } + void remove_remote_parent(CDentry *p) { + remote_parents.erase(p); + } + int num_remote_parents() { + return remote_parents.size(); + } + + + /* + // for giving to clients + void get_dist_spec(set& ls, int auth, timepair_t& now) { + if (( is_dir() && popularity[MDS_POP_CURDOM].get(now) > g_conf.mds_bal_replicate_threshold) || + (!is_dir() && popularity[MDS_POP_JUSTME].get(now) > g_conf.mds_bal_replicate_threshold)) { + //if (!cached_by.empty() && inode.ino > 1) dout(1) << "distributed spec for " << *this << endl; + ls = cached_by; + } + } + */ + + // dbg + void dump(int d = 0); +}; + + + + +// -- encoded state + +// discover + +class CInodeDiscover { + + inode_t inode; + int replica_nonce; + + int hardlock_state; + int filelock_state; + + public: + CInodeDiscover() {} + CInodeDiscover(CInode *in, int nonce) { + inode = in->inode; + replica_nonce = nonce; + + hardlock_state = in->hardlock.get_replica_state(); + filelock_state = in->filelock.get_replica_state(); + } + + inodeno_t get_ino() { return inode.ino; } + int get_replica_nonce() { return replica_nonce; } + + void update_inode(CInode *in) { + in->inode = inode; + + in->replica_nonce = replica_nonce; + in->hardlock.set_state(hardlock_state); + in->filelock.set_state(filelock_state); + } + + void _encode(bufferlist& bl) { + bl.append((char*)&inode, sizeof(inode)); + bl.append((char*)&replica_nonce, sizeof(replica_nonce)); + bl.append((char*)&hardlock_state, sizeof(hardlock_state)); + bl.append((char*)&filelock_state, sizeof(filelock_state)); + } + + void _decode(bufferlist& bl, int& off) { + bl.copy(off,sizeof(inode_t), (char*)&inode); + off += sizeof(inode_t); + bl.copy(off, sizeof(int), (char*)&replica_nonce); + off += sizeof(int); + bl.copy(off, sizeof(hardlock_state), (char*)&hardlock_state); + off += sizeof(hardlock_state); + bl.copy(off, sizeof(filelock_state), (char*)&filelock_state); + off += sizeof(filelock_state); + } + +}; + + +// export + +class CInodeExport { + + struct { + inode_t inode; + meta_load_t popularity_justme; + meta_load_t popularity_curdom; + bool is_dirty; // dirty inode? + + int num_caps; + } st; + + set cached_by; + map cached_by_nonce; + map cap_map; + + CLock hardlock,filelock; + //int remaining_issued; + +public: + CInodeExport() {} + CInodeExport(CInode *in) { + st.inode = in->inode; + st.is_dirty = in->is_dirty(); + cached_by = in->cached_by; + cached_by_nonce = in->cached_by_nonce; + + hardlock = in->hardlock; + filelock = in->filelock; + + st.popularity_justme.take( in->popularity[MDS_POP_JUSTME] ); + st.popularity_curdom.take( in->popularity[MDS_POP_CURDOM] ); + in->popularity[MDS_POP_ANYDOM] -= st.popularity_curdom; + in->popularity[MDS_POP_NESTED] -= st.popularity_curdom; + + // steal WRITER caps from inode + in->take_client_caps(cap_map); + //remaining_issued = in->get_caps_issued(); + } + ~CInodeExport() { + } + + inodeno_t get_ino() { return st.inode.ino; } + + void update_inode(CInode *in, set& new_client_caps) { + in->inode = st.inode; + + in->popularity[MDS_POP_JUSTME] += st.popularity_justme; + in->popularity[MDS_POP_CURDOM] += st.popularity_curdom; + in->popularity[MDS_POP_ANYDOM] += st.popularity_curdom; + in->popularity[MDS_POP_NESTED] += st.popularity_curdom; + + if (st.is_dirty) { + in->mark_dirty(); + } + + in->cached_by.clear(); + in->cached_by = cached_by; + in->cached_by_nonce = cached_by_nonce; + if (!cached_by.empty()) + in->get(CINODE_PIN_CACHED); + + in->hardlock = hardlock; + in->filelock = filelock; + + // caps + in->merge_client_caps(cap_map, new_client_caps); + } + + void _encode(bufferlist& bl) { + st.num_caps = cap_map.size(); + bl.append((char*)&st, sizeof(st)); + + // cached_by + nonce + ::_encode(cached_by, bl); + ::_encode(cached_by_nonce, bl); + + hardlock.encode_state(bl); + filelock.encode_state(bl); + + // caps + for (map::iterator it = cap_map.begin(); + it != cap_map.end(); + it++) { + bl.append((char*)&it->first, sizeof(it->first)); + it->second._encode(bl); + } + } + + int _decode(bufferlist& bl, int off = 0) { + bl.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + + ::_decode(cached_by, bl, off); + ::_decode(cached_by_nonce, bl, off); + + hardlock.decode_state(bl, off); + filelock.decode_state(bl, off); + + // caps + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __CAPABILITY_H +#define __CAPABILITY_H + +#include "include/buffer.h" + +#include +using namespace std; + +#include "config.h" + + +// definite caps +#define CAP_FILE_RDCACHE 1 // client can safely cache reads +#define CAP_FILE_RD 2 // client can read +#define CAP_FILE_WR 4 // client can write +#define CAP_FILE_WREXTEND 8 // client can extend file +#define CAP_FILE_WRBUFFER 16 // client can safely buffer writes +#define CAP_FILE_LAZYIO 32 // client can perform lazy io + + +// heuristics +//#define CAP_FILE_DELAYFLUSH 32 + +inline string cap_string(int cap) +{ + string s; + s = "["; + if (cap & CAP_FILE_RDCACHE) s += " rdcache"; + if (cap & CAP_FILE_RD) s += " rd"; + if (cap & CAP_FILE_WR) s += " wr"; + if (cap & CAP_FILE_WRBUFFER) s += " wrbuffer"; + if (cap & CAP_FILE_WRBUFFER) s += " wrextend"; + if (cap & CAP_FILE_LAZYIO) s += " lazyio"; + s += " ]"; + return s; +} + + +class Capability { + int wanted_caps; // what the client wants (ideally) + + map cap_history; // seq -> cap + long last_sent, last_recv; + + bool suppress; + +public: + Capability(int want=0) : + wanted_caps(want), + last_sent(0), + last_recv(0), + suppress(false) { + //cap_history[last_sent] = 0; + } + + + bool is_suppress() { return suppress; } + void set_suppress(bool b) { suppress = b; } + + bool is_null() { return cap_history.empty(); } + + // most recently issued caps. + int pending() { + if (cap_history.count(last_sent)) + return cap_history[ last_sent ]; + return 0; + } + + // caps client has confirmed receipt of + int confirmed() { + if (cap_history.count(last_recv)) + return cap_history[ last_recv ]; + return 0; + } + + // caps potentially issued + int issued() { + int c = 0; + for (long seq = last_recv; seq <= last_sent; seq++) { + if (cap_history.count(seq)) { + c |= cap_history[seq]; + dout(10) << " cap issued: seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(c) << endl; + } + } + return c; + } + + // caps this client wants to hold + int wanted() { return wanted_caps; } + void set_wanted(int w) { + wanted_caps = w; + } + + // needed + static int needed(int from) { + // strip out wrbuffer, rdcache + return from & (CAP_FILE_WR|CAP_FILE_RD); + } + int needed() { return needed(wanted_caps); } + + // conflicts + static int conflicts(int from) { + int c = 0; + if (from & CAP_FILE_WRBUFFER) c |= CAP_FILE_RDCACHE|CAP_FILE_RD; + if (from & CAP_FILE_WR) c |= CAP_FILE_RDCACHE; + if (from & CAP_FILE_RD) c |= CAP_FILE_WRBUFFER; + if (from & CAP_FILE_RDCACHE) c |= CAP_FILE_WRBUFFER|CAP_FILE_WR; + return c; + } + int wanted_conflicts() { return conflicts(wanted()); } + int needed_conflicts() { return conflicts(needed()); } + int issued_conflicts() { return conflicts(issued()); } + + // issue caps; return seq number. + long issue(int c) { + //int was = pending(); + //no! if (c == was && last_sent) return -1; // repeat of previous? + + ++last_sent; + cap_history[last_sent] = c; + + /* no! + // not recalling, just adding? + if (c & ~was && + cap_history.count(last_sent-1)) { + cap_history.erase(last_sent-1); + } + */ + return last_sent; + } + long get_last_seq() { return last_sent; } + + void merge(Capability& other) { + // issued + pending + int newpending = other.pending() | pending(); + if (other.issued() & ~newpending) + issue(other.issued() | newpending); + issue(newpending); + + // wanted + wanted_caps = wanted_caps | other.wanted(); + } + + // confirm receipt of a previous sent/issued seq. + int confirm_receipt(long seq, int caps) { + int r = 0; + + // old seqs + while (last_recv < seq) { + dout(10) << " cap.confirm_receipt forgetting seq " << last_recv << " " << cap_string(cap_history[last_recv]) << endl; + r |= cap_history[last_recv]; + cap_history.erase(last_recv); + ++last_recv; + } + + // release current? + if (cap_history.count(seq) && + cap_history[seq] != caps) { + dout(10) << " cap.confirm_receipt revising seq " << seq << " " << cap_string(cap_history[seq]) << " -> " << cap_string(caps) << endl; + // note what we're releasing.. + assert(cap_history[seq] & ~caps); + r |= cap_history[seq] & ~caps; + + cap_history[seq] = caps; // confirmed() now less than before.. + } + + // null? + if (caps == 0 && + cap_history.size() == 1 && + cap_history.count(seq)) { + cap_history.clear(); // viola, null! + } + + return r; + } + + // serializers + void _encode(bufferlist& bl) { + bl.append((char*)&wanted_caps, sizeof(wanted_caps)); + bl.append((char*)&last_sent, sizeof(last_sent)); + bl.append((char*)&last_recv, sizeof(last_recv)); + ::_encode(cap_history, bl); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(wanted_caps), (char*)&wanted_caps); + off += sizeof(wanted_caps); + bl.copy(off, sizeof(last_sent), (char*)&last_sent); + off += sizeof(last_sent); + bl.copy(off, sizeof(last_recv), (char*)&last_recv); + off += sizeof(last_recv); + ::_decode(cap_history, bl, off); + } + +}; + + + + + +#endif diff --git a/branches/sage/cephmds2/mds/ClientMap.h b/branches/sage/cephmds2/mds/ClientMap.h new file mode 100644 index 0000000000000..63f310358cae8 --- /dev/null +++ b/branches/sage/cephmds2/mds/ClientMap.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __CLIENTMAP_H +#define __CLIENTMAP_H + +#include "msg/Message.h" + +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +class ClientMap { + hash_map client_inst; + set client_mount; + hash_map client_ref; + + void inc_ref(int client, const entity_inst_t& inst) { + if (client_inst.count(client)) { + assert(client_inst[client] == inst); + assert(client_ref.count(client)); + } else { + client_inst[client] = inst; + } + client_ref[client]++; + } + void dec_ref(int client) { + assert(client_ref.count(client)); + assert(client_ref[client] > 0); + client_ref[client]--; + if (client_ref[client] == 0) { + client_ref.erase(client); + client_inst.erase(client); + } + } + +public: + const entity_inst_t& get_inst(int client) { + assert(client_inst.count(client)); + return client_inst[client]; + } + const set& get_mount_set() { return client_mount; } + + void add_mount(int client, const entity_inst_t& inst) { + inc_ref(client, inst); + client_mount.insert(client); + } + void rem_mount(int client) { + dec_ref(client); + client_mount.erase(client); + } + + + void add_open(int client, const entity_inst_t& inst) { + inc_ref(client, inst); + } + void dec_open(int client) { + dec_ref(client); + } +}; + +#endif diff --git a/branches/sage/cephmds2/mds/IdAllocator.cc b/branches/sage/cephmds2/mds/IdAllocator.cc new file mode 100644 index 0000000000000..fba33d599de40 --- /dev/null +++ b/branches/sage/cephmds2/mds/IdAllocator.cc @@ -0,0 +1,188 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#define DBLEVEL 20 + +#include "IdAllocator.h" +#include "MDS.h" +#include "MDLog.h" +#include "events/EAlloc.h" + +#include "osdc/Filer.h" + +#include "include/types.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".idalloc: " + + +idno_t IdAllocator::alloc_id(bool replay) +{ + assert(is_active()); + + // pick one + idno_t id = free.start(); + free.erase(id); + dout(10) << "idalloc " << this << ": alloc id " << id << endl; + + version++; + + // log it + if (!replay) + mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_ALLOC, version)); + + return id; +} + +void IdAllocator::reclaim_id(idno_t id, bool replay) +{ + assert(is_active()); + + dout(10) << "idalloc " << this << ": reclaim id " << id << endl; + free.insert(id); + + version++; + + if (!replay) + mds->mdlog->submit_entry(new EAlloc(IDTYPE_INO, id, EALLOC_EV_FREE, version)); +} + + + +class C_ID_Save : public Context { + IdAllocator *ida; + version_t version; +public: + C_ID_Save(IdAllocator *i, version_t v) : ida(i), version(v) {} + void finish(int r) { + ida->save_2(version); + } +}; + +void IdAllocator::save(Context *onfinish, version_t v) +{ + if (v > 0 && v <= committing_version) { + dout(10) << "save v " << version << " - already saving " + << committing_version << " >= needed " << v << endl; + waitfor_save[v].push_back(onfinish); + return; + } + + dout(10) << "save v " << version << endl; + assert(is_active()); + + bufferlist bl; + + bl.append((char*)&version, sizeof(version)); + ::_encode(free.m, bl); + + committing_version = version; + + if (onfinish) + waitfor_save[version].push_back(onfinish); + + // write (async) + mds->filer->write(inode, + 0, bl.length(), bl, + 0, + 0, new C_ID_Save(this, version)); +} + +void IdAllocator::save_2(version_t v) +{ + dout(10) << "save_2 v " << v << endl; + + committed_version = v; + + list ls; + while (!waitfor_save.empty()) { + if (waitfor_save.begin()->first > v) break; + ls.splice(ls.end(), waitfor_save.begin()->second); + waitfor_save.erase(waitfor_save.begin()); + } + finish_contexts(ls,0); +} + + +void IdAllocator::reset() +{ + free.clear(); + + // use generic range FIXME THIS IS CRAP + free.insert((long long)0x1000000 * (long long)(mds->get_nodeid()+1), + (long long)0x1000000 * (long long)(mds->get_nodeid()+2) - 1LL); + //free[ID_INO].dump(); + + //free[ID_FH].map_insert(10000000LL * (mds->get_nodeid()+1), + //10000000LL * (mds->get_nodeid()+2) - 1); + //free[ID_FH].dump(); + + state = STATE_ACTIVE; +} + + + +// ----------------------- + +class C_ID_Load : public Context { +public: + IdAllocator *ida; + Context *onfinish; + bufferlist bl; + C_ID_Load(IdAllocator *i, Context *o) : ida(i), onfinish(o) {} + void finish(int r) { + ida->load_2(r, bl, onfinish); + } +}; + +void IdAllocator::load(Context *onfinish) +{ + dout(10) << "load" << endl; + + assert(is_undef()); + state = STATE_OPENING; + + C_ID_Load *c = new C_ID_Load(this, onfinish); + mds->filer->read(inode, + 0, inode.layout.stripe_size, + &c->bl, + c); +} + +void IdAllocator::load_2(int r, bufferlist& bl, Context *onfinish) +{ + assert(is_opening()); + state = STATE_ACTIVE; + + if (r > 0) { + dout(10) << "load_2 got " << bl.length() << " bytes" << endl; + int off = 0; + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + ::_decode(free.m, bl, off); + committed_version = version; + } + else { + dout(10) << "load_2 found no alloc file" << endl; + assert(0); // this shouldn't happen if mkfs finished. + reset(); + } + + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } +} diff --git a/branches/sage/cephmds2/mds/IdAllocator.h b/branches/sage/cephmds2/mds/IdAllocator.h new file mode 100644 index 0000000000000..745d863be99d3 --- /dev/null +++ b/branches/sage/cephmds2/mds/IdAllocator.h @@ -0,0 +1,78 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __IDALLOCATOR_H +#define __IDALLOCATOR_H + +#include "include/types.h" +#include "include/interval_set.h" +#include "include/buffer.h" +#include "include/Context.h" + +class MDS; + +#define IDTYPE_INO 1 +typedef inodeno_t idno_t; + +class IdAllocator { + MDS *mds; + inode_t inode; + + static const int STATE_UNDEF = 0; + static const int STATE_OPENING = 1; + static const int STATE_ACTIVE = 2; + //static const int STATE_COMMITTING = 3; + int state; + + version_t version, committing_version, committed_version; + + interval_set free; // unused ids + + map > waitfor_save; + + public: + IdAllocator(MDS *m, inode_t i) : + mds(m), + inode(i), + state(STATE_UNDEF), + version(0), committing_version(0), committed_version(0) + { + } + + // alloc or reclaim ids + idno_t alloc_id(bool replay=false); + void reclaim_id(idno_t id, bool replay=false); + + version_t get_version() { return version; } + version_t get_committed_version() { return committed_version; } + + // load/save from disk (hack) + bool is_undef() { return state == STATE_UNDEF; } + bool is_active() { return state == STATE_ACTIVE; } + bool is_opening() { return state == STATE_OPENING; } + + void reset(); + void save(Context *onfinish=0, version_t need=0); + void save_2(version_t v); + + void shutdown() { + if (is_active()) save(0); + } + + void load(Context *onfinish); + void load_2(int, bufferlist&, Context *onfinish); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/Lock.h b/branches/sage/cephmds2/mds/Lock.h new file mode 100644 index 0000000000000..faf648ed3b07f --- /dev/null +++ b/branches/sage/cephmds2/mds/Lock.h @@ -0,0 +1,311 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __LOCK_H +#define __LOCK_H + +#include +#include +using namespace std; + +#include "include/buffer.h" + +#include "Capability.h" + +// states and such. +// C = cache reads, R = read, W = write, A = append, B = buffer writes, L = lazyio + +// basic lock -----auth-------- ---replica------- +#define LOCK_SYNC 0 // AR R . / C R . . . L R . / C R . . . L stat() +#define LOCK_LOCK 1 // AR R W / C . . . . . . . / C . . . . . truncate() +#define LOCK_GLOCKR 2 // AR R . / C . . . . . . . / C . . . . . + +// file lock states +#define LOCK_GLOCKL 3 // A . . / . . . . . . loner -> lock +#define LOCK_GLOCKM 4 // A . . / . . . . . . +#define LOCK_MIXED 5 // AR . . / . R W A . L . . / . R . . . L +#define LOCK_GMIXEDR 6 // AR R . / . R . . . L . . / . R . . . L +#define LOCK_GMIXEDL 7 // A . . / . . . . . L loner -> mixed + +#define LOCK_LONER 8 // A . . / C R W A B L (lock) +#define LOCK_GLONERR 9 // A . . / . R . . . L +#define LOCK_GLONERM 10 // A . . / . R W A . L + +#define LOCK_GSYNCL 11 // A . . / C ? . . . L loner -> sync (*) FIXME: let old loner keep R, somehow... +#define LOCK_GSYNCM 12 // A . . / . R . . . L + +// 4 stable +// +9 transition +// 13 total + +/* no append scenarios: + +loner + truncate(): + - loner needs to lose A (?unless it's the loner doing the truncate?) +loner + statlite(size): + - loner needs to lose A + +any + statlite(size) + - all lose A + +any + statlite(mtime) + - all lose W + +-> we need to add lonerfixed and mixedfixed states (and associated transitions) + in order to efficiently support statlite(size) and truncate(). until then, + we have to LOCK. + + */ + +// -- lock... hard or file + +class CLock { + protected: + // lock state + char state; + set gather_set; // auth + int nread, nwrite; + + + public: + CLock() : + state(LOCK_LOCK), + nread(0), + nwrite(0) { + } + + // encode/decode + void encode_state(bufferlist& bl) { + bl.append((char*)&state, sizeof(state)); + bl.append((char*)&nread, sizeof(nread)); + bl.append((char*)&nwrite, sizeof(nwrite)); + + _encode(gather_set, bl); + } + void decode_state(bufferlist& bl, int& off) { + bl.copy(off, sizeof(state), (char*)&state); + off += sizeof(state); + bl.copy(off, sizeof(nread), (char*)&nread); + off += sizeof(nread); + bl.copy(off, sizeof(nwrite), (char*)&nwrite); + off += sizeof(nwrite); + + _decode(gather_set, bl, off); + } + + char get_state() { return state; } + char set_state(char s) { + state = s; + assert(!is_stable() || gather_set.size() == 0); // gather should be empty in stable states. + return s; + }; + + char get_replica_state() { + switch (state) { + case LOCK_LOCK: + case LOCK_GLOCKM: + case LOCK_GLOCKL: + case LOCK_GLOCKR: + case LOCK_LONER: + case LOCK_GLONERR: + case LOCK_GLONERM: + return LOCK_LOCK; + case LOCK_MIXED: + case LOCK_GMIXEDR: + return LOCK_MIXED; + case LOCK_SYNC: + return LOCK_SYNC; + + // after gather auth will bc LOCK_AC_MIXED or whatever + case LOCK_GSYNCM: + return LOCK_MIXED; + case LOCK_GSYNCL: + case LOCK_GMIXEDL: // ** LOCK isn't exact right state, but works. + return LOCK_LOCK; + + default: + assert(0); + } + return 0; + } + + // gather set + set& get_gather_set() { return gather_set; } + void init_gather(set& i) { + gather_set = i; + } + bool is_gathering(int i) { + return gather_set.count(i); + } + void clear_gather() { + gather_set.clear(); + } + + // ref counting + int get_read() { return ++nread; } + int put_read() { + assert(nread>0); + return --nread; + } + int get_nread() { return nread; } + + int get_write() { return ++nwrite; } + int put_write() { + assert(nwrite>0); + return --nwrite; + } + int get_nwrite() { return nwrite; } + bool is_used() { + return (nwrite+nread)>0 ? true:false; + } + + + // stable + bool is_stable() { + return (state == LOCK_SYNC) || + (state == LOCK_LOCK) || + (state == LOCK_MIXED) || + (state == LOCK_LONER); + } + + // read/write access + bool can_read(bool auth) { + if (auth) + return (state == LOCK_SYNC) || (state == LOCK_GMIXEDR) + || (state == LOCK_GLOCKR) || (state == LOCK_LOCK); + else + return (state == LOCK_SYNC); + } + bool can_read_soon(bool auth) { + if (auth) + return (state == LOCK_GLOCKL); + else + return false; + } + + bool can_write(bool auth) { + if (auth) + return (state == LOCK_LOCK); + else + return false; + } + bool can_write_soon(bool auth) { + if (auth) + return (state == LOCK_GLOCKR) || (state == LOCK_GLOCKL) + || (state == LOCK_GLOCKM); + else + return false; + } + + // client caps allowed + int caps_allowed_ever(bool auth) { + if (auth) + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + else + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + } + int caps_allowed(bool auth) { + if (auth) + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; + + case LOCK_GLOCKL: + case LOCK_GLOCKM: + return 0; + + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + case LOCK_GMIXEDR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GMIXEDL: + return 0; + + case LOCK_LONER: // single client writer, of course. + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_WRBUFFER | CAP_FILE_LAZYIO; + case LOCK_GLONERR: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_GLONERM: + return CAP_FILE_RD | CAP_FILE_WR | CAP_FILE_WREXTEND | CAP_FILE_LAZYIO; + + case LOCK_GSYNCL: + return CAP_FILE_RDCACHE | CAP_FILE_LAZYIO; + case LOCK_GSYNCM: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + else + switch (state) { + case LOCK_SYNC: + return CAP_FILE_RDCACHE | CAP_FILE_RD | CAP_FILE_LAZYIO; + case LOCK_LOCK: + case LOCK_GLOCKR: + return CAP_FILE_RDCACHE; + case LOCK_GMIXEDR: + case LOCK_MIXED: + return CAP_FILE_RD | CAP_FILE_LAZYIO; + } + assert(0); + return 0; + } + + friend class MDCache; + friend class Locker; + friend class Migrator; +}; + +//ostream& operator<<(ostream& out, CLock& l); +inline ostream& operator<<(ostream& out, CLock& l) +{ + static char* __lock_states[] = { + "sync", + "lock", + "glockr", + "glockl", + "glockm", + "mixed", + "gmixedr", + "gmixedl", + "loner", + "glonerr", + "glonerm", + "gsyncl", + "gsyncm" + }; + + out << "(" << __lock_states[(int)l.get_state()]; + + if (!l.get_gather_set().empty()) out << " g=" << l.get_gather_set(); + + if (l.get_nread()) + out << " " << l.get_nread() << "r"; + if (l.get_nwrite()) + out << " " << l.get_nwrite() << "w"; + + // rw? + /* + out << " "; + if (l.can_read(true)) out << "r[" << l.get_nread() << "]"; + if (l.can_write(true)) out << "w[" << l.get_nwrite() << "]"; + out << "/"; + if (l.can_read(false)) out << "r[" << l.get_nread() << "]"; + if (l.can_write(false)) out << "w[" << l.get_nwrite() << "]"; + */ + out << ")"; + return out; +} + +#endif diff --git a/branches/sage/cephmds2/mds/Locker.cc b/branches/sage/cephmds2/mds/Locker.cc new file mode 100644 index 0000000000000..0b4418fe2262d --- /dev/null +++ b/branches/sage/cephmds2/mds/Locker.cc @@ -0,0 +1,2286 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDS.h" +#include "MDCache.h" +#include "Locker.h" +#include "Server.h" +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" +#include "Migrator.h" + +#include "MDBalancer.h" +#include "MDLog.h" +#include "MDSMap.h" + +#include "include/filepath.h" + +#include "events/EInodeUpdate.h" +#include "events/EDirUpdate.h" +#include "events/EUnlink.h" + +#include "msg/Messenger.h" + +#include "messages/MGenericMessage.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" + +#include "messages/MDirUpdate.h" + +#include "messages/MInodeFileCaps.h" + +#include "messages/MInodeLink.h" +#include "messages/MInodeLinkAck.h" +#include "messages/MInodeUnlink.h" +#include "messages/MInodeUnlinkAck.h" + +#include "messages/MLock.h" +#include "messages/MDentryUnlink.h" + +#include "messages/MClientRequest.h" +#include "messages/MClientFileCaps.h" + +#include +#include + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".locker " + + + +void Locker::dispatch(Message *m) +{ + switch (m->get_type()) { + + // locking + case MSG_MDS_LOCK: + handle_lock((MLock*)m); + break; + + // cache fun + case MSG_MDS_INODEFILECAPS: + handle_inode_file_caps((MInodeFileCaps*)m); + break; + + case MSG_CLIENT_FILECAPS: + handle_client_file_caps((MClientFileCaps*)m); + break; + + + + default: + assert(0); + } +} + + + + +// file i/o ----------------------------------------- + +__uint64_t Locker::issue_file_data_version(CInode *in) +{ + dout(7) << "issue_file_data_version on " << *in << endl; + return in->inode.file_data_version; +} + + +Capability* Locker::issue_new_caps(CInode *in, + int mode, + MClientRequest *req) +{ + dout(7) << "issue_new_caps for mode " << mode << " on " << *in << endl; + + // my needs + int my_client = req->get_client(); + int my_want = 0; + if (mode & FILE_MODE_R) my_want |= CAP_FILE_RDCACHE | CAP_FILE_RD; + if (mode & FILE_MODE_W) my_want |= CAP_FILE_WRBUFFER | CAP_FILE_WR; + + // register a capability + Capability *cap = in->get_client_cap(my_client); + if (!cap) { + // new cap + Capability c(my_want); + in->add_client_cap(my_client, c); + cap = in->get_client_cap(my_client); + + // note client addr + mds->clientmap.add_open(my_client, req->get_client_inst()); + + } else { + // make sure it has sufficient caps + if (cap->wanted() & ~my_want) { + // augment wanted caps for this client + cap->set_wanted( cap->wanted() | my_want ); + } + } + + // suppress file cap messages for this guy for a few moments (we'll bundle with the open() reply) + cap->set_suppress(true); + int before = cap->pending(); + + if (in->is_auth()) { + // [auth] twiddle mode? + inode_file_eval(in); + } else { + // [replica] tell auth about any new caps wanted + request_inode_file_caps(in); + } + + // issue caps (pot. incl new one) + issue_caps(in); // note: _eval above may have done this already... + + // re-issue whatever we can + cap->issue(cap->pending()); + + // ok, stop suppressing. + cap->set_suppress(false); + + int now = cap->pending(); + if (before != now && + (before & CAP_FILE_WR) == 0 && + (now & CAP_FILE_WR)) { + // FIXME FIXME FIXME + } + + // twiddle file_data_version? + if ((before & CAP_FILE_WRBUFFER) == 0 && + (now & CAP_FILE_WRBUFFER)) { + in->inode.file_data_version++; + dout(7) << " incrementing file_data_version, now " << in->inode.file_data_version << " for " << *in << endl; + } + + return cap; +} + + + +bool Locker::issue_caps(CInode *in) +{ + // allowed caps are determined by the lock mode. + int allowed = in->filelock.caps_allowed(in->is_auth()); + dout(7) << "issue_caps filelock allows=" << cap_string(allowed) + << " on " << *in << endl; + + // count conflicts with + int nissued = 0; + + // client caps + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); + it++) { + if (it->second.issued() != (it->second.wanted() & allowed)) { + // issue + nissued++; + + int before = it->second.pending(); + long seq = it->second.issue(it->second.wanted() & allowed); + int after = it->second.pending(); + + // twiddle file_data_version? + if (!(before & CAP_FILE_WRBUFFER) && + (after & CAP_FILE_WRBUFFER)) { + dout(7) << " incrementing file_data_version for " << *in << endl; + in->inode.file_data_version++; + } + + if (seq > 0 && + !it->second.is_suppress()) { + dout(7) << " sending MClientFileCaps to client" << it->first << " seq " << it->second.get_last_seq() << " new pending " << cap_string(it->second.pending()) << " was " << cap_string(before) << endl; + mds->messenger->send_message(new MClientFileCaps(in->inode, + it->second.get_last_seq(), + it->second.pending(), + it->second.wanted()), + MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first), + 0, MDS_PORT_LOCKER); + } + } + } + + return (nissued == 0); // true if no re-issued, no callbacks +} + + + +void Locker::request_inode_file_caps(CInode *in) +{ + int wanted = in->get_caps_wanted(); + if (wanted != in->replica_caps_wanted) { + + if (wanted == 0) { + if (in->replica_caps_wanted_keep_until > g_clock.recent_now()) { + // ok, release them finally! + in->replica_caps_wanted_keep_until.sec_ref() = 0; + dout(7) << "request_inode_file_caps " << cap_string(wanted) + << " was " << cap_string(in->replica_caps_wanted) + << " no keeping anymore " + << " on " << *in + << endl; + } + else if (in->replica_caps_wanted_keep_until.sec() == 0) { + in->replica_caps_wanted_keep_until = g_clock.recent_now(); + in->replica_caps_wanted_keep_until.sec_ref() += 2; + + dout(7) << "request_inode_file_caps " << cap_string(wanted) + << " was " << cap_string(in->replica_caps_wanted) + << " keeping until " << in->replica_caps_wanted_keep_until + << " on " << *in + << endl; + return; + } else { + // wait longer + return; + } + } else { + in->replica_caps_wanted_keep_until.sec_ref() = 0; + } + assert(!in->is_auth()); + + int auth = in->authority(); + dout(7) << "request_inode_file_caps " << cap_string(wanted) + << " was " << cap_string(in->replica_caps_wanted) + << " on " << *in << " to mds" << auth << endl; + assert(!in->is_auth()); + + in->replica_caps_wanted = wanted; + mds->send_message_mds(new MInodeFileCaps(in->ino(), mds->get_nodeid(), + in->replica_caps_wanted), + auth, MDS_PORT_LOCKER); + } else { + in->replica_caps_wanted_keep_until.sec_ref() = 0; + } +} + +void Locker::handle_inode_file_caps(MInodeFileCaps *m) +{ + CInode *in = mdcache->get_inode(m->get_ino()); + assert(in); + assert(in->is_auth() || in->is_proxy()); + + dout(7) << "handle_inode_file_caps replica mds" << m->get_from() << " wants caps " << cap_string(m->get_caps()) << " on " << *in << endl; + + if (in->is_proxy()) { + dout(7) << "proxy, fw" << endl; + mds->send_message_mds(m, in->authority(), MDS_PORT_LOCKER); + return; + } + + if (m->get_caps()) + in->mds_caps_wanted[m->get_from()] = m->get_caps(); + else + in->mds_caps_wanted.erase(m->get_from()); + + inode_file_eval(in); + delete m; +} + + +/* + * note: we only get these from the client if + * - we are calling back previously issued caps (fewer than the client previously had) + * - or if the client releases (any of) its caps on its own + */ +void Locker::handle_client_file_caps(MClientFileCaps *m) +{ + int client = MSG_ADDR_NUM(m->get_source()); + CInode *in = mdcache->get_inode(m->get_ino()); + Capability *cap = 0; + if (in) + cap = in->get_client_cap(client); + + if (!in || !cap) { + if (!in) { + dout(7) << "handle_client_file_caps on unknown ino " << m->get_ino() << ", dropping" << endl; + } else { + dout(7) << "handle_client_file_caps no cap for client" << client << " on " << *in << endl; + } + delete m; + return; + } + + assert(cap); + + // filter wanted based on what we could ever give out (given auth/replica status) + int wanted = m->get_wanted() & in->filelock.caps_allowed_ever(in->is_auth()); + + dout(7) << "handle_client_file_caps seq " << m->get_seq() + << " confirms caps " << cap_string(m->get_caps()) + << " wants " << cap_string(wanted) + << " from client" << client + << " on " << *in + << endl; + + // update wanted + if (cap->wanted() != wanted) + cap->set_wanted(wanted); + + // confirm caps + int had = cap->confirm_receipt(m->get_seq(), m->get_caps()); + int has = cap->confirmed(); + if (cap->is_null()) { + dout(7) << " cap for client" << client << " is now null, removing from " << *in << endl; + in->remove_client_cap(client); + if (!in->is_auth()) + request_inode_file_caps(in); + + // dec client addr counter + mds->clientmap.dec_open(client); + + // tell client. + MClientFileCaps *r = new MClientFileCaps(in->inode, + 0, 0, 0, + MClientFileCaps::FILECAP_RELEASE); + mds->messenger->send_message(r, m->get_source(), m->get_source_inst(), 0, MDS_PORT_LOCKER); + } + + // merge in atime? + if (m->get_inode().atime > in->inode.atime) { + dout(7) << " taking atime " << m->get_inode().atime << " > " + << in->inode.atime << " for " << *in << endl; + in->inode.atime = m->get_inode().atime; + } + + if ((has|had) & CAP_FILE_WR) { + bool dirty = false; + + // mtime + if (m->get_inode().mtime > in->inode.mtime) { + dout(7) << " taking mtime " << m->get_inode().mtime << " > " + << in->inode.mtime << " for " << *in << endl; + in->inode.mtime = m->get_inode().mtime; + dirty = true; + } + // size + if (m->get_inode().size > in->inode.size) { + dout(7) << " taking size " << m->get_inode().size << " > " + << in->inode.size << " for " << *in << endl; + in->inode.size = m->get_inode().size; + dirty = true; + } + + if (dirty) + mds->mdlog->submit_entry(new EInodeUpdate(in)); + } + + // reevaluate, waiters + inode_file_eval(in); + in->finish_waiting(CINODE_WAIT_CAPS, 0); + + delete m; +} + + + + + + + + + + +// locks ---------------------------------------------------------------- + +/* + + +INODES: + += two types of inode metadata: + hard - uid/gid, mode + file - mtime, size + ? atime - atime (*) <-- we want a lazy update strategy? + += correspondingly, two types of inode locks: + hardlock - hard metadata + filelock - file metadata + + -> These locks are completely orthogonal! + += metadata ops and how they affect inode metadata: + sma=size mtime atime + HARD FILE OP + files: + R RRR stat + RW chmod/chown + R W touch ?ctime + R openr + W read atime + R openw + Wc openwc ?ctime + WW write size mtime + close + + dirs: + R W readdir atime + RRR ( + implied stats on files) + Rc WW mkdir (ctime on new dir, size+mtime on parent dir) + R WW link/unlink/rename/rmdir (size+mtime on dir) + + + += relationship to client (writers): + + - ops in question are + - stat ... need reasonable value for mtime (+ atime?) + - maybe we want a "quicksync" type operation instead of full lock + - truncate ... need to stop writers for the atomic truncate operation + - need a full lock + + + + += modes + - SYNC + Rauth Rreplica Wauth Wreplica + sync + + + + + +ALSO: + + dirlock - no dir changes (prior to unhashing) + denlock - dentry lock (prior to unlink, rename) + + +*/ + + +void Locker::handle_lock(MLock *m) +{ + switch (m->get_otype()) { + case LOCK_OTYPE_IHARD: + handle_lock_inode_hard(m); + break; + + case LOCK_OTYPE_IFILE: + handle_lock_inode_file(m); + break; + + case LOCK_OTYPE_DIR: + handle_lock_dir(m); + break; + + case LOCK_OTYPE_DN: + handle_lock_dn(m); + break; + + default: + dout(7) << "handle_lock got otype " << m->get_otype() << endl; + assert(0); + break; + } +} + + + +// =============================== +// hard inode metadata + +bool Locker::inode_hard_read_try(CInode *in, Context *con) +{ + dout(7) << "inode_hard_read_try on " << *in << endl; + + // can read? grab ref. + if (in->hardlock.can_read(in->is_auth())) + return true; + + assert(!in->is_auth()); + + // wait! + dout(7) << "inode_hard_read_try waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_HARDR, con); + return false; +} + +bool Locker::inode_hard_read_start(CInode *in, MClientRequest *m) +{ + dout(7) << "inode_hard_read_start on " << *in << endl; + + // can read? grab ref. + if (in->hardlock.can_read(in->is_auth())) { + in->hardlock.get_read(); + return true; + } + + // can't read, and replicated. + assert(!in->is_auth()); + + // wait! + dout(7) << "inode_hard_read_start waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_HARDR, new C_MDS_RetryRequest(mds, m, in)); + return false; +} + + +void Locker::inode_hard_read_finish(CInode *in) +{ + // drop ref + assert(in->hardlock.can_read(in->is_auth())); + in->hardlock.put_read(); + + dout(7) << "inode_hard_read_finish on " << *in << endl; + + //if (in->hardlock.get_nread() == 0) in->finish_waiting(CINODE_WAIT_HARDNORD); +} + + +bool Locker::inode_hard_write_start(CInode *in, MClientRequest *m) +{ + dout(7) << "inode_hard_write_start on " << *in << endl; + + // if not replicated, i can twiddle lock at will + if (in->is_auth() && + !in->is_cached_by_anyone() && + in->hardlock.get_state() != LOCK_LOCK) + in->hardlock.set_state(LOCK_LOCK); + + // can write? grab ref. + if (in->hardlock.can_write(in->is_auth())) { + assert(in->is_auth()); + if (!in->can_auth_pin()) { + dout(7) << "inode_hard_write_start waiting for authpinnable on " << *in << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; + } + + in->auth_pin(); // ugh, can't condition this on nwrite==0 bc we twiddle that in handle_lock_* + in->hardlock.get_write(); + return true; + } + + // can't write, replicated. + if (in->is_auth()) { + // auth + if (in->hardlock.can_write_soon(in->is_auth())) { + // just wait + } else { + // initiate lock + inode_hard_lock(in); + } + + dout(7) << "inode_hard_write_start waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_HARDW, new C_MDS_RetryRequest(mds, m, in)); + + return false; + } else { + // replica + // fw to auth + int auth = in->authority(); + dout(7) << "inode_hard_write_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mdcache->request_forward(m, auth); + return false; + } +} + + +void Locker::inode_hard_write_finish(CInode *in) +{ + // drop ref + assert(in->hardlock.can_write(in->is_auth())); + in->hardlock.put_write(); + in->auth_unpin(); + dout(7) << "inode_hard_write_finish on " << *in << endl; + + // drop lock? + if (in->hardlock.get_nwrite() == 0) { + + // auto-sync if alone. + if (in->is_auth() && + !in->is_cached_by_anyone() && + in->hardlock.get_state() != LOCK_SYNC) + in->hardlock.set_state(LOCK_SYNC); + + inode_hard_eval(in); + } +} + + +void Locker::inode_hard_eval(CInode *in) +{ + // finished gather? + if (in->is_auth() && + !in->hardlock.is_stable() && + in->hardlock.gather_set.empty()) { + dout(7) << "inode_hard_eval finished gather on " << *in << endl; + switch (in->hardlock.get_state()) { + case LOCK_GLOCKR: + in->hardlock.set_state(LOCK_LOCK); + + // waiters + in->hardlock.get_write(); + in->finish_waiting(CINODE_WAIT_HARDRWB|CINODE_WAIT_HARDSTABLE); + in->hardlock.put_write(); + break; + + default: + assert(0); + } + } + if (!in->hardlock.is_stable()) return; + + if (in->is_auth()) { + + // sync? + if (in->is_cached_by_anyone() && + in->hardlock.get_nwrite() == 0 && + in->hardlock.get_state() != LOCK_SYNC) { + dout(7) << "inode_hard_eval stable, syncing " << *in << endl; + inode_hard_sync(in); + } + + } else { + // replica + } +} + + +// mid + +void Locker::inode_hard_sync(CInode *in) +{ + dout(7) << "inode_hard_sync on " << *in << endl; + assert(in->is_auth()); + + // check state + if (in->hardlock.get_state() == LOCK_SYNC) + return; // already sync + if (in->hardlock.get_state() == LOCK_GLOCKR) + assert(0); // um... hmm! + assert(in->hardlock.get_state() == LOCK_LOCK); + + // hard data + bufferlist harddata; + in->encode_hard_state(harddata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IHARD); + m->set_data(harddata); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + + // change lock + in->hardlock.set_state(LOCK_SYNC); + + // waiters? + in->finish_waiting(CINODE_WAIT_HARDSTABLE); +} + +void Locker::inode_hard_lock(CInode *in) +{ + dout(7) << "inode_hard_lock on " << *in << " hardlock=" << in->hardlock << endl; + assert(in->is_auth()); + + // check state + if (in->hardlock.get_state() == LOCK_LOCK || + in->hardlock.get_state() == LOCK_GLOCKR) + return; // already lock or locking + assert(in->hardlock.get_state() == LOCK_SYNC); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IHARD); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + + // change lock + in->hardlock.set_state(LOCK_GLOCKR); + in->hardlock.init_gather(in->get_cached_by()); +} + + + + + +// messenger + +void Locker::handle_lock_inode_hard(MLock *m) +{ + assert(m->get_otype() == LOCK_OTYPE_IHARD); + + mds->logger->inc("lih"); + + int from = m->get_asker(); + CInode *in = mdcache->get_inode(m->get_ino()); + + if (LOCK_AC_FOR_AUTH(m->get_action())) { + // auth + assert(in); + assert(in->is_auth() || in->is_proxy()); + dout(7) << "handle_lock_inode_hard " << *in << " hardlock=" << in->hardlock << endl; + + if (in->is_proxy()) { + // fw + int newauth = in->authority(); + assert(newauth >= 0); + if (from == newauth) { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; + delete m; + } else { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); + } + return; + } + } else { + // replica + if (!in) { + dout(7) << "handle_lock_inode_hard " << m->get_ino() << ": don't have it anymore" << endl; + /* do NOT nak.. if we go that route we need to duplicate all the nonce funkiness + to keep gather_set a proper/correct subset of cached_by. better to use the existing + cacheexpire mechanism instead! + */ + delete m; + return; + } + + assert(!in->is_auth()); + } + + dout(7) << "handle_lock_inode_hard a=" << m->get_action() << " from " << from << " " << *in << " hardlock=" << in->hardlock << endl; + + CLock *lock = &in->hardlock; + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK); + + { // assim data + int off = 0; + in->decode_hard_state(m->get_data(), off); + } + + // update lock + lock->set_state(LOCK_SYNC); + + // no need to reply + + // waiters + in->finish_waiting(CINODE_WAIT_HARDR|CINODE_WAIT_HARDSTABLE); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC); + //|| lock->get_state() == LOCK_GLOCKR); + + // wait for readers to finish? + if (lock->get_nread() > 0) { + dout(7) << "handle_lock_inode_hard readers, waiting before ack on " << *in << endl; + lock->set_state(LOCK_GLOCKR); + in->add_waiter(CINODE_WAIT_HARDNORD, + new C_MDS_RetryMessage(mds,m)); + assert(0); // does this ever happen? (if so, fix hard_read_finish, and CInodeExport.update_inode!) + return; + } else { + + // update lock and reply + lock->set_state(LOCK_LOCK); + + { + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IHARD); + mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + } + } + break; + + + // -- auth -- + case LOCK_AC_LOCKACK: + assert(lock->state == LOCK_GLOCKR); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_hard " << *in << " from " << from << ", last one" << endl; + inode_hard_eval(in); + } + } + delete m; +} + + + + +// ===================== +// soft inode metadata + + +bool Locker::inode_file_read_start(CInode *in, MClientRequest *m) +{ + dout(7) << "inode_file_read_start " << *in << " filelock=" << in->filelock << endl; + + // can read? grab ref. + if (in->filelock.can_read(in->is_auth())) { + in->filelock.get_read(); + return true; + } + + // can't read, and replicated. + if (in->filelock.can_read_soon(in->is_auth())) { + // wait + dout(7) << "inode_file_read_start can_read_soon " << *in << endl; + } else { + if (in->is_auth()) { + // auth + + // FIXME or qsync? + + if (in->filelock.is_stable()) { + inode_file_lock(in); // lock, bc easiest to back off + + if (in->filelock.can_read(in->is_auth())) { + in->filelock.get_read(); + + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_write(); + return true; + } + } else { + dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; + } + } else { + // replica + if (in->filelock.is_stable()) { + + // fw to auth + int auth = in->authority(); + dout(7) << "inode_file_read_start " << *in << " on replica and async, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mdcache->request_forward(m, auth); + return false; + + } else { + // wait until stable + dout(7) << "inode_file_read_start waiting until stable on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; + } + } + } + + // wait + dout(7) << "inode_file_read_start waiting on " << *in << ", filelock=" << in->filelock << endl; + in->add_waiter(CINODE_WAIT_FILER, new C_MDS_RetryRequest(mds, m, in)); + + return false; +} + + +void Locker::inode_file_read_finish(CInode *in) +{ + // drop ref + assert(in->filelock.can_read(in->is_auth())); + in->filelock.put_read(); + + dout(7) << "inode_file_read_finish on " << *in << ", filelock=" << in->filelock << endl; + + if (in->filelock.get_nread() == 0) { + in->finish_waiting(CINODE_WAIT_FILENORD); + inode_file_eval(in); + } +} + + +bool Locker::inode_file_write_start(CInode *in, MClientRequest *m) +{ + // can write? grab ref. + if (in->filelock.can_write(in->is_auth())) { + in->filelock.get_write(); + return true; + } + + // can't write, replicated. + if (in->is_auth()) { + // auth + if (in->filelock.can_write_soon(in->is_auth())) { + // just wait + } else { + if (!in->filelock.is_stable()) { + dout(7) << "inode_file_write_start on auth, waiting for stable on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILESTABLE, new C_MDS_RetryRequest(mds, m, in)); + return false; + } + + // initiate lock + inode_file_lock(in); + + if (in->filelock.can_write(in->is_auth())) { + in->filelock.get_write(); + + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + return true; + } + } + + dout(7) << "inode_file_write_start on auth, waiting for write on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILEW, new C_MDS_RetryRequest(mds, m, in)); + return false; + } else { + // replica + // fw to auth + int auth = in->authority(); + dout(7) << "inode_file_write_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mdcache->request_forward(m, auth); + return false; + } +} + + +void Locker::inode_file_write_finish(CInode *in) +{ + // drop ref + assert(in->filelock.can_write(in->is_auth())); + in->filelock.put_write(); + dout(7) << "inode_file_write_finish on " << *in << ", filelock=" << in->filelock << endl; + + // drop lock? + if (in->filelock.get_nwrite() == 0) { + in->finish_waiting(CINODE_WAIT_FILENOWR); + inode_file_eval(in); + } +} + + +/* + * ... + * + * also called after client caps are acked to us + * - checks if we're in unstable sfot state and can now move on to next state + * - checks if soft state should change (eg bc last writer closed) + */ + +void Locker::inode_file_eval(CInode *in) +{ + int issued = in->get_caps_issued(); + + // [auth] finished gather? + if (in->is_auth() && + !in->filelock.is_stable() && + in->filelock.gather_set.size() == 0) { + dout(7) << "inode_file_eval finished mds gather on " << *in << endl; + + switch (in->filelock.get_state()) { + // to lock + case LOCK_GLOCKR: + case LOCK_GLOCKM: + case LOCK_GLOCKL: + if (issued == 0) { + in->filelock.set_state(LOCK_LOCK); + + // waiters + in->filelock.get_read(); + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILERWB|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + in->filelock.put_write(); + } + break; + + // to mixed + case LOCK_GMIXEDR: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_MIXED); + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + case LOCK_GMIXEDL: + if ((issued & ~(CAP_FILE_WR)) == 0) { + in->filelock.set_state(LOCK_MIXED); + + if (in->is_cached_by_anyone()) { + // data + bufferlist softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + // to loner + case LOCK_GLONERR: + if (issued == 0) { + in->filelock.set_state(LOCK_LONER); + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + case LOCK_GLONERM: + if ((issued & ~CAP_FILE_WR) == 0) { + in->filelock.set_state(LOCK_LONER); + in->finish_waiting(CINODE_WAIT_FILESTABLE); + } + break; + + // to sync + case LOCK_GSYNCL: + case LOCK_GSYNCM: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_SYNC); + + { // bcast data to replicas + bufferlist softdata; + in->encode_file_state(softdata); + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *reply = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + reply->set_data(softdata); + mds->send_message_mds(reply, *it, MDS_PORT_LOCKER); + } + } + + // waiters + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + } + break; + + default: + assert(0); + } + + issue_caps(in); + } + + // [replica] finished caps gather? + if (!in->is_auth() && + !in->filelock.is_stable()) { + switch (in->filelock.get_state()) { + case LOCK_GMIXEDR: + if ((issued & ~(CAP_FILE_RD)) == 0) { + in->filelock.set_state(LOCK_MIXED); + + // ack + MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); + } + break; + + case LOCK_GLOCKR: + if (issued == 0) { + in->filelock.set_state(LOCK_LOCK); + + // ack + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(reply, in->authority(), MDS_PORT_LOCKER); + } + break; + + default: + assert(0); + } + } + + // !stable -> do nothing. + if (!in->filelock.is_stable()) return; + + + // stable. + assert(in->filelock.is_stable()); + + if (in->is_auth()) { + // [auth] + int wanted = in->get_caps_wanted(); + bool loner = (in->client_caps.size() == 1) && in->mds_caps_wanted.empty(); + dout(7) << "inode_file_eval wanted=" << cap_string(wanted) + << " filelock=" << in->filelock + << " loner=" << loner + << endl; + + // * -> loner? + if (in->filelock.get_nread() == 0 && + in->filelock.get_nwrite() == 0 && + (wanted & CAP_FILE_WR) && + loner && + in->filelock.get_state() != LOCK_LONER) { + dout(7) << "inode_file_eval stable, bump to loner " << *in << ", filelock=" << in->filelock << endl; + inode_file_loner(in); + } + + // * -> mixed? + else if (in->filelock.get_nread() == 0 && + in->filelock.get_nwrite() == 0 && + (wanted & CAP_FILE_RD) && + (wanted & CAP_FILE_WR) && + !(loner && in->filelock.get_state() == LOCK_LONER) && + in->filelock.get_state() != LOCK_MIXED) { + dout(7) << "inode_file_eval stable, bump to mixed " << *in << ", filelock=" << in->filelock << endl; + inode_file_mixed(in); + } + + // * -> sync? + else if (in->filelock.get_nwrite() == 0 && + !(wanted & CAP_FILE_WR) && + ((wanted & CAP_FILE_RD) || + in->is_cached_by_anyone() || + (!loner && in->filelock.get_state() == LOCK_LONER)) && + in->filelock.get_state() != LOCK_SYNC) { + dout(7) << "inode_file_eval stable, bump to sync " << *in << ", filelock=" << in->filelock << endl; + inode_file_sync(in); + } + + // * -> lock? (if not replicated or open) + else if (!in->is_cached_by_anyone() && + wanted == 0 && + in->filelock.get_state() != LOCK_LOCK) { + inode_file_lock(in); + } + + } else { + // replica + // recall? check wiaters? XXX + } +} + + +// mid + +bool Locker::inode_file_sync(CInode *in) +{ + dout(7) << "inode_file_sync " << *in << " filelock=" << in->filelock << endl; + + assert(in->is_auth()); + + // check state + if (in->filelock.get_state() == LOCK_SYNC || + in->filelock.get_state() == LOCK_GSYNCL || + in->filelock.get_state() == LOCK_GSYNCM) + return true; + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + + assert((in->get_caps_wanted() & CAP_FILE_WR) == 0); + + if (in->filelock.get_state() == LOCK_LOCK) { + if (in->is_cached_by_anyone()) { + // soft data + bufferlist softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + + // change lock + in->filelock.set_state(LOCK_SYNC); + + // reissue caps + issue_caps(in); + return true; + } + + else if (in->filelock.get_state() == LOCK_MIXED) { + // writers? + if (issued & CAP_FILE_WR) { + // gather client write caps + in->filelock.set_state(LOCK_GSYNCM); + issue_caps(in); + } else { + // no writers, go straight to sync + + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + + // change lock + in->filelock.set_state(LOCK_SYNC); + } + return false; + } + + else if (in->filelock.get_state() == LOCK_LONER) { + // writers? + if (issued & CAP_FILE_WR) { + // gather client write caps + in->filelock.set_state(LOCK_GSYNCL); + issue_caps(in); + } else { + // no writers, go straight to sync + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + + // change lock + in->filelock.set_state(LOCK_SYNC); + } + return false; + } + else + assert(0); // wtf. + + return false; +} + + +void Locker::inode_file_lock(CInode *in) +{ + dout(7) << "inode_file_lock " << *in << " filelock=" << in->filelock << endl; + + assert(in->is_auth()); + + // check state + if (in->filelock.get_state() == LOCK_LOCK || + in->filelock.get_state() == LOCK_GLOCKR || + in->filelock.get_state() == LOCK_GLOCKM || + in->filelock.get_state() == LOCK_GLOCKL) + return; // lock or locking + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GLOCKR); + + // call back caps + if (issued) + issue_caps(in); + } else { + if (issued) { + // call back caps + in->filelock.set_state(LOCK_GLOCKR); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_LOCK); + } + } + } + + else if (in->filelock.get_state() == LOCK_MIXED) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GLOCKM); + + // call back caps + issue_caps(in); + } else { + //assert(issued); // ??? -sage 2/19/06 + if (issued) { + // change lock + in->filelock.set_state(LOCK_GLOCKM); + + // call back caps + issue_caps(in); + } else { + in->filelock.set_state(LOCK_LOCK); + } + } + + } + else if (in->filelock.get_state() == LOCK_LONER) { + if (issued & CAP_FILE_WR) { + // change lock + in->filelock.set_state(LOCK_GLOCKL); + + // call back caps + issue_caps(in); + } else { + in->filelock.set_state(LOCK_LOCK); + } + } + else + assert(0); // wtf. +} + + +void Locker::inode_file_mixed(CInode *in) +{ + dout(7) << "inode_file_mixed " << *in << " filelock=" << in->filelock << endl; + + assert(in->is_auth()); + + // check state + if (in->filelock.get_state() == LOCK_GMIXEDR || + in->filelock.get_state() == LOCK_GMIXEDL) + return; // mixed or mixing + + assert(in->filelock.is_stable()); + + int issued = in->get_caps_issued(); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.init_gather(in->get_cached_by()); + + in->filelock.set_state(LOCK_GMIXEDR); + issue_caps(in); + } else { + if (issued) { + in->filelock.set_state(LOCK_GMIXEDR); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_MIXED); + } + } + } + + else if (in->filelock.get_state() == LOCK_LOCK) { + if (in->is_cached_by_anyone()) { + // data + bufferlist softdata; + in->encode_file_state(softdata); + + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + m->set_data(softdata); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + + // change lock + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); + } + + else if (in->filelock.get_state() == LOCK_LONER) { + if (issued & CAP_FILE_WRBUFFER) { + // gather up WRBUFFER caps + in->filelock.set_state(LOCK_GMIXEDL); + issue_caps(in); + } + else if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_MIXED, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); + } else { + in->filelock.set_state(LOCK_MIXED); + issue_caps(in); + } + } + + else + assert(0); // wtf. +} + + +void Locker::inode_file_loner(CInode *in) +{ + dout(7) << "inode_file_loner " << *in << " filelock=" << in->filelock << endl; + + assert(in->is_auth()); + + // check state + if (in->filelock.get_state() == LOCK_LONER || + in->filelock.get_state() == LOCK_GLONERR || + in->filelock.get_state() == LOCK_GLONERM) + return; + + assert(in->filelock.is_stable()); + assert((in->client_caps.size() == 1) && in->mds_caps_wanted.empty()); + + if (in->filelock.get_state() == LOCK_SYNC) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GLONERR); + } else { + // only one guy with file open, who gets it all, so + in->filelock.set_state(LOCK_LONER); + issue_caps(in); + } + } + + else if (in->filelock.get_state() == LOCK_LOCK) { + // change lock. ignore replicas; they don't know about LONER. + in->filelock.set_state(LOCK_LONER); + issue_caps(in); + } + + else if (in->filelock.get_state() == LOCK_MIXED) { + if (in->is_cached_by_anyone()) { + // bcast to replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + in->filelock.init_gather(in->get_cached_by()); + + // change lock + in->filelock.set_state(LOCK_GLONERM); + } else { + in->filelock.set_state(LOCK_LONER); + issue_caps(in); + } + } + + else + assert(0); +} + +// messenger + +void Locker::handle_lock_inode_file(MLock *m) +{ + assert(m->get_otype() == LOCK_OTYPE_IFILE); + + mds->logger->inc("lif"); + + CInode *in = mdcache->get_inode(m->get_ino()); + int from = m->get_asker(); + + if (LOCK_AC_FOR_AUTH(m->get_action())) { + // auth + assert(in); + assert(in->is_auth() || in->is_proxy()); + dout(7) << "handle_lock_inode_file " << *in << " hardlock=" << in->hardlock << endl; + + if (in->is_proxy()) { + // fw + int newauth = in->authority(); + assert(newauth >= 0); + if (from == newauth) { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, but from new auth, dropping" << endl; + delete m; + } else { + dout(7) << "handle_lock " << m->get_ino() << " from " << from << ": proxy, fw to " << newauth << endl; + mds->send_message_mds(m, newauth, MDS_PORT_LOCKER); + } + return; + } + } else { + // replica + if (!in) { + // drop it. don't nak. + dout(7) << "handle_lock " << m->get_ino() << ": don't have it anymore" << endl; + delete m; + return; + } + + assert(!in->is_auth()); + } + + dout(7) << "handle_lock_inode_file a=" << m->get_action() << " from " << from << " " << *in << " filelock=" << in->filelock << endl; + + CLock *lock = &in->filelock; + int issued = in->get_caps_issued(); + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_SYNC: + assert(lock->get_state() == LOCK_LOCK || + lock->get_state() == LOCK_MIXED); + + { // assim data + int off = 0; + in->decode_file_state(m->get_data(), off); + } + + // update lock + lock->set_state(LOCK_SYNC); + + // no need to reply. + + // waiters + in->filelock.get_read(); + in->finish_waiting(CINODE_WAIT_FILER|CINODE_WAIT_FILESTABLE); + in->filelock.put_read(); + inode_file_eval(in); + break; + + case LOCK_AC_LOCK: + assert(lock->get_state() == LOCK_SYNC || + lock->get_state() == LOCK_MIXED); + + // call back caps? + if (issued & CAP_FILE_RD) { + dout(7) << "handle_lock_inode_file client readers, gathering caps on " << *in << endl; + issue_caps(in); + } + if (lock->get_nread() > 0) { + dout(7) << "handle_lock_inode_file readers, waiting before ack on " << *in << endl; + in->add_waiter(CINODE_WAIT_FILENORD, + new C_MDS_RetryMessage(mds,m)); + lock->set_state(LOCK_GLOCKR); + assert(0);// i am broken.. why retry message when state captures all the info i need? + return; + } + if (issued & CAP_FILE_RD) { + lock->set_state(LOCK_GLOCKR); + break; + } + + // nothing to wait for, lock and ack. + { + lock->set_state(LOCK_LOCK); + + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + } + break; + + case LOCK_AC_MIXED: + assert(lock->get_state() == LOCK_SYNC || + lock->get_state() == LOCK_LOCK); + + if (lock->get_state() == LOCK_SYNC) { + // MIXED + if (issued & CAP_FILE_RD) { + // call back client caps + lock->set_state(LOCK_GMIXEDR); + issue_caps(in); + break; + } else { + // no clients, go straight to mixed + lock->set_state(LOCK_MIXED); + + // ack + MLock *reply = new MLock(LOCK_AC_MIXEDACK, mds->get_nodeid()); + reply->set_ino(in->ino(), LOCK_OTYPE_IFILE); + mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + } + } else { + // LOCK + lock->set_state(LOCK_MIXED); + + // no ack needed. + } + + issue_caps(in); + + // waiters + in->filelock.get_write(); + in->finish_waiting(CINODE_WAIT_FILEW|CINODE_WAIT_FILESTABLE); + in->filelock.put_write(); + inode_file_eval(in); + break; + + + + + // -- auth -- + case LOCK_AC_LOCKACK: + assert(lock->state == LOCK_GLOCKR || + lock->state == LOCK_GLOCKM || + lock->state == LOCK_GLONERM || + lock->state == LOCK_GLONERR); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); + } + break; + + case LOCK_AC_SYNCACK: + assert(lock->state == LOCK_GSYNCM); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + /* not used currently + { + // merge data (keep largest size, mtime, etc.) + int off = 0; + in->decode_merge_file_state(m->get_data(), off); + } + */ + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); + } + break; + + case LOCK_AC_MIXEDACK: + assert(lock->state == LOCK_GMIXEDR); + assert(lock->gather_set.count(from)); + lock->gather_set.erase(from); + + if (lock->gather_set.size()) { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", still gathering " << lock->gather_set << endl; + } else { + dout(7) << "handle_lock_inode_file " << *in << " from " << from << ", last one" << endl; + inode_file_eval(in); + } + break; + + + default: + assert(0); + } + + delete m; +} + + + + + + + + + + + + + + +void Locker::handle_lock_dir(MLock *m) +{ + +} + + + +// DENTRY + +bool Locker::dentry_xlock_start(CDentry *dn, Message *m, CInode *ref) +{ + dout(7) << "dentry_xlock_start on " << *dn << endl; + + // locked? + if (dn->lockstate == DN_LOCK_XLOCK) { + if (dn->xlockedby == m) return true; // locked by me! + + // not by me, wait + dout(7) << "dentry " << *dn << " xlock by someone else" << endl; + dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, + new C_MDS_RetryRequest(mds,m,ref)); + return false; + } + + // prelock? + if (dn->lockstate == DN_LOCK_PREXLOCK) { + if (dn->xlockedby == m) { + dout(7) << "dentry " << *dn << " prexlock by me" << endl; + dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, + new C_MDS_RetryRequest(mds,m,ref)); + } else { + dout(7) << "dentry " << *dn << " prexlock by someone else" << endl; + dn->dir->add_waiter(CDIR_WAIT_DNREAD, dn->name, + new C_MDS_RetryRequest(mds,m,ref)); + } + return false; + } + + + // lockable! + assert(dn->lockstate == DN_LOCK_SYNC || + dn->lockstate == DN_LOCK_UNPINNING); + + // dir auth pinnable? + if (!dn->dir->can_auth_pin()) { + dout(7) << "dentry " << *dn << " dir not pinnable, waiting" << endl; + dn->dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, + new C_MDS_RetryRequest(mds,m,ref)); + return false; + } + + // is dentry path pinned? + if (dn->is_pinned()) { + dout(7) << "dentry " << *dn << " pinned, waiting" << endl; + dn->lockstate = DN_LOCK_UNPINNING; + dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, + dn->name, + new C_MDS_RetryRequest(mds,m,ref)); + return false; + } + + // pin path up to dentry! (if success, point of no return) + CDentry *pdn = dn->dir->inode->get_parent_dn(); + if (pdn) { + if (mdcache->active_requests[m].traces.count(pdn)) { + dout(7) << "already path pinned parent dentry " << *pdn << endl; + } else { + dout(7) << "pinning parent dentry " << *pdn << endl; + vector trace; + mdcache->make_trace(trace, pdn->inode); + assert(trace.size()); + + if (!mdcache->path_pin(trace, m, new C_MDS_RetryRequest(mds, m, ref))) return false; + + mdcache->active_requests[m].traces[trace[trace.size()-1]] = trace; + } + } + + // pin dir! + dn->dir->auth_pin(); + + // mine! + dn->xlockedby = m; + + if (dn->dir->is_open_by_anyone()) { + dn->lockstate = DN_LOCK_PREXLOCK; + + // xlock with whom? + set who = dn->dir->get_open_by(); + dn->gather_set = who; + + // make path + string path; + dn->make_path(path); + dout(10) << "path is " << path << " for " << *dn << endl; + + for (set::iterator it = who.begin(); + it != who.end(); + it++) { + MLock *m = new MLock(LOCK_AC_LOCK, mds->get_nodeid()); + m->set_dn(dn->dir->ino(), dn->name); + m->set_path(path); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + + // wait + dout(7) << "dentry_xlock_start locking, waiting for replicas " << endl; + dn->dir->add_waiter(CDIR_WAIT_DNLOCK, dn->name, + new C_MDS_RetryRequest(mds, m, ref)); + return false; + } else { + dn->lockstate = DN_LOCK_XLOCK; + mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); + return true; + } +} + +void Locker::dentry_xlock_finish(CDentry *dn, bool quiet) +{ + dout(7) << "dentry_xlock_finish on " << *dn << endl; + + assert(dn->xlockedby); + if (dn->xlockedby == DN_XLOCK_FOREIGN) { + dout(7) << "this was a foreign xlock" << endl; + } else { + // remove from request record + assert(mdcache->active_requests[dn->xlockedby].xlocks.count(dn) == 1); + mdcache->active_requests[dn->xlockedby].xlocks.erase(dn); + } + + dn->xlockedby = 0; + dn->lockstate = DN_LOCK_SYNC; + + // unpin parent dir? + // -> no? because we might have xlocked 2 things in this dir. + // instead, we let request_finish clean up the mess. + + // tell replicas? + if (!quiet) { + // tell even if dn is null. + if (dn->dir->is_open_by_anyone()) { + for (set::iterator it = dn->dir->open_by_begin(); + it != dn->dir->open_by_end(); + it++) { + MLock *m = new MLock(LOCK_AC_SYNC, mds->get_nodeid()); + m->set_dn(dn->dir->ino(), dn->name); + mds->send_message_mds(m, *it, MDS_PORT_LOCKER); + } + } + } + + // unpin dir + dn->dir->auth_unpin(); +} + +/* + * onfinish->finish() will be called with + * 0 on successful xlock, + * -1 on failure + */ + +class C_MDC_XlockRequest : public Context { + Locker *mdc; + CDir *dir; + string dname; + Message *req; + Context *finisher; +public: + C_MDC_XlockRequest(Locker *mdc, + CDir *dir, string& dname, + Message *req, + Context *finisher) { + this->mdc = mdc; + this->dir = dir; + this->dname = dname; + this->req = req; + this->finisher = finisher; + } + + void finish(int r) { + mdc->dentry_xlock_request_finish(r, dir, dname, req, finisher); + } +}; + +void Locker::dentry_xlock_request_finish(int r, + CDir *dir, string& dname, + Message *req, + Context *finisher) +{ + dout(10) << "dentry_xlock_request_finish r = " << r << endl; + if (r == 1) { // 1 for xlock request success + CDentry *dn = dir->lookup(dname); + if (dn && dn->xlockedby == 0) { + // success + dn->xlockedby = req; // our request was the winner + dout(10) << "xlock request success, now xlocked by req " << req << " dn " << *dn << endl; + + // remember! + mdcache->active_requests[req].foreign_xlocks.insert(dn); + } + } + + // retry request (or whatever) + finisher->finish(0); + delete finisher; +} + +void Locker::dentry_xlock_request(CDir *dir, string& dname, bool create, + Message *req, Context *onfinish) +{ + dout(10) << "dentry_xlock_request on dn " << dname << " create=" << create << " in " << *dir << endl; + // send request + int dauth = dir->dentry_authority(dname); + MLock *m = new MLock(create ? LOCK_AC_REQXLOCKC:LOCK_AC_REQXLOCK, mds->get_nodeid()); + m->set_dn(dir->ino(), dname); + mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); + + // add waiter + dir->add_waiter(CDIR_WAIT_DNREQXLOCK, dname, + new C_MDC_XlockRequest(this, + dir, dname, req, + onfinish)); +} + + + + +void Locker::handle_lock_dn(MLock *m) +{ + assert(m->get_otype() == LOCK_OTYPE_DN); + + CInode *diri = mdcache->get_inode(m->get_ino()); // may be null + CDir *dir = 0; + if (diri) dir = diri->dir; // may be null + string dname = m->get_dn(); + int from = m->get_asker(); + CDentry *dn = 0; + + if (LOCK_AC_FOR_AUTH(m->get_action())) { + // auth + + // normally we have it always + if (diri && dir) { + int dauth = dir->dentry_authority(dname); + assert(dauth == mds->get_nodeid() || dir->is_proxy() || // mine or proxy, + m->get_action() == LOCK_AC_REQXLOCKACK || // or we did a REQXLOCK and this is our ack/nak + m->get_action() == LOCK_AC_REQXLOCKNAK); + + if (dir->is_proxy()) { + + assert(dauth >= 0); + + if (dauth == m->get_asker() && + (m->get_action() == LOCK_AC_REQXLOCK || + m->get_action() == LOCK_AC_REQXLOCKC)) { + dout(7) << "handle_lock_dn got reqxlock from " << dauth << " and they are auth.. dropping on floor (their import will have woken them up)" << endl; + if (mdcache->active_requests.count(m)) + mdcache->request_finish(m); + else + delete m; + return; + } + + dout(7) << "handle_lock_dn " << m << " " << m->get_ino() << " dname " << dname << " from " << from << ": proxy, fw to " << dauth << endl; + + // forward + if (mdcache->active_requests.count(m)) { + // xlock requests are requests, use request_* functions! + assert(m->get_action() == LOCK_AC_REQXLOCK || + m->get_action() == LOCK_AC_REQXLOCKC); + // forward as a request + mdcache->request_forward(m, dauth, MDS_PORT_LOCKER); + } else { + // not an xlock req, or it is and we just didn't register the request yet + // forward normally + mds->send_message_mds(m, dauth, MDS_PORT_LOCKER); + } + return; + } + + dn = dir->lookup(dname); + } + + // except with.. an xlock request? + if (!dn) { + assert(dir); // we should still have the dir, though! the requester has the dir open. + switch (m->get_action()) { + + case LOCK_AC_LOCK: + dout(7) << "handle_lock_dn xlock on " << dname << ", adding (null)" << endl; + dn = dir->add_dentry(dname); + break; + + case LOCK_AC_REQXLOCK: + // send nak + if (dir->state_test(CDIR_STATE_DELETED)) { + dout(7) << "handle_lock_dn reqxlock on deleted dir " << *dir << ", nak" << endl; + } else { + dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << " dne, nak" << endl; + } + { + MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); + reply->set_dn(dir->ino(), dname); + reply->set_path(m->get_path()); + mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); + } + + // finish request (if we got that far) + if (mdcache->active_requests.count(m)) + mdcache->request_finish(m); + + delete m; + return; + + case LOCK_AC_REQXLOCKC: + dout(7) << "handle_lock_dn reqxlockc on " << dname << " in " << *dir << " dne (yet!)" << endl; + break; + + default: + assert(0); + } + } + } else { + // replica + if (dir) dn = dir->lookup(dname); + if (!dn) { + dout(7) << "handle_lock_dn " << m << " don't have " << m->get_ino() << " dname " << dname << endl; + + if (m->get_action() == LOCK_AC_REQXLOCKACK || + m->get_action() == LOCK_AC_REQXLOCKNAK) { + dout(7) << "handle_lock_dn got reqxlockack/nak, but don't have dn " << m->get_path() << ", discovering" << endl; + //assert(0); // how can this happen? tell me now! + + vector trace; + filepath path = m->get_path(); + int r = mdcache->path_traverse(path, trace, true, + m, new C_MDS_RetryMessage(mds,m), + MDS_TRAVERSE_DISCOVER); + assert(r>0); + return; + } + + if (m->get_action() == LOCK_AC_LOCK) { + if (0) { // not anymore + dout(7) << "handle_lock_dn don't have " << m->get_path() << ", discovering" << endl; + + vector trace; + filepath path = m->get_path(); + int r = mdcache->path_traverse(path, trace, true, + m, new C_MDS_RetryMessage(mds,m), + MDS_TRAVERSE_DISCOVER); + assert(r>0); + } + if (1) { + // NAK + MLock *reply = new MLock(LOCK_AC_LOCKNAK, mds->get_nodeid()); + reply->set_dn(m->get_ino(), dname); + mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); + } + } else { + dout(7) << "safely ignoring." << endl; + delete m; + } + return; + } + + assert(dn); + } + + if (dn) { + dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << *dn << endl; + } else { + dout(7) << "handle_lock_dn a=" << m->get_action() << " from " << from << " " << dname << " in " << *dir << endl; + } + + switch (m->get_action()) { + // -- replica -- + case LOCK_AC_LOCK: + assert(dn->lockstate == DN_LOCK_SYNC || + dn->lockstate == DN_LOCK_UNPINNING || + dn->lockstate == DN_LOCK_XLOCK); // <-- bc the handle_lock_dn did the discover! + + if (dn->is_pinned()) { + dn->lockstate = DN_LOCK_UNPINNING; + + // wait + dout(7) << "dn pinned, waiting " << *dn << endl; + dn->dir->add_waiter(CDIR_WAIT_DNUNPINNED, + dn->name, + new C_MDS_RetryMessage(mds, m)); + return; + } else { + dn->lockstate = DN_LOCK_XLOCK; + dn->xlockedby = 0; + + // ack now + MLock *reply = new MLock(LOCK_AC_LOCKACK, mds->get_nodeid()); + reply->set_dn(diri->ino(), dname); + mds->send_message_mds(reply, from, MDS_PORT_LOCKER); + } + + // wake up waiters + dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); // ? will this happen on replica ? + break; + + case LOCK_AC_SYNC: + assert(dn->lockstate == DN_LOCK_XLOCK); + dn->lockstate = DN_LOCK_SYNC; + dn->xlockedby = 0; + + // null? hose it. + if (dn->is_null()) { + dout(7) << "hosing null (and now sync) dentry " << *dn << endl; + dir->remove_dentry(dn); + } + + // wake up waiters + dir->finish_waiting(CDIR_WAIT_DNREAD, dname); // will this happen either? YES: if a rename lock backs out + break; + + case LOCK_AC_REQXLOCKACK: + case LOCK_AC_REQXLOCKNAK: + { + dout(10) << "handle_lock_dn got ack/nak on a reqxlock for " << *dn << endl; + list finished; + dir->take_waiting(CDIR_WAIT_DNREQXLOCK, m->get_dn(), finished, 1); // TAKE ONE ONLY! + finish_contexts(finished, + (m->get_action() == LOCK_AC_REQXLOCKACK) ? 1:-1); + } + break; + + + // -- auth -- + case LOCK_AC_LOCKACK: + case LOCK_AC_LOCKNAK: + assert(dn->gather_set.count(from) == 1); + dn->gather_set.erase(from); + if (dn->gather_set.size() == 0) { + dout(7) << "handle_lock_dn finish gather, now xlock on " << *dn << endl; + dn->lockstate = DN_LOCK_XLOCK; + mdcache->active_requests[dn->xlockedby].xlocks.insert(dn); + dir->finish_waiting(CDIR_WAIT_DNLOCK, dname); + } + break; + + + case LOCK_AC_REQXLOCKC: + // make sure it's a _file_, if it exists. + if (dn && dn->inode && dn->inode->is_dir()) { + dout(7) << "handle_lock_dn failing, reqxlockc on dir " << *dn->inode << endl; + + // nak + string path; + dn->make_path(path); + + MLock *reply = new MLock(LOCK_AC_REQXLOCKNAK, mds->get_nodeid()); + reply->set_dn(dir->ino(), dname); + reply->set_path(path); + mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); + + // done + if (mdcache->active_requests.count(m)) + mdcache->request_finish(m); + else + delete m; + return; + } + + case LOCK_AC_REQXLOCK: + if (dn) { + dout(7) << "handle_lock_dn reqxlock on " << *dn << endl; + } else { + dout(7) << "handle_lock_dn reqxlock on " << dname << " in " << *dir << endl; + } + + + // start request? + if (!mdcache->active_requests.count(m)) { + vector trace; + if (!mdcache->request_start(m, dir->inode, trace)) + return; // waiting for pin + } + + // try to xlock! + if (!dn) { + assert(m->get_action() == LOCK_AC_REQXLOCKC); + dn = dir->add_dentry(dname); + } + + if (dn->xlockedby != m) { + if (!dentry_xlock_start(dn, m, dir->inode)) { + // hose null dn if we're waiting on something + if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); + return; // waiting for xlock + } + } else { + // successfully xlocked! on behalf of requestor. + string path; + dn->make_path(path); + + dout(7) << "handle_lock_dn reqxlock success for " << m->get_asker() << " on " << *dn << ", acking" << endl; + + // ACK xlock request + MLock *reply = new MLock(LOCK_AC_REQXLOCKACK, mds->get_nodeid()); + reply->set_dn(dir->ino(), dname); + reply->set_path(path); + mds->send_message_mds(reply, m->get_asker(), MDS_PORT_LOCKER); + + // note: keep request around in memory (to hold the xlock/pins on behalf of requester) + return; + } + break; + + case LOCK_AC_UNXLOCK: + dout(7) << "handle_lock_dn unxlock on " << *dn << endl; + { + string dname = dn->name; + Message *m = dn->xlockedby; + + // finish request + mdcache->request_finish(m); // this will drop the locks (and unpin paths!) + return; + } + break; + + default: + assert(0); + } + + delete m; +} + + + + + + + diff --git a/branches/sage/cephmds2/mds/Locker.h b/branches/sage/cephmds2/mds/Locker.h new file mode 100644 index 0000000000000..20b5a17896610 --- /dev/null +++ b/branches/sage/cephmds2/mds/Locker.h @@ -0,0 +1,123 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_LOCKER_H +#define __MDS_LOCKER_H + +#include "include/types.h" + +#include +#include +#include +using std::map; +using std::list; +using std::set; + +class MDS; +class CDir; +class CInode; +class CDentry; + +class Message; + +class MDiscover; +class MDiscoverReply; +class MCacheExpire; +class MDirUpdate; +class MDentryUnlink; +class MLock; + +class MClientRequest; + + +class Anchor; +class Capability; + + +class Locker { +private: + MDS *mds; + MDCache *mdcache; + + public: + Locker(MDS *m, MDCache *c) : mds(m), mdcache(c) {} + + void dispatch(Message *m); + + // -- locks -- + // high level interface + public: + bool inode_hard_read_try(CInode *in, Context *con); + bool inode_hard_read_start(CInode *in, MClientRequest *m); + void inode_hard_read_finish(CInode *in); + bool inode_hard_write_start(CInode *in, MClientRequest *m); + void inode_hard_write_finish(CInode *in); + bool inode_file_read_start(CInode *in, MClientRequest *m); + void inode_file_read_finish(CInode *in); + bool inode_file_write_start(CInode *in, MClientRequest *m); + void inode_file_write_finish(CInode *in); + + void inode_hard_eval(CInode *in); + void inode_file_eval(CInode *in); + + protected: + void inode_hard_mode(CInode *in, int mode); + void inode_file_mode(CInode *in, int mode); + + // low level triggers + void inode_hard_sync(CInode *in); + void inode_hard_lock(CInode *in); + bool inode_file_sync(CInode *in); + void inode_file_lock(CInode *in); + void inode_file_mixed(CInode *in); + void inode_file_loner(CInode *in); + + // messengers + void handle_lock(MLock *m); + void handle_lock_inode_hard(MLock *m); + void handle_lock_inode_file(MLock *m); + + // -- file i/o -- + public: + version_t issue_file_data_version(CInode *in); + Capability* issue_new_caps(CInode *in, int mode, MClientRequest *req); + bool issue_caps(CInode *in); + + protected: + void handle_client_file_caps(class MClientFileCaps *m); + + void request_inode_file_caps(CInode *in); + void handle_inode_file_caps(class MInodeFileCaps *m); + + + // dirs + void handle_lock_dir(MLock *m); + + // dentry locks + public: + bool dentry_xlock_start(CDentry *dn, + Message *m, CInode *ref); + void dentry_xlock_finish(CDentry *dn, bool quiet=false); + void handle_lock_dn(MLock *m); + void dentry_xlock_request(CDir *dir, string& dname, bool create, + Message *req, Context *onfinish); + void dentry_xlock_request_finish(int r, + CDir *dir, string& dname, + Message *req, + Context *finisher); + + +}; + + +#endif diff --git a/branches/sage/cephmds2/mds/LogEvent.cc b/branches/sage/cephmds2/mds/LogEvent.cc new file mode 100644 index 0000000000000..5b15f487d77ab --- /dev/null +++ b/branches/sage/cephmds2/mds/LogEvent.cc @@ -0,0 +1,86 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "LogEvent.h" + +#include "MDS.h" + +// events i know of +#include "events/EString.h" +#include "events/EInodeUpdate.h" +#include "events/EDirUpdate.h" +#include "events/EUnlink.h" +#include "events/EAlloc.h" +#include "events/EMknod.h" +#include "events/EMkdir.h" +#include "events/EPurgeFinish.h" + +LogEvent *LogEvent::decode(bufferlist& bl) +{ + // parse type, length + int off = 0; + int type; + bl.copy(off, sizeof(type), (char*)&type); + off += sizeof(type); + + int length = bl.length() - off; + dout(15) << "decode_log_event type " << type << ", size " << length << endl; + + assert(type > 0); + + // create event + LogEvent *le; + switch (type) { + case EVENT_STRING: // string + le = new EString(); + break; + + case EVENT_INODEUPDATE: + le = new EInodeUpdate(); + break; + + case EVENT_DIRUPDATE: + le = new EDirUpdate(); + break; + + case EVENT_UNLINK: + le = new EUnlink(); + break; + + case EVENT_PURGEFINISH: + le = new EPurgeFinish(); + break; + + case EVENT_ALLOC: + le = new EAlloc(); + break; + + case EVENT_MKNOD: + le = new EMknod(); + break; + + case EVENT_MKDIR: + le = new EMkdir(); + break; + + default: + dout(1) << "uh oh, unknown event type " << type << endl; + assert(0); + } + + // decode + le->decode_payload(bl, off); + + return le; +} + diff --git a/branches/sage/cephmds2/mds/LogEvent.h b/branches/sage/cephmds2/mds/LogEvent.h new file mode 100644 index 0000000000000..0de268252036a --- /dev/null +++ b/branches/sage/cephmds2/mds/LogEvent.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __LOGEVENT_H +#define __LOGEVENT_H + +#define EVENT_STRING 1 + +#define EVENT_INODEUPDATE 2 +#define EVENT_DIRUPDATE 3 + +#define EVENT_ALLOC 10 +#define EVENT_MKNOD 11 +#define EVENT_MKDIR 12 +#define EVENT_LINK 13 + +#define EVENT_UNLINK 20 +#define EVENT_RMDIR 21 +#define EVENT_PURGEFINISH 22 + + +#include +using namespace std; + +#include "include/buffer.h" +#include "include/Context.h" + +class MDS; + +// generic log event +class LogEvent { + private: + int _type; + off_t _end_off; + friend class MDLog; + + public: + LogEvent(int t) : _type(t), _end_off(0) { } + virtual ~LogEvent() { } + + // encoding + virtual void encode_payload(bufferlist& bl) = 0; + virtual void decode_payload(bufferlist& bl, int& off) = 0; + static LogEvent *decode(bufferlist &bl); + + + virtual void print(ostream& out) { + out << "event(" << _type << ")"; + } + + + /*** live journal ***/ + + /* obsolete() - is this entry committed to primary store, such that + * we can expire it from the journal? + */ + virtual bool can_expire(MDS *m) { + return true; + } + + /* retire() - prod MDS into committing hte relevant state so that this + * entry can be expired from the jorunal. + */ + virtual void retire(MDS *m, Context *c) { + c->finish(0); + delete c; + } + + + /*** recovery ***/ + + /* has_happened() - true if this event has already been applied. + */ + virtual bool has_happened(MDS *m) { return true; } + + /* replay() - replay given event + */ + virtual void replay(MDS *m) { assert(0); } + +}; + +inline ostream& operator<<(ostream& out, LogEvent& le) { + le.print(out); + return out; +} + +#endif diff --git a/branches/sage/cephmds2/mds/MDBalancer.cc b/branches/sage/cephmds2/mds/MDBalancer.cc new file mode 100644 index 0000000000000..0b497103183b2 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDBalancer.cc @@ -0,0 +1,902 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "mdstypes.h" + +#include "MDBalancer.h" +#include "MDS.h" +#include "MDSMap.h" +#include "CInode.h" +#include "CDir.h" +#include "MDCache.h" +#include "Migrator.h" + +#include "include/Context.h" +#include "msg/Messenger.h" +#include "messages/MHeartbeat.h" + +#include +#include +using namespace std; + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds_balancer) cout << "mds" << mds->get_nodeid() << ".bal " << (g_clock.recent_now() - mds->logger->get_start()) << " " + +#define MIN_LOAD 50 // ?? +#define MIN_REEXPORT 5 // will automatically reexport +#define MIN_OFFLOAD 10 // point at which i stop trying, close enough + + +int MDBalancer::proc_message(Message *m) +{ + switch (m->get_type()) { + + case MSG_MDS_HEARTBEAT: + handle_heartbeat((MHeartbeat*)m); + break; + + default: + dout(1) << " balancer unknown message " << m->get_type() << endl; + assert(0); + break; + } + + return 0; +} + + +class C_Bal_SendHeartbeat : public Context { +public: + MDS *mds; + C_Bal_SendHeartbeat(MDS *mds) { + this->mds = mds; + } + virtual void finish(int f) { + mds->balancer->send_heartbeat(); + } +}; + +mds_load_t MDBalancer::get_load() +{ + mds_load_t load; + if (mds->mdcache->get_root()) + load.root = + mds->mdcache->get_root()->popularity[MDS_POP_ANYDOM]; + // + + // mds->mdcache->get_root()->popularity[MDS_POP_NESTED]; + + load.req_rate = mds->get_req_rate(); + load.queue_len = mds->messenger->get_dispatch_queue_len(); + return load; +} + +void MDBalancer::send_heartbeat() +{ + if (!mds->mdcache->get_root()) { + dout(5) << "no root on send_heartbeat" << endl; + mds->mdcache->open_root(new C_Bal_SendHeartbeat(mds)); + return; + } + + mds_load.clear(); + if (mds->get_nodeid() == 0) + beat_epoch++; + + // load + mds_load_t load = get_load(); + mds_load[ mds->get_nodeid() ] = load; + + // import_map + map import_map; + + for (set::iterator it = mds->mdcache->imports.begin(); + it != mds->mdcache->imports.end(); + it++) { + CDir *im = *it; + if (im->inode->is_root()) continue; + int from = im->inode->authority(); + import_map[from] += im->popularity[MDS_POP_CURDOM].meta_load(); + } + mds_import_map[ mds->get_nodeid() ] = import_map; + + + dout(5) << "mds" << mds->get_nodeid() << " sending heartbeat " << beat_epoch << " " << load << endl; + for (map::iterator it = import_map.begin(); + it != import_map.end(); + it++) { + dout(5) << " import_map from " << it->first << " -> " << it->second << endl; + } + + + int size = mds->get_mds_map()->get_num_mds(); + for (int i = 0; iget_nodeid()) continue; + MHeartbeat *hb = new MHeartbeat(load, beat_epoch); + hb->get_import_map() = import_map; + mds->messenger->send_message(hb, + MSG_ADDR_MDS(i), MDS_PORT_BALANCER, + MDS_PORT_BALANCER); + } +} + +void MDBalancer::handle_heartbeat(MHeartbeat *m) +{ + dout(25) << "=== got heartbeat " << m->get_beat() << " from " << MSG_ADDR_NICE(m->get_source()) << " " << m->get_load() << endl; + + if (!mds->mdcache->get_root()) { + dout(10) << "no root on handle" << endl; + mds->mdcache->open_root(new C_MDS_RetryMessage(mds, m)); + return; + } + + int who = MSG_ADDR_NUM(m->get_source()); + + if (who == 0) { + dout(20) << " from mds0, new epoch" << endl; + beat_epoch = m->get_beat(); + send_heartbeat(); + + show_imports(); + } + + mds_load[ who ] = m->get_load(); + mds_import_map[ who ] = m->get_import_map(); + + //cout << " load is " << load << " have " << mds_load.size() << endl; + + unsigned cluster_size = mds->get_mds_map()->get_num_mds(); + if (mds_load.size() == cluster_size) { + // let's go! + //export_empties(); // no! + do_rebalance(m->get_beat()); + } + + // done + delete m; +} + + +void MDBalancer::export_empties() +{ + dout(5) << "export_empties checking for empty imports" << endl; + + for (set::iterator it = mds->mdcache->imports.begin(); + it != mds->mdcache->imports.end(); + it++) { + CDir *dir = *it; + + if (!dir->inode->is_root() && dir->get_size() == 0) + mds->mdcache->migrator->export_empty_import(dir); + } +} + + + +double MDBalancer::try_match(int ex, double& maxex, + int im, double& maxim) +{ + if (maxex <= 0 || maxim <= 0) return 0.0; + + double howmuch = MIN(maxex, maxim); + if (howmuch <= 0) return 0.0; + + dout(5) << " - mds" << ex << " exports " << howmuch << " to mds" << im << endl; + + if (ex == mds->get_nodeid()) + my_targets[im] += howmuch; + + exported[ex] += howmuch; + imported[im] += howmuch; + + maxex -= howmuch; + maxim -= howmuch; + + return howmuch; +} + + + +void MDBalancer::do_hashing() +{ + if (hash_queue.empty()) { + dout(20) << "do_hashing has nothing to do" << endl; + return; + } + + dout(0) << "do_hashing " << hash_queue.size() << " dirs marked for possible hashing" << endl; + + for (set::iterator i = hash_queue.begin(); + i != hash_queue.end(); + i++) { + inodeno_t dirino = *i; + CInode *in = mds->mdcache->get_inode(dirino); + if (!in) continue; + CDir *dir = in->dir; + if (!dir) continue; + if (!dir->is_auth()) continue; + + dout(0) << "do_hashing hashing " << *dir << endl; + mds->mdcache->migrator->hash_dir(dir); + } + hash_queue.clear(); +} + + + +void MDBalancer::do_rebalance(int beat) +{ + int cluster_size = mds->get_mds_map()->get_num_mds(); + int whoami = mds->get_nodeid(); + + // reset + my_targets.clear(); + imported.clear(); + exported.clear(); + + dout(5) << " do_rebalance: cluster loads are" << endl; + + // rescale! turn my mds_load back into meta_load units + double load_fac = 1.0; + if (mds_load[whoami].mds_load() > 0) { + load_fac = mds_load[whoami].root.meta_load() / mds_load[whoami].mds_load(); + dout(7) << " load_fac is " << load_fac + << " <- " << mds_load[whoami].root.meta_load() << " / " << mds_load[whoami].mds_load() + << endl; + } + + double total_load = 0; + multimap load_map; + for (int i=0; i " << l << endl; + + if (whoami == i) my_load = l; + total_load += l; + + load_map.insert(pair( l, i )); + } + + // target load + target_load = total_load / (double)cluster_size; + dout(5) << "do_rebalance: my load " << my_load + << " target " << target_load + << " total " << total_load + << endl; + + // under or over? + if (my_load < target_load) { + dout(5) << " i am underloaded, doing nothing." << endl; + show_imports(); + return; + } + + dout(5) << " i am overloaded" << endl; + + + // first separate exporters and importers + multimap importers; + multimap exporters; + set importer_set; + set exporter_set; + + for (multimap::iterator it = load_map.begin(); + it != load_map.end(); + it++) { + if (it->first < target_load) { + dout(15) << " mds" << it->second << " is importer" << endl; + importers.insert(pair(it->first,it->second)); + importer_set.insert(it->second); + } else { + dout(15) << " mds" << it->second << " is exporter" << endl; + exporters.insert(pair(it->first,it->second)); + exporter_set.insert(it->second); + } + } + + + // determine load transfer mapping + + if (true) { + // analyze import_map; do any matches i can + + dout(5) << " matching exporters to import sources" << endl; + + // big -> small exporters + for (multimap::reverse_iterator ex = exporters.rbegin(); + ex != exporters.rend(); + ex++) { + double maxex = get_maxex(ex->second); + if (maxex <= .001) continue; + + // check importers. for now, just in arbitrary order (no intelligent matching). + for (map::iterator im = mds_import_map[ex->second].begin(); + im != mds_import_map[ex->second].end(); + im++) { + double maxim = get_maxim(im->first); + if (maxim <= .001) continue; + try_match(ex->second, maxex, + im->first, maxim); + if (maxex <= .001) break;; + } + } + } + + + if (1) { + if (beat % 2 == 1) { + // old way + dout(5) << " matching big exporters to big importers" << endl; + // big exporters to big importers + multimap::reverse_iterator ex = exporters.rbegin(); + multimap::iterator im = importers.begin(); + while (ex != exporters.rend() && + im != importers.end()) { + double maxex = get_maxex(ex->second); + double maxim = get_maxim(im->second); + if (maxex < .001 || maxim < .001) break; + try_match(ex->second, maxex, + im->second, maxim); + if (maxex <= .001) ex++; + if (maxim <= .001) im++; + } + } else { + // new way + dout(5) << " matching small exporters to big importers" << endl; + // small exporters to big importers + multimap::iterator ex = exporters.begin(); + multimap::iterator im = importers.begin(); + while (ex != exporters.end() && + im != importers.end()) { + double maxex = get_maxex(ex->second); + double maxim = get_maxim(im->second); + if (maxex < .001 || maxim < .001) break; + try_match(ex->second, maxex, + im->second, maxim); + if (maxex <= .001) ex++; + if (maxim <= .001) im++; + } + } + } + + + + // make a sorted list of my imports + map import_pop_map; + multimap import_from_map; + for (set::iterator it = mds->mdcache->imports.begin(); + it != mds->mdcache->imports.end(); + it++) { + if ((*it)->is_hashed()) continue; + double pop = (*it)->popularity[MDS_POP_CURDOM].meta_load(); + if (pop < g_conf.mds_bal_idle_threshold && + (*it)->inode != mds->mdcache->get_root()) { + dout(-5) << " exporting idle import " << **it + << " back to mds" << (*it)->inode->authority() + << endl; + mds->mdcache->migrator->export_dir(*it, (*it)->inode->authority()); + continue; + } + import_pop_map[ pop ] = *it; + int from = (*it)->inode->authority(); + dout(15) << " map: i imported " << **it << " from " << from << endl; + import_from_map.insert(pair(from, *it)); + } + + + + // do my exports! + set already_exporting; + double total_sent = 0; + double total_goal = 0; + + for (map::iterator it = my_targets.begin(); + it != my_targets.end(); + it++) { + + /* + double fac = 1.0; + if (false && total_goal > 0 && total_sent > 0) { + fac = total_goal / total_sent; + dout(-5) << " total sent is " << total_sent << " / " << total_goal << " -> fac 1/ " << fac << endl; + if (fac > 1.0) fac = 1.0; + } + fac = .9 - .4 * ((float)g_conf.num_mds / 128.0); // hack magic fixme + */ + + int target = (*it).first; + double amount = (*it).second;// * load_fac; + total_goal += amount; + + if (amount < MIN_OFFLOAD) continue; + + dout(-5) << " sending " << amount << " to mds" << target + //<< " .. " << (*it).second << " * " << load_fac + << " -> " << amount + << endl;//" .. fudge is " << fudge << endl; + double have = 0; + + show_imports(); + + // search imports from target + if (import_from_map.count(target)) { + dout(5) << " aha, looking through imports from target mds" << target << endl; + pair::iterator, multimap::iterator> p = + import_from_map.equal_range(target); + while (p.first != p.second) { + CDir *dir = (*p.first).second; + dout(5) << "considering " << *dir << " from " << (*p.first).first << endl; + multimap::iterator plast = p.first++; + + if (dir->inode->is_root()) continue; + if (dir->is_hashed()) continue; + if (dir->is_freezing() || dir->is_frozen()) continue; // export pbly already in progress + double pop = dir->popularity[MDS_POP_CURDOM].meta_load(); + assert(dir->inode->authority() == target); // cuz that's how i put it in the map, dummy + + if (pop <= amount-have) { + dout(-5) << "reexporting " << *dir + << " pop " << pop + << " back to mds" << target << endl; + mds->mdcache->migrator->export_dir(dir, target); + have += pop; + import_from_map.erase(plast); + import_pop_map.erase(pop); + } else { + dout(5) << "can't reexport " << *dir << ", too big " << pop << endl; + } + if (amount-have < MIN_OFFLOAD) break; + } + } + if (amount-have < MIN_OFFLOAD) { + total_sent += have; + continue; + } + + // any other imports + if (false) + for (map::iterator import = import_pop_map.begin(); + import != import_pop_map.end(); + import++) { + CDir *imp = (*import).second; + if (imp->inode->is_root()) continue; + + double pop = (*import).first; + if (pop < amount-have || pop < MIN_REEXPORT) { + dout(-5) << "reexporting " << *imp + << " pop " << pop + << " back to mds" << imp->inode->authority() + << endl; + have += pop; + mds->mdcache->migrator->export_dir(imp, imp->inode->authority()); + } + if (amount-have < MIN_OFFLOAD) break; + } + if (amount-have < MIN_OFFLOAD) { + //fudge = amount-have; + total_sent += have; + continue; + } + + // okay, search for fragments of my workload + set candidates = mds->mdcache->imports; + + list exports; + + for (set::iterator pot = candidates.begin(); + pot != candidates.end(); + pot++) { + find_exports(*pot, amount, exports, have, already_exporting); + if (have > amount-MIN_OFFLOAD) { + break; + } + } + //fudge = amount - have; + total_sent += have; + + for (list::iterator it = exports.begin(); it != exports.end(); it++) { + dout(-5) << " exporting to mds" << target + << " fragment " << **it + << " pop " << (*it)->popularity[MDS_POP_CURDOM].meta_load() + << endl; + mds->mdcache->migrator->export_dir(*it, target); + } + } + + dout(5) << "rebalance done" << endl; + show_imports(); + +} + + + +void MDBalancer::find_exports(CDir *dir, + double amount, + list& exports, + double& have, + set& already_exporting) +{ + double need = amount - have; + if (need < amount * g_conf.mds_bal_min_start) + return; // good enough! + double needmax = need * g_conf.mds_bal_need_max; + double needmin = need * g_conf.mds_bal_need_min; + double midchunk = need * g_conf.mds_bal_midchunk; + double minchunk = need * g_conf.mds_bal_minchunk; + + list bigger; + multimap smaller; + + double dir_pop = dir->popularity[MDS_POP_CURDOM].meta_load(); + double dir_sum = 0; + dout(-7) << " find_exports in " << dir_pop << " " << *dir << " need " << need << " (" << needmin << " - " << needmax << ")" << endl; + + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CInode *in = it->second->get_inode(); + if (!in) continue; + if (!in->is_dir()) continue; + if (!in->dir) continue; // clearly not popular + + if (in->dir->is_export()) continue; + if (in->dir->is_hashed()) continue; + if (already_exporting.count(in->dir)) continue; + + if (in->dir->is_frozen()) continue; // can't export this right now! + //if (in->dir->get_size() == 0) continue; // don't export empty dirs, even if they're not complete. for now! + + // how popular? + double pop = in->dir->popularity[MDS_POP_CURDOM].meta_load(); + dir_sum += pop; + dout(20) << " pop " << pop << " " << *in->dir << endl; + + if (pop < minchunk) continue; + + // lucky find? + if (pop > needmin && pop < needmax) { + exports.push_back(in->dir); + have += pop; + return; + } + + if (pop > need) + bigger.push_back(in->dir); + else + smaller.insert(pair(pop, in->dir)); + } + dout(7) << " .. sum " << dir_sum << " / " << dir_pop << endl; + + // grab some sufficiently big small items + multimap::reverse_iterator it; + for (it = smaller.rbegin(); + it != smaller.rend(); + it++) { + + if ((*it).first < midchunk) + break; // try later + + dout(7) << " taking smaller " << *(*it).second << endl; + + exports.push_back((*it).second); + already_exporting.insert((*it).second); + have += (*it).first; + if (have > needmin) + return; + } + + // apprently not enough; drill deeper into the hierarchy (if non-replicated) + for (list::iterator it = bigger.begin(); + it != bigger.end(); + it++) { + if ((*it)->is_rep()) continue; + dout(7) << " descending into " << **it << endl; + find_exports(*it, amount, exports, have, already_exporting); + if (have > needmin) + return; + } + + // ok fine, use smaller bits + for (; + it != smaller.rend(); + it++) { + + dout(7) << " taking (much) smaller " << it->first << " " << *(*it).second << endl; + + exports.push_back((*it).second); + already_exporting.insert((*it).second); + have += (*it).first; + if (have > needmin) + return; + } + + // ok fine, drill inot replicated dirs + for (list::iterator it = bigger.begin(); + it != bigger.end(); + it++) { + if (!(*it)->is_rep()) continue; + dout(7) << " descending into replicated " << **it << endl; + find_exports(*it, amount, exports, have, already_exporting); + if (have > needmin) + return; + } + +} + + + + +void MDBalancer::hit_inode(CInode *in, int type) +{ + // hit me + in->popularity[MDS_POP_JUSTME].pop[type].hit(); + in->popularity[MDS_POP_NESTED].pop[type].hit(); + if (in->is_auth()) { + in->popularity[MDS_POP_CURDOM].pop[type].hit(); + in->popularity[MDS_POP_ANYDOM].pop[type].hit(); + } + + // hit auth up to import + CDir *dir = in->get_parent_dir(); + if (dir) hit_dir(dir, type); +} + + +void MDBalancer::hit_dir(CDir *dir, int type) +{ + // hit me + float v = dir->popularity[MDS_POP_JUSTME].pop[type].hit(); + + // hit modify counter, if this was a modify + if (g_conf.num_mds > 2 && // FIXME >2 thing + !dir->inode->is_root() && // not root (for now at least) + dir->is_auth()) { + //dout(-20) << "hit_dir " << type << " pop is " << v << " " << *dir << endl; + + // hash this dir? (later?) + if (((v > g_conf.mds_bal_hash_rd && type == META_POP_IRD) || + //(v > g_conf.mds_bal_hash_wr && type == META_POP_IWR) || + (v > g_conf.mds_bal_hash_wr && type == META_POP_DWR)) && + !(dir->is_hashed() || dir->is_hashing()) && + hash_queue.count(dir->ino()) == 0) { + dout(0) << "hit_dir " << type << " pop is " << v << ", putting in hash_queue: " << *dir << endl; + hash_queue.insert(dir->ino()); + } + + } + + hit_recursive(dir, type); +} + + + +void MDBalancer::hit_recursive(CDir *dir, int type) +{ + bool anydom = dir->is_auth(); + bool curdom = dir->is_auth(); + + float rd_adj = 0.0; + + // replicate? + float dir_pop = dir->popularity[MDS_POP_CURDOM].pop[type].get(); // hmm?? + + if (dir->is_auth()) { + if (!dir->is_rep() && + dir_pop >= g_conf.mds_bal_replicate_threshold) { + // replicate + float rdp = dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].get(); + rd_adj = rdp / mds->get_mds_map()->get_num_mds() - rdp; + rd_adj /= 2.0; // temper somewhat + + dout(1) << "replicating dir " << *dir << " pop " << dir_pop << " .. rdp " << rdp << " adj " << rd_adj << endl; + + dir->dir_rep = CDIR_REP_ALL; + mds->mdcache->send_dir_updates(dir, true); + + dir->popularity[MDS_POP_JUSTME].pop[META_POP_IRD].adjust(rd_adj); + dir->popularity[MDS_POP_CURDOM].pop[META_POP_IRD].adjust(rd_adj); + } + + if (!dir->ino() != 1 && + dir->is_rep() && + dir_pop < g_conf.mds_bal_unreplicate_threshold) { + // unreplicate + dout(1) << "unreplicating dir " << *dir << " pop " << dir_pop << endl; + + dir->dir_rep = CDIR_REP_NONE; + mds->mdcache->send_dir_updates(dir); + } + } + + + while (dir) { + CInode *in = dir->inode; + + dir->popularity[MDS_POP_NESTED].pop[type].hit(); + in->popularity[MDS_POP_NESTED].pop[type].hit(); + + if (rd_adj != 0.0) dir->popularity[MDS_POP_NESTED].pop[META_POP_IRD].adjust(rd_adj); + + if (anydom) { + dir->popularity[MDS_POP_ANYDOM].pop[type].hit(); + in->popularity[MDS_POP_ANYDOM].pop[type].hit(); + } + + if (curdom) { + dir->popularity[MDS_POP_CURDOM].pop[type].hit(); + in->popularity[MDS_POP_CURDOM].pop[type].hit(); + } + + if (dir->is_import()) + curdom = false; // end of auth domain, stop hitting auth counters. + dir = dir->inode->get_parent_dir(); + } +} + + +/* + * subtract off an exported chunk + */ +void MDBalancer::subtract_export(CDir *dir) +{ + meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; + + bool in_domain = !dir->is_import(); + + while (true) { + CInode *in = dir->inode; + + in->popularity[MDS_POP_ANYDOM] -= curdom; + if (in_domain) in->popularity[MDS_POP_CURDOM] -= curdom; + + dir = in->get_parent_dir(); + if (!dir) break; + + if (dir->is_import()) in_domain = false; + + dir->popularity[MDS_POP_ANYDOM] -= curdom; + if (in_domain) dir->popularity[MDS_POP_CURDOM] -= curdom; + } +} + + +void MDBalancer::add_import(CDir *dir) +{ + meta_load_t curdom = dir->popularity[MDS_POP_CURDOM]; + + bool in_domain = !dir->is_import(); + + while (true) { + CInode *in = dir->inode; + + in->popularity[MDS_POP_ANYDOM] += curdom; + if (in_domain) in->popularity[MDS_POP_CURDOM] += curdom; + + dir = in->get_parent_dir(); + if (!dir) break; + + if (dir->is_import()) in_domain = false; + + dir->popularity[MDS_POP_ANYDOM] += curdom; + if (in_domain) dir->popularity[MDS_POP_CURDOM] += curdom; + } + +} + + + + + + +void MDBalancer::show_imports(bool external) +{ + int db = 20; //debug level + return; + + if (mds->mdcache->imports.empty() && + mds->mdcache->hashdirs.empty()) { + dout(db) << "no imports/exports/hashdirs" << endl; + return; + } + dout(db) << "imports/exports/hashdirs:" << endl; + + set ecopy = mds->mdcache->exports; + + set::iterator it = mds->mdcache->hashdirs.begin(); + while (1) { + if (it == mds->mdcache->hashdirs.end()) it = mds->mdcache->imports.begin(); + if (it == mds->mdcache->imports.end() ) break; + + CDir *im = *it; + + if (im->is_import()) { + dout(db) << " + import (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; + assert( im->is_auth() ); + } + else if (im->is_hashed()) { + if (im->is_import()) continue; // if import AND hash, list as import. + dout(db) << " + hash (" << im->popularity[MDS_POP_CURDOM] << "/" << im->popularity[MDS_POP_ANYDOM] << ") " << *im << endl; + } + + for (set::iterator p = mds->mdcache->nested_exports[im].begin(); + p != mds->mdcache->nested_exports[im].end(); + p++) { + CDir *exp = *p; + if (exp->is_hashed()) { + //assert(0); // we don't do it this way actually + dout(db) << " - hash (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; + assert( !exp->is_auth() ); + } else { + dout(db) << " - ex (" << exp->popularity[MDS_POP_NESTED] << ", " << exp->popularity[MDS_POP_ANYDOM] << ") " << *exp << " to " << exp->dir_auth << endl; + assert( exp->is_export() ); + assert( !exp->is_auth() ); + } + + if ( mds->mdcache->get_auth_container(exp) != im ) { + dout(1) << "uh oh, auth container is " << mds->mdcache->get_auth_container(exp) << endl; + dout(1) << "uh oh, auth container is " << *mds->mdcache->get_auth_container(exp) << endl; + assert( mds->mdcache->get_auth_container(exp) == im ); + } + + if (ecopy.count(exp) != 1) { + dout(1) << "***** nested_export " << *exp << " not in exports" << endl; + assert(0); + } + ecopy.erase(exp); + } + + it++; + } + + if (ecopy.size()) { + for (set::iterator it = ecopy.begin(); + it != ecopy.end(); + it++) + dout(1) << "***** stray item in exports: " << **it << endl; + assert(ecopy.size() == 0); + } +} + + + +/* replicate? + + float dir_pop = dir->get_popularity(); + + if (dir->is_auth()) { + if (!dir->is_rep() && + dir_pop >= g_conf.mds_bal_replicate_threshold) { + // replicate + dout(5) << "replicating dir " << *in << " pop " << dir_pop << endl; + + dir->dir_rep = CDIR_REP_ALL; + mds->mdcache->send_dir_updates(dir); + } + + if (dir->is_rep() && + dir_pop < g_conf.mds_bal_unreplicate_threshold) { + // unreplicate + dout(5) << "unreplicating dir " << *in << " pop " << dir_pop << endl; + + dir->dir_rep = CDIR_REP_NONE; + mds->mdcache->send_dir_updates(dir); + } + } + +*/ diff --git a/branches/sage/cephmds2/mds/MDBalancer.h b/branches/sage/cephmds2/mds/MDBalancer.h new file mode 100644 index 0000000000000..a6129045ca3f7 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDBalancer.h @@ -0,0 +1,106 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MDBALANCER_H +#define __MDBALANCER_H + +#include +#include +using namespace std; + +#include +#include +using namespace __gnu_cxx; + +#include "include/types.h" +#include "common/Clock.h" +#include "CInode.h" + + +class MDS; +class Message; +class MHeartbeat; +class CInode; +class Context; +class CDir; + +class MDBalancer { + protected: + MDS *mds; + + int beat_epoch; + + // todo + set hash_queue; + + // per-epoch scatter/gathered info + hash_map mds_load; + hash_map mds_meta_load; + map > mds_import_map; + + // per-epoch state + double my_load, target_load; + map my_targets; + map imported; + map exported; + + double try_match(int ex, double& maxex, + int im, double& maxim); + double get_maxim(int im) { + return target_load - mds_meta_load[im] - imported[im]; + } + double get_maxex(int ex) { + return mds_meta_load[ex] - target_load - exported[ex]; + } + + public: + MDBalancer(MDS *m) { + mds = m; + beat_epoch = 0; + } + + mds_load_t get_load(); + + int proc_message(Message *m); + + void send_heartbeat(); + void handle_heartbeat(MHeartbeat *m); + + void do_hashing(); + + void export_empties(); + void do_rebalance(int beat); + void find_exports(CDir *dir, + double amount, + list& exports, + double& have, + set& already_exporting); + + + void subtract_export(class CDir *ex); + void add_import(class CDir *im); + + void hit_inode(class CInode *in, int type=0); + void hit_dir(class CDir *dir, int type=0); + void hit_recursive(class CDir *dir, int type=0); + + + void show_imports(bool external=false); + +}; + + + +#endif diff --git a/branches/sage/cephmds2/mds/MDCache.cc b/branches/sage/cephmds2/mds/MDCache.cc new file mode 100644 index 0000000000000..02e2a9cd1417d --- /dev/null +++ b/branches/sage/cephmds2/mds/MDCache.cc @@ -0,0 +1,2580 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "MDCache.h" +#include "MDStore.h" +#include "MDS.h" +#include "Server.h" +#include "Locker.h" +#include "MDLog.h" +#include "MDBalancer.h" +#include "AnchorClient.h" +#include "Migrator.h" +#include "Renamer.h" + +#include "MDSMap.h" + +#include "CInode.h" +#include "CDir.h" + +#include "include/filepath.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "common/Logger.h" + +#include "osdc/Filer.h" + +#include "events/EUnlink.h" +#include "events/EPurgeFinish.h" + +#include "messages/MGenericMessage.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" + +//#include "messages/MInodeUpdate.h" +#include "messages/MDirUpdate.h" +#include "messages/MCacheExpire.h" + +#include "messages/MInodeFileCaps.h" + +#include "messages/MInodeLink.h" +#include "messages/MInodeLinkAck.h" +#include "messages/MInodeUnlink.h" +#include "messages/MInodeUnlinkAck.h" + +#include "messages/MLock.h" +#include "messages/MDentryUnlink.h" + +#include "messages/MClientRequest.h" +#include "messages/MClientFileCaps.h" + +#include "IdAllocator.h" + +#include "common/Timer.h" + +#include +#include +#include +#include +#include +using namespace std; + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".cache " + + + + +MDCache::MDCache(MDS *m) +{ + mds = m; + migrator = new Migrator(mds, this); + renamer = new Renamer(mds, this); + root = NULL; + lru.lru_set_max(g_conf.mds_cache_size); + lru.lru_set_midpoint(g_conf.mds_cache_mid); + + did_shutdown_exports = false; + shutdown_commits = 0; +} + +MDCache::~MDCache() +{ + delete migrator; + delete renamer; +} + + +void MDCache::log_stat(Logger *logger) +{ + if (get_root()) { + logger->set("popanyd", (int)get_root()->popularity[MDS_POP_ANYDOM].meta_load()); + logger->set("popnest", (int)get_root()->popularity[MDS_POP_NESTED].meta_load()); + } + logger->set("c", lru.lru_get_size()); + logger->set("cpin", lru.lru_get_num_pinned()); + logger->set("ctop", lru.lru_get_top()); + logger->set("cbot", lru.lru_get_bot()); + logger->set("cptail", lru.lru_get_pintail()); +} + + +// + +bool MDCache::shutdown() +{ + if (lru.lru_get_size() > 0) { + dout(7) << "WARNING: mdcache shutodwn with non-empty cache" << endl; + //show_cache(); + show_imports(); + //dump(); + } + return true; +} + + +// MDCache + +CInode *MDCache::create_inode() +{ + CInode *in = new CInode(this); + + // zero + memset(&in->inode, 0, sizeof(inode_t)); + + // assign ino + in->inode.ino = mds->idalloc->alloc_id(); + + in->inode.nlink = 1; // FIXME + + in->inode.layout = g_OSD_FileLayout; + + add_inode(in); // add + return in; +} + +void MDCache::destroy_inode(CInode *in) +{ + mds->idalloc->reclaim_id(in->ino()); + remove_inode(in); +} + + +void MDCache::add_inode(CInode *in) +{ + // add to lru, inode map + assert(inode_map.size() == lru.lru_get_size()); + lru.lru_insert_mid(in); + assert(inode_map.count(in->ino()) == 0); // should be no dup inos! + inode_map[ in->ino() ] = in; + assert(inode_map.size() == lru.lru_get_size()); +} + +void MDCache::remove_inode(CInode *o) +{ + dout(14) << "remove_inode " << *o << endl; + if (o->get_parent_dn()) { + // FIXME: multiple parents? + CDentry *dn = o->get_parent_dn(); + assert(!dn->is_dirty()); + if (dn->is_sync()) + dn->dir->remove_dentry(dn); // unlink inode AND hose dentry + else + dn->dir->unlink_inode(dn); // leave dentry + } + inode_map.erase(o->ino()); // remove from map + lru.lru_remove(o); // remove from lru +} + + + + +void MDCache::rename_file(CDentry *srcdn, + CDentry *destdn) +{ + CInode *in = srcdn->inode; + + // unlink src + srcdn->dir->unlink_inode(srcdn); + + // unlink old inode? + if (destdn->inode) destdn->dir->unlink_inode(destdn); + + // link inode w/ dentry + destdn->dir->link_inode( destdn, in ); +} + + + +void MDCache::set_root(CInode *in) +{ + assert(root == 0); + root = in; + root->state_set(CINODE_STATE_ROOT); +} + +void MDCache::add_import(CDir *dir) +{ + imports.insert(dir); + dir->state_set(CDIR_STATE_IMPORT); + dir->get(CDIR_PIN_IMPORT); +} + + + + + +// ************** +// Inode purging -- reliably removing deleted file's objects + +class C_MDC_PurgeFinish : public Context { + MDCache *mdc; + inodeno_t ino; +public: + C_MDC_PurgeFinish(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + void finish(int r) { + mdc->purge_inode_finish(ino); + } +}; +class C_MDC_PurgeFinish2 : public Context { + MDCache *mdc; + inodeno_t ino; +public: + C_MDC_PurgeFinish2(MDCache *c, inodeno_t i) : mdc(c), ino(i) {} + void finish(int r) { + mdc->purge_inode_finish_2(ino); + } +}; + +/* purge_inode in + * will be called by on unlink or rmdir + * caller responsible for journaling an appropriate EUnlink or ERmdir + */ +void MDCache::purge_inode(inode_t &inode) +{ + dout(10) << "purge_inode " << inode.ino << " size " << inode.size << endl; + + // take note + assert(purging.count(inode.ino) == 0); + purging[inode.ino] = inode; + + // remove + mds->filer->remove(inode, 0, inode.size, + 0, new C_MDC_PurgeFinish(this, inode.ino)); +} + +void MDCache::purge_inode_finish(inodeno_t ino) +{ + dout(10) << "purge_inode_finish " << ino << " - logging our completion" << endl; + + // log completion + mds->mdlog->submit_entry(new EPurgeFinish(ino), + new C_MDC_PurgeFinish2(this, ino)); +} + +void MDCache::purge_inode_finish_2(inodeno_t ino) +{ + dout(10) << "purge_inode_finish_2 " << ino << endl; + + // remove from purging list + purging.erase(ino); + + // tell anyone who cares (log flusher?) + list ls; + ls.swap(waiting_for_purge[ino]); + waiting_for_purge.erase(ino); + finish_contexts(ls, 0); + + // reclaim ino? + +} + +void MDCache::start_recovered_purges() +{ + for (map::iterator p = purging.begin(); + p != purging.end(); + ++p) { + dout(10) << "start_recovered_purges " << p->first << " size " << p->second.size << endl; + mds->filer->remove(p->second, 0, p->second.size, + 0, new C_MDC_PurgeFinish(this, p->first)); + } +} + + + + +bool MDCache::trim(int max) +{ + // empty? short cut. + if (lru.lru_get_size() == 0) return true; + + if (max < 0) { + max = lru.lru_get_max(); + if (!max) return false; + } + + map expiremap; + + dout(7) << "trim max=" << max << " cur=" << lru.lru_get_size() << endl; + assert(expiremap.empty()); + + while (lru.lru_get_size() > (unsigned)max) { + CInode *in = (CInode*)lru.lru_expire(); + if (!in) break; //return false; + + if (in->dir) { + // notify dir authority? + int auth = in->dir->authority(); + if (auth != mds->get_nodeid()) { + dout(17) << "sending expire to mds" << auth << " on " << *in->dir << endl; + if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid()); + expiremap[auth]->add_dir(in->ino(), in->dir->replica_nonce); + } + } + + // notify inode authority? + { + int auth = in->authority(); + if (auth != mds->get_nodeid()) { + assert(!in->is_auth()); + dout(17) << "sending expire to mds" << auth << " on " << *in << endl; + if (expiremap.count(auth) == 0) expiremap[auth] = new MCacheExpire(mds->get_nodeid()); + expiremap[auth]->add_inode(in->ino(), in->replica_nonce); + } else { + assert(in->is_auth()); + } + } + CInode *diri = NULL; + if (in->parent) + diri = in->parent->dir->inode; + + if (in->is_root()) { + dout(7) << "just trimmed root, cache now empty." << endl; + root = NULL; + } + + + // last link? + if (in->inode.nlink == 0) { + dout(17) << "last link, removing file content " << *in << endl; // FIXME THIS IS WRONG PLACE FOR THIS! + mds->filer->zero(in->inode, + 0, in->inode.size, + NULL, NULL); // FIXME + } + + // remove it + dout(15) << "trim removing " << *in << " " << in << endl; + remove_inode(in); + delete in; + + if (diri) { + // dir incomplete! + diri->dir->state_clear(CDIR_STATE_COMPLETE); + + // reexport? + if (diri->dir->is_import() && // import + diri->dir->get_size() == 0 && // no children + !diri->is_root()) // not root + migrator->export_empty_import(diri->dir); + + } + + mds->logger->inc("cex"); + } + + + /* hack + if (lru.lru_get_size() == max) { + int i; + dout(1) << "lru_top " << lru.lru_ntop << "/" << lru.lru_num << endl; + CInode *cur = (CInode*)lru.lru_tophead; + i = 1; + while (cur) { + dout(1) << " top " << i++ << "/" << lru.lru_ntop << " " << cur->lru_is_expireable() << " " << *cur << endl; + cur = (CInode*)cur->lru_next; + } + + dout(1) << "lru_bot " << lru.lru_nbot << "/" << lru.lru_num << endl; + cur = (CInode*)lru.lru_bothead; + i = 1; + while (cur) { + dout(1) << " bot " << i++ << "/" << lru.lru_nbot << " " << cur->lru_is_expireable() << " " << *cur << endl; + cur = (CInode*)cur->lru_next; + } + + } + */ + + // send expires + for (map::iterator it = expiremap.begin(); + it != expiremap.end(); + it++) { + dout(7) << "sending cache_expire to " << it->first << endl; + mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); + } + + + return true; +} + +class C_MDC_ShutdownCommit : public Context { + MDCache *mdc; +public: + C_MDC_ShutdownCommit(MDCache *mdc) { + this->mdc = mdc; + } + void finish(int r) { + mdc->shutdown_commits--; + } +}; + +class C_MDC_ShutdownCheck : public Context { + MDCache *mdc; + Mutex *lock; +public: + C_MDC_ShutdownCheck(MDCache *m, Mutex *l) : mdc(m), lock(l) {} + void finish(int) { + lock->Lock(); + mdc->shutdown_check(); + lock->Unlock(); + } +}; + +void MDCache::shutdown_check() +{ + dout(0) << "shutdown_check at " << g_clock.now() << endl; + + // cache + int o = g_conf.debug_mds; + g_conf.debug_mds = 10; + show_cache(); + g_conf.debug_mds = o; + g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock)); + + // this + dout(0) << "lru size now " << lru.lru_get_size() << endl; + dout(0) << "log len " << mds->mdlog->get_num_events() << endl; + + + if (exports.size()) + dout(0) << "still have " << exports.size() << " exports" << endl; + + if (mds->filer->is_active()) + dout(0) << "filer still active" << endl; +} + +void MDCache::shutdown_start() +{ + dout(1) << "shutdown_start" << endl; + + if (g_conf.mds_shutdown_check) + g_timer.add_event_after(g_conf.mds_shutdown_check, new C_MDC_ShutdownCheck(this, &mds->mds_lock)); +} + + + +bool MDCache::shutdown_pass() +{ + dout(7) << "shutdown_pass" << endl; + //assert(mds->is_shutting_down()); + if (mds->is_stopped()) { + dout(7) << " already shut down" << endl; + show_cache(); + show_imports(); + return true; + } + + // unhash dirs? + if (!hashdirs.empty()) { + // unhash any of my dirs? + for (set::iterator it = hashdirs.begin(); + it != hashdirs.end(); + it++) { + CDir *dir = *it; + if (!dir->is_auth()) continue; + if (dir->is_unhashing()) continue; + migrator->unhash_dir(dir); + } + + dout(7) << "waiting for dirs to unhash" << endl; + return false; + } + + // commit dirs? + if (g_conf.mds_commit_on_shutdown) { + + if (shutdown_commits < 0) { + dout(1) << "shutdown_pass committing all dirty dirs" << endl; + shutdown_commits = 0; + + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it++) { + CInode *in = it->second; + + // commit any dirty dir that's ours + if (in->is_dir() && in->dir && in->dir->is_auth() && in->dir->is_dirty()) { + mds->mdstore->commit_dir(in->dir, new C_MDC_ShutdownCommit(this)); + shutdown_commits++; + } + } + } + + // commits? + if (shutdown_commits > 0) { + dout(7) << "shutdown_commits still waiting for " << shutdown_commits << endl; + return false; + } + } + + // flush anything we can from the cache + trim(0); + dout(5) << "cache size now " << lru.lru_get_size() << endl; + + + // (wait for) flush log? + if (g_conf.mds_log_flush_on_shutdown && + mds->mdlog->get_num_events()) { + dout(7) << "waiting for log to flush .. " << mds->mdlog->get_num_events() << endl; + return false; + } + + // send all imports back to 0. + if (mds->get_nodeid() != 0 && !did_shutdown_exports) { + // flush what i can from the cache first.. + trim(0); + + // export to root + for (set::iterator it = imports.begin(); + it != imports.end(); + ) { + CDir *im = *it; + it++; + if (im->inode->is_root()) continue; + if (im->is_frozen() || im->is_freezing()) continue; + + dout(7) << "sending " << *im << " back to mds0" << endl; + migrator->export_dir(im,0); + } + did_shutdown_exports = true; + } + + + // waiting for imports? (e.g. root?) + if (exports.size()) { + dout(7) << "still have " << exports.size() << " exports" << endl; + //show_cache(); + return false; + } + + // filer active? + if (mds->filer->is_active()) { + dout(7) << "filer still active" << endl; + return false; + } + + // close root? + if (mds->get_nodeid() == 0 && + lru.lru_get_size() == 1 && + root && + root->dir && + root->dir->is_import() && + root->dir->get_ref() == 1) { // 1 is the import! + // un-import + dout(7) << "removing root import" << endl; + imports.erase(root->dir); + root->dir->state_clear(CDIR_STATE_IMPORT); + root->dir->put(CDIR_PIN_IMPORT); + + if (root->is_pinned_by(CINODE_PIN_DIRTY)) { + dout(7) << "clearing root dirty flag" << endl; + root->put(CINODE_PIN_DIRTY); + } + + trim(0); + assert(inode_map.size() == lru.lru_get_size()); + } + + // imports? + if (!imports.empty()) { + dout(7) << "still have " << imports.size() << " imports" << endl; + show_cache(); + return false; + } + + // done? + if (lru.lru_get_size() > 0) { + dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << endl; + show_cache(); + //dump(); + return false; + } + + // done! + dout(1) << "shutdown done." << endl; + return true; +} + + + + + + + +int MDCache::open_root(Context *c) +{ + int whoami = mds->get_nodeid(); + + // open root inode + if (whoami == 0) { + // i am root inode + CInode *root = new CInode(this); + memset(&root->inode, 0, sizeof(inode_t)); + root->inode.ino = 1; + root->inode.hash_seed = 0; // not hashed! + + // make it up (FIXME) + root->inode.mode = 0755 | INODE_MODE_DIR; + root->inode.size = 0; + root->inode.ctime = 0; + root->inode.mtime = g_clock.gettime(); + + root->inode.nlink = 1; + root->inode.layout = g_OSD_MDDirLayout; + + set_root( root ); + add_inode( root ); + + // root directory too + assert(root->dir == NULL); + root->set_dir( new CDir(root, mds, true) ); + root->dir->set_dir_auth( 0 ); // me! + root->dir->dir_rep = CDIR_REP_ALL; //NONE; + + // root is sort of technically an import (from a vacuum) + imports.insert( root->dir ); + root->dir->state_set(CDIR_STATE_IMPORT); + root->dir->get(CDIR_PIN_IMPORT); + + if (c) { + c->finish(0); + delete c; + } + } else { + // request inode from root mds + if (waiting_for_root.empty()) { + dout(7) << "discovering root" << endl; + + filepath want; + MDiscover *req = new MDiscover(whoami, + 0, + want, + false); // there _is_ no base dir for the root inode + mds->send_message_mds(req, 0, MDS_PORT_CACHE); + } else { + dout(7) << "waiting for root" << endl; + } + + // wait + waiting_for_root.push_back(c); + + } + + return 0; +} + + + + + + + + +// ========= messaging ============== + + +void MDCache::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_MDS_DISCOVER: + handle_discover((MDiscover*)m); + break; + case MSG_MDS_DISCOVERREPLY: + handle_discover_reply((MDiscoverReply*)m); + break; + + /* + case MSG_MDS_INODEUPDATE: + handle_inode_update((MInodeUpdate*)m); + break; + */ + + case MSG_MDS_INODELINK: + handle_inode_link((MInodeLink*)m); + break; + case MSG_MDS_INODELINKACK: + handle_inode_link_ack((MInodeLinkAck*)m); + break; + + case MSG_MDS_DIRUPDATE: + handle_dir_update((MDirUpdate*)m); + break; + + case MSG_MDS_CACHEEXPIRE: + handle_cache_expire((MCacheExpire*)m); + break; + + + + case MSG_MDS_DENTRYUNLINK: + handle_dentry_unlink((MDentryUnlink*)m); + break; + + + + + + default: + dout(7) << "cache unknown message " << m->get_type() << endl; + assert(0); + break; + } +} + + +/* path_traverse + * + * return values: + * <0 : traverse error (ENOTDIR, ENOENT) + * 0 : success + * >0 : delayed or forwarded + * + * Notes: + * onfinish context is only needed if you specify MDS_TRAVERSE_DISCOVER _and_ + * you aren't absolutely certain that the path actually exists. If it doesn't, + * the context is needed to pass a (failure) result code. + */ + +class C_MDC_TraverseDiscover : public Context { + Context *onfinish, *ondelay; + public: + C_MDC_TraverseDiscover(Context *onfinish, Context *ondelay) { + this->ondelay = ondelay; + this->onfinish = onfinish; + } + void finish(int r) { + //dout(10) << "TraverseDiscover r = " << r << endl; + if (r < 0 && onfinish) { // ENOENT on discover, pass back to caller. + onfinish->finish(r); + } else { + ondelay->finish(r); // retry as usual + } + delete onfinish; + delete ondelay; + } +}; + +int MDCache::path_traverse(filepath& origpath, + vector& trace, + bool follow_trailing_symlink, + Message *req, + Context *ondelay, + int onfail, + Context *onfinish, + bool is_client_req) // true if req is MClientRequest .. gross, FIXME +{ + int whoami = mds->get_nodeid(); + set< pair > symlinks_resolved; // keep a list of symlinks we touch to avoid loops + + bool noperm = false; + if (onfail == MDS_TRAVERSE_DISCOVER || + onfail == MDS_TRAVERSE_DISCOVERXLOCK) noperm = true; + + // root + CInode *cur = get_root(); + if (cur == NULL) { + dout(7) << "traverse: i don't have root" << endl; + open_root(ondelay); + if (onfinish) delete onfinish; + return 1; + } + + // start trace + trace.clear(); + + // make our own copy, since we'll modify when we hit symlinks + filepath path = origpath; + + unsigned depth = 0; + while (depth < path.depth()) { + dout(12) << "traverse: path seg depth " << depth << " = " << path[depth] << endl; + + // ENOTDIR? + if (!cur->is_dir()) { + dout(7) << "traverse: " << *cur << " not a dir " << endl; + delete ondelay; + if (onfinish) { + onfinish->finish(-ENOTDIR); + delete onfinish; + } + return -ENOTDIR; + } + + // open dir + if (!cur->dir) { + if (cur->dir_is_auth()) { + // parent dir frozen_dir? + if (cur->is_frozen_dir()) { + dout(7) << "traverse: " << *cur->get_parent_dir() << " is frozen_dir, waiting" << endl; + cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); + if (onfinish) delete onfinish; + return 1; + } + + cur->get_or_open_dir(mds); + assert(cur->dir); + } else { + // discover dir from/via inode auth + assert(!cur->is_auth()); + if (cur->waiting_for(CINODE_WAIT_DIR)) { + dout(10) << "traverse: need dir for " << *cur << ", already doing discover" << endl; + } else { + filepath want = path.postfixpath(depth); + dout(10) << "traverse: need dir for " << *cur << ", doing discover, want " << want.get_path() << endl; + mds->send_message_mds(new MDiscover(mds->get_nodeid(), + cur->ino(), + want, + true), // need this dir too + cur->authority(), MDS_PORT_CACHE); + } + cur->add_waiter(CINODE_WAIT_DIR, ondelay); + if (onfinish) delete onfinish; + return 1; + } + } + + // frozen? + /* + if (cur->dir->is_frozen()) { + // doh! + // FIXME: traverse is allowed? + dout(7) << "traverse: " << *cur->dir << " is frozen, waiting" << endl; + cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, ondelay); + if (onfinish) delete onfinish; + return 1; + } + */ + + // must read directory hard data (permissions, x bit) to traverse + if (!noperm && !mds->locker->inode_hard_read_try(cur, ondelay)) { + if (onfinish) delete onfinish; + return 1; + } + + // check permissions? + // XXX + + // ..? + if (path[depth] == "..") { + trace.pop_back(); + depth++; + cur = cur->get_parent_inode(); + dout(10) << "traverse: following .. back to " << *cur << endl; + continue; + } + + + // dentry + CDentry *dn = cur->dir->lookup(path[depth]); + + // null and last_bit and xlocked by me? + if (dn && dn->is_null() && + dn->is_xlockedbyme(req) && + depth == path.depth()-1) { + dout(10) << "traverse: hit (my) xlocked dentry at tail of traverse, succeeding" << endl; + trace.push_back(dn); + break; // done! + } + + if (dn && !dn->is_null()) { + // dentry exists. xlocked? + if (!noperm && dn->is_xlockedbyother(req)) { + dout(10) << "traverse: xlocked dentry at " << *dn << endl; + cur->dir->add_waiter(CDIR_WAIT_DNREAD, + path[depth], + ondelay); + if (onfinish) delete onfinish; + return 1; + } + + // do we have inode? + if (!dn->inode) { + assert(dn->is_remote()); + // do i have it? + CInode *in = get_inode(dn->get_remote_ino()); + if (in) { + dout(7) << "linking in remote in " << *in << endl; + dn->link_remote(in); + } else { + dout(7) << "remote link to " << dn->get_remote_ino() << ", which i don't have" << endl; + open_remote_ino(dn->get_remote_ino(), req, + ondelay); + return 1; + } + } + + // symlink? + if (dn->inode->is_symlink() && + (follow_trailing_symlink || depth < path.depth()-1)) { + // symlink, resolve! + filepath sym = dn->inode->symlink; + dout(10) << "traverse: hit symlink " << *dn->inode << " to " << sym << endl; + + // break up path components + // /head/symlink/tail + filepath head = path.prefixpath(depth); + filepath tail = path.postfixpath(depth+1); + dout(10) << "traverse: path head = " << head << endl; + dout(10) << "traverse: path tail = " << tail << endl; + + if (symlinks_resolved.count(pair(dn->inode, tail.get_path()))) { + dout(10) << "already hit this symlink, bailing to avoid the loop" << endl; + return -ELOOP; + } + symlinks_resolved.insert(pair(dn->inode, tail.get_path())); + + // start at root? + if (dn->inode->symlink[0] == '/') { + // absolute + trace.clear(); + depth = 0; + path = tail; + dout(10) << "traverse: absolute symlink, path now " << path << " depth " << depth << endl; + } else { + // relative + path = head; + path.append(sym); + path.append(tail); + dout(10) << "traverse: relative symlink, path now " << path << " depth " << depth << endl; + } + continue; + } else { + // keep going. + + // forwarder wants replicas? + if (is_client_req && ((MClientRequest*)req)->get_mds_wants_replica_in_dirino()) { + dout(30) << "traverse: REP is here, " << ((MClientRequest*)req)->get_mds_wants_replica_in_dirino() << " vs " << cur->dir->ino() << endl; + + if (((MClientRequest*)req)->get_mds_wants_replica_in_dirino() == cur->dir->ino() && + cur->dir->is_auth() && + cur->dir->is_rep() && + cur->dir->is_open_by(req->get_source().num()) && + dn->get_inode()->is_auth() + ) { + assert(req->get_source().is_mds()); + int from = req->get_source().num(); + + if (dn->get_inode()->is_cached_by(from)) { + dout(15) << "traverse: REP would replicate to mds" << from << ", but already cached_by " + << MSG_ADDR_NICE(req->get_source()) << " dn " << *dn << endl; + } else { + dout(10) << "traverse: REP replicating to " << MSG_ADDR_NICE(req->get_source()) << " dn " << *dn << endl; + MDiscoverReply *reply = new MDiscoverReply(cur->dir->ino()); + reply->add_dentry( dn->get_name(), !dn->can_read()); + reply->add_inode( dn->inode->replicate_to( from ) ); + mds->send_message_mds(reply, req->get_source().num(), MDS_PORT_CACHE); + } + } + } + + trace.push_back(dn); + cur = dn->inode; + touch_inode(cur); + depth++; + continue; + } + } + + // MISS. don't have it. + + int dauth = cur->dir->dentry_authority( path[depth] ); + dout(12) << "traverse: miss on dentry " << path[depth] << " dauth " << dauth << " in " << *cur->dir << endl; + + + if (dauth == whoami) { + // dentry is mine. + if (cur->dir->is_complete()) { + // file not found + delete ondelay; + if (onfinish) { + onfinish->finish(-ENOENT); + delete onfinish; + } + return -ENOENT; + } else { + + //wrong? + //if (onfail == MDS_TRAVERSE_DISCOVER) + // return -1; + + // directory isn't complete; reload + dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << endl; + touch_inode(cur); + mds->mdstore->fetch_dir(cur->dir, ondelay); + + mds->logger->inc("cmiss"); + + if (onfinish) delete onfinish; + return 1; + } + } else { + // dentry is not mine. + + /* no, let's let auth handle the discovery/replication .. + if (onfail == MDS_TRAVERSE_FORWARD && + onfinish == 0 && // no funnyness + cur->dir->is_rep()) { + dout(5) << "trying to discover in popular dir " << *cur->dir << endl; + onfail = MDS_TRAVERSE_DISCOVER; + } + */ + + if ((onfail == MDS_TRAVERSE_DISCOVER || + onfail == MDS_TRAVERSE_DISCOVERXLOCK)) { + // discover + + filepath want = path.postfixpath(depth); + if (cur->dir->waiting_for(CDIR_WAIT_DENTRY, path[depth])) { + dout(7) << "traverse: already waiting for discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; + } else { + dout(7) << "traverse: discover on " << *cur << " for " << want.get_path() << " to mds" << dauth << endl; + + touch_inode(cur); + + mds->send_message_mds(new MDiscover(mds->get_nodeid(), + cur->ino(), + want, + false), + dauth, MDS_PORT_CACHE); + mds->logger->inc("dis"); + } + + // delay processing of current request. + // delay finish vs ondelay until result of traverse, so that ENOENT can be + // passed to onfinish if necessary + cur->dir->add_waiter(CDIR_WAIT_DENTRY, + path[depth], + new C_MDC_TraverseDiscover(onfinish, ondelay)); + + mds->logger->inc("cmiss"); + return 1; + } + if (onfail == MDS_TRAVERSE_FORWARD) { + // forward + dout(7) << "traverse: not auth for " << path << " at " << path[depth] << ", fwd to mds" << dauth << endl; + + if (is_client_req && cur->dir->is_rep()) { + dout(15) << "traverse: REP fw to mds" << dauth << ", requesting rep under " << *cur->dir << " req " << *(MClientRequest*)req << endl; + ((MClientRequest*)req)->set_mds_wants_replica_in_dirino(cur->dir->ino()); + req->clear_payload(); // reencode! + } + + mds->send_message_mds(req, dauth, req->get_dest_port()); + //show_imports(); + + mds->logger->inc("cfw"); + if (onfinish) delete onfinish; + delete ondelay; + return 2; + } + if (onfail == MDS_TRAVERSE_FAIL) { + delete ondelay; + if (onfinish) { + onfinish->finish(-ENOENT); // -ENOENT, but only because i'm not the authority! + delete onfinish; + } + return -ENOENT; // not necessarily exactly true.... + } + } + + assert(0); // i shouldn't get here + } + + // success. + delete ondelay; + if (onfinish) { + onfinish->finish(0); + delete onfinish; + } + return 0; +} + + + +void MDCache::open_remote_dir(CInode *diri, + Context *fin) +{ + dout(10) << "open_remote_dir on " << *diri << endl; + + assert(diri->is_dir()); + assert(!diri->dir_is_auth()); + assert(!diri->is_auth()); + assert(diri->dir == 0); + + filepath want; // no dentries, i just want the dir open + mds->send_message_mds(new MDiscover(mds->get_nodeid(), + diri->ino(), + want, + true), // need the dir open + diri->authority(), MDS_PORT_CACHE); + + diri->add_waiter(CINODE_WAIT_DIR, fin); +} + + + +class C_MDC_OpenRemoteInoLookup : public Context { + MDCache *mdc; + inodeno_t ino; + Message *req; + Context *onfinish; +public: + vector anchortrace; + C_MDC_OpenRemoteInoLookup(MDCache *mdc, inodeno_t ino, Message *req, Context *onfinish) { + this->mdc = mdc; + this->ino = ino; + this->req = req; + this->onfinish = onfinish; + } + void finish(int r) { + assert(r == 0); + if (r == 0) + mdc->open_remote_ino_2(ino, req, anchortrace, onfinish); + else { + onfinish->finish(r); + delete onfinish; + } + } +}; + +void MDCache::open_remote_ino(inodeno_t ino, + Message *req, + Context *onfinish) +{ + dout(7) << "open_remote_ino on " << ino << endl; + + C_MDC_OpenRemoteInoLookup *c = new C_MDC_OpenRemoteInoLookup(this, ino, req, onfinish); + mds->anchorclient->lookup(ino, c->anchortrace, c); +} + +void MDCache::open_remote_ino_2(inodeno_t ino, + Message *req, + vector& anchortrace, + Context *onfinish) +{ + dout(7) << "open_remote_ino_2 on " << ino << ", trace depth is " << anchortrace.size() << endl; + + // construct path + filepath path; + for (unsigned i=0; iref_dn); + + dout(7) << " path is " << path << endl; + + vector trace; + int r = path_traverse(path, trace, false, + req, + onfinish, // delay actually + MDS_TRAVERSE_DISCOVER); + if (r > 0) return; + + onfinish->finish(r); + delete onfinish; +} + + + + +// path pins + +bool MDCache::path_pin(vector& trace, + Message *m, + Context *c) +{ + // verify everything is pinnable + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + CDentry *dn = *it; + if (!dn->is_pinnable(m)) { + // wait + if (c) { + dout(10) << "path_pin can't pin " << *dn << ", waiting" << endl; + dn->dir->add_waiter(CDIR_WAIT_DNPINNABLE, + dn->name, + c); + } else { + dout(10) << "path_pin can't pin, no waiter, failing." << endl; + } + return false; + } + } + + // pin! + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + (*it)->pin(m); + dout(11) << "path_pinned " << *(*it) << endl; + } + + delete c; + return true; +} + + +void MDCache::path_unpin(vector& trace, + Message *m) +{ + for (vector::iterator it = trace.begin(); + it != trace.end(); + it++) { + CDentry *dn = *it; + dn->unpin(m); + dout(11) << "path_unpinned " << *dn << endl; + + // did we completely unpin a waiter? + if (dn->lockstate == DN_LOCK_UNPINNING && !dn->is_pinned()) { + // return state to sync, in case the unpinner flails + dn->lockstate = DN_LOCK_SYNC; + + // run finisher right now to give them a fair shot. + dn->dir->finish_waiting(CDIR_WAIT_DNUNPINNED, dn->name); + } + } +} + + +void MDCache::make_trace(vector& trace, CInode *in) +{ + CInode *parent = in->get_parent_inode(); + if (parent) { + make_trace(trace, parent); + + CDentry *dn = in->get_parent_dn(); + dout(15) << "make_trace adding " << *dn << endl; + trace.push_back(dn); + } +} + + +bool MDCache::request_start(Message *req, + CInode *ref, + vector& trace) +{ + assert(active_requests.count(req) == 0); + + // pin path + if (trace.size()) { + if (!path_pin(trace, req, new C_MDS_RetryMessage(mds,req))) return false; + } + + dout(7) << "request_start " << *req << endl; + + // add to map + active_requests[req].ref = ref; + if (trace.size()) active_requests[req].traces[trace[trace.size()-1]] = trace; + + // request pins + request_pin_inode(req, ref); + + mds->logger->inc("req"); + + return true; +} + + +void MDCache::request_pin_inode(Message *req, CInode *in) +{ + if (active_requests[req].request_pins.count(in) == 0) { + in->request_pin_get(); + active_requests[req].request_pins.insert(in); + } +} + +void MDCache::request_pin_dir(Message *req, CDir *dir) +{ + if (active_requests[req].request_dir_pins.count(dir) == 0) { + dir->request_pin_get(); + active_requests[req].request_dir_pins.insert(dir); + } +} + + +void MDCache::request_cleanup(Message *req) +{ + assert(active_requests.count(req) == 1); + + // leftover xlocks? + if (active_requests[req].xlocks.size()) { + set dns = active_requests[req].xlocks; + + for (set::iterator it = dns.begin(); + it != dns.end(); + it++) { + CDentry *dn = *it; + + dout(7) << "request_cleanup leftover xlock " << *dn << endl; + + mds->locker->dentry_xlock_finish(dn); + + // queue finishers + dn->dir->take_waiting(CDIR_WAIT_ANY, dn->name, mds->finished_queue); + + // remove clean, null dentry? (from a failed rename or whatever) + if (dn->is_null() && dn->is_sync() && !dn->is_dirty()) { + dn->dir->remove_dentry(dn); + } + } + + assert(active_requests[req].xlocks.empty()); // we just finished finished them + } + + // foreign xlocks? + if (active_requests[req].foreign_xlocks.size()) { + set dns = active_requests[req].foreign_xlocks; + active_requests[req].foreign_xlocks.clear(); + + for (set::iterator it = dns.begin(); + it != dns.end(); + it++) { + CDentry *dn = *it; + + dout(7) << "request_cleanup sending unxlock for foreign xlock on " << *dn << endl; + assert(dn->is_xlocked()); + int dauth = dn->dir->dentry_authority(dn->name); + MLock *m = new MLock(LOCK_AC_UNXLOCK, mds->get_nodeid()); + m->set_dn(dn->dir->ino(), dn->name); + mds->send_message_mds(m, dauth, MDS_PORT_CACHE); + } + } + + // unpin paths + for (map< CDentry*, vector >::iterator it = active_requests[req].traces.begin(); + it != active_requests[req].traces.end(); + it++) { + path_unpin(it->second, req); + } + + // request pins + for (set::iterator it = active_requests[req].request_pins.begin(); + it != active_requests[req].request_pins.end(); + it++) { + (*it)->request_pin_put(); + } + for (set::iterator it = active_requests[req].request_dir_pins.begin(); + it != active_requests[req].request_dir_pins.end(); + it++) { + (*it)->request_pin_put(); + } + + // remove from map + active_requests.erase(req); + + + // log some stats ***** + mds->logger->set("c", lru.lru_get_size()); + mds->logger->set("cpin", lru.lru_get_num_pinned()); + mds->logger->set("ctop", lru.lru_get_top()); + mds->logger->set("cbot", lru.lru_get_bot()); + mds->logger->set("cptail", lru.lru_get_pintail()); + //mds->logger->set("buf",buffer_total_alloc); + + if (g_conf.log_pins) { + // pin + for (int i=0; ilogger2->set(cinode_pin_names[i], + cinode_pins[i]); + } + /* + for (map::iterator it = cdir_pins.begin(); + it != cdir_pins.end(); + it++) { + //string s = "D"; + //s += cdir_pin_names[it->first]; + mds->logger2->set(//s, + cdir_pin_names[it->first], + it->second); + } + */ + } + +} + +void MDCache::request_finish(Message *req) +{ + dout(7) << "request_finish " << *req << endl; + request_cleanup(req); + delete req; // delete req + + mds->logger->inc("reply"); + + + //dump(); +} + + +void MDCache::request_forward(Message *req, int who, int port) +{ + if (!port) port = MDS_PORT_SERVER; + + dout(7) << "request_forward to " << who << " req " << *req << endl; + request_cleanup(req); + mds->send_message_mds(req, who, port); + + mds->logger->inc("fw"); +} + + + +// ANCHORS + +class C_MDC_AnchorInode : public Context { + CInode *in; + +public: + C_MDC_AnchorInode(CInode *in) { + this->in = in; + } + void finish(int r) { + if (r == 0) { + assert(in->inode.anchored == false); + in->inode.anchored = true; + + in->state_clear(CINODE_STATE_ANCHORING); + in->put(CINODE_PIN_ANCHORING); + + in->mark_dirty(); + } + + // trigger + in->finish_waiting(CINODE_WAIT_ANCHORED, r); + } +}; + +void MDCache::anchor_inode(CInode *in, Context *onfinish) +{ + assert(in->is_auth()); + + // already anchoring? + if (in->state_test(CINODE_STATE_ANCHORING)) { + dout(7) << "anchor_inode already anchoring " << *in << endl; + + // wait + in->add_waiter(CINODE_WAIT_ANCHORED, + onfinish); + + } else { + dout(7) << "anchor_inode anchoring " << *in << endl; + + // auth: do it + in->state_set(CINODE_STATE_ANCHORING); + in->get(CINODE_PIN_ANCHORING); + + // wait + in->add_waiter(CINODE_WAIT_ANCHORED, + onfinish); + + // make trace + vector trace; + in->make_anchor_trace(trace); + + // do it + mds->anchorclient->create(in->ino(), trace, + new C_MDC_AnchorInode( in )); + } +} + + +void MDCache::handle_inode_link(MInodeLink *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + if (!in->is_auth()) { + assert(in->is_proxy()); + dout(7) << "handle_inode_link not auth for " << *in << ", fw to auth" << endl; + mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); + return; + } + + dout(7) << "handle_inode_link on " << *in << endl; + + if (!in->is_anchored()) { + assert(in->inode.nlink == 1); + dout(7) << "needs anchor, nlink=" << in->inode.nlink << ", creating anchor" << endl; + + anchor_inode(in, + new C_MDS_RetryMessage(mds, m)); + return; + } + + in->inode.nlink++; + in->mark_dirty(); + + // reply + dout(7) << " nlink++, now " << in->inode.nlink++ << endl; + + mds->send_message_mds(new MInodeLinkAck(m->get_ino(), true), m->get_from(), MDS_PORT_CACHE); + delete m; +} + + +void MDCache::handle_inode_link_ack(MInodeLinkAck *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + dout(7) << "handle_inode_link_ack success = " << m->is_success() << " on " << *in << endl; + in->finish_waiting(CINODE_WAIT_LINK, + m->is_success() ? 1:-1); +} + + + +// REPLICAS + + +void MDCache::handle_discover(MDiscover *dis) +{ + int whoami = mds->get_nodeid(); + + // from me to me? + if (dis->get_asker() == whoami) { + dout(7) << "discover for " << dis->get_want().get_path() << " bounced back to me, dropping." << endl; + delete dis; + return; + } + + CInode *cur = 0; + MDiscoverReply *reply = 0; + //filepath fullpath; + + // get started. + if (dis->get_base_ino() == 0) { + // wants root + dout(7) << "discover from mds" << dis->get_asker() << " wants root + " << dis->get_want().get_path() << endl; + + assert(mds->get_nodeid() == 0); + assert(root->is_auth()); + + //fullpath = dis->get_want(); + + + // add root + reply = new MDiscoverReply(0); + reply->add_inode( root->replicate_to( dis->get_asker() ) ); + dout(10) << "added root " << *root << endl; + + cur = root; + + } else { + // there's a base inode + cur = get_inode(dis->get_base_ino()); + assert(cur); + + if (dis->wants_base_dir()) { + dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur << " wants dir+" << dis->get_want().get_path() << endl; + } else { + dout(7) << "discover from mds" << dis->get_asker() << " has " << *cur->dir << " wants " << dis->get_want().get_path() << endl; + } + + assert(cur->is_dir()); + + // crazyness? + if (!cur->dir && !cur->is_auth()) { + int iauth = cur->authority(); + dout(7) << "no dir and not inode auth; fwd to auth " << iauth << endl; + mds->send_message_mds( dis, iauth, MDS_PORT_CACHE); + return; + } + + // frozen_dir? + if (!cur->dir && cur->is_frozen_dir()) { + dout(7) << "is frozen_dir, waiting" << endl; + cur->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, dis)); + return; + } + + if (!cur->dir) + cur->get_or_open_dir(mds); + assert(cur->dir); + + dout(10) << "dir is " << *cur->dir << endl; + + // create reply + reply = new MDiscoverReply(cur->ino()); + } + + assert(reply); + assert(cur); + + /* + // first traverse and make sure we won't have to do any waiting + dout(10) << "traversing full discover path = " << fullpath << endl; + vector trav; + int r = path_traverse(fullpath, trav, dis, MDS_TRAVERSE_FAIL); + if (r > 0) + return; // fw or delay + dout(10) << "traverse finish w/o blocking, continuing" << endl; + // ok, now we know we won't block on dentry locks or readdir. + */ + + + // add content + // do some fidgeting to include a dir if they asked for the base dir, or just root. + for (unsigned i = 0; i < dis->get_want().depth() || dis->get_want().depth() == 0; i++) { + // add dir + if (reply->is_empty() && !dis->wants_base_dir()) { + dout(7) << "they don't want the base dir" << endl; + } else { + // is it actaully a dir at all? + if (!cur->is_dir()) { + dout(7) << "not a dir " << *cur << endl; + reply->set_flag_error_dir(); + break; + } + + // add dir + if (!cur->dir_is_auth()) { + dout(7) << *cur << " dir auth is someone else, i'm done" << endl; + break; + } + + // did we hit a frozen_dir? + if (!cur->dir && cur->is_frozen_dir()) { + dout(7) << *cur << " is frozen_dir, stopping" << endl; + break; + } + + if (!cur->dir) cur->get_or_open_dir(mds); + + reply->add_dir( new CDirDiscover( cur->dir, + cur->dir->open_by_add( dis->get_asker() ) ) ); + dout(7) << "added dir " << *cur->dir << endl; + } + if (dis->get_want().depth() == 0) break; + + // lookup dentry + int dentry_auth = cur->dir->dentry_authority( dis->get_dentry(i) ); + if (dentry_auth != mds->get_nodeid()) { + dout(7) << *cur->dir << "dentry " << dis->get_dentry(i) << " auth " << dentry_auth << ", i'm done." << endl; + break; // that's it for us! + } + + // get inode + CDentry *dn = cur->dir->lookup( dis->get_dentry(i) ); + + /* + if (dn && !dn->can_read()) { // xlocked? + dout(7) << "waiting on " << *dn << endl; + cur->dir->add_waiter(CDIR_WAIT_DNREAD, + dn->name, + new C_MDS_RetryMessage(mds, dis)); + return; + } + */ + + if (dn) { + if (!dn->inode && dn->is_sync()) { + dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " null in " << *cur->dir << ", returning error" << endl; + reply->set_flag_error_dn( dis->get_dentry(i) ); + break; // don't replicate null but non-locked dentries. + } + + reply->add_dentry( dis->get_dentry(i), !dn->can_read() ); + dout(7) << "added dentry " << *dn << endl; + + if (!dn->inode) break; // we're done. + } + + if (dn && dn->inode) { + CInode *next = dn->inode; + assert(next->is_auth()); + + // add inode + //int nonce = next->cached_by_add(dis->get_asker()); + reply->add_inode( next->replicate_to( dis->get_asker() ) ); + dout(7) << "added inode " << *next << endl;// " nonce=" << nonce<< endl; + + // descend + cur = next; + } else { + // don't have inode? + if (cur->dir->is_complete()) { + // set error flag in reply + dout(7) << "mds" << whoami << " dentry " << dis->get_dentry(i) << " not found in " << *cur->dir << ", returning error" << endl; + reply->set_flag_error_dn( dis->get_dentry(i) ); + break; + } else { + // readdir + dout(7) << "mds" << whoami << " incomplete dir contents for " << *cur->dir << ", fetching" << endl; + + //mds->mdstore->fetch_dir(cur->dir, NULL); //new C_MDS_RetryMessage(mds, dis)); + //break; // send what we have so far + + mds->mdstore->fetch_dir(cur->dir, new C_MDS_RetryMessage(mds, dis)); + return; + } + } + } + + // how did we do. + if (reply->is_empty()) { + + // discard empty reply + delete reply; + + if ((cur->is_auth() || cur->is_proxy() || cur->dir->is_proxy()) && + !cur->dir->is_auth()) { + // fwd to dir auth + int dirauth = cur->dir->authority(); + if (dirauth == dis->get_asker()) { + dout(7) << "from (new?) dir auth, dropping (obsolete) discover on floor." << endl; // XXX FIXME is this right? + //assert(dis->get_asker() == dis->get_source()); //might be a weird other loop. either way, asker has it. + delete dis; + } else { + dout(7) << "fwd to dir auth " << dirauth << endl; + mds->send_message_mds( dis, dirauth, MDS_PORT_CACHE ); + } + return; + } + + dout(7) << "i'm not auth or proxy, dropping (this empty reply). i bet i just exported." << endl; + //assert(0); + + } else { + // send back to asker + dout(7) << "sending result back to asker mds" << dis->get_asker() << endl; + mds->send_message_mds(reply, dis->get_asker(), MDS_PORT_CACHE); + } + + // done. + delete dis; +} + + +void MDCache::handle_discover_reply(MDiscoverReply *m) +{ + // starting point + CInode *cur; + list finished, error; + + if (m->has_root()) { + // nowhere! + dout(7) << "discover_reply root + " << m->get_path() << " " << m->get_num_inodes() << " inodes" << endl; + assert(!root); + assert(m->get_base_ino() == 0); + assert(!m->has_base_dentry()); + assert(!m->has_base_dir()); + + // add in root + cur = new CInode(this, false); + + m->get_inode(0).update_inode(cur); + + // root + set_root( cur ); + add_inode( cur ); + dout(7) << " got root: " << *cur << endl; + + // take waiters + finished.swap(waiting_for_root); + } else { + // grab inode + cur = get_inode(m->get_base_ino()); + + if (!cur) { + dout(7) << "discover_reply don't have base ino " << m->get_base_ino() << ", dropping" << endl; + delete m; + return; + } + + dout(7) << "discover_reply " << *cur << " + " << m->get_path() << ", have " << m->get_num_inodes() << " inodes" << endl; + } + + // fyi + if (m->is_flag_error_dir()) dout(7) << " flag error, dir" << endl; + if (m->is_flag_error_dn()) dout(7) << " flag error, dentry = " << m->get_error_dentry() << endl; + dout(10) << "depth is " << m->get_depth() << ", has_root = " << m->has_root() << endl; + + // loop over discover results. + // indexese follow each ([[dir] dentry] inode) + // can start, end with any type. + + for (int i=m->has_root(); iget_depth(); i++) { + dout(10) << "discover_reply i=" << i << " cur " << *cur << endl; + + // dir + if ((i > 0) || + (i == 0 && m->has_base_dir())) { + if (cur->dir) { + // had it + /* this is strange, but it happens when: + we discover multiple dentries under a dir. + bc, no flag to indicate a dir discover is underway, (as there is w/ a dentry one). + this is actually good, since (dir aside) they're asking for different information. + */ + dout(7) << "had " << *cur->dir; + m->get_dir(i).update_dir(cur->dir); + dout2(7) << ", now " << *cur->dir << endl; + } else { + // add it (_replica_) + cur->set_dir( new CDir(cur, mds, false) ); + m->get_dir(i).update_dir(cur->dir); + dout(7) << "added " << *cur->dir << " nonce " << cur->dir->replica_nonce << endl; + + // get waiters + cur->take_waiting(CINODE_WAIT_DIR, finished); + } + } + + // dentry error? + if (i == m->get_depth()-1 && + m->is_flag_error_dn()) { + // error! + assert(cur->is_dir()); + if (cur->dir) { + dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dentry?" << endl; + cur->dir->take_waiting(CDIR_WAIT_DENTRY, + m->get_error_dentry(), + error); + } else { + dout(7) << " flag_error on dentry " << m->get_error_dentry() << ", triggering dir?" << endl; + cur->take_waiting(CINODE_WAIT_DIR, error); + } + break; + } + + if (i >= m->get_num_dentries()) break; + + // dentry + dout(7) << "i = " << i << " dentry is " << m->get_dentry(i) << endl; + + CDentry *dn = 0; + if (i > 0 || + m->has_base_dentry()) { + dn = cur->dir->lookup( m->get_dentry(i) ); + + if (dn) { + dout(7) << "had " << *dn << endl; + } else { + dn = cur->dir->add_dentry( m->get_dentry(i) ); + if (m->get_dentry_xlock(i)) { + dout(7) << " new dentry is xlock " << *dn << endl; + dn->lockstate = DN_LOCK_XLOCK; + dn->xlockedby = 0; + } + dout(7) << "added " << *dn << endl; + } + + cur->dir->take_waiting(CDIR_WAIT_DENTRY, + m->get_dentry(i), + finished); + } + + if (i >= m->get_num_inodes()) break; + + // inode + dout(7) << "i = " << i << " ino is " << m->get_ino(i) << endl; + CInode *in = get_inode( m->get_inode(i).get_ino() ); + assert(dn); + + if (in) { + dout(7) << "had " << *in << endl; + + // fix nonce + dout(7) << " my nonce is " << in->replica_nonce << ", taking from discover, which has " << m->get_inode(i).get_replica_nonce() << endl; + in->replica_nonce = m->get_inode(i).get_replica_nonce(); + + if (dn && in != dn->inode) { + dout(7) << " but it's not linked via dentry " << *dn << endl; + // link + if (dn->inode) { + dout(7) << "dentry WAS linked to " << *dn->inode << endl; + assert(0); // WTF. + } + dn->dir->link_inode(dn, in); + } + } + else { + assert(dn->inode == 0); // better not be something else linked to this dentry... + + // didn't have it. + in = new CInode(this, false); + + m->get_inode(i).update_inode(in); + + // link in + add_inode( in ); + dn->dir->link_inode(dn, in); + + dout(7) << "added " << *in << " nonce " << in->replica_nonce << endl; + } + + // onward! + cur = in; + } + + // dir error at the end there? + if (m->is_flag_error_dir()) { + dout(7) << " flag_error on dir " << *cur << endl; + assert(!cur->is_dir()); + cur->take_waiting(CINODE_WAIT_DIR, error); + } + + // finish errors directly + finish_contexts(error, -ENOENT); + + mds->queue_finished(finished); + + // done + delete m; +} + + + + + + + + +/* +int MDCache::send_inode_updates(CInode *in) +{ + assert(in->is_auth()); + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + dout(7) << "sending inode_update on " << *in << " to " << *it << endl; + assert(*it != mds->get_nodeid()); + mds->send_message_mds(new MInodeUpdate(in, in->get_cached_by_nonce(*it)), *it, MDS_PORT_CACHE); + } + + return 0; +} + + +void MDCache::handle_inode_update(MInodeUpdate *m) +{ + inodeno_t ino = m->get_ino(); + CInode *in = get_inode(m->get_ino()); + if (!in) { + //dout(7) << "inode_update on " << m->get_ino() << ", don't have it, ignoring" << endl; + dout(7) << "inode_update on " << m->get_ino() << ", don't have it, sending expire" << endl; + MCacheExpire *expire = new MCacheExpire(mds->get_nodeid()); + expire->add_inode(m->get_ino(), m->get_nonce()); + mds->send_message_mds(expire, m->get_source().num(), MDS_PORT_CACHE); + goto out; + } + + if (in->is_auth()) { + dout(7) << "inode_update on " << *in << ", but i'm the authority!" << endl; + assert(0); // this should never happen + } + + dout(7) << "inode_update on " << *in << endl; + + // update! NOTE dir_auth is unaffected by this. + in->decode_basic_state(m->get_payload()); + + out: + // done + delete m; +} +*/ + + + +void MDCache::handle_cache_expire(MCacheExpire *m) +{ + int from = m->get_from(); + int source = MSG_ADDR_NUM(m->get_source()); + map proxymap; + + if (m->get_from() == source) { + dout(7) << "cache_expire from " << from << endl; + } else { + dout(7) << "cache_expire from " << from << " via " << source << endl; + } + + // inodes + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = get_inode(it->first); + int nonce = it->second; + + if (!in) { + dout(0) << "inode_expire on " << it->first << " from " << from << ", don't have it" << endl; + assert(in); // i should be authority, or proxy .. and pinned + } + if (!in->is_auth()) { + int newauth = in->authority(); + dout(7) << "proxy inode expire on " << *in << " to " << newauth << endl; + assert(newauth >= 0); + if (!in->state_test(CINODE_STATE_PROXY)) dout(0) << "missing proxy bit on " << *in << endl; + assert(in->state_test(CINODE_STATE_PROXY)); + if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); + proxymap[newauth]->add_inode(it->first, it->second); + continue; + } + + // check nonce + if (from == mds->get_nodeid()) { + // my cache_expire, and the export_dir giving auth back to me crossed paths! + // we can ignore this. no danger of confusion since the two parties are both me. + dout(7) << "inode_expire on " << *in << " from mds" << from << " .. ME! ignoring." << endl; + } + else if (nonce == in->get_cached_by_nonce(from)) { + // remove from our cached_by + dout(7) << "inode_expire on " << *in << " from mds" << from << " cached_by was " << in->cached_by << endl; + in->cached_by_remove(from); + in->mds_caps_wanted.erase(from); + + // note: this code calls _eval more often than it needs to! + // fix lock + if (in->hardlock.is_gathering(from)) { + in->hardlock.gather_set.erase(from); + if (in->hardlock.gather_set.size() == 0) + mds->locker->inode_hard_eval(in); + } + if (in->filelock.is_gathering(from)) { + in->filelock.gather_set.erase(from); + if (in->filelock.gather_set.size() == 0) + mds->locker->inode_file_eval(in); + } + + // alone now? + if (!in->is_cached_by_anyone()) { + mds->locker->inode_hard_eval(in); + mds->locker->inode_file_eval(in); + } + + } + else { + // this is an old nonce, ignore expire. + dout(7) << "inode_expire on " << *in << " from mds" << from << " with old nonce " << nonce << " (current " << in->get_cached_by_nonce(from) << "), dropping" << endl; + assert(in->get_cached_by_nonce(from) > nonce); + } + } + + // dirs + for (map::iterator it = m->get_dirs().begin(); + it != m->get_dirs().end(); + it++) { + CInode *diri = get_inode(it->first); + CDir *dir = diri->dir; + int nonce = it->second; + + if (!dir) { + dout(0) << "dir_expire on " << it->first << " from " << from << ", don't have it" << endl; + assert(dir); // i should be authority, or proxy ... and pinned + } + if (!dir->is_auth()) { + int newauth = dir->authority(); + dout(7) << "proxy dir expire on " << *dir << " to " << newauth << endl; + if (!dir->is_proxy()) dout(0) << "nonproxy dir expire? " << *dir << " .. auth is " << newauth << " .. expire is from " << from << endl; + assert(dir->is_proxy()); + assert(newauth >= 0); + assert(dir->state_test(CDIR_STATE_PROXY)); + if (proxymap.count(newauth) == 0) proxymap[newauth] = new MCacheExpire(from); + proxymap[newauth]->add_dir(it->first, it->second); + continue; + } + + // check nonce + if (from == mds->get_nodeid()) { + dout(7) << "dir_expire on " << *dir << " from mds" << from << " .. ME! ignoring" << endl; + } + else if (nonce == dir->get_open_by_nonce(from)) { + // remove from our cached_by + dout(7) << "dir_expire on " << *dir << " from mds" << from << " open_by was " << dir->open_by << endl; + dir->open_by_remove(from); + } + else { + // this is an old nonce, ignore expire. + dout(7) << "dir_expire on " << *dir << " from mds" << from << " with old nonce " << nonce << " (current " << dir->get_open_by_nonce(from) << "), dropping" << endl; + assert(dir->get_open_by_nonce(from) > nonce); + } + } + + // send proxy forwards + for (map::iterator it = proxymap.begin(); + it != proxymap.end(); + it++) { + dout(7) << "sending proxy forward to " << it->first << endl; + mds->send_message_mds(it->second, it->first, MDS_PORT_CACHE); + } + + // done + delete m; +} + + + +int MDCache::send_dir_updates(CDir *dir, bool bcast) +{ + // this is an FYI, re: replication + + set who = dir->open_by; + if (bcast) + who = mds->get_mds_map()->get_mds(); + + dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << endl; + + string path; + dir->inode->make_path(path); + + int whoami = mds->get_nodeid(); + for (set::iterator it = who.begin(); + it != who.end(); + it++) { + if (*it == whoami) continue; + //if (*it == except) continue; + dout(7) << "sending dir_update on " << *dir << " to " << *it << endl; + + mds->send_message_mds(new MDirUpdate(dir->ino(), + dir->dir_rep, + dir->dir_rep_by, + path, + bcast), + *it, MDS_PORT_CACHE); + } + + return 0; +} + + +void MDCache::handle_dir_update(MDirUpdate *m) +{ + CInode *in = get_inode(m->get_ino()); + if (!in || !in->dir) { + dout(5) << "dir_update on " << m->get_ino() << ", don't have it" << endl; + + // discover it? + if (m->should_discover()) { + m->tried_discover(); // only once! + vector trace; + filepath path = m->get_path(); + + dout(5) << "trying discover on dir_update for " << path << endl; + + int r = path_traverse(path, trace, true, + m, new C_MDS_RetryMessage(mds, m), + MDS_TRAVERSE_DISCOVER); + if (r > 0) + return; + if (r == 0) { + assert(in); + open_remote_dir(in, new C_MDS_RetryMessage(mds, m)); + return; + } + assert(0); + } + + goto out; + } + + // update + dout(5) << "dir_update on " << *in->dir << endl; + in->dir->dir_rep = m->get_dir_rep(); + in->dir->dir_rep_by = m->get_dir_rep_by(); + + // done + out: + delete m; +} + + + + + +class C_MDC_DentryUnlink : public Context { +public: + MDCache *mdc; + CDentry *dn; + CDir *dir; + Context *c; + C_MDC_DentryUnlink(MDCache *mdc, CDentry *dn, CDir *dir, Context *c) { + this->mdc = mdc; + this->dn = dn; + this->dir = dir; + this->c = c; + } + void finish(int r) { + assert(r == 0); + mdc->dentry_unlink_finish(dn, dir, c); + } +}; + + +// NAMESPACE FUN + +void MDCache::dentry_unlink(CDentry *dn, Context *c) +{ + CDir *dir = dn->dir; + string dname = dn->name; + + assert(dn->lockstate == DN_LOCK_XLOCK); + + // i need the inode to do any of this properly + assert(dn->inode); + + // log it + if (dn->inode) dn->inode->mark_unsafe(); // XXX ??? FIXME + mds->mdlog->submit_entry(new EUnlink(dir, dn, dn->inode), + NULL); // FIXME FIXME FIXME + + // tell replicas + if (dir->is_open_by_anyone()) { + for (set::iterator it = dir->open_by_begin(); + it != dir->open_by_end(); + it++) { + dout(7) << "inode_unlink sending DentryUnlink to " << *it << endl; + + mds->send_message_mds(new MDentryUnlink(dir->ino(), dn->name), *it, MDS_PORT_CACHE); + } + + // don't need ack. + } + + + // inode deleted? + if (dn->is_primary()) { + assert(dn->inode->is_auth()); + dn->inode->inode.nlink--; + + if (dn->inode->is_dir()) assert(dn->inode->inode.nlink == 0); // no hard links on dirs + + // last link? + if (dn->inode->inode.nlink == 0) { + // truly dangling + if (dn->inode->dir) { + // mark dir clean too, since it now dne! + assert(dn->inode->dir->is_auth()); + dn->inode->dir->state_set(CDIR_STATE_DELETED); + dn->inode->dir->remove_null_dentries(); + dn->inode->dir->mark_clean(); + } + + // mark it clean, it's dead + if (dn->inode->is_dirty()) + dn->inode->mark_clean(); + + } else { + // migrate to inode file + dout(7) << "removed primary, but there are remote links, moving to inode file: " << *dn->inode << endl; + + // dangling but still linked. + assert(dn->inode->is_anchored()); + + // unlink locally + CInode *in = dn->inode; + dn->dir->unlink_inode( dn ); + dn->mark_dirty(); + + // mark it dirty! + in->mark_dirty(); + + // update anchor to point to inode file+mds + vector atrace; + in->make_anchor_trace(atrace); + assert(atrace.size() == 1); // it's dangling + mds->anchorclient->update(in->ino(), atrace, + new C_MDC_DentryUnlink(this, dn, dir, c)); + return; + } + } + else if (dn->is_remote()) { + // need to dec nlink on primary + if (dn->inode->is_auth()) { + // awesome, i can do it + dout(7) << "remote target is local, nlink--" << endl; + dn->inode->inode.nlink--; + dn->inode->mark_dirty(); + + if (( dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 0) || + (!dn->inode->state_test(CINODE_STATE_DANGLING) && dn->inode->inode.nlink == 1)) { + dout(7) << "nlink=1+primary or 0+dangling, removing anchor" << endl; + + // remove anchor (async) + mds->anchorclient->destroy(dn->inode->ino(), NULL); + } + } else { + int auth = dn->inode->authority(); + dout(7) << "remote target is remote, sending unlink request to " << auth << endl; + + mds->send_message_mds(new MInodeUnlink(dn->inode->ino(), mds->get_nodeid()), + auth, MDS_PORT_CACHE); + + // unlink locally + CInode *in = dn->inode; + dn->dir->unlink_inode( dn ); + dn->mark_dirty(); + + // add waiter + in->add_waiter(CINODE_WAIT_UNLINK, c); + return; + } + } + else + assert(0); // unlink on null dentry?? + + // unlink locally + dn->dir->unlink_inode( dn ); + dn->mark_dirty(); + + // finish! + dentry_unlink_finish(dn, dir, c); +} + + +void MDCache::dentry_unlink_finish(CDentry *dn, CDir *dir, Context *c) +{ + dout(7) << "dentry_unlink_finish on " << *dn << endl; + string dname = dn->name; + + // unpin dir / unxlock + mds->locker->dentry_xlock_finish(dn, true); // quiet, no need to bother replicas since they're already unlinking + + // did i empty out an imported dir? + if (dir->is_import() && !dir->inode->is_root() && dir->get_size() == 0) + migrator->export_empty_import(dir); + + // wake up any waiters + dir->take_waiting(CDIR_WAIT_ANY, dname, mds->finished_queue); + + c->finish(0); +} + + + + +void MDCache::handle_dentry_unlink(MDentryUnlink *m) +{ + CInode *diri = get_inode(m->get_dirino()); + CDir *dir = 0; + if (diri) dir = diri->dir; + + if (!diri || !dir) { + dout(7) << "handle_dentry_unlink don't have dir " << m->get_dirino() << endl; + } + else { + CDentry *dn = dir->lookup(m->get_dn()); + if (!dn) { + dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << endl; + } else { + dout(7) << "handle_dentry_unlink on " << *dn << endl; + + // dir? + if (dn->inode) { + if (dn->inode->dir) { + dn->inode->dir->state_set(CDIR_STATE_DELETED); + dn->inode->dir->remove_null_dentries(); + } + } + + string dname = dn->name; + + // unlink + dn->dir->remove_dentry(dn); + + // wake up + //dir->finish_waiting(CDIR_WAIT_DNREAD, dname); + dir->take_waiting(CDIR_WAIT_DNREAD, dname, mds->finished_queue); + } + } + + delete m; + return; +} + + +void MDCache::handle_inode_unlink(MInodeUnlink *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + // proxy? + if (in->is_proxy()) { + dout(7) << "handle_inode_unlink proxy on " << *in << endl; + mds->send_message_mds(m, in->authority(), MDS_PORT_CACHE); + return; + } + assert(in->is_auth()); + + // do it. + dout(7) << "handle_inode_unlink nlink=" << in->inode.nlink << " on " << *in << endl; + assert(in->inode.nlink > 0); + in->inode.nlink--; + + if (in->state_test(CINODE_STATE_DANGLING)) { + // already dangling. + // last link? + if (in->inode.nlink == 0) { + dout(7) << "last link, marking clean and removing anchor" << endl; + + in->mark_clean(); // mark it clean. + + // remove anchor (async) + mds->anchorclient->destroy(in->ino(), NULL); + } + else { + in->mark_dirty(); + } + } else { + // has primary link still. + assert(in->inode.nlink >= 1); + in->mark_dirty(); + + if (in->inode.nlink == 1) { + dout(7) << "nlink=1, removing anchor" << endl; + + // remove anchor (async) + mds->anchorclient->destroy(in->ino(), NULL); + } + } + + // ack + mds->send_message_mds(new MInodeUnlinkAck(m->get_ino()), m->get_from(), MDS_PORT_CACHE); +} + +void MDCache::handle_inode_unlink_ack(MInodeUnlinkAck *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + dout(7) << "handle_inode_unlink_ack on " << *in << endl; + in->finish_waiting(CINODE_WAIT_UNLINK, 0); +} + + + + + + + + + + +/* + * some import/export helpers + */ + +/** con = get_auth_container(dir) + * Returns the directory in which authority is delegated for *dir. + * This may be because a directory is an import, or because it is hashed + * and we are nested underneath an inode in that dir (that hashes to us). + * Thus do not assume con->is_auth()! It is_auth() || is_hashed(). + */ +CDir *MDCache::get_auth_container(CDir *dir) +{ + CDir *imp = dir; // might be *dir + + // find the underlying import or hash that delegates dir + while (true) { + if (imp->is_import()) break; // import + imp = imp->get_parent_dir(); + assert(imp); + if (imp->is_hashed()) break; // hash + } + + return imp; +} + + +void MDCache::find_nested_exports(CDir *dir, set& s) +{ + CDir *import = get_auth_container(dir); + find_nested_exports_under(import, dir, s); +} + +void MDCache::find_nested_exports_under(CDir *import, CDir *dir, set& s) +{ + dout(10) << "find_nested_exports for " << *dir << endl; + dout(10) << "find_nested_exports_under import " << *import << endl; + + if (import == dir) { + // yay, my job is easy! + for (set::iterator p = nested_exports[import].begin(); + p != nested_exports[import].end(); + p++) { + CDir *nested = *p; + s.insert(nested); + dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; + } + return; + } + + // ok, my job is annoying. + for (set::iterator p = nested_exports[import].begin(); + p != nested_exports[import].end(); + p++) { + CDir *nested = *p; + + dout(12) << "find_nested_exports checking " << *nested << endl; + + // trace back to import, or dir + CDir *cur = nested->get_parent_dir(); + while (!cur->is_import() || cur == dir) { + if (cur == dir) { + s.insert(nested); + dout(10) << "find_nested_exports " << *dir << " " << *nested << endl; + break; + } else { + cur = cur->get_parent_dir(); + } + } + } +} + + + + + + + + + + + + + + + + + + +// ============================================================== +// debug crap + + +void MDCache::show_imports() +{ + mds->balancer->show_imports(); +} + + +void MDCache::show_cache() +{ + dout(7) << "show_cache" << endl; + for (hash_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it++) { + dout(7) << *((*it).second) << endl; + + CDentry *dn = (*it).second->get_parent_dn(); + if (dn) + dout(7) << " dn " << *dn << endl; + if ((*it).second->dir) + dout(7) << " subdir " << *(*it).second->dir << endl; + } +} + diff --git a/branches/sage/cephmds2/mds/MDCache.h b/branches/sage/cephmds2/mds/MDCache.h new file mode 100644 index 0000000000000..e62113312447f --- /dev/null +++ b/branches/sage/cephmds2/mds/MDCache.h @@ -0,0 +1,282 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MDCACHE_H +#define __MDCACHE_H + +#include +#include +#include +#include +#include + +#include "include/types.h" +#include "include/filepath.h" + +#include "CInode.h" +#include "CDentry.h" +#include "CDir.h" +#include "Lock.h" + + +class MDS; +class Migrator; +class Renamer; + +class Logger; + +class Message; + +class MDiscover; +class MDiscoverReply; +class MCacheExpire; +class MDirUpdate; +class MDentryUnlink; +class MLock; + + +class MClientRequest; + + +// MDCache + +//typedef const char* pchar; + + + +/** active_request_t + * state we track for requests we are currently processing. + * mostly information about locks held, so that we can drop them all + * the request is finished or forwarded. see request_*(). + */ +typedef struct { + CInode *ref; // reference inode + set< CInode* > request_pins; + set< CDir* > request_dir_pins; + map< CDentry*, vector > traces; // path pins held + set< CDentry* > xlocks; // xlocks (local) + set< CDentry* > foreign_xlocks; // xlocks on foreign hosts +} active_request_t; + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const Message *p) const { + static hash H; + return H((unsigned long)p); + } + }; +} + +class MDCache { + protected: + // my master + MDS *mds; + + // the cache + CInode *root; // root inode + LRU lru; // lru for expiring items + hash_map inode_map; // map of inodes by ino + + // root + list waiting_for_root; + + // imports, exports, and hashes. + set imports; // includes root (on mds0) + set exports; + set hashdirs; + map > nested_exports; // exports nested under imports _or_ hashdirs + + // active MDS requests + hash_map active_requests; + + // inode purging + map purging; + map > waiting_for_purge; + + // shutdown crap + int shutdown_commits; + bool did_shutdown_exports; + friend class C_MDC_ShutdownCommit; + + friend class CInode; + friend class Locker; + friend class Migrator; + friend class Renamer; + friend class MDBalancer; + + public: + // subsystems + Migrator *migrator; + Renamer *renamer; + + public: + MDCache(MDS *m); + ~MDCache(); + + // debug + void log_stat(Logger *logger); + + // root inode + CInode *get_root() { return root; } + void set_root(CInode *r); + + void add_import(CDir *dir); + void remove_import(CDir *dir); + + // cache + void set_cache_size(size_t max) { lru.lru_set_max(max); } + size_t get_cache_size() { return lru.lru_get_size(); } + bool trim(int max = -1); // trim cache + + // shutdown + void shutdown_start(); + void shutdown_check(); + bool shutdown_pass(); + bool shutdown(); // clear cache (ie at shutodwn) + + // inode_map + bool have_inode( inodeno_t ino ) { return inode_map.count(ino) ? true:false; } + CInode* get_inode( inodeno_t ino ) { + if (have_inode(ino)) + return inode_map[ ino ]; + return NULL; + } + + public: + CInode *create_inode(); + void add_inode(CInode *in); + + protected: + void remove_inode(CInode *in); + void destroy_inode(CInode *in); + void touch_inode(CInode *in) { + // touch parent(s) too + if (in->get_parent_dir()) touch_inode(in->get_parent_dir()->inode); + + // top or mid, depending on whether i'm auth + if (in->is_auth()) + lru.lru_touch(in); + else + lru.lru_midtouch(in); + } + void rename_file(CDentry *srcdn, CDentry *destdn); + + public: + // inode purging + void purge_inode(inode_t& inode); + void purge_inode_finish(inodeno_t ino); + void purge_inode_finish_2(inodeno_t ino); + void waitfor_purge(inodeno_t ino, Context *c); + void start_recovered_purges(); + + + protected: + // private methods + CDir *get_auth_container(CDir *in); + void find_nested_exports(CDir *dir, set& s); + void find_nested_exports_under(CDir *import, CDir *dir, set& s); + + + public: + int open_root(Context *c); + int path_traverse(filepath& path, vector& trace, bool follow_trailing_sym, + Message *req, Context *ondelay, + int onfail, + Context *onfinish=0, + bool is_client_req = false); + void open_remote_dir(CInode *diri, Context *fin); + void open_remote_ino(inodeno_t ino, Message *req, Context *fin); + void open_remote_ino_2(inodeno_t ino, Message *req, + vector& anchortrace, + Context *onfinish); + + bool path_pin(vector& trace, Message *m, Context *c); + void path_unpin(vector& trace, Message *m); + void make_trace(vector& trace, CInode *in); + + bool request_start(Message *req, + CInode *ref, + vector& trace); + void request_cleanup(Message *req); + void request_finish(Message *req); + void request_forward(Message *req, int mds, int port=0); + void request_pin_inode(Message *req, CInode *in); + void request_pin_dir(Message *req, CDir *dir); + + // anchors + void anchor_inode(CInode *in, Context *onfinish); + //void unanchor_inode(CInode *in, Context *c); + + void handle_inode_link(class MInodeLink *m); + void handle_inode_link_ack(class MInodeLinkAck *m); + + // == messages == + public: + void dispatch(Message *m); + + protected: + // -- replicas -- + void handle_discover(MDiscover *dis); + void handle_discover_reply(MDiscoverReply *m); + + + // -- namespace -- + // these handle logging, cache sync themselves. + // UNLINK + public: + void dentry_unlink(CDentry *in, Context *c); + protected: + void dentry_unlink_finish(CDentry *in, CDir *dir, Context *c); + void handle_dentry_unlink(MDentryUnlink *m); + void handle_inode_unlink(class MInodeUnlink *m); + void handle_inode_unlink_ack(class MInodeUnlinkAck *m); + friend class C_MDC_DentryUnlink; + + + + // -- misc auth -- + int ino_proxy_auth(inodeno_t ino, + int frommds, + map >& inomap); + void do_ino_proxy(CInode *in, Message *m); + void do_dir_proxy(CDir *dir, Message *m); + + + + + // -- updates -- + //int send_inode_updates(CInode *in); + //void handle_inode_update(MInodeUpdate *m); + + int send_dir_updates(CDir *in, bool bcast=false); + void handle_dir_update(MDirUpdate *m); + + void handle_cache_expire(MCacheExpire *m); + + + + // == crap fns == + public: + void dump() { + if (root) root->dump(); + } + + void show_imports(); + void show_cache(); + +}; + + +#endif diff --git a/branches/sage/cephmds2/mds/MDLog.cc b/branches/sage/cephmds2/mds/MDLog.cc new file mode 100644 index 0000000000000..b272eb9a176d6 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDLog.cc @@ -0,0 +1,371 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDLog.h" +#include "MDS.h" +#include "LogEvent.h" + +#include "osdc/Journaler.h" + +#include "common/LogType.h" +#include "common/Logger.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds_log) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".log " + +// cons/des + +LogType mdlog_logtype; + +MDLog::MDLog(MDS *m) +{ + mds = m; + num_events = 0; + waiting_for_read = false; + + max_events = g_conf.mds_log_max_len; + + unflushed = 0; + + // logger + char name[80]; + sprintf(name, "mds%d.log", mds->get_nodeid()); + logger = new Logger(name, &mdlog_logtype); + + static bool didit = false; + if (!didit) { + mdlog_logtype.add_inc("add"); + mdlog_logtype.add_inc("retire"); + mdlog_logtype.add_inc("obs"); + mdlog_logtype.add_inc("trim"); + mdlog_logtype.add_set("size"); + mdlog_logtype.add_set("read"); + mdlog_logtype.add_set("append"); + mdlog_logtype.add_inc("lsum"); + mdlog_logtype.add_inc("lnum"); + } + + // inode + memset(&log_inode, 0, sizeof(log_inode)); + log_inode.ino = MDS_INO_LOG_OFFSET + mds->get_nodeid(); + log_inode.layout = g_OSD_MDLogLayout; + + if (g_conf.mds_local_osd) { + log_inode.layout.object_layout = OBJECT_LAYOUT_STARTOSD; + log_inode.layout.osd = mds->get_nodeid() + 10000; // hack + } + + // log streamer + journaler = new Journaler(log_inode, mds->objecter, logger); + +} + + +MDLog::~MDLog() +{ + if (journaler) { delete journaler; journaler = 0; } + if (logger) { delete logger; logger = 0; } +} + + +void MDLog::reset() +{ + journaler->reset(); +} + +void MDLog::open(Context *c) +{ + dout(5) << "open discovering log bounds" << endl; + journaler->recover(c); +} + +void MDLog::write_head(Context *c) +{ + journaler->write_head(c); +} + + +void MDLog::submit_entry( LogEvent *le, + Context *c ) +{ + dout(5) << "submit_entry " << journaler->get_write_pos() << " : " << *le << endl; + + if (g_conf.mds_log) { + // encode it, with event type + bufferlist bl; + bl.append((char*)&le->_type, sizeof(le->_type)); + le->encode_payload(bl); + + // journal it. + journaler->append_entry(bl); + + delete le; + num_events++; + + logger->inc("add"); + logger->set("size", num_events); + logger->set("append", journaler->get_write_pos()); + + if (c) { + unflushed = 0; + journaler->flush(c); + } + else + unflushed++; + + } else { + // hack: log is disabled. + if (c) { + c->finish(0); + delete c; + } + } +} + +void MDLog::wait_for_sync( Context *c ) +{ + if (g_conf.mds_log) { + // wait + journaler->flush(c); + } else { + // hack: bypass. + c->finish(0); + delete c; + } +} + +void MDLog::flush() +{ + if (unflushed) + journaler->flush(); + unflushed = 0; + + // trim + trim(NULL); +} + + + + +// trim + +class C_MDL_Trimmed : public Context { +public: + MDLog *mdl; + LogEvent *le; + + C_MDL_Trimmed(MDLog *mdl, LogEvent *le) { + this->mdl = mdl; + this->le = le; + } + void finish(int res) { + mdl->_trimmed(le); + } +}; + +class C_MDL_Reading : public Context { +public: + MDLog *mdl; + C_MDL_Reading(MDLog *m) { + mdl = m; + } + void finish(int res) { + mdl->_did_read(); + } +}; + + +void MDLog::_did_read() +{ + dout(5) << "_did_read()" << endl; + waiting_for_read = false; + trim(0); +} + +void MDLog::_trimmed(LogEvent *le) +{ + dout(7) << " trimmed " << *le << endl; + + assert(le->can_expire(mds)); + + if (trimming.begin()->first == le->_end_off) { + // front! we can expire the log a bit + journaler->set_expire_pos(le->_end_off); + } + + trimming.erase(le->_end_off); + delete le; + + logger->set("trim", trimming.size()); + logger->set("read", journaler->get_read_pos()); + + trim(0); +} + + + +void MDLog::trim(Context *c) +{ + // add waiter + if (c) + trim_waiters.push_back(c); + + // trim! + while (num_events > max_events) { + + off_t gap = journaler->get_write_pos() - journaler->get_read_pos(); + dout(5) << "trim num_events " << num_events << " > max " << max_events + << ", trimming " << trimming.size() + << ", byte gap " << gap + << endl; + + if ((int)trimming.size() >= g_conf.mds_log_max_trimming) { + dout(7) << "trim already trimming max, waiting" << endl; + return; + } + + bufferlist bl; + if (journaler->try_read_entry(bl)) { + // decode logevent + LogEvent *le = LogEvent::decode(bl); + le->_end_off = journaler->get_read_pos(); + num_events--; + + // we just read an event. + if (le->can_expire(mds) == true) { + // obsolete + dout(7) << "trim obsolete: " << *le << endl; + delete le; + logger->inc("obs"); + } else { + assert ((int)trimming.size() < g_conf.mds_log_max_trimming); + + // trim! + dout(7) << "trim trimming: " << *le << endl; + trimming[le->_end_off] = le; + le->retire(mds, new C_MDL_Trimmed(this, le)); + logger->inc("retire"); + logger->set("trim", trimming.size()); + } + logger->set("read", journaler->get_read_pos()); + logger->set("size", num_events); + } else { + // need to read! + if (!waiting_for_read) { + waiting_for_read = true; + dout(7) << "trim waiting for read" << endl; + journaler->wait_for_readable(new C_MDL_Reading(this)); + } else { + dout(7) << "trim already waiting for read" << endl; + } + return; + } + } + + dout(5) << "trim num_events " << num_events << " <= max " << max_events + << ", trimming " << trimming.size() + << ", done for now." + << endl; + + // trimmed! + std::list finished; + finished.swap(trim_waiters); + finish_contexts(finished, 0); +} + + +void MDLog::replay(Context *c) +{ + assert(journaler->is_active()); + + // start reading at the last known expire point. + journaler->set_read_pos( journaler->get_expire_pos() ); + + // empty? + if (journaler->get_read_pos() == journaler->get_write_pos()) { + dout(10) << "replay - journal empty, done." << endl; + if (c) { + c->finish(0); + delete c; + } + return; + } + + // add waiter + if (c) + waitfor_replay.push_back(c); + + // go! + dout(10) << "replay start, from " << journaler->get_read_pos() + << " to " << journaler->get_write_pos() << endl; + + assert(num_events == 0); + + _replay(); +} + +class C_MDL_Replay : public Context { + MDLog *mdlog; +public: + C_MDL_Replay(MDLog *l) : mdlog(l) {} + void finish(int r) { mdlog->_replay(); } +}; + +void MDLog::_replay() +{ + // read what's buffered + while (journaler->is_readable() && + journaler->get_read_pos() < journaler->get_write_pos()) { + // read it + off_t pos = journaler->get_read_pos(); + bufferlist bl; + bool r = journaler->try_read_entry(bl); + assert(r); + + // unpack event + LogEvent *le = LogEvent::decode(bl); + num_events++; + + if (le->has_happened(mds)) { + dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() + << " : " << *le << " : already happened" << endl; + } else { + dout(10) << "_replay " << pos << " / " << journaler->get_write_pos() + << " : " << *le << " : applying" << endl; + le->replay(mds); + } + delete le; + } + + // wait for read? + if (journaler->get_read_pos() < journaler->get_write_pos()) { + journaler->wait_for_readable(new C_MDL_Replay(this)); + return; + } + + // done! + assert(journaler->get_read_pos() == journaler->get_write_pos()); + dout(10) << "_replay - complete" << endl; + + // move read pointer _back_ to expire pos, for eventual trimming + journaler->set_read_pos(journaler->get_expire_pos()); + + // kick waiter(s) + list ls; + ls.swap(waitfor_replay); + finish_contexts(ls,0); +} + + diff --git a/branches/sage/cephmds2/mds/MDLog.h b/branches/sage/cephmds2/mds/MDLog.h new file mode 100644 index 0000000000000..37329a164e781 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDLog.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDLOG_H +#define __MDLOG_H + +#include "include/types.h" +#include "include/Context.h" + +#include + +//#include +//using __gnu_cxx::hash_mapset; + +class Journaler; +class LogEvent; +class MDS; + +class Logger; + +/* +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const LogEvent *p) const { + static hash H; + return H((unsigned long)p); + } + }; +} +*/ + +class MDLog { + protected: + MDS *mds; + size_t num_events; // in events + size_t max_events; + + int unflushed; + + inode_t log_inode; + Journaler *journaler; + + + //hash_map trimming; // events currently being trimmed + map trimming; + std::list trim_waiters; // contexts waiting for trim + bool trim_reading; + + bool waiting_for_read; + friend class C_MDL_Reading; + + Logger *logger; + + list waitfor_replay; + + public: + MDLog(MDS *m); + ~MDLog(); + + void set_max_events(size_t max) { max_events = max; } + size_t get_max_events() { return max_events; } + size_t get_num_events() { return num_events + trimming.size(); } + + void submit_entry( LogEvent *e, Context *c = 0 ); + void wait_for_sync( Context *c ); + void flush(); + + void trim(Context *c); + void _did_read(); + void _trimmed(LogEvent *le); + + void reset(); // fresh, empty log! + void open(Context *onopen); + void write_head(Context *onfinish); + + void replay(Context *onfinish); + void _replay(); +}; + +#endif diff --git a/branches/sage/cephmds2/mds/MDS.cc b/branches/sage/cephmds2/mds/MDS.cc new file mode 100644 index 0000000000000..a487d6469eb7a --- /dev/null +++ b/branches/sage/cephmds2/mds/MDS.cc @@ -0,0 +1,692 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "include/types.h" +#include "common/Clock.h" + +#include "msg/Messenger.h" + +#include "osd/OSDMap.h" +#include "osdc/Objecter.h" +#include "osdc/Filer.h" + +#include "MDSMap.h" + +#include "MDS.h" +#include "Server.h" +#include "Locker.h" +#include "MDCache.h" +#include "MDStore.h" +#include "MDLog.h" +#include "MDBalancer.h" +#include "IdAllocator.h" +#include "Migrator.h" +#include "Renamer.h" + +#include "AnchorTable.h" +#include "AnchorClient.h" + +#include "common/Logger.h" +#include "common/LogType.h" + +#include "common/Timer.h" + +#include "messages/MMDSMap.h" +#include "messages/MMDSBoot.h" + +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MGenericMessage.h" + +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" + + +LogType mds_logtype, mds_cache_logtype; + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << " " + + + + + +// cons/des +MDS::MDS(int whoami, Messenger *m, MonMap *mm) { + this->whoami = whoami; + + monmap = mm; + messenger = m; + + mdsmap = new MDSMap; + osdmap = new OSDMap; + + objecter = new Objecter(messenger, monmap, osdmap); + filer = new Filer(objecter); + + mdcache = new MDCache(this); + mdstore = new MDStore(this); + mdlog = new MDLog(this); + balancer = new MDBalancer(this); + + anchorclient = new AnchorClient(messenger, mdsmap); + + // alloc + { + inode_t id_inode; + memset(&id_inode, 0, sizeof(id_inode)); + id_inode.ino = MDS_INO_IDS_OFFSET + whoami; + id_inode.layout = g_OSD_FileLayout; + idalloc = new IdAllocator(this, id_inode); + } + + // hack: anchortable on mds0. + if (whoami == 0) + anchormgr = new AnchorTable(this); + else + anchormgr = 0; + + + server = new Server(this); + locker = new Locker(this, mdcache); + + + req_rate = 0; + + state = STATE_BOOTING; + + last_balancer_hash = last_balancer_heartbeat = g_clock.recent_now(); + + // log + string name; + name = "mds"; + int w = whoami; + if (w >= 1000) name += ('0' + ((w/1000)%10)); + if (w >= 100) name += ('0' + ((w/100)%10)); + if (w >= 10) name += ('0' + ((w/10)%10)); + name += ('0' + ((w/1)%10)); + + logger = new Logger(name, (LogType*)&mds_logtype); + + mds_logtype.add_inc("req"); + mds_logtype.add_inc("reply"); + mds_logtype.add_inc("fw"); + mds_logtype.add_inc("cfw"); + + mds_logtype.add_set("l"); + mds_logtype.add_set("q"); + mds_logtype.add_set("popanyd"); + mds_logtype.add_set("popnest"); + + mds_logtype.add_inc("lih"); + mds_logtype.add_inc("lif"); + + mds_logtype.add_set("c"); + mds_logtype.add_set("ctop"); + mds_logtype.add_set("cbot"); + mds_logtype.add_set("cptail"); + mds_logtype.add_set("cpin"); + mds_logtype.add_inc("cex"); + mds_logtype.add_inc("dis"); + mds_logtype.add_inc("cmiss"); + + mds_logtype.add_set("buf"); + mds_logtype.add_inc("cdir"); + mds_logtype.add_inc("fdir"); + + mds_logtype.add_inc("iex"); + mds_logtype.add_inc("iim"); + mds_logtype.add_inc("ex"); + mds_logtype.add_inc("im"); + mds_logtype.add_inc("imex"); + mds_logtype.add_set("nex"); + mds_logtype.add_set("nim"); + + + char n[80]; + sprintf(n, "mds%d.cache", whoami); + logger2 = new Logger(n, (LogType*)&mds_cache_logtype); + + + // i'm ready! + messenger->set_dispatcher(this); +} + +MDS::~MDS() { + if (mdcache) { delete mdcache; mdcache = NULL; } + if (mdstore) { delete mdstore; mdstore = NULL; } + if (mdlog) { delete mdlog; mdlog = NULL; } + if (balancer) { delete balancer; balancer = NULL; } + if (idalloc) { delete idalloc; idalloc = NULL; } + if (anchormgr) { delete anchormgr; anchormgr = NULL; } + if (anchorclient) { delete anchorclient; anchorclient = NULL; } + if (osdmap) { delete osdmap; osdmap = 0; } + + if (filer) { delete filer; filer = 0; } + if (objecter) { delete objecter; objecter = 0; } + if (messenger) { delete messenger; messenger = NULL; } + + if (logger) { delete logger; logger = 0; } + if (logger2) { delete logger2; logger2 = 0; } + +} + + +void MDS::send_message_mds(Message *m, int mds, int port, int fromport) +{ + if (port && !fromport) + fromport = port; + messenger->send_message(m, MSG_ADDR_MDS(mds), mdsmap->get_inst(mds), port, fromport); +} + + +int MDS::init() +{ + // request osd map + dout(5) << "requesting mds and osd maps from mon" << endl; + int mon = monmap->pick_mon(); + messenger->send_message(new MMDSBoot, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + return 0; +} + + +void MDS::handle_mds_map(MMDSMap *m) +{ + map::reverse_iterator p = m->maps.rbegin(); + + dout(1) << "handle_mds_map epoch " << p->first << endl; + mdsmap->decode(p->second); + + delete m; + + if (is_booting()) { + // we need an osdmap too. + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDGetMap(0), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } +} + +void MDS::handle_osd_map(MOSDMap *m) +{ + // process locally + objecter->handle_osd_map(m); + + if (is_booting()) { + // we got our maps. mkfs for recovery? + if (g_conf.mkfs) + boot_mkfs(); + else + boot_recover(); + } + + // pass on to clients + for (set::iterator it = clientmap.get_mount_set().begin(); + it != clientmap.get_mount_set().end(); + it++) { + MOSDMap *n = new MOSDMap; + n->maps = m->maps; + n->incremental_maps = m->incremental_maps; + messenger->send_message(n, MSG_ADDR_CLIENT(*it), clientmap.get_inst(*it)); + } +} + + +class C_MDS_MkfsFinish : public Context { + MDS *mds; +public: + C_MDS_MkfsFinish(MDS *m) : mds(m) {} + void finish(int r) { mds->boot_mkfs_finish(); } +}; + +void MDS::boot_mkfs() +{ + dout(3) << "boot_mkfs" << endl; + + C_Gather *fin = new C_Gather(new C_MDS_MkfsFinish(this)); + + if (whoami == 0) { + dout(3) << "boot_mkfs - creating root inode and dir" << endl; + + // create root inode. + mdcache->open_root(0); + CInode *root = mdcache->get_root(); + assert(root); + + // force empty root dir + CDir *dir = root->dir; + dir->mark_complete(); + dir->mark_dirty(); + + // save it + mdstore->commit_dir(dir, fin->new_sub()); + } + + // start with a fresh journal + dout(10) << "boot_mkfs creating fresh journal" << endl; + mdlog->reset(); + mdlog->write_head(fin->new_sub()); + + // fixme: fake out idalloc (reset, pretend loaded) + dout(10) << "boot_mkfs creating fresh idalloc table" << endl; + idalloc->reset(); + idalloc->save(fin->new_sub()); + + // fixme: fake out anchortable + if (mdsmap->get_anchortable() == whoami) { + dout(10) << "boot_mkfs creating fresh anchortable" << endl; + anchormgr->reset(); + anchormgr->save(fin->new_sub()); + } +} + +void MDS::boot_mkfs_finish() +{ + dout(3) << "boot_mkfs_finish" << endl; + mark_active(); +} + + +class C_MDS_BootRecover : public Context { + MDS *mds; + int nextstep; +public: + C_MDS_BootRecover(MDS *m, int n) : mds(m), nextstep(n) {} + void finish(int r) { mds->boot_recover(nextstep); } +}; + +void MDS::boot_recover(int step) +{ + if (is_booting()) + state = STATE_RECOVERING; + + switch (step) { + case 0: + if (whoami == 0) { + dout(2) << "boot_recover " << step << ": creating root inode" << endl; + mdcache->open_root(0); + step = 1; + // fall-thru + } else { + // FIXME + assert(0); + } + + case 1: + dout(2) << "boot_recover " << step << ": opening idalloc" << endl; + idalloc->load(new C_MDS_BootRecover(this, 2)); + break; + + case 2: + if (mdsmap->get_anchortable() == whoami) { + dout(2) << "boot_recover " << step << ": opening anchor table" << endl; + anchormgr->load(new C_MDS_BootRecover(this, 3)); + break; + } else { + dout(2) << "boot_recover " << step << ": i have no anchor table" << endl; + step++; + } + // fall-thru + + case 3: + dout(2) << "boot_recover " << step << ": opening mds log" << endl; + mdlog->open(new C_MDS_BootRecover(this, 4)); + break; + + case 4: + dout(2) << "boot_recover " << step << ": replaying mds log" << endl; + mdlog->replay(new C_MDS_BootRecover(this, 5)); + break; + + case 5: + dout(2) << "boot_recover " << step << ": restarting any recovered purges" << endl; + mdcache->start_recovered_purges(); + step++; + // fall-thru + + case 6: + dout(2) << "boot_recover " << step << ": done." << endl; + mark_active(); + } +} + + + +void MDS::mark_active() +{ + dout(3) << "mark_active" << endl; + state = STATE_ACTIVE; + finish_contexts(waitfor_active); // kick waiters +} + + + + + +int MDS::shutdown_start() +{ + dout(1) << "shutdown_start" << endl; + derr(0) << "mds shutdown start" << endl; + + for (set::iterator p = mdsmap->get_mds().begin(); + p != mdsmap->get_mds().end(); + p++) { + dout(1) << "sending MShutdownStart to mds" << *p << endl; + send_message_mds(new MGenericMessage(MSG_MDS_SHUTDOWNSTART), + *p, MDS_PORT_MAIN); + } + + if (idalloc) idalloc->shutdown(); + + handle_shutdown_start(NULL); + return 0; +} + + +void MDS::handle_shutdown_start(Message *m) +{ + dout(1) << " handle_shutdown_start" << endl; + + // set flag + state = STATE_STOPPING; + + mdcache->shutdown_start(); + + // save anchor table + if (whoami == 0) + anchormgr->save(0); // FIXME FIXME + + // flush log + mdlog->set_max_events(0); + mdlog->trim(NULL); + + if (m) delete m; + + //g_conf.debug_mds = 10; +} + + + +int MDS::shutdown_final() +{ + dout(1) << "shutdown" << endl; + + state = STATE_STOPPED; + + // shut down cache + mdcache->shutdown(); + + // tell monitor + messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), + MSG_ADDR_MON(0), monmap->get_inst(0)); + + // shut down messenger + messenger->shutdown(); + + return 0; +} + + + + +void MDS::dispatch(Message *m) +{ + // make sure we advacne the clock + g_clock.now(); + + // process + mds_lock.Lock(); + my_dispatch(m); + mds_lock.Unlock(); +} + + + +void MDS::my_dispatch(Message *m) +{ + + switch (m->get_dest_port()) { + + case MDS_PORT_ANCHORMGR: + anchormgr->dispatch(m); + break; + case MDS_PORT_ANCHORCLIENT: + anchorclient->dispatch(m); + break; + + case MDS_PORT_CACHE: + mdcache->dispatch(m); + break; + case MDS_PORT_LOCKER: + locker->dispatch(m); + break; + + case MDS_PORT_MIGRATOR: + mdcache->migrator->dispatch(m); + break; + case MDS_PORT_RENAMER: + mdcache->renamer->dispatch(m); + break; + + case MDS_PORT_BALANCER: + balancer->proc_message(m); + break; + + case MDS_PORT_MAIN: + proc_message(m); + break; + + case MDS_PORT_SERVER: + server->dispatch(m); + break; + + default: + dout(1) << "MDS dispatch unknown message port" << m->get_dest_port() << endl; + assert(0); + } + + + // HACK FOR NOW + /* + static bool did_heartbeat_hack = false; + if (!shutting_down && !shut_down && + false && + !did_heartbeat_hack) { + osdmonitor->initiate_heartbeat(); + did_heartbeat_hack = true; + } + */ + + + if (is_active()) { + // flush log to disk after every op. for now. + mdlog->flush(); + + // trim cache + mdcache->trim(); + } + + // finish any triggered contexts + if (finished_queue.size()) { + dout(7) << "mds has " << finished_queue.size() << " queued contexts" << endl; + list ls; + ls.splice(ls.begin(), finished_queue); + assert(finished_queue.empty()); + finish_contexts(ls); + } + + + + // hash root? + if (false && + mdcache->get_root() && + mdcache->get_root()->dir && + !(mdcache->get_root()->dir->is_hashed() || + mdcache->get_root()->dir->is_hashing())) { + dout(0) << "hashing root" << endl; + mdcache->migrator->hash_dir(mdcache->get_root()->dir); + } + + + // periodic crap (1-second resolution) + static utime_t last_log = g_clock.recent_now(); + utime_t now = g_clock.recent_now(); + if (is_active() && + last_log.sec() != now.sec()) { + + // log + last_log = now; + mds_load_t load = balancer->get_load(); + + req_rate = logger->get("req"); + + logger->set("l", (int)load.mds_load()); + logger->set("q", messenger->get_dispatch_queue_len()); + logger->set("buf", buffer_total_alloc); + + mdcache->log_stat(logger); + + + // balance? + static int num_bal_times = g_conf.mds_bal_max; + static utime_t first = g_clock.recent_now(); + utime_t elapsed = now; + elapsed -= first; + if (true && + whoami == 0 && + (num_bal_times || (g_conf.mds_bal_max_until >= 0 && elapsed.sec() > g_conf.mds_bal_max_until)) && + !is_stopping() && !is_stopped() && + now.sec() - last_balancer_heartbeat.sec() >= g_conf.mds_bal_interval) { + last_balancer_heartbeat = now; + balancer->send_heartbeat(); + num_bal_times--; + } + + // hash? + if (true && + g_conf.num_mds > 1 && + now.sec() - last_balancer_hash.sec() > g_conf.mds_bal_hash_interval) { + last_balancer_hash = now; + balancer->do_hashing(); + } + + + + // HACK to test hashing stuff + if (false) { + static map didhash; + if (elapsed.sec() > 15 && !didhash[whoami]) { + CInode *in = mdcache->get_inode(100000010); + if (in && in->dir) { + if (in->dir->is_auth()) + mdcache->migrator->hash_dir(in->dir); + didhash[whoami] = 1; + } + } + if (0 && elapsed.sec() > 25 && didhash[whoami] == 1) { + CInode *in = mdcache->get_inode(100000010); + if (in && in->dir) { + if (in->dir->is_auth() && in->dir->is_hashed()) + mdcache->migrator->unhash_dir(in->dir); + didhash[whoami] = 2; + } + } + } + + + + } + + // HACK to force export to test foreign renames + if (false && whoami == 0) { + static bool didit = false; + + // 7 to 1 + CInode *in = mdcache->get_inode(1001); + if (in && in->is_dir() && !didit) { + CDir *dir = in->get_or_open_dir(this); + if (dir->is_auth()) { + dout(1) << "FORCING EXPORT" << endl; + mdcache->migrator->export_dir(dir,1); + didit = true; + } + } + } + + + + // shut down? + if (is_stopping()) { + if (mdcache->shutdown_pass()) { + dout(7) << "shutdown_pass=true, finished w/ shutdown" << endl; + shutdown_final(); + } + } + +} + + +void MDS::proc_message(Message *m) +{ + switch (m->get_type()) { + // OSD =============== + /* + case MSG_OSD_MKFS_ACK: + handle_osd_mkfs_ack(m); + return; + */ + case MSG_OSD_OPREPLY: + objecter->handle_osd_op_reply((class MOSDOpReply*)m); + return; + case MSG_OSD_MAP: + handle_osd_map((MOSDMap*)m); + return; + + + // MDS + case MSG_MDS_MAP: + handle_mds_map((MMDSMap*)m); + return; + + case MSG_MDS_SHUTDOWNSTART: // mds0 -> mds1+ + handle_shutdown_start(m); + return; + + + + case MSG_PING: + handle_ping((MPing*)m); + return; + } + +} + + + + + + +void MDS::handle_ping(MPing *m) +{ + dout(10) << " received ping from " << MSG_ADDR_NICE(m->get_source()) << " with seq " << m->seq << endl; + + messenger->send_message(new MPingAck(m), + m->get_source(), m->get_source_inst()); + + delete m; +} + diff --git a/branches/sage/cephmds2/mds/MDS.h b/branches/sage/cephmds2/mds/MDS.h new file mode 100644 index 0000000000000..1581d9c4049ca --- /dev/null +++ b/branches/sage/cephmds2/mds/MDS.h @@ -0,0 +1,252 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MDS_H +#define __MDS_H + +#include +#include +#include +#include +#include +using namespace std; + +#include +using namespace __gnu_cxx; + +#include "msg/Dispatcher.h" +#include "include/types.h" +#include "include/Context.h" +#include "common/DecayCounter.h" +#include "common/Logger.h" +#include "common/Mutex.h" + +#include "mon/MonMap.h" + +#include "ClientMap.h" + + +#define MDS_PORT_MAIN 0 +#define MDS_PORT_SERVER 1 +#define MDS_PORT_CACHE 2 +#define MDS_PORT_LOCKER 3 +#define MDS_PORT_STORE 4 +#define MDS_PORT_BALANCER 5 +#define MDS_PORT_MIGRATOR 6 +#define MDS_PORT_RENAMER 7 + +#define MDS_PORT_ANCHORCLIENT 10 +#define MDS_PORT_ANCHORMGR 11 + + +#define MDS_INO_ROOT 1 +#define MDS_INO_PGTABLE 2 +#define MDS_INO_LOG_OFFSET 0x100 +#define MDS_INO_IDS_OFFSET 0x200 +#define MDS_INO_INODEFILE_OFFSET 0x300 +#define MDS_INO_ANCHORTABLE 0x400 +#define MDS_INO_BASE 0x1000 + +#define MDS_TRAVERSE_FORWARD 1 +#define MDS_TRAVERSE_DISCOVER 2 // skips permissions checks etc. +#define MDS_TRAVERSE_DISCOVERXLOCK 3 // succeeds on (foreign?) null, xlocked dentries. +#define MDS_TRAVERSE_FAIL 4 + + +class filepath; + +class MDSMap; +class OSDMap; +class Objecter; +class Filer; + +class Server; +class Locker; +class AnchorTable; +class AnchorClient; +class MDCache; +class MDStore; +class MDLog; +class MDBalancer; +class IdAllocator; + +class CInode; +class CDir; +class CDentry; + +class Messenger; +class Message; + +class MClientRequest; +class MClientReply; +class MHashReaddir; +class MHashReaddirReply; + + + + +class MDS : public Dispatcher { + public: + Mutex mds_lock; + + protected: + int whoami; + + public: + Messenger *messenger; + MDSMap *mdsmap; + MonMap *monmap; + OSDMap *osdmap; + Objecter *objecter; + Filer *filer; // for reading/writing to/from osds + + ClientMap clientmap; + + // sub systems + Server *server; + MDCache *mdcache; + Locker *locker; + MDStore *mdstore; + MDLog *mdlog; + MDBalancer *balancer; + + IdAllocator *idalloc; + + AnchorTable *anchormgr; + AnchorClient *anchorclient; + + Logger *logger, *logger2; + + + + protected: + // -- MDS state -- + static const int STATE_BOOTING = 1; // fetching mds and osd maps + static const int STATE_MKFS = 2; // creating a file system + static const int STATE_RECOVERING = 3; // recovering mds log + static const int STATE_ACTIVE = 4; // up and active! + static const int STATE_STOPPING = 5; + static const int STATE_STOPPED = 6; + + int state; + list waitfor_active; + +public: + void queue_waitfor_active(Context *c) { waitfor_active.push_back(c); } + + bool is_booting() { return state == STATE_BOOTING; } + bool is_recovering() { return state == STATE_RECOVERING; } + bool is_active() { return state == STATE_ACTIVE; } + bool is_stopping() { return state == STATE_STOPPING; } + bool is_stopped() { return state == STATE_STOPPED; } + + void mark_active(); + + + // -- waiters -- + list finished_queue; + + void queue_finished(Context *c) { + finished_queue.push_back(c); + } + void queue_finished(list& ls) { + finished_queue.splice( finished_queue.end(), ls ); + } + + + + // shutdown crap + int req_rate; + + // ino's and fh's + public: + + int get_req_rate() { return req_rate; } + + protected: + + friend class MDStore; + + + public: + + protected: + utime_t last_balancer_heartbeat, last_balancer_hash; + + public: + MDS(int whoami, Messenger *m, MonMap *mm); + ~MDS(); + + // who am i etc + int get_nodeid() { return whoami; } + MDSMap *get_mds_map() { return mdsmap; } + OSDMap *get_osd_map() { return osdmap; } + + void send_message_mds(Message *m, int mds, int port=0, int fromport=0); + + // start up, shutdown + int init(); + + void boot_mkfs(); + void boot_mkfs_finish(); + void boot_recover(int step=0); + + int shutdown_start(); + int shutdown_final(); + + int hash_dentry(inodeno_t ino, const string& s) { + return 0; // fixme + } + + + // messages + void proc_message(Message *m); + virtual void dispatch(Message *m); + void my_dispatch(Message *m); + + // special message types + void handle_ping(class MPing *m); + + void handle_mds_map(class MMDSMap *m); + + void handle_shutdown_start(Message *m); + + // osds + void handle_osd_getmap(Message *m); + void handle_osd_map(class MOSDMap *m); + +}; + + + +class C_MDS_RetryMessage : public Context { + Message *m; + MDS *mds; +public: + C_MDS_RetryMessage(MDS *mds, Message *m) { + assert(m); + this->m = m; + this->mds = mds; + } + virtual void finish(int r) { + mds->my_dispatch(m); + } +}; + + +ostream& operator<<(ostream& out, MDS& mds); + + +#endif diff --git a/branches/sage/cephmds2/mds/MDSMap.h b/branches/sage/cephmds2/mds/MDSMap.h new file mode 100644 index 0000000000000..6117e6943d3c7 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDSMap.h @@ -0,0 +1,103 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDSMAP_H +#define __MDSMAP_H + +#include "common/Clock.h" +#include "msg/Message.h" + +#include "include/types.h" + +#include +#include +#include +using namespace std; + +class MDSMap { + protected: + epoch_t epoch; + utime_t ctime; + + int anchortable; + + set all_mds; + set down_mds; + map mds_inst; + + friend class MDSMonitor; + + public: + MDSMap() : epoch(0), anchortable(0) {} + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + const utime_t& get_ctime() const { return ctime; } + + int get_anchortable() const { return anchortable; } + + int get_num_mds() const { return all_mds.size(); } + int get_num_up_mds() const { return all_mds.size() - down_mds.size(); } + + const set& get_mds() const { return all_mds; } + const set& get_down_mds() const { return down_mds; } + + bool is_down(int m) const { return down_mds.count(m); } + bool is_up(int m) const { return !is_down(m); } + + const entity_inst_t& get_inst(int m) { + assert(mds_inst.count(m)); + return mds_inst[m]; + } + bool get_inst(int m, entity_inst_t& inst) { + if (mds_inst.count(m)) { + inst = mds_inst[m]; + return true; + } + return false; + } + + // serialize, unserialize + void encode(bufferlist& blist) { + blist.append((char*)&epoch, sizeof(epoch)); + blist.append((char*)&ctime, sizeof(ctime)); + blist.append((char*)&anchortable, sizeof(anchortable)); + + _encode(all_mds, blist); + _encode(down_mds, blist); + _encode(mds_inst, blist); + } + + void decode(bufferlist& blist) { + int off = 0; + blist.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + blist.copy(off, sizeof(ctime), (char*)&ctime); + off += sizeof(ctime); + blist.copy(off, sizeof(anchortable), (char*)&anchortable); + off += sizeof(anchortable); + + _decode(all_mds, blist, off); + _decode(down_mds, blist, off); + _decode(mds_inst, blist, off); + } + + + /*** mapping functions ***/ + + int hash_dentry( inodeno_t dirino, const string& dn ); +}; + +#endif diff --git a/branches/sage/cephmds2/mds/MDStore.cc b/branches/sage/cephmds2/mds/MDStore.cc new file mode 100644 index 0000000000000..432d56751b643 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDStore.cc @@ -0,0 +1,786 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "MDStore.h" +#include "MDS.h" +#include "MDCache.h" +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" +#include "MDSMap.h" + +#include "osd/OSDMap.h" +#include "osdc/Filer.h" + +#include "msg/Message.h" + +#include +#include +using namespace std; + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".store " + + +/* + * separate hashed dir slices into "regions" + */ +size_t get_hash_offset(int hashcode) { + if (hashcode < 0) + return 0; // not hashed + else + return (size_t)(1<<30) * (size_t)(1+hashcode); +} + + + + +// ========================================================================== +// FETCH + + +class C_MDS_Fetch : public Context { + protected: + MDStore *ms; + inodeno_t ino; + + public: + C_MDS_Fetch(MDStore *ms, inodeno_t ino) : Context() { + this->ms = ms; + this->ino = ino; + } + + void finish(int result) { + ms->fetch_dir_2( result, ino ); + } +}; + +/** fetch_dir(dir, context) + * public call to fetch a dir. + */ +void MDStore::fetch_dir( CDir *dir, + Context *c ) +{ + dout(7) << "fetch_dir " << *dir << " context is " << c << endl; + assert(dir->is_auth() || + dir->is_hashed()); + + // wait + if (c) dir->add_waiter(CDIR_WAIT_COMPLETE, c); + + // already fetching? + if (dir->state_test(CDIR_STATE_FETCHING)) { + dout(7) << "already fetching " << *dir << "; waiting" << endl; + return; + } + + // state + dir->state_set(CDIR_STATE_FETCHING); + + // stats + mds->logger->inc("fdir"); + + // create return context + Context *fin = new C_MDS_Fetch( this, dir->ino() ); + if (dir->is_hashed()) + fetch_dir_hash( dir, fin, mds->get_nodeid()); // hashed + else + fetch_dir_hash( dir, fin ); // normal +} + +/* + * called by low level fn when it's fetched. + * fix up dir state. + */ +void MDStore::fetch_dir_2( int result, + inodeno_t ino) +{ + CInode *idir = mds->mdcache->get_inode(ino); + + if (!idir || result < 0) return; // hmm! nevermind i guess. + + assert(idir); + CDir *dir = idir->dir; + assert(dir); + + // dir is now complete + dir->state_set(CDIR_STATE_COMPLETE); + dir->state_clear(CDIR_STATE_FETCHING); + + // finish + list finished; + dir->take_waiting(CDIR_WAIT_COMPLETE|CDIR_WAIT_DENTRY, finished); + finish_contexts(finished, result); +} + + +/** low level methods **/ + +class C_MDS_FetchHash : public Context { +protected: + MDS *mds; + inode_t inode; + int hashcode; + Context *context; + +public: + bufferlist bl; + bufferlist bl2; + + C_MDS_FetchHash(MDS *mds, inode_t inode, Context *c, int hashcode) : Context() { + this->mds = mds; + this->inode = inode; + this->hashcode = hashcode; + this->context = c; + } + + void finish(int result) { + assert(result>0); + + // combine bufferlists bl + bl2 -> bl + bl.claim_append(bl2); + + // did i get the whole thing? + size_t size; + bl.copy(0, sizeof(size_t), (char*)&size); + size_t got = bl.length() - sizeof(size); + size_t left = size - got; + size_t from = bl.length(); + + // what part of dir are we getting? + from += get_hash_offset(hashcode); + + if (got >= size) { + // done. + mds->mdstore->fetch_dir_hash_2( bl, inode, context, hashcode ); + } + else { + // read the rest! + dout(12) << "fetch_dir_hash_2 dir size is " << size << ", got " << got << ", reading remaniing " << left << " from off " << from << endl; + + // create return context + C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, inode, context, hashcode ); + fin->bl.claim( bl ); + mds->filer->read(inode, + from, left, + &fin->bl2, + fin ); + return; + } + } +}; + +/** fetch_dir_hash + * low level method. + * fetch part of a dir. either the whole thing if hashcode is -1, or a specific + * hash segment. + */ +void MDStore::fetch_dir_hash( CDir *dir, + Context *c, + int hashcode) +{ + dout(11) << "fetch_dir_hash hashcode " << hashcode << " " << *dir << endl; + + // create return context + C_MDS_FetchHash *fin = new C_MDS_FetchHash( mds, dir->get_inode()->inode, c, hashcode ); + + // grab first stripe bit (which had better be more than 16 bytes!) + assert(dir->get_inode()->inode.layout.stripe_size >= 16); + mds->filer->read(dir->get_inode()->inode, + get_hash_offset(hashcode), dir->get_inode()->inode.layout.stripe_size, + &fin->bl, + fin ); +} + +void MDStore::fetch_dir_hash_2( bufferlist& bl, + inode_t& inode, + Context *c, + int hashcode) +{ + CInode *idir = mds->mdcache->get_inode(inode.ino); + if (!idir) { + dout(7) << "fetch_dir_hash_2 on ino " << inode.ino << " but no longer in our cache!" << endl; + c->finish(-1); + delete c; + return; + } + + if (!idir->dir_is_auth() || + !idir->dir) { + dout(7) << "fetch_dir_hash_2 on " << *idir << ", but i'm not auth, or dir not open" << endl; + c->finish(-1); + delete c; + return; + } + + // make sure we have a CDir + CDir *dir = idir->get_or_open_dir(mds); + + // do it + dout(7) << "fetch_dir_hash_2 hashcode " << hashcode << " dir " << *dir << endl; + + // parse buffer contents into cache + dout(15) << "bl is " << bl << endl; + + int off = 0; + size_t size; + __uint32_t num; + version_t got_version; + int got_hashcode; + bl.copy(off, sizeof(size), (char*)&size); + off += sizeof(size); + assert(bl.length() >= size + sizeof(size)); + bl.copy(off, sizeof(num), (char*)&num); + off += sizeof(num); + bl.copy(off, sizeof(got_version), (char*)&got_version); + off += sizeof(got_version); + bl.copy(off, sizeof(got_hashcode), (char*)&got_hashcode); + off += sizeof(got_hashcode); + + assert(got_hashcode == hashcode); + + int buflen = bl.length(); + + dout(10) << " " << num << " items in " << size << " bytes" << endl; + + unsigned parsed = 0; + while (parsed < num) { + assert(off < buflen && num > 0); + parsed++; + + dout(24) << " " << parsed << "/" << num << " pos " << off << endl; + + // dentry + string dname; + ::_decode(dname, bl, off); + dout(24) << "parse filename '" << dname << "'" << endl; + + CDentry *dn = dir->lookup(dname); // existing dentry? + + char type = bl[off]; + ++off; + if (type == 'L') { + // hard link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + // what to do? + if (hashcode >= 0) { + int dentryhashcode = mds->hash_dentry( dir->ino(), dname ); + assert(dentryhashcode == hashcode); + } + + if (dn) { + if (dn->get_inode() == 0) { + // negative dentry? + dout(12) << "readdir had NEG dentry " << dname << endl; + } else { + // had dentry + dout(12) << "readdir had dentry " << dname << endl; + } + continue; + } + + // (remote) link + CDentry *dn = dir->add_dentry( dname, ino ); + + // link to inode? + CInode *in = mds->mdcache->get_inode(ino); // we may or may not have it. + if (in) { + dn->link_remote(in); + dout(12) << "readdir got remote link " << ino << " which we have " << *in << endl; + } else { + dout(12) << "readdir got remote link " << ino << " (dont' have it)" << endl; + } + } + else if (type == 'I') { + // inode + + // parse out inode + inode_t inode; + bl.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); + + string symlink; + if (inode.is_symlink()) + ::_decode(symlink, bl, off); + + // what to do? + if (hashcode >= 0) { + int dentryhashcode = mds->hash_dentry( dir->ino(), dname ); + assert(dentryhashcode == hashcode); + } + + if (dn) { + if (dn->get_inode() == 0) { + // negative dentry? + dout(12) << "readdir had NEG dentry " << dname << endl; + } else { + // had dentry + dout(12) << "readdir had dentry " << dname << endl; + + // under water? + if (dn->get_inode()->get_parent_dir_version() <= got_version) { + dout(10) << "readdir had underwater dentry " << dname << " and inode, marking clean" << endl; + dn->get_inode()->mark_clean(); + dn->mark_clean(); + } + } + continue; + } + + // add inode + CInode *in = 0; + if (mds->mdcache->have_inode(inode.ino)) { + in = mds->mdcache->get_inode(inode.ino); + dout(12) << "readdir got (but i already had) " << *in + << " mode " << in->inode.mode + << " mtime " << in->inode.mtime << endl; + } else { + // inode + in = new CInode(mds->mdcache); + in->inode = inode; + + // symlink? + if (in->is_symlink()) { + in->symlink = symlink; + } + + // add + mds->mdcache->add_inode( in ); + } + + // link + dir->add_dentry( dname, in ); + dout(12) << "readdir got " << *in << " mode " << in->inode.mode << " mtime " << in->inode.mtime << endl; + } + else { + dout(1) << "corrupt directory, i got tag char '" << type << "' val " << (int)(type) + << " at pos " << off << endl; + assert(0); + } + } + dout(15) << "parsed " << parsed << endl; + + if (c) { + c->finish(0); + delete c; + } +} + + + + +// ================================================================== +// COMMIT + +class C_MDS_CommitDirVerify : public Context { +public: + MDS *mds; + inodeno_t ino; + version_t version; + Context *c; + + C_MDS_CommitDirVerify( MDS *mds, + inodeno_t ino, + version_t version, + Context *c) { + this->mds = mds; + this->c = c; + this->version = version; + this->ino = ino; + } + + virtual void finish(int r) { + + if (r >= 0) { + CInode *in = mds->mdcache->get_inode(ino); + assert(in && in->dir); + if (in && in->dir && in->dir->is_auth()) { + dout(7) << "CommitDirVerify: current version = " << in->dir->get_version() << endl; + dout(7) << "CommitDirVerify: last committed = " << in->dir->get_last_committed_version() << endl; + dout(7) << "CommitDirVerify: required = " << version << endl; + + if (in->dir->get_last_committed_version() >= version) { + dout(7) << "my required version is safe, done." << endl; + } else { + dout(7) << "my required version is still not safe, committing again." << endl; + + // what was requested isn't committed yet. + mds->mdstore->commit_dir(in->dir, + version, + c); + return; + } + } + } + + // must have exported ors omethign! + dout(7) << "can't retry commit dir on " << ino << ", must have exported?" << endl; + if (c) { + c->finish(-1); + delete c; + } + } +}; + +class C_MDS_CommitDirFinish : public Context { + protected: + MDStore *ms; + CDir *dir; + version_t version; + + public: + + C_MDS_CommitDirFinish(MDStore *ms, CDir *dir) : Context() { + this->ms = ms; + this->dir = dir; + this->version = dir->get_version(); // just for sanity check later + } + + void finish(int result) { + ms->commit_dir_2( result, dir, version ); + } +}; + + +void MDStore::commit_dir( CDir *dir, + Context *c ) +{ + assert(dir->is_dirty()); + + // commit thru current version + commit_dir(dir, dir->get_version(), c); +} + +void MDStore::commit_dir( CDir *dir, + version_t version, + Context *c ) +{ + assert(dir->is_auth() || + dir->is_hashed()); + + // already committing? + if (dir->state_test(CDIR_STATE_COMMITTING)) { + // already mid-commit! + dout(7) << "commit_dir " << *dir << " mid-commit of " << dir->get_committing_version() << endl; + dout(7) << " current version = " << dir->get_version() << endl; + dout(7) << "requested version = " << version << endl; + + assert(version >= dir->get_last_committed_version()); // why would we request _old_ one? + + dir->add_waiter(CDIR_WAIT_COMMITTED, + new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); + return; + } + + if (!dir->can_auth_pin()) { + // something must be frozen up the hiearchy! + dout(7) << "commit_dir " << *dir << " can't auth_pin, waiting" << endl; + dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, + new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); + return; + } + + + // is it complete? + if (!dir->is_complete()) { + dout(7) << "commit_dir " << *dir << " not complete, fetching first" << endl; + // fetch dir first + fetch_dir(dir, + new C_MDS_CommitDirVerify(mds, dir->ino(), version, c) ); + return; + } + + + // ok go + dout(7) << "commit_dir " << *dir << " version " << dir->get_version() << endl; + + // add waiter + if (c) dir->add_waiter(CDIR_WAIT_COMMITTED, c); + + // get continuation ready + Context *fin = new C_MDS_CommitDirFinish(this, dir); + + // state + dir->state_set(CDIR_STATE_COMMITTING); + dir->set_committing_version(); + + // stats + mds->logger->inc("cdir"); + + if (dir->is_hashed()) { + // hashed + commit_dir_slice( dir, fin, mds->get_nodeid() ); + } else { + // non-hashed + commit_dir_slice( dir, fin ); + } +} + +void MDStore::commit_dir_2( int result, + CDir *dir, + version_t committed_version) +{ + dout(5) << "commit_dir_2 " << *dir << " committed " << committed_version << ", current version " << dir->get_version() << endl; + assert(committed_version == dir->get_committing_version()); + + // remember which version is now safe + dir->set_last_committed_version(committed_version); + + // is the dir now clean? + if (committed_version == dir->get_version()) + dir->mark_clean(); + + dir->state_clear(CDIR_STATE_COMMITTING); + + // finish + dir->finish_waiting(CDIR_WAIT_COMMITTED); +} + + + + +// low-level committer (hashed or normal) + +class C_MDS_CommitSlice : public Context { + protected: + MDStore *ms; + CDir *dir; + Context *c; + int hashcode; + version_t version; + +public: + bufferlist bl; + + C_MDS_CommitSlice(MDStore *ms, CDir *dir, Context *c, int w) : Context() { + this->ms = ms; + this->dir = dir; + this->c = c; + this->hashcode = w; + version = dir->get_version(); + } + + void finish(int result) { + ms->commit_dir_slice_2( result, dir, c, version, hashcode ); + } +}; + + +void MDStore::commit_dir_slice( CDir *dir, + Context *c, + int hashcode) +{ + if (hashcode >= 0) { + assert(dir->is_hashed()); + dout(10) << "commit_dir_slice hashcode " << hashcode << " " << *dir << " version " << dir->get_version() << endl; + } else { + assert(dir->is_auth()); + dout(10) << "commit_dir_slice (whole dir) " << *dir << " version " << dir->get_version() << endl; + } + + // get continuation ready + C_MDS_CommitSlice *fin = new C_MDS_CommitSlice(this, dir, c, hashcode); + + // fill buffer + __uint32_t num = 0; + + bufferlist dirdata; + + version_t v = dir->get_version(); + dirdata.append((char*)&v, sizeof(v)); + dirdata.append((char*)&hashcode, sizeof(hashcode)); + + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + + if (hashcode >= 0) { + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != hashcode) continue; + } + + // put dentry in this version + if (dn->is_dirty()) { + dn->float_parent_dir_version( dir->get_version() ); + dout(12) << " dirty dn " << *dn << " now " << dn->get_parent_dir_version() << endl; + } + + if (dn->is_null()) continue; // skipping negative entry + + // primary or remote? + if (dn->is_remote()) { + + inodeno_t ino = dn->get_remote_ino(); + dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' remote ino " << ino << endl; + + // name, marker, ion + dirdata.append( it->first.c_str(), it->first.length() + 1); + dirdata.append( "L", 1 ); // remote link + dirdata.append((char*)&ino, sizeof(ino)); + + } else { + // primary link + CInode *in = dn->get_inode(); + assert(in); + + dout(14) << " pos " << dirdata.length() << " dn '" << it->first << "' inode " << *in << endl; + + // name, marker, inode, [symlink string] + dirdata.append( it->first.c_str(), it->first.length() + 1); + dirdata.append( "I", 1 ); // inode + dirdata.append( (char*) &in->inode, sizeof(inode_t)); + + if (in->is_symlink()) { + // include symlink destination! + dout(18) << " inlcuding symlink ptr " << in->symlink << endl; + dirdata.append( (char*) in->symlink.c_str(), in->symlink.length() + 1); + } + + // put inode in this dir version + if (in->is_dirty()) { + in->float_parent_dir_version( dir->get_version() ); + dout(12) << " dirty inode " << *in << " now " << in->get_parent_dir_version() << endl; + + in->set_committing_version( in->get_version() ); + assert(in->get_last_committed_version() < in->get_committing_version()); + } else { + assert(in->get_committing_version() == in->get_version()); + } + + } + + num++; + } + dout(14) << "num " << num << endl; + + // put count in buffer + //bufferlist bl; + size_t size = sizeof(num) + dirdata.length(); + fin->bl.append((char*)&size, sizeof(size)); + fin->bl.append((char*)&num, sizeof(num)); + fin->bl.claim_append(dirdata); //.c_str(), dirdata.length()); + assert(fin->bl.length() == size + sizeof(size)); + + // pin inode + dir->auth_pin(); + + // submit to osd + mds->filer->write( dir->get_inode()->inode, + 0, fin->bl.length(), + fin->bl, + 0, //OSD_OP_FLAGS_TRUNCATE, // truncate file/object after end of this write + NULL, fin ); // on safe +} + + +void MDStore::commit_dir_slice_2( int result, + CDir *dir, + Context *c, + version_t committed_version, + int hashcode ) +{ + dout(11) << "commit_dir_slice_2 hashcode " << hashcode << " " << *dir << " v " << committed_version << endl; + + // mark inodes and dentries clean too (if we committed them!) + list null_clean; + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); ) { + CDentry *dn = it->second; + it++; + + if (hashcode >= 0) { + int dentryhashcode = mds->hash_dentry( dir->ino(), dn->get_name() ); + if (dentryhashcode != hashcode) continue; + } + + // dentry + if (committed_version > dn->get_parent_dir_version()) { + dout(15) << " dir " << committed_version << " > dn " << dn->get_parent_dir_version() << " still clean " << *dn << endl; + assert(!dn->is_dirty()); + } + else if (dn->get_parent_dir_version() == committed_version) { + dout(15) << " dir " << committed_version << " == dn " << dn->get_parent_dir_version() << " now clean " << *dn << endl; + if (dn->is_dirty()) + dn->mark_clean(); // might not but could be dirty + + // remove, if it's null and unlocked + if (dn->is_null() && dn->is_sync()) { + dout(15) << " removing clean and null " << *dn << endl; + null_clean.push_back(dn); + continue; + } + } else { + dout(15) << " dir " << committed_version << " < dn " << dn->get_parent_dir_version() << " still dirty " << *dn << endl; + assert(committed_version < dn->get_parent_dir_version()); + //assert(dn->is_dirty() || !dn->is_sync()); // -OR- we did a fetch_dir in order to do a newer commit... + } + + // only do primary... + if (!dn->is_primary()) continue; + + CInode *in = dn->get_inode(); + assert(in); + assert(in->is_auth()); + + if (in->get_committing_version()) + in->set_committed_version(); + + if (committed_version > in->get_parent_dir_version()) { + dout(15) << " dir " << committed_version << " > inode " << in->get_parent_dir_version() << " still clean " << *(in) << endl; + assert(!in->is_dirty()); + } + else if (in->get_parent_dir_version() == committed_version) { + dout(15) << " dir " << committed_version << " == inode " << in->get_parent_dir_version() << " now clean " << *(in) << endl; + in->mark_clean(); // might not but could be dirty + } else { + dout(15) << " dir " << committed_version << " < inode " << in->get_parent_dir_version() << " still dirty " << *(in) << endl; + assert(committed_version < in->get_parent_dir_version()); + //assert(in->is_dirty()); // -OR- we did a fetch_dir in order to do a newer commit... + } + } + + // remove null clean dentries + for (list::iterator it = null_clean.begin(); + it != null_clean.end(); + it++) + dir->remove_dentry(*it); + + // unpin + dir->auth_unpin(); + + // finish + if (c) { + c->finish(0); + delete c; + } +} + + + + + + + + + + + + diff --git a/branches/sage/cephmds2/mds/MDStore.h b/branches/sage/cephmds2/mds/MDStore.h new file mode 100644 index 0000000000000..fe7553608a975 --- /dev/null +++ b/branches/sage/cephmds2/mds/MDStore.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MDSTORE_H +#define __MDSTORE_H + +#include "include/types.h" +#include "include/buffer.h" + +class MDS; +class CDir; +class Context; + +class MDStore { + protected: + MDS *mds; + + + public: + MDStore(MDS *m) { + mds = m; + } + + + // fetch + public: + void fetch_dir( CDir *dir, Context *c ); + protected: + void fetch_dir_2( int result, inodeno_t ino ); + + void fetch_dir_hash( CDir *dir, + Context *c, + int hashcode = -1); + void fetch_dir_hash_2( bufferlist &bl, + inode_t& inode, + Context *c, + int which); + friend class C_MDS_Fetch; + friend class C_MDS_FetchHash; + + // commit + public: + void commit_dir( CDir *dir, Context *c ); // commit current dir version to disk. + void commit_dir( CDir *dir, __uint64_t version, Context *c ); // commit specified version to disk + protected: + void commit_dir_2( int result, CDir *dir, __uint64_t committed_version ); + + // low level committers + void commit_dir_slice( CDir *dir, + Context *c, + int hashcode = -1); + void commit_dir_slice_2( int result, + CDir *dir, + Context *c, + __uint64_t version, + int hashcode ); + + friend class C_MDS_CommitDirFinish; + friend class C_MDS_CommitSlice; +}; + + +#endif diff --git a/branches/sage/cephmds2/mds/Migrator.cc b/branches/sage/cephmds2/mds/Migrator.cc new file mode 100644 index 0000000000000..bde26ae72dced --- /dev/null +++ b/branches/sage/cephmds2/mds/Migrator.cc @@ -0,0 +1,3192 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDS.h" +#include "MDCache.h" +#include "CInode.h" +#include "CDir.h" +#include "CDentry.h" +#include "Migrator.h" +#include "Locker.h" + +#include "MDBalancer.h" +#include "MDLog.h" +#include "MDSMap.h" + +#include "include/filepath.h" + +#include "events/EInodeUpdate.h" +#include "events/EDirUpdate.h" + +#include "msg/Messenger.h" + +#include "messages/MClientFileCaps.h" + +#include "messages/MExportDirDiscover.h" +#include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirPrep.h" +#include "messages/MExportDirPrepAck.h" +#include "messages/MExportDirWarning.h" +#include "messages/MExportDir.h" +#include "messages/MExportDirNotify.h" +#include "messages/MExportDirNotifyAck.h" +#include "messages/MExportDirFinish.h" + +#include "messages/MHashDirDiscover.h" +#include "messages/MHashDirDiscoverAck.h" +#include "messages/MHashDirPrep.h" +#include "messages/MHashDirPrepAck.h" +#include "messages/MHashDir.h" +#include "messages/MHashDirNotify.h" +#include "messages/MHashDirAck.h" + +#include "messages/MUnhashDirPrep.h" +#include "messages/MUnhashDirPrepAck.h" +#include "messages/MUnhashDir.h" +#include "messages/MUnhashDirAck.h" +#include "messages/MUnhashDirNotify.h" +#include "messages/MUnhashDirNotifyAck.h" + + + +void Migrator::dispatch(Message *m) +{ + switch (m->get_type()) { + // import + case MSG_MDS_EXPORTDIRDISCOVER: + handle_export_dir_discover((MExportDirDiscover*)m); + break; + case MSG_MDS_EXPORTDIRPREP: + handle_export_dir_prep((MExportDirPrep*)m); + break; + case MSG_MDS_EXPORTDIR: + handle_export_dir((MExportDir*)m); + break; + case MSG_MDS_EXPORTDIRFINISH: + handle_export_dir_finish((MExportDirFinish*)m); + break; + + // export + case MSG_MDS_EXPORTDIRDISCOVERACK: + handle_export_dir_discover_ack((MExportDirDiscoverAck*)m); + break; + case MSG_MDS_EXPORTDIRPREPACK: + handle_export_dir_prep_ack((MExportDirPrepAck*)m); + break; + case MSG_MDS_EXPORTDIRNOTIFYACK: + handle_export_dir_notify_ack((MExportDirNotifyAck*)m); + break; + + // export 3rd party (inode authority) + case MSG_MDS_EXPORTDIRWARNING: + handle_export_dir_warning((MExportDirWarning*)m); + break; + case MSG_MDS_EXPORTDIRNOTIFY: + handle_export_dir_notify((MExportDirNotify*)m); + break; + + + // hashing + case MSG_MDS_HASHDIRDISCOVER: + handle_hash_dir_discover((MHashDirDiscover*)m); + break; + case MSG_MDS_HASHDIRDISCOVERACK: + handle_hash_dir_discover_ack((MHashDirDiscoverAck*)m); + break; + case MSG_MDS_HASHDIRPREP: + handle_hash_dir_prep((MHashDirPrep*)m); + break; + case MSG_MDS_HASHDIRPREPACK: + handle_hash_dir_prep_ack((MHashDirPrepAck*)m); + break; + case MSG_MDS_HASHDIR: + handle_hash_dir((MHashDir*)m); + break; + case MSG_MDS_HASHDIRACK: + handle_hash_dir_ack((MHashDirAck*)m); + break; + case MSG_MDS_HASHDIRNOTIFY: + handle_hash_dir_notify((MHashDirNotify*)m); + break; + + // unhashing + case MSG_MDS_UNHASHDIRPREP: + handle_unhash_dir_prep((MUnhashDirPrep*)m); + break; + case MSG_MDS_UNHASHDIRPREPACK: + handle_unhash_dir_prep_ack((MUnhashDirPrepAck*)m); + break; + case MSG_MDS_UNHASHDIR: + handle_unhash_dir((MUnhashDir*)m); + break; + case MSG_MDS_UNHASHDIRACK: + handle_unhash_dir_ack((MUnhashDirAck*)m); + break; + case MSG_MDS_UNHASHDIRNOTIFY: + handle_unhash_dir_notify((MUnhashDirNotify*)m); + break; + case MSG_MDS_UNHASHDIRNOTIFYACK: + handle_unhash_dir_notify_ack((MUnhashDirNotifyAck*)m); + break; + + default: + assert(0); + } +} + + +class C_MDC_EmptyImport : public Context { + Migrator *mig; + CDir *dir; +public: + C_MDC_EmptyImport(Migrator *m, CDir *d) : mig(m), dir(d) {} + void finish(int r) { + mig->export_empty_import(dir); + } +}; + + +void Migrator::export_empty_import(CDir *dir) +{ + dout(7) << "export_empty_import " << *dir << endl; + + return; // hack fixme + + if (!dir->is_import()) { + dout(7) << "not import (anymore?)" << endl; + return; + } + if (dir->inode->is_root()) { + dout(7) << "root" << endl; + return; + } + + if (dir->get_size() > 0) { + dout(7) << "not actually empty" << endl; + return; + } + + // is it really empty? + if (!dir->is_complete()) { + dout(7) << "not complete, fetching." << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_EmptyImport(this,dir)); + return; + } + + int dest = dir->inode->authority(); + + // comment this out ot wreak havoc? + //if (mds->is_shutting_down()) dest = 0; // this is more efficient. + + dout(7) << "really empty, exporting to " << dest << endl; + assert (dest != mds->get_nodeid()); + + dout(-7) << "exporting to mds" << dest + << " empty import " << *dir << endl; + export_dir( dir, dest ); +} + + +// ========================================================== +// IMPORT/EXPORT + + +class C_MDC_ExportFreeze : public Context { + Migrator *mig; + CDir *ex; // dir i'm exporting + int dest; + +public: + C_MDC_ExportFreeze(Migrator *m, CDir *e, int d) : + mig(m), ex(e), dest(d) {} + virtual void finish(int r) { + mig->export_dir_frozen(ex, dest); + } +}; + + + +/** export_dir(dir, dest) + * public method to initiate an export. + * will fail if the directory is freezing, frozen, unpinnable, or root. + */ +void Migrator::export_dir(CDir *dir, + int dest) +{ + dout(7) << "export_dir " << *dir << " to " << dest << endl; + assert(dest != mds->get_nodeid()); + assert(!dir->is_hashed()); + + if (dir->inode->is_root()) { + dout(7) << "i won't export root" << endl; + assert(0); + return; + } + + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << endl; + return; + } + if (dir->is_hashed()) { + dout(7) << "can't export hashed dir right now. implement me carefully later." << endl; + return; + } + + + // pin path? + vector trace; + cache->make_trace(trace, dir->inode); + if (!cache->path_pin(trace, 0, 0)) { + dout(7) << "export_dir couldn't pin path, failing." << endl; + return; + } + + // ok, let's go. + + // send ExportDirDiscover (ask target) + export_gather[dir].insert(dest); + mds->send_message_mds(new MExportDirDiscover(dir->inode), dest, MDS_PORT_MIGRATOR); + dir->auth_pin(); // pin dir, to hang up our freeze (unpin on prep ack) + + // take away the popularity we're sending. FIXME: do this later? + mds->balancer->subtract_export(dir); + + + // freeze the subtree + dir->freeze_tree(new C_MDC_ExportFreeze(this, dir, dest)); +} + + +/* + * called on receipt of MExportDirDiscoverAck + * the importer now has the directory's _inode_ in memory, and pinned. + */ +void Migrator::handle_export_dir_discover_ack(MExportDirDiscoverAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = MSG_ADDR_NUM(m->get_source()); + assert(export_gather[dir].count(from)); + export_gather[dir].erase(from); + + if (export_gather[dir].empty()) { + dout(7) << "export_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; + dir->auth_unpin(); // unpin to allow freeze to complete + } else { + dout(7) << "export_dir_discover_ack " << *dir << ", still waiting for " << export_gather[dir] << endl; + } + + delete m; // done +} + + +void Migrator::export_dir_frozen(CDir *dir, + int dest) +{ + // subtree is now frozen! + dout(7) << "export_dir_frozen on " << *dir << " to " << dest << endl; + + show_imports(); + + MExportDirPrep *prep = new MExportDirPrep(dir->inode); + + // include spanning tree for all nested exports. + // these need to be on the destination _before_ the final export so that + // dir_auth updates on any nested exports are properly absorbed. + + set inodes_added; + + // include base dir + prep->add_dir( new CDirDiscover(dir, dir->open_by_add(dest)) ); + + // also include traces to all nested exports. + set my_nested; + cache->find_nested_exports(dir, my_nested); + for (set::iterator it = my_nested.begin(); + it != my_nested.end(); + it++) { + CDir *exp = *it; + + dout(7) << " including nested export " << *exp << " in prep" << endl; + + prep->add_export( exp->ino() ); + + /* first assemble each trace, in trace order, and put in message */ + list inode_trace; + + // trace to dir + CDir *cur = exp; + while (cur != dir) { + // don't repeat ourselves + if (inodes_added.count(cur->ino())) break; // did already! + inodes_added.insert(cur->ino()); + + CDir *parent_dir = cur->get_parent_dir(); + + // inode? + assert(cur->inode->is_auth()); + inode_trace.push_front(cur->inode); + dout(7) << " will add " << *cur->inode << endl; + + // include dir? note: this'll include everything except the nested exports themselves, + // since someone else is obviously auth. + if (cur->is_auth()) { + prep->add_dir( new CDirDiscover(cur, cur->open_by_add(dest)) ); // yay! + dout(7) << " added " << *cur << endl; + } + + cur = parent_dir; + } + + for (list::iterator it = inode_trace.begin(); + it != inode_trace.end(); + it++) { + CInode *in = *it; + dout(7) << " added " << *in << endl; + prep->add_inode( in->parent->dir->ino(), + in->parent->name, + in->replicate_to(dest) ); + } + + } + + // send it! + mds->send_message_mds(prep, dest, MDS_PORT_MIGRATOR); +} + +void Migrator::handle_export_dir_prep_ack(MExportDirPrepAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "export_dir_prep_ack " << *dir << ", starting export" << endl; + + // start export. + export_dir_go(dir, MSG_ADDR_NUM(m->get_source())); + + // done + delete m; +} + + +void Migrator::export_dir_go(CDir *dir, + int dest) +{ + dout(7) << "export_dir_go " << *dir << " to " << dest << endl; + + show_imports(); + + + // build export message + MExportDir *req = new MExportDir(dir->inode); // include pop + + + // update imports/exports + CDir *containing_import = cache->get_auth_container(dir); + + if (containing_import == dir) { + dout(7) << " i'm rexporting a previous import" << endl; + assert(dir->is_import()); + cache->imports.erase(dir); + dir->state_clear(CDIR_STATE_IMPORT); + dir->put(CDIR_PIN_IMPORT); // unpin, no longer an import + + // discard nested exports (that we're handing off + for (set::iterator p = cache->nested_exports[dir].begin(); + p != cache->nested_exports[dir].end(); ) { + CDir *nested = *p; + p++; + + // add to export message + req->add_export(nested); + + // nested beneath our new export *in; remove! + dout(7) << " export " << *nested << " was nested beneath us; removing from export list(s)" << endl; + assert(cache->exports.count(nested) == 1); + cache->nested_exports[dir].erase(nested); + } + + } else { + dout(7) << " i'm a subdir nested under import " << *containing_import << endl; + cache->exports.insert(dir); + cache->nested_exports[containing_import].insert(dir); + + dir->state_set(CDIR_STATE_EXPORT); + dir->get(CDIR_PIN_EXPORT); // i must keep it pinned + + // discard nested exports (that we're handing off) + for (set::iterator p = cache->nested_exports[containing_import].begin(); + p != cache->nested_exports[containing_import].end(); ) { + CDir *nested = *p; + p++; + if (nested == dir) continue; // ignore myself + + // container of parent; otherwise we get ourselves. + CDir *containing_export = nested->get_parent_dir(); + while (containing_export && !containing_export->is_export()) + containing_export = containing_export->get_parent_dir(); + if (!containing_export) continue; + + if (containing_export == dir) { + // nested beneath our new export *in; remove! + dout(7) << " export " << *nested << " was nested beneath us; removing from nested_exports" << endl; + cache->nested_exports[containing_import].erase(nested); + // exports.erase(nested); _walk does this + + // add to msg + req->add_export(nested); + } else { + dout(12) << " export " << *nested << " is under other export " << *containing_export << ", which is unrelated" << endl; + assert(cache->get_auth_container(containing_export) != containing_import); + } + } + } + + // note new authority (locally) + if (dir->inode->authority() == dest) + dir->set_dir_auth( CDIR_AUTH_PARENT ); + else + dir->set_dir_auth( dest ); + + // make list of nodes i expect an export_dir_notify_ack from + // (everyone w/ this dir open, but me!) + assert(export_notify_ack_waiting[dir].empty()); + for (set::iterator it = dir->open_by.begin(); + it != dir->open_by.end(); + it++) { + if (*it == mds->get_nodeid()) continue; + export_notify_ack_waiting[dir].insert( *it ); + + // send warning to all but dest + if (*it != dest) { + dout(10) << " sending export_dir_warning to mds" << *it << endl; + mds->send_message_mds(new MExportDirWarning( dir->ino() ), *it, MDS_PORT_MIGRATOR); + } + } + assert(export_notify_ack_waiting[dir].count( dest )); + + // fill export message with cache data + C_Contexts *fin = new C_Contexts; + int num_exported_inodes = export_dir_walk( req, + fin, + dir, // base + dir, // recur start point + dest ); + + // send the export data! + mds->send_message_mds(req, dest, MDS_PORT_MIGRATOR); + + // queue up the finisher + dir->add_waiter( CDIR_WAIT_UNFREEZE, fin ); + + + // stats + mds->logger->inc("ex"); + mds->logger->inc("iex", num_exported_inodes); + + show_imports(); +} + + +/** encode_export_inode + * update our local state for this inode to export. + * encode relevant state to be sent over the wire. + * used by: export_dir_walk, file_rename (if foreign) + */ +void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state, int new_auth) +{ + in->inode.version++; // so local log entries are ignored, etc. (FIXME ??) + + // tell (all) clients about migrating caps.. mark STALE + for (map::iterator it = in->client_caps.begin(); + it != in->client_caps.end(); + it++) { + dout(7) << "encode_export_inode " << *in << " telling client" << it->first << " stale caps" << endl; + MClientFileCaps *m = new MClientFileCaps(in->inode, + it->second.get_last_seq(), + it->second.pending(), + it->second.wanted(), + MClientFileCaps::FILECAP_STALE); + mds->messenger->send_message(m, MSG_ADDR_CLIENT(it->first), mds->clientmap.get_inst(it->first), + 0, MDS_PORT_CACHE); + } + + // relax locks? + if (!in->is_cached_by_anyone()) + in->replicate_relax_locks(); + + // add inode + assert(in->cached_by.count(mds->get_nodeid()) == 0); + CInodeExport istate( in ); + istate._encode( enc_state ); + + // we're export this inode; fix inode state + dout(7) << "encode_export_inode " << *in << endl; + + if (in->is_dirty()) in->mark_clean(); + + // clear/unpin cached_by (we're no longer the authority) + in->cached_by_clear(); + + // twiddle lock states for auth -> replica transition + // hard + in->hardlock.clear_gather(); + if (in->hardlock.get_state() == LOCK_GLOCKR) + in->hardlock.set_state(LOCK_LOCK); + + // file : we lost all our caps, so move to stable state! + in->filelock.clear_gather(); + if (in->filelock.get_state() == LOCK_GLOCKR || + in->filelock.get_state() == LOCK_GLOCKM || + in->filelock.get_state() == LOCK_GLOCKL || + in->filelock.get_state() == LOCK_GLONERR || + in->filelock.get_state() == LOCK_GLONERM || + in->filelock.get_state() == LOCK_LONER) + in->filelock.set_state(LOCK_LOCK); + if (in->filelock.get_state() == LOCK_GMIXEDR) + in->filelock.set_state(LOCK_MIXED); + // this looks like a step backwards, but it's what we want! + if (in->filelock.get_state() == LOCK_GSYNCM) + in->filelock.set_state(LOCK_MIXED); + if (in->filelock.get_state() == LOCK_GSYNCL) + in->filelock.set_state(LOCK_LOCK); + if (in->filelock.get_state() == LOCK_GMIXEDL) + in->filelock.set_state(LOCK_LOCK); + //in->filelock.set_state(LOCK_MIXED); + + // mark auth + assert(in->is_auth()); + in->set_auth(false); + in->replica_nonce = CINODE_EXPORT_NONCE; + + // *** other state too? + + // move to end of LRU so we drop out of cache quickly! + cache->lru.lru_bottouch(in); +} + + +int Migrator::export_dir_walk(MExportDir *req, + C_Contexts *fin, + CDir *basedir, + CDir *dir, + int newauth) +{ + int num_exported = 0; + + dout(7) << "export_dir_walk " << *dir << " " << dir->nitems << " items" << endl; + + // dir + bufferlist enc_dir; + + CDirExport dstate(dir); + dstate._encode( enc_dir ); + + // release open_by + dir->open_by_clear(); + + // mark + assert(dir->is_auth()); + dir->state_clear(CDIR_STATE_AUTH); + dir->replica_nonce = CDIR_NONCE_EXPORT; + + // proxy + dir->state_set(CDIR_STATE_PROXY); + dir->get(CDIR_PIN_PROXY); + export_proxy_dirinos[basedir].push_back(dir->ino()); + + list subdirs; + + if (dir->is_hashed()) { + // fix state + dir->state_clear( CDIR_STATE_AUTH ); + + } else { + + if (dir->is_dirty()) + dir->mark_clean(); + + // discard most dir state + dir->state &= CDIR_MASK_STATE_EXPORT_KEPT; // i only retain a few things. + + // suck up all waiters + list waiting; + dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // inodes + + CDir_map_t::iterator it; + for (it = dir->begin(); it != dir->end(); it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + num_exported++; + + // -- dentry + dout(7) << "export_dir_walk exporting " << *dn << endl; + _encode(it->first, enc_dir); + + if (dn->is_dirty()) + enc_dir.append("D", 1); // dirty + else + enc_dir.append("C", 1); // clean + + // null dentry? + if (dn->is_null()) { + enc_dir.append("N", 1); // null dentry + assert(dn->is_sync()); + continue; + } + + if (dn->is_remote()) { + // remote link + enc_dir.append("L", 1); // remote link + + inodeno_t ino = dn->get_remote_ino(); + enc_dir.append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + enc_dir.append("I", 1); // inode dentry + + encode_export_inode(in, enc_dir, newauth); // encode, and (update state for) export + + // directory? + if (in->is_dir() && in->dir) { + if (in->dir->is_auth()) { + // nested subdir + assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); + subdirs.push_back(in->dir); // it's ours, recurse (later) + + } else { + // nested export + assert(in->dir->get_dir_auth() >= 0); + dout(7) << " encountered nested export " << *in->dir << " dir_auth " << in->dir->get_dir_auth() << "; removing from exports" << endl; + assert(cache->exports.count(in->dir) == 1); + cache->exports.erase(in->dir); // discard nested export (nested_exports updated above) + + in->dir->state_clear(CDIR_STATE_EXPORT); + in->dir->put(CDIR_PIN_EXPORT); + + // simplify dir_auth? + if (in->dir->get_dir_auth() == newauth) + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + } + + // add to proxy + export_proxy_inos[basedir].push_back(in->ino()); + in->state_set(CINODE_STATE_PROXY); + in->get(CINODE_PIN_PROXY); + + // waiters + list waiters; + in->take_waiting(CINODE_WAIT_ANY, waiters); + fin->take(waiters); + } + } + + req->add_dir( enc_dir ); + + // subdirs + for (list::iterator it = subdirs.begin(); it != subdirs.end(); it++) + num_exported += export_dir_walk(req, fin, basedir, *it, newauth); + + return num_exported; +} + + +/* + * i should get an export_dir_notify_ack from every mds that had me open, including the new auth (an ack) + */ +void Migrator::handle_export_dir_notify_ack(MExportDirNotifyAck *m) +{ + CInode *diri = cache->get_inode(m->get_ino()); + CDir *dir = diri->dir; + assert(dir); + assert(dir->is_frozen_tree_root()); // i'm exporting! + + // remove from waiting list + int from = MSG_ADDR_NUM(m->get_source()); + assert(export_notify_ack_waiting[dir].count(from)); + export_notify_ack_waiting[dir].erase(from); + + // done? + if (!export_notify_ack_waiting[dir].empty()) { + dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from + << ", still waiting for " << export_notify_ack_waiting[dir] << endl; + + } else { + dout(7) << "handle_export_dir_notify_ack on " << *dir << " from " << from + << ", last one!" << endl; + + // ok, we're finished! + export_notify_ack_waiting.erase(dir); + + // finish export (unfreeze, trigger finish context, etc.) + export_dir_finish(dir); + + // unpin proxies + // inodes + for (list::iterator it = export_proxy_inos[dir].begin(); + it != export_proxy_inos[dir].end(); + it++) { + CInode *in = cache->get_inode(*it); + in->put(CINODE_PIN_PROXY); + assert(in->state_test(CINODE_STATE_PROXY)); + in->state_clear(CINODE_STATE_PROXY); + } + export_proxy_inos.erase(dir); + + // dirs + for (list::iterator it = export_proxy_dirinos[dir].begin(); + it != export_proxy_dirinos[dir].end(); + it++) { + CDir *dir = cache->get_inode(*it)->dir; + dir->put(CDIR_PIN_PROXY); + assert(dir->state_test(CDIR_STATE_PROXY)); + dir->state_clear(CDIR_STATE_PROXY); + + // hose neg dentries, too, since we're no longer auth + CDir_map_t::iterator it; + for (it = dir->begin(); it != dir->end(); ) { + CDentry *dn = it->second; + it++; + if (dn->is_null()) { + assert(dn->is_sync()); + dir->remove_dentry(dn); + } else { + //dout(10) << "export_dir_notify_ack leaving xlocked neg " << *dn << endl; + if (dn->is_dirty()) + dn->mark_clean(); + } + } + } + export_proxy_dirinos.erase(dir); + + } + + delete m; +} + + +/* + * once i get all teh notify_acks i can finish + */ +void Migrator::export_dir_finish(CDir *dir) +{ + // exported! + + + // FIXME log it + + // send finish to new auth + mds->send_message_mds(new MExportDirFinish(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); + + // unfreeze + dout(7) << "export_dir_finish " << *dir << ", unfreezing" << endl; + dir->unfreeze_tree(); + + // unpin path + dout(7) << "export_dir_finish unpinning path" << endl; + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + + // stats + mds->logger->set("nex", cache->exports.size()); + + show_imports(); +} + + + + + + + + + + + + +// IMPORTS + +class C_MDC_ExportDirDiscover : public Context { + Migrator *mig; + MExportDirDiscover *m; +public: + vector trace; + C_MDC_ExportDirDiscover(Migrator *mig_, MExportDirDiscover *m_) : + mig(mig_), m(m_) {} + void finish(int r) { + CInode *in = 0; + if (r >= 0) in = trace[trace.size()-1]->get_inode(); + mig->handle_export_dir_discover_2(m, in, r); + } +}; + +void Migrator::handle_export_dir_discover(MExportDirDiscover *m) +{ + assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid()); + + dout(7) << "handle_export_dir_discover on " << m->get_path() << endl; + + // must discover it! + C_MDC_ExportDirDiscover *onfinish = new C_MDC_ExportDirDiscover(this, m); + filepath fpath(m->get_path()); + cache->path_traverse(fpath, onfinish->trace, true, + m, new C_MDS_RetryMessage(mds,m), // on delay/retry + MDS_TRAVERSE_DISCOVER, + onfinish); // on completion|error +} + +void Migrator::handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r) +{ + // yay! + if (in) { + dout(7) << "handle_export_dir_discover_2 has " << *in << endl; + } + + if (r < 0 || !in->is_dir()) { + dout(7) << "handle_export_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; + + assert(0); // this shouldn't happen if the auth pins his path properly!!!! + + mds->send_message_mds(new MExportDirDiscoverAck(m->get_ino(), false), + m->get_source().num(), MDS_PORT_MIGRATOR); + delete m; + return; + } + + assert(in->is_dir()); + + if (in->is_frozen()) { + dout(7) << "frozen, waiting." << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, + new C_MDS_RetryMessage(mds,m)); + return; + } + + // pin inode in the cache (for now) + in->get(CINODE_PIN_IMPORTING); + + // pin auth too, until the import completes. + in->auth_pin(); + + // reply + dout(7) << " sending export_dir_discover_ack on " << *in << endl; + mds->send_message_mds(new MExportDirDiscoverAck(in->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + delete m; +} + + + +void Migrator::handle_export_dir_prep(MExportDirPrep *m) +{ + assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid()); + + CInode *diri = cache->get_inode(m->get_ino()); + assert(diri); + + list finished; + + // assimilate root dir. + CDir *dir = diri->dir; + if (dir) { + dout(7) << "handle_export_dir_prep on " << *dir << " (had dir)" << endl; + + if (!m->did_assim()) + m->get_dir(diri->ino())->update_dir(dir); + } else { + assert(!m->did_assim()); + + // open dir i'm importing. + diri->set_dir( new CDir(diri, mds, false) ); + dir = diri->dir; + m->get_dir(diri->ino())->update_dir(dir); + + dout(7) << "handle_export_dir_prep on " << *dir << " (opening dir)" << endl; + + diri->take_waiting(CINODE_WAIT_DIR, finished); + } + assert(dir->is_auth() == false); + + show_imports(); + + // assimilate contents? + if (!m->did_assim()) { + dout(7) << "doing assim on " << *dir << endl; + m->mark_assim(); // only do this the first time! + + // move pin to dir + diri->put(CINODE_PIN_IMPORTING); + dir->get(CDIR_PIN_IMPORTING); + + // auth pin too + dir->auth_pin(); + diri->auth_unpin(); + + // assimilate traces to exports + for (list::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + // inode + CInode *in = cache->get_inode( (*it)->get_ino() ); + if (in) { + (*it)->update_inode(in); + dout(7) << " updated " << *in << endl; + } else { + in = new CInode(mds->mdcache, false); + (*it)->update_inode(in); + + // link to the containing dir + CInode *condiri = cache->get_inode( m->get_containing_dirino(in->ino()) ); + assert(condiri && condiri->dir); + cache->add_inode( in ); + condiri->dir->add_dentry( m->get_dentry(in->ino()), in ); + + dout(7) << " added " << *in << endl; + } + + assert( in->get_parent_dir()->ino() == m->get_containing_dirino(in->ino()) ); + + // dir + if (m->have_dir(in->ino())) { + if (in->dir) { + m->get_dir(in->ino())->update_dir(in->dir); + dout(7) << " updated " << *in->dir << endl; + } else { + in->set_dir( new CDir(in, mds, false) ); + m->get_dir(in->ino())->update_dir(in->dir); + dout(7) << " added " << *in->dir << endl; + in->take_waiting(CINODE_WAIT_DIR, finished); + } + } + } + + // open export dirs? + for (list::iterator it = m->get_exports().begin(); + it != m->get_exports().end(); + it++) { + dout(7) << " checking dir " << hex << *it << dec << endl; + CInode *in = cache->get_inode(*it); + assert(in); + + if (!in->dir) { + dout(7) << " opening nested export on " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + + // pin it! + in->get(CINODE_PIN_OPENINGDIR); + in->state_set(CINODE_STATE_OPENINGDIR); + } + } + } else { + dout(7) << " not doing assim on " << *dir << endl; + } + + + // verify we have all exports + int waiting_for = 0; + for (list::iterator it = m->get_exports().begin(); + it != m->get_exports().end(); + it++) { + inodeno_t ino = *it; + CInode *in = cache->get_inode(ino); + if (!in) dout(0) << "** missing ino " << hex << ino << dec << endl; + assert(in); + if (in->dir) { + if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { + dout(7) << " pinning nested export " << *in->dir << endl; + in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); + + if (in->state_test(CINODE_STATE_OPENINGDIR)) { + in->put(CINODE_PIN_OPENINGDIR); + in->state_clear(CINODE_STATE_OPENINGDIR); + } + } else { + dout(7) << " already pinned nested export " << *in << endl; + } + } else { + dout(7) << " waiting for nested export dir on " << *in << endl; + waiting_for++; + } + } + if (waiting_for) { + dout(7) << " waiting for " << waiting_for << " nested export dir opens" << endl; + } else { + // ok! + dout(7) << " all ready, sending export_dir_prep_ack on " << *dir << endl; + mds->send_message_mds(new MExportDirPrepAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // done + delete m; + } + + // finish waiters + finish_contexts(finished, 0); +} + + + + +/* this guy waits for the pre-import discovers on hashed directory dir inodes to finish. + * if it's the last one on the dir, it reprocessed the import. + */ +/* +class C_MDS_ImportPrediscover : public Context { +public: + MDS *mds; + MExportDir *m; + inodeno_t dir_ino; + string dentry; + C_MDS_ImportPrediscover(MDS *mds, MExportDir *m, inodeno_t dir_ino, const string& dentry) { + this->mds = mds; + this->m = m; + this->dir_ino = dir_ino; + this->dentry = dentry; + } + virtual void finish(int r) { + assert(r == 0); // should never fail! + + m->remove_prediscover(dir_ino, dentry); + + if (!m->any_prediscovers()) + mds->mdcache->handle_export_dir(m); + } +}; +*/ + + + +void Migrator::handle_export_dir(MExportDir *m) +{ + CInode *diri = cache->get_inode(m->get_ino()); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + int oldauth = MSG_ADDR_NUM(m->get_source()); + dout(7) << "handle_export_dir, import " << *dir << " from " << oldauth << endl; + assert(dir->is_auth() == false); + + + + show_imports(); + + // note new authority (locally) + if (dir->inode->is_auth()) + dir->set_dir_auth( CDIR_AUTH_PARENT ); + else + dir->set_dir_auth( mds->get_nodeid() ); + dout(10) << " set dir_auth to " << dir->get_dir_auth() << endl; + + // update imports/exports + CDir *containing_import; + if (cache->exports.count(dir)) { + // reimporting + dout(7) << " i'm reimporting " << *dir << endl; + cache->exports.erase(dir); + + dir->state_clear(CDIR_STATE_EXPORT); + dir->put(CDIR_PIN_EXPORT); // unpin, no longer an export + + containing_import = cache->get_auth_container(dir); + dout(7) << " it is nested under import " << *containing_import << endl; + cache->nested_exports[containing_import].erase(dir); + } else { + // new import + cache->imports.insert(dir); + dir->state_set(CDIR_STATE_IMPORT); + dir->get(CDIR_PIN_IMPORT); // must keep it pinned + + containing_import = dir; // imported exports nested under *in + + dout(7) << " new import at " << *dir << endl; + } + + + // take out my temp pin + dir->put(CDIR_PIN_IMPORTING); + + // add any inherited exports + for (list::iterator it = m->get_exports().begin(); + it != m->get_exports().end(); + it++) { + CInode *exi = cache->get_inode(*it); + assert(exi && exi->dir); + CDir *ex = exi->dir; + + dout(15) << " nested export " << *ex << endl; + + // remove our pin + ex->put(CDIR_PIN_IMPORTINGEXPORT); + ex->state_clear(CDIR_STATE_IMPORTINGEXPORT); + + + // add... + if (ex->is_import()) { + dout(7) << " importing my import " << *ex << endl; + cache->imports.erase(ex); + ex->state_clear(CDIR_STATE_IMPORT); + + mds->logger->inc("imex"); + + // move nested exports under containing_import + for (set::iterator it = cache->nested_exports[ex].begin(); + it != cache->nested_exports[ex].end(); + it++) { + dout(7) << " moving nested export " << **it << " under " << *containing_import << endl; + cache->nested_exports[containing_import].insert(*it); + } + cache->nested_exports.erase(ex); // de-list under old import + + ex->set_dir_auth( CDIR_AUTH_PARENT ); + ex->put(CDIR_PIN_IMPORT); // imports are pinned, no longer import + + } else { + dout(7) << " importing export " << *ex << endl; + + // add it + ex->state_set(CDIR_STATE_EXPORT); + ex->get(CDIR_PIN_EXPORT); // all exports are pinned + cache->exports.insert(ex); + cache->nested_exports[containing_import].insert(ex); + mds->logger->inc("imex"); + } + + } + + + // add this crap to my cache + list imported_subdirs; + bufferlist dir_state; + dir_state.claim( m->get_state() ); + int off = 0; + int num_imported_inodes = 0; + + for (int i = 0; i < m->get_ndirs(); i++) { + num_imported_inodes += + import_dir_block(dir_state, + off, + oldauth, + dir, // import root + imported_subdirs); + } + dout(10) << " " << imported_subdirs.size() << " imported subdirs" << endl; + dout(10) << " " << m->get_exports().size() << " imported nested exports" << endl; + + + // adjust popularity + mds->balancer->add_import(dir); + + // send notify's etc. + dout(7) << "sending notifyack for " << *dir << " to old auth " << MSG_ADDR_NUM(m->get_source()) << endl; + mds->send_message_mds(new MExportDirNotifyAck(dir->inode->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + dout(7) << "sending notify to others" << endl; + for (set::iterator it = dir->open_by.begin(); + it != dir->open_by.end(); + it++) { + assert( *it != mds->get_nodeid() ); + if ( *it == MSG_ADDR_NUM(m->get_source()) ) continue; // not to old auth. + + MExportDirNotify *notify = new MExportDirNotify(dir->ino(), MSG_ADDR_NUM(m->get_source()), mds->get_nodeid()); + notify->copy_exports(m->get_exports()); + + if (g_conf.mds_verify_export_dirauth) + notify->copy_subdirs(imported_subdirs); // copy subdir list (DEBUG) + + mds->send_message_mds(notify, *it, MDS_PORT_MIGRATOR); + } + + // done + delete m; + + show_imports(); + + + // is it empty? + if (dir->get_size() == 0 && + !dir->inode->is_auth()) { + // reexport! + export_empty_import(dir); + } + + + // some stats + mds->logger->inc("im"); + mds->logger->inc("iim", num_imported_inodes); + mds->logger->set("nim", cache->imports.size()); + + + // FIXME LOG IT + + /* + stupid hashing crap, FIXME + + // wait for replicas in hashed dirs? + if (import_hashed_replicate_waiting.count(m->get_ino())) { + // it'll happen later!, when i get my inodegetreplicaack's back + } else { + // finish now + //not anymoreimport_dir_finish(dir); + } + */ + +} + + + +void Migrator::handle_export_dir_finish(MExportDirFinish *m) +{ + CInode *diri = cache->get_inode(m->get_ino()); + CDir *dir = diri->dir; + assert(dir); + + dout(7) << "handle_export_dir_finish on " << *dir << endl; + assert(dir->is_auth()); + + dout(5) << "done with import of " << *dir << endl; + show_imports(); + mds->logger->set("nex", cache->exports.size()); + mds->logger->set("nim", cache->imports.size()); + + // un auth pin (other exports can now proceed) + dir->auth_unpin(); + + // ok now finish contexts + dout(5) << "finishing any waiters on imported data" << endl; + dir->finish_waiting(CDIR_WAIT_IMPORTED); + + delete m; +} + + +void Migrator::decode_import_inode(CDentry *dn, bufferlist& bl, int& off, int oldauth) +{ + CInodeExport istate; + off = istate._decode(bl, off); + dout(15) << "got a cinodeexport " << endl; + + bool added = false; + CInode *in = cache->get_inode(istate.get_ino()); + if (!in) { + in = new CInode(mds->mdcache); + added = true; + } else { + in->set_auth(true); + } + + // link before state + if (dn->inode != in) { + assert(!dn->inode); + dn->dir->link_inode(dn, in); + } + + // state after link + set merged_client_caps; + istate.update_inode(in, merged_client_caps); + + + // add inode? + if (added) { + cache->add_inode(in); + dout(10) << "added " << *in << endl; + } else { + dout(10) << " had " << *in << endl; + } + + + // cached_by + assert(!in->is_cached_by(oldauth)); + in->cached_by_add( oldauth, CINODE_EXPORT_NONCE ); + if (in->is_cached_by(mds->get_nodeid())) + in->cached_by_remove(mds->get_nodeid()); + + // twiddle locks + // hard + if (in->hardlock.get_state() == LOCK_GLOCKR) { + in->hardlock.gather_set.erase(mds->get_nodeid()); + in->hardlock.gather_set.erase(oldauth); + if (in->hardlock.gather_set.empty()) + mds->locker->inode_hard_eval(in); + } + + // caps + for (set::iterator it = merged_client_caps.begin(); + it != merged_client_caps.end(); + it++) { + MClientFileCaps *caps = new MClientFileCaps(in->inode, + in->client_caps[*it].get_last_seq(), + in->client_caps[*it].pending(), + in->client_caps[*it].wanted(), + MClientFileCaps::FILECAP_REAP); + caps->set_mds( oldauth ); // reap from whom? + mds->messenger->send_message(caps, + MSG_ADDR_CLIENT(*it), mds->clientmap.get_inst(*it), + 0, MDS_PORT_CACHE); + } + + // filelock + if (!in->filelock.is_stable()) { + // take me and old auth out of gather set + in->filelock.gather_set.erase(mds->get_nodeid()); + in->filelock.gather_set.erase(oldauth); + if (in->filelock.gather_set.empty()) // necessary but not suffient... + mds->locker->inode_file_eval(in); + } + + // other + if (in->is_dirty()) { + dout(10) << "logging dirty import " << *in << endl; + mds->mdlog->submit_entry(new EInodeUpdate(in)); + } +} + + +int Migrator::import_dir_block(bufferlist& bl, + int& off, + int oldauth, + CDir *import_root, + list& imported_subdirs) +{ + // set up dir + CDirExport dstate; + off = dstate._decode(bl, off); + + CInode *diri = cache->get_inode(dstate.get_ino()); + assert(diri); + CDir *dir = diri->get_or_open_dir(mds); + assert(dir); + + dout(7) << " import_dir_block " << *dir << " have " << dir->nitems << " items, importing " << dstate.get_nden() << " dentries" << endl; + + // add to list + if (dir != import_root) + imported_subdirs.push_back(dir->ino()); + + // assimilate state + dstate.update_dir( dir ); + if (diri->is_auth()) + dir->set_dir_auth( CDIR_AUTH_PARENT ); // update_dir may hose dir_auth + + // mark (may already be marked from get_or_open_dir() above) + if (!dir->is_auth()) + dir->state_set(CDIR_STATE_AUTH); + + // open_by + assert(!dir->is_open_by(oldauth)); + dir->open_by_add(oldauth); + if (dir->is_open_by(mds->get_nodeid())) + dir->open_by_remove(mds->get_nodeid()); + + if (dir->is_hashed()) { + + // do nothing; dir is hashed + return 0; + } else { + // take all waiters on this dir + // NOTE: a pass of imported data is guaranteed to get all of my waiters because + // a replica's presense in my cache implies/forces it's presense in authority's. + list waiters; + + dir->take_waiting(CDIR_WAIT_ANY, waiters); + for (list::iterator it = waiters.begin(); + it != waiters.end(); + it++) + import_root->add_waiter(CDIR_WAIT_IMPORTED, *it); + + dout(15) << "doing contents" << endl; + + // contents + int num_imported = 0; + long nden = dstate.get_nden(); + + for (; nden>0; nden--) { + + num_imported++; + + // dentry + string dname; + _decode(dname, bl, off); + dout(15) << "dname is " << dname << endl; + + char dirty; + bl.copy(off, 1, &dirty); + off++; + + char icode; + bl.copy(off, 1, &icode); + off++; + + CDentry *dn = dir->lookup(dname); + if (!dn) + dn = dir->add_dentry(dname); // null + + // mark dn dirty _after_ we link the inode (scroll down) + + if (icode == 'N') { + // null dentry + assert(dn->is_null()); + + // fall thru + } + else if (icode == 'L') { + // remote link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + dir->link_inode(dn, ino); + } + else if (icode == 'I') { + // inode + decode_import_inode(dn, bl, off, oldauth); + } + + // mark dentry dirty? (only _after_ we link the inode!) + if (dirty == 'D') dn->mark_dirty(); + + } + + if (dir->is_dirty()) + mds->mdlog->submit_entry(new EDirUpdate(dir)); + + return num_imported; + } +} + + + + + +// authority bystander + +void Migrator::handle_export_dir_warning(MExportDirWarning *m) +{ + // add to warning list + stray_export_warnings.insert( m->get_ino() ); + + // did i already see the notify? + if (stray_export_notifies.count(m->get_ino())) { + // i did, we're good. + dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". already got notify." << endl; + + // process the notify + map::iterator it = stray_export_notifies.find(m->get_ino()); + handle_export_dir_notify(it->second); + stray_export_notifies.erase(it); + } else { + dout(7) << "handle_export_dir_warning on " << m->get_ino() << ". waiting for notify." << endl; + } + + // done + delete m; +} + + +void Migrator::handle_export_dir_notify(MExportDirNotify *m) +{ + CDir *dir = 0; + CInode *in = cache->get_inode(m->get_ino()); + if (in) dir = in->dir; + + // did i see the warning yet? + if (!stray_export_warnings.count(m->get_ino())) { + // wait for it. + dout(7) << "export_dir_notify on " << m->get_ino() << ", waiting for warning." << endl; + stray_export_notifies.insert(pair( m->get_ino(), m )); + return; + } + + // i did, we're all good. + dout(7) << "export_dir_notify on " << m->get_ino() << ", already saw warning." << endl; + + // update dir_auth! + if (dir) { + dout(7) << "export_dir_notify on " << *dir << " new_auth " << m->get_new_auth() << " (old_auth " << m->get_old_auth() << ")" << endl; + + // update bounds first + for (list::iterator it = m->get_exports().begin(); + it != m->get_exports().end(); + it++) { + CInode *n = cache->get_inode(*it); + if (!n) continue; + CDir *ndir = n->dir; + if (!ndir) continue; + + int boundauth = ndir->authority(); + dout(7) << "export_dir_notify bound " << *ndir << " was dir_auth " << ndir->get_dir_auth() << " (" << boundauth << ")" << endl; + if (ndir->get_dir_auth() == CDIR_AUTH_PARENT) { + if (boundauth != m->get_new_auth()) + ndir->set_dir_auth( boundauth ); + else assert(dir->authority() == m->get_new_auth()); // apparently we already knew! + } else { + if (boundauth == m->get_new_auth()) + ndir->set_dir_auth( CDIR_AUTH_PARENT ); + } + } + + // update dir_auth + if (in->authority() == m->get_new_auth()) { + dout(7) << "handle_export_dir_notify on " << *in << ": inode auth is the same, setting dir_auth -1" << endl; + dir->set_dir_auth( CDIR_AUTH_PARENT ); + assert(!in->is_auth()); + assert(!dir->is_auth()); + } else { + dir->set_dir_auth( m->get_new_auth() ); + } + assert(dir->authority() != mds->get_nodeid()); + assert(!dir->is_auth()); + + // DEBUG: verify subdirs + if (g_conf.mds_verify_export_dirauth) { + + dout(7) << "handle_export_dir_notify on " << *dir << " checking " << m->num_subdirs() << " subdirs" << endl; + for (list::iterator it = m->subdirs_begin(); + it != m->subdirs_end(); + it++) { + CInode *diri = cache->get_inode(*it); + if (!diri) continue; // don't have it, don't care + if (!diri->dir) continue; + dout(10) << "handle_export_dir_notify checking subdir " << *diri->dir << " is auth " << diri->dir->get_dir_auth() << endl; + assert(diri->dir != dir); // base shouldn't be in subdir list + if (diri->dir->get_dir_auth() != CDIR_AUTH_PARENT) { + dout(7) << "*** weird value for dir_auth " << diri->dir->get_dir_auth() << " on " << *diri->dir << ", should have been -1 probably??? ******************" << endl; + assert(0); // bad news! + //dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + assert(diri->dir->authority() == m->get_new_auth()); + } + } + } + + // send notify ack to old auth + dout(7) << "handle_export_dir_notify sending ack to old_auth " << m->get_old_auth() << endl; + mds->send_message_mds(new MExportDirNotifyAck(m->get_ino()), + m->get_old_auth(), MDS_PORT_MIGRATOR); + + + // done + stray_export_warnings.erase( m->get_ino() ); + delete m; +} + + + + + +// ======================================================================= +// HASHING + + +void Migrator::import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth) +{ + int off = 0; + + for (; nden>0; nden--) { + // dentry + string dname; + _decode(dname, bl, off); + dout(15) << "dname is " << dname << endl; + + char icode; + bl.copy(off, 1, &icode); + off++; + + CDentry *dn = dir->lookup(dname); + if (!dn) + dn = dir->add_dentry(dname); // null + + // mark dn dirty _after_ we link the inode (scroll down) + + if (icode == 'N') { + + // null dentry + assert(dn->is_null()); + + // fall thru + } + else if (icode == 'L') { + // remote link + inodeno_t ino; + bl.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + dir->link_inode(dn, ino); + } + else if (icode == 'I') { + // inode + decode_import_inode(dn, bl, off, oldauth); + + // fix up subdir export? + if (dn->inode->dir) { + assert(dn->inode->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)); + dn->inode->dir->put(CDIR_PIN_IMPORTINGEXPORT); + dn->inode->dir->state_clear(CDIR_STATE_IMPORTINGEXPORT); + + if (dn->inode->dir->is_auth()) { + // mine. must have been an import. + assert(dn->inode->dir->is_import()); + dout(7) << "unimporting subdir now that inode is mine " << *dn->inode->dir << endl; + dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); + cache->imports.erase(dn->inode->dir); + dn->inode->dir->put(CDIR_PIN_IMPORT); + dn->inode->dir->state_clear(CDIR_STATE_IMPORT); + + // move nested under hashdir + for (set::iterator it = cache->nested_exports[dn->inode->dir].begin(); + it != cache->nested_exports[dn->inode->dir].end(); + it++) + cache->nested_exports[dir].insert(*it); + cache->nested_exports.erase(dn->inode->dir); + + // now it matches the inode + dn->inode->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + else { + // not mine. make it an export. + dout(7) << "making subdir into export " << *dn->inode->dir << endl; + dn->inode->dir->get(CDIR_PIN_EXPORT); + dn->inode->dir->state_set(CDIR_STATE_EXPORT); + cache->exports.insert(dn->inode->dir); + cache->nested_exports[dir].insert(dn->inode->dir); + + if (dn->inode->dir->get_dir_auth() == CDIR_AUTH_PARENT) + dn->inode->dir->set_dir_auth( oldauth ); // no longer matches inode + assert(dn->inode->dir->get_dir_auth() >= 0); + } + } + } + + // mark dentry dirty? (only _after_ we link the inode!) + dn->mark_dirty(); + } +} + +/* + + notes on interaction of hashing and export/import: + + - dir->is_auth() is completely independent of hashing. for a hashed dir, + - all nodes are partially authoritative + - all nodes dir->is_hashed() == true + - all nodes dir->inode->dir_is_hashed() == true + - one node dir->is_auth() == true, the rest == false + - dir_auth for all subdirs in a hashed dir will (likely?) be explicit. + + - remember simple rule: dir auth follows inode, unless dir_auth is explicit. + + - export_dir_walk and import_dir_block take care with dir_auth: (for import/export) + - on export, -1 is changed to mds->get_nodeid() + - on import, nothing special, actually. + + - hashed dir files aren't included in export; subdirs are converted to imports + or exports as necessary. + - hashed dir subdirs are discovered on export. this is important + because dirs are needed to tie together auth hierarchy, for auth to know about + imports/exports, etc. + + - dir state is maintained on auth. + - COMPLETE and HASHED are transfered to importers. + - DIRTY is set everywhere. + + - hashed dir is like an import: hashed dir used for nested_exports map. + - nested_exports is updated appropriately on auth and replicas. + - a subtree terminates as a hashed dir, since the hashing explicitly + redelegates all inodes. thus export_dir_walk includes hashed dirs, but + not their inodes. +*/ + +// HASH on auth + +class C_MDC_HashFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_HashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->hash_dir_frozen(dir); + } +}; + +class C_MDC_HashComplete : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_HashComplete(Migrator *mig, CDir *dir) { + this->mig = mig; + this->dir = dir; + } + virtual void finish(int r) { + mig->hash_dir_complete(dir); + } +}; + + +/** hash_dir(dir) + * start hashing a directory. + */ +void Migrator::hash_dir(CDir *dir) +{ + dout(-7) << "hash_dir " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + + if (dir->is_frozen() || + dir->is_freezing()) { + dout(7) << " can't hash, freezing|frozen." << endl; + return; + } + + // pin path? + vector trace; + cache->make_trace(trace, dir->inode); + if (!cache->path_pin(trace, 0, 0)) { + dout(7) << "hash_dir couldn't pin path, failing." << endl; + return; + } + + // ok, go + dir->state_set(CDIR_STATE_HASHING); + dir->get(CDIR_PIN_HASHING); + assert(dir->hashed_subset.empty()); + + // discover on all mds + assert(hash_gather.count(dir) == 0); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; // except me + hash_gather[dir].insert(i); + mds->send_message_mds(new MHashDirDiscover(dir->inode), i, MDS_PORT_MIGRATOR); + } + dir->auth_pin(); // pin until discovers are all acked. + + // start freeze + dir->freeze_dir(new C_MDC_HashFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "hash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_HashComplete(this, dir)); + } else + hash_dir_complete(dir); +} + + +/* + * wait for everybody to discover and open the hashing dir + * then auth_unpin, to let the freeze happen + */ +void Migrator::handle_hash_dir_discover_ack(MHashDirDiscoverAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = MSG_ADDR_NUM(m->get_source()); + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "hash_dir_discover_ack " << *dir << ", releasing auth_pin" << endl; + dir->auth_unpin(); // unpin to allow freeze to complete + } else { + dout(7) << "hash_dir_discover_ack " << *dir << ", still waiting for " << hash_gather[dir] << endl; + } + + delete m; // done +} + + + +/* + * once the dir is completely in memory, + * mark all migrating inodes dirty (to pin in cache) + */ +void Migrator::hash_dir_complete(CDir *dir) +{ + dout(7) << "hash_dir_complete " << *dir << ", dirtying inodes" << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + + // mark dirty to pin in cache + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CInode *in = it->second->inode; + in->mark_dirty(); + } + + if (dir->is_frozen_dir()) + hash_dir_go(dir); +} + + +/* + * once the dir is frozen, + * make sure it's complete + * send the prep messages! + */ +void Migrator::hash_dir_frozen(CDir *dir) +{ + dout(7) << "hash_dir_frozen " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + if (!dir->is_complete()) { + dout(7) << "hash_dir_frozen !complete, waiting still on " << *dir << endl; + return; + } + + // send prep messages w/ export directories to open + vector msgs(mds->get_mds_map()->get_num_mds()); + + // check for subdirs + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + if (!in->is_dir()) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode == mds->get_nodeid()) continue; + + // msg? + if (msgs[dentryhashcode] == 0) { + msgs[dentryhashcode] = new MHashDirPrep(dir->ino()); + } + msgs[dentryhashcode]->add_inode(it->first, in->replicate_to(dentryhashcode)); + } + + // send them! + assert(hash_gather[dir].empty()); + for (unsigned i=0; isend_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); + hash_gather[dir].insert(i); + } + } + + if (hash_gather[dir].empty()) { + // no subdirs! continue! + hash_gather.erase(dir); + hash_dir_go(dir); + } else { + // wait! + } +} + +/* + * wait for peers to open all subdirs + */ +void Migrator::handle_hash_dir_prep_ack(MHashDirPrepAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = MSG_ADDR_NUM(m->get_source()); + + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", last one" << endl; + hash_dir_go(dir); + } else { + dout(7) << "handle_hash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +/* + * once the dir is frozen, + * make sure it's complete + * do the hashing! + */ +void Migrator::hash_dir_go(CDir *dir) +{ + dout(7) << "hash_dir_go " << *dir << endl; + + assert(!dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + // get messages to other nodes ready + vector msgs(mds->get_mds_map()->get_num_mds()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + msgs[i] = new MHashDir(dir->ino()); + } + + // pick a hash seed. + dir->inode->inode.hash_seed = 1;//dir->ino(); + + // suck up all waiters + C_Contexts *fin = new C_Contexts; + list waiting; + dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // get containing import. might be me. + CDir *containing_import = cache->get_auth_container(dir); + assert(containing_import != dir || dir->is_import()); + + // divy up contents + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode == mds->get_nodeid()) { + continue; // still mine! + } + + bufferlist *bl = msgs[dentryhashcode]->get_state_ptr(); + assert(bl); + + // -- dentry + dout(7) << "hash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; + _encode(it->first, *bl); + + // null dentry? + if (dn->is_null()) { + bl->append("N", 1); // null dentry + assert(dn->is_sync()); + continue; + } + + if (dn->is_remote()) { + // remote link + bl->append("L", 1); // remote link + + inodeno_t ino = dn->get_remote_ino(); + bl->append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + bl->append("I", 1); // inode dentry + + encode_export_inode(in, *bl, dentryhashcode); // encode, and (update state for) export + msgs[dentryhashcode]->inc_nden(); + + if (dn->is_dirty()) + dn->mark_clean(); + + // add to proxy + hash_proxy_inos[dir].push_back(in); + in->state_set(CINODE_STATE_PROXY); + in->get(CINODE_PIN_PROXY); + + // fix up subdirs + if (in->dir) { + if (in->dir->is_auth()) { + // mine. make it into an import. + dout(7) << "making subdir into import " << *in->dir << endl; + in->dir->set_dir_auth( mds->get_nodeid() ); + cache->imports.insert(in->dir); + in->dir->get(CDIR_PIN_IMPORT); + in->dir->state_set(CDIR_STATE_IMPORT); + + // fix nested bits + for (set::iterator it = cache->nested_exports[containing_import].begin(); + it != cache->nested_exports[containing_import].end(); ) { + CDir *ex = *it; + it++; + if (cache->get_auth_container(ex) == in->dir) { + dout(10) << "moving nested export " << *ex << endl; + cache->nested_exports[containing_import].erase(ex); + cache->nested_exports[in->dir].insert(ex); + } + } + } + else { + // not mine. + dout(7) << "un-exporting subdir that's being hashed away " << *in->dir << endl; + assert(in->dir->is_export()); + in->dir->put(CDIR_PIN_EXPORT); + in->dir->state_clear(CDIR_STATE_EXPORT); + cache->exports.erase(in->dir); + cache->nested_exports[containing_import].erase(in->dir); + if (in->dir->authority() == dentryhashcode) + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + else + in->dir->set_dir_auth( in->dir->authority() ); + } + } + + // waiters + list waiters; + in->take_waiting(CINODE_WAIT_ANY, waiters); + fin->take(waiters); + } + + // dir state + dir->state_set(CDIR_STATE_HASHED); + dir->get(CDIR_PIN_HASHED); + cache->hashdirs.insert(dir); + dir->mark_dirty(); + mds->mdlog->submit_entry(new EDirUpdate(dir)); + + // inode state + if (dir->inode->is_auth()) { + dir->inode->mark_dirty(); + mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + } + + // fix up nested_exports? + if (containing_import != dir) { + dout(7) << "moving nested exports under hashed dir" << endl; + for (set::iterator it = cache->nested_exports[containing_import].begin(); + it != cache->nested_exports[containing_import].end(); ) { + CDir *ex = *it; + it++; + if (cache->get_auth_container(ex) == dir) { + dout(7) << " moving nested export under hashed dir: " << *ex << endl; + cache->nested_exports[containing_import].erase(ex); + cache->nested_exports[dir].insert(ex); + } else { + dout(7) << " NOT moving nested export under hashed dir: " << *ex << endl; + } + } + } + + // send hash messages + assert(hash_gather[dir].empty()); + assert(hash_notify_gather[dir].empty()); + assert(dir->hashed_subset.empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + // all nodes hashed locally.. + dir->hashed_subset.insert(i); + + if (i == mds->get_nodeid()) continue; + + // init hash_gather and hash_notify_gather sets + hash_gather[dir].insert(i); + + assert(hash_notify_gather[dir][i].empty()); + for (int j=0; jget_mds_map()->get_num_mds(); j++) { + if (j == mds->get_nodeid()) continue; + if (j == i) continue; + hash_notify_gather[dir][i].insert(j); + } + + mds->send_message_mds(msgs[i], i, MDS_PORT_MIGRATOR); + } + + // wait for all the acks. +} + + +void Migrator::handle_hash_dir_ack(MHashDirAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + assert(dir->is_hashed()); + assert(dir->is_hashing()); + + int from = MSG_ADDR_NUM(m->get_source()); + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + dout(7) << "handle_hash_dir_ack on " << *dir << ", last one" << endl; + + if (hash_notify_gather[dir].empty()) { + dout(7) << "got notifies too, all done" << endl; + hash_dir_finish(dir); + } else { + dout(7) << "waiting on notifies " << endl; + } + + } else { + dout(7) << "handle_hash_dir_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +void Migrator::hash_dir_finish(CDir *dir) +{ + dout(7) << "hash_dir_finish finishing " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_hashing()); + + // dir state + hash_gather.erase(dir); + dir->state_clear(CDIR_STATE_HASHING); + dir->put(CDIR_PIN_HASHING); + dir->hashed_subset.clear(); + + // unproxy inodes + // this _could_ happen sooner, on a per-peer basis, but no harm in waiting a few more seconds. + for (list::iterator it = hash_proxy_inos[dir].begin(); + it != hash_proxy_inos[dir].end(); + it++) { + CInode *in = *it; + assert(in->state_test(CINODE_STATE_PROXY)); + in->state_clear(CINODE_STATE_PROXY); + in->put(CINODE_PIN_PROXY); + } + hash_proxy_inos.erase(dir); + + // unpin path + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + // unfreeze + dir->unfreeze_dir(); + + show_imports(); + assert(hash_gather.count(dir) == 0); + + // stats + //mds->logger->inc("nh", 1); + +} + + + + +// HASH on auth and non-auth + +void Migrator::handle_hash_dir_notify(MHashDirNotify *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + assert(dir->is_hashing()); + + dout(5) << "handle_hash_dir_notify " << *dir << endl; + int from = m->get_from(); + + int source = MSG_ADDR_NUM(m->get_source()); + if (dir->is_auth()) { + // gather notifies + assert(dir->is_hashed()); + + assert( hash_notify_gather[dir][from].count(source) ); + hash_notify_gather[dir][from].erase(source); + + if (hash_notify_gather[dir][from].empty()) { + dout(7) << "last notify from " << from << endl; + hash_notify_gather[dir].erase(from); + + if (hash_notify_gather[dir].empty()) { + dout(7) << "last notify!" << endl; + hash_notify_gather.erase(dir); + + if (hash_gather[dir].empty()) { + dout(7) << "got acks too, all done" << endl; + hash_dir_finish(dir); + } else { + dout(7) << "still waiting on acks from " << hash_gather[dir] << endl; + } + } else { + dout(7) << "still waiting for notify gathers from " << hash_notify_gather[dir].size() << " others" << endl; + } + } else { + dout(7) << "still waiting for notifies from " << from << " via " << hash_notify_gather[dir][from] << endl; + } + + // delete msg + delete m; + } else { + // update dir hashed_subset + assert(dir->hashed_subset.count(from) == 0); + dir->hashed_subset.insert(from); + + // update open subdirs + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->get_inode(); + if (!in) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != from) continue; // we'll import these in a minute + + if (in->dir->authority() != dentryhashcode) + in->dir->set_dir_auth( in->dir->authority() ); + else + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + + // remove from notify gather set + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + // last notify? + if (hash_gather[dir].empty()) { + dout(7) << "gathered all the notifies, finishing hash of " << *dir << endl; + hash_gather.erase(dir); + + dir->state_clear(CDIR_STATE_HASHING); + dir->put(CDIR_PIN_HASHING); + dir->hashed_subset.clear(); + } else { + dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; + } + + // fw notify to auth + mds->send_message_mds(m, dir->authority(), MDS_PORT_MIGRATOR); + } +} + + + + +// HASH on non-auth + +/* + * discover step: + * each peer needs to open up the directory and pin it before we start + */ +class C_MDC_HashDirDiscover : public Context { + Migrator *mig; + MHashDirDiscover *m; +public: + vector trace; + C_MDC_HashDirDiscover(Migrator *mig, MHashDirDiscover *m) { + this->mig = mig; + this->m = m; + } + void finish(int r) { + CInode *in = 0; + if (r >= 0) { + if (trace.size()) + in = trace[trace.size()-1]->get_inode(); + else + in = mig->cache->get_root(); + } + mig->handle_hash_dir_discover_2(m, in, r); + } +}; + +void Migrator::handle_hash_dir_discover(MHashDirDiscover *m) +{ + assert(MSG_ADDR_NUM(m->get_source()) != mds->get_nodeid()); + + dout(7) << "handle_hash_dir_discover on " << m->get_path() << endl; + + // must discover it! + C_MDC_HashDirDiscover *onfinish = new C_MDC_HashDirDiscover(this, m); + filepath fpath(m->get_path()); + cache->path_traverse(fpath, onfinish->trace, true, + m, new C_MDS_RetryMessage(mds,m), // on delay/retry + MDS_TRAVERSE_DISCOVER, + onfinish); // on completion|error +} + +void Migrator::handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r) +{ + // yay! + if (in) { + dout(7) << "handle_hash_dir_discover_2 has " << *in << endl; + } + + if (r < 0 || !in->is_dir()) { + dout(7) << "handle_hash_dir_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << endl; + assert(0); // this shouldn't happen if the auth pins his path properly!!!! + } + assert(in->is_dir()); + + // is dir open? + if (!in->dir) { + dout(7) << "handle_hash_dir_discover_2 opening dir " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + return; + } + CDir *dir = in->dir; + + // pin dir, set hashing flag + dir->state_set(CDIR_STATE_HASHING); + dir->get(CDIR_PIN_HASHING); + assert(dir->hashed_subset.empty()); + + // inode state + dir->inode->inode.hash_seed = 1;// dir->ino(); + if (dir->inode->is_auth()) { + dir->inode->mark_dirty(); + mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + } + + // get gather set ready for notifies + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + if (i == dir->authority()) continue; + hash_gather[dir].insert(i); + } + + // reply + dout(7) << " sending hash_dir_discover_ack on " << *dir << endl; + mds->send_message_mds(new MHashDirDiscoverAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + delete m; +} + +/* + * prep step: + * peers need to open up all subdirs of the hashed dir + */ + +void Migrator::handle_hash_dir_prep(MHashDirPrep *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_hash_dir_prep " << *dir << endl; + + if (!m->did_assim()) { + m->mark_assim(); // only do this the first time! + + // assimilate dentry+inodes for exports + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + if (in) { + it->second->update_inode(in); + dout(5) << " updated " << *in << endl; + } else { + in = new CInode(mds->mdcache, false); + it->second->update_inode(in); + cache->add_inode(in); + + // link + dir->add_dentry( it->first, in ); + dout(5) << " added " << *in << endl; + } + + // open! + if (!in->dir) { + dout(5) << " opening nested export on " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + } + } + } + + // verify! + int waiting_for = 0; + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + assert(in); + + if (in->dir) { + if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { + dout(5) << " pinning nested export " << *in->dir << endl; + in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); + } else { + dout(5) << " already pinned nested export " << *in << endl; + } + } else { + dout(5) << " waiting for nested export dir on " << *in << endl; + waiting_for++; + } + } + + if (waiting_for) { + dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; + return; + } + + // ack! + mds->send_message_mds(new MHashDirPrepAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // done. + delete m; +} + + +/* + * hash step: + */ + +void Migrator::handle_hash_dir(MHashDir *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + assert(!dir->is_auth()); + assert(!dir->is_hashed()); + assert(dir->is_hashing()); + + dout(5) << "handle_hash_dir " << *dir << endl; + int oldauth = MSG_ADDR_NUM(m->get_source()); + + // content + import_hashed_content(dir, m->get_state(), m->get_nden(), oldauth); + + // dir state + dir->state_set(CDIR_STATE_HASHED); + dir->get(CDIR_PIN_HASHED); + cache->hashdirs.insert(dir); + dir->hashed_subset.insert(mds->get_nodeid()); + + // dir is complete + dir->mark_complete(); + dir->mark_dirty(); + mds->mdlog->submit_entry(new EDirUpdate(dir)); + + // commit + mds->mdstore->commit_dir(dir, 0); + + // send notifies + dout(7) << "sending notifies" << endl; + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + if (i == MSG_ADDR_NUM(m->get_source())) continue; + mds->send_message_mds(new MHashDirNotify(dir->ino(), mds->get_nodeid()), + i, MDS_PORT_MIGRATOR); + } + + // ack + dout(7) << "acking" << endl; + mds->send_message_mds(new MHashDirAck(dir->ino()), + m->get_source().num(), MDS_PORT_MIGRATOR); + + // done. + delete m; + + show_imports(); +} + + + + + +// UNHASH on auth + +class C_MDC_UnhashFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_frozen(dir); + } +}; + +class C_MDC_UnhashComplete : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashComplete(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_complete(dir); + } +}; + + +void Migrator::unhash_dir(CDir *dir) +{ + dout(-7) << "unhash_dir " << *dir << endl; + + assert(dir->is_hashed()); + assert(!dir->is_unhashing()); + assert(dir->is_auth()); + assert(hash_gather.count(dir)==0); + + // pin path? + vector trace; + cache->make_trace(trace, dir->inode); + if (!cache->path_pin(trace, 0, 0)) { + dout(7) << "unhash_dir couldn't pin path, failing." << endl; + return; + } + + // twiddle state + dir->state_set(CDIR_STATE_UNHASHING); + + // first, freeze the dir. + dir->freeze_dir(new C_MDC_UnhashFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_UnhashComplete(this, dir)); + } else + unhash_dir_complete(dir); + +} + +void Migrator::unhash_dir_frozen(CDir *dir) +{ + dout(7) << "unhash_dir_frozen " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + + if (!dir->is_complete()) { + dout(7) << "unhash_dir_frozen !complete, waiting still on " << *dir << endl; + } else + unhash_dir_prep(dir); +} + + +/* + * ask peers to freeze and complete hashed dir + */ +void Migrator::unhash_dir_prep(CDir *dir) +{ + dout(7) << "unhash_dir_prep " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + assert(dir->is_complete()); + + if (!hash_gather[dir].empty()) return; // already been here..freeze must have been instantaneous + + // send unhash prep to all peers + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + hash_gather[dir].insert(i); + mds->send_message_mds(new MUnhashDirPrep(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + +/* + * wait for peers to freeze and complete hashed dirs + */ +void Migrator::handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + int from = MSG_ADDR_NUM(m->get_source()); + dout(7) << "handle_unhash_dir_prep_ack from " << from << " " << *dir << endl; + + if (!m->did_assim()) { + m->mark_assim(); // only do this the first time! + + // assimilate dentry+inodes for exports + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + if (in) { + it->second->update_inode(in); + dout(5) << " updated " << *in << endl; + } else { + in = new CInode(mds->mdcache, false); + it->second->update_inode(in); + cache->add_inode(in); + + // link + dir->add_dentry( it->first, in ); + dout(5) << " added " << *in << endl; + } + + // open! + if (!in->dir) { + dout(5) << " opening nested export on " << *in << endl; + cache->open_remote_dir(in, + new C_MDS_RetryMessage(mds, m)); + } + } + } + + // verify! + int waiting_for = 0; + for (map::iterator it = m->get_inodes().begin(); + it != m->get_inodes().end(); + it++) { + CInode *in = cache->get_inode( it->second->get_ino() ); + assert(in); + + if (in->dir) { + if (!in->dir->state_test(CDIR_STATE_IMPORTINGEXPORT)) { + dout(5) << " pinning nested export " << *in->dir << endl; + in->dir->get(CDIR_PIN_IMPORTINGEXPORT); + in->dir->state_set(CDIR_STATE_IMPORTINGEXPORT); + } else { + dout(5) << " already pinned nested export " << *in << endl; + } + } else { + dout(5) << " waiting for nested export dir on " << *in << endl; + waiting_for++; + } + } + + if (waiting_for) { + dout(5) << "waiting for " << waiting_for << " dirs to open" << endl; + return; + } + + // ok, done with this PrepAck + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + + if (hash_gather[dir].empty()) { + hash_gather.erase(dir); + dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", last one" << endl; + unhash_dir_go(dir); + } else { + dout(7) << "handle_unhash_dir_prep_ack on " << *dir << ", waiting for " << hash_gather[dir] << endl; + } + + delete m; +} + + +/* + * auth: + * send out MHashDir's to peers + */ +void Migrator::unhash_dir_go(CDir *dir) +{ + dout(7) << "unhash_dir_go " << *dir << endl; + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(dir->is_frozen_dir()); + assert(dir->is_complete()); + + // send unhash prep to all peers + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + hash_gather[dir].insert(i); + mds->send_message_mds(new MUnhashDir(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + +/* + * auth: + * assimilate unhashing content + */ +void Migrator::handle_unhash_dir_ack(MUnhashDirAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_ack " << *dir << endl; + assert(dir->is_hashed()); + + // assimilate content + int from = MSG_ADDR_NUM(m->get_source()); + import_hashed_content(dir, m->get_state(), m->get_nden(), from); + delete m; + + // done? + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for unhash acks from " << hash_gather[dir] << endl; + return; + } + + // done! + + // fix up nested_exports + CDir *containing_import = cache->get_auth_container(dir); + if (containing_import != dir) { + for (set::iterator it = cache->nested_exports[dir].begin(); + it != cache->nested_exports[dir].end(); + it++) { + dout(7) << "moving nested export out from under hashed dir : " << **it << endl; + cache->nested_exports[containing_import].insert(*it); + } + cache->nested_exports.erase(dir); + } + + // dir state + //dir->state_clear(CDIR_STATE_UNHASHING); //later + dir->state_clear(CDIR_STATE_HASHED); + dir->put(CDIR_PIN_HASHED); + cache->hashdirs.erase(dir); + + // commit! + assert(dir->is_complete()); + //dir->mark_complete(); + dir->mark_dirty(); + mds->mdstore->commit_dir(dir, 0); + + // inode state + dir->inode->inode.hash_seed = 0; + if (dir->inode->is_auth()) { + dir->inode->mark_dirty(); + mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + } + + // notify + assert(hash_gather[dir].empty()); + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == mds->get_nodeid()) continue; + + hash_gather[dir].insert(i); + + mds->send_message_mds(new MUnhashDirNotify(dir->ino()), + i, MDS_PORT_MIGRATOR); + } +} + + +/* + * sent by peer to flush mds links. unfreeze when all gathered. + */ +void Migrator::handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_ack " << *dir << endl; + assert(!dir->is_hashed()); + assert(dir->is_unhashing()); + assert(dir->is_frozen_dir()); + + // done? + int from = MSG_ADDR_NUM(m->get_source()); + assert(hash_gather[dir].count(from)); + hash_gather[dir].erase(from); + delete m; + + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for notifyack from " << hash_gather[dir] << " on " << *dir << endl; + } else { + unhash_dir_finish(dir); + } +} + + +/* + * all mds links are flushed. unfreeze dir! + */ +void Migrator::unhash_dir_finish(CDir *dir) +{ + dout(7) << "unhash_dir_finish " << *dir << endl; + hash_gather.erase(dir); + + // unpin path + vector trace; + cache->make_trace(trace, dir->inode); + cache->path_unpin(trace, 0); + + // state + dir->state_clear(CDIR_STATE_UNHASHING); + + // unfreeze + dir->unfreeze_dir(); + +} + + + +// UNHASH on all + +/* + * hashed dir is complete. + * mark all migrating inodes dirty (to pin in cache) + * if frozen too, then go to next step (depending on auth) + */ +void Migrator::unhash_dir_complete(CDir *dir) +{ + dout(7) << "unhash_dir_complete " << *dir << ", dirtying inodes" << endl; + + assert(dir->is_hashed()); + assert(dir->is_complete()); + + // mark dirty to pin in cache + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CInode *in = it->second->inode; + if (in->is_auth()) { + in->mark_dirty(); + mds->mdlog->submit_entry(new EInodeUpdate(in)); + } + } + + if (!dir->is_frozen_dir()) { + dout(7) << "dir complete but !frozen, waiting " << *dir << endl; + } else { + if (dir->is_auth()) + unhash_dir_prep(dir); // auth + else + unhash_dir_prep_finish(dir); // nonauth + } +} + + +// UNHASH on non-auth + +class C_MDC_UnhashPrepFreeze : public Context { +public: + Migrator *mig; + CDir *dir; + C_MDC_UnhashPrepFreeze(Migrator *m, CDir *d) : mig(m), dir(d) {} + virtual void finish(int r) { + mig->unhash_dir_prep_frozen(dir); + } +}; + + +/* + * peers need to freeze their dir and make them complete + */ +void Migrator::handle_unhash_dir_prep(MUnhashDirPrep *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_prep " << *dir << endl; + assert(dir->is_hashed()); + + // freeze + dir->freeze_dir(new C_MDC_UnhashPrepFreeze(this, dir)); + + // make complete + if (!dir->is_complete()) { + dout(7) << "unhash_dir " << *dir << " not complete, fetching" << endl; + mds->mdstore->fetch_dir(dir, + new C_MDC_UnhashComplete(this, dir)); + } else { + unhash_dir_complete(dir); + } + + delete m; +} + +/* + * peer has hashed dir frozen. + * complete too? + */ +void Migrator::unhash_dir_prep_frozen(CDir *dir) +{ + dout(7) << "unhash_dir_prep_frozen " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_frozen_dir()); + assert(!dir->is_auth()); + + if (!dir->is_complete()) { + dout(7) << "unhash_dir_prep_frozen !complete, waiting still on " << *dir << endl; + } else + unhash_dir_prep_finish(dir); +} + +/* + * peer has hashed dir complete and frozen. ack. + */ +void Migrator::unhash_dir_prep_finish(CDir *dir) +{ + dout(7) << "unhash_dir_prep_finish " << *dir << endl; + assert(dir->is_hashed()); + assert(!dir->is_auth()); + assert(dir->is_frozen()); + assert(dir->is_complete()); + + // twiddle state + if (dir->is_unhashing()) + return; // already replied. + dir->state_set(CDIR_STATE_UNHASHING); + + // send subdirs back to auth + MUnhashDirPrepAck *ack = new MUnhashDirPrepAck(dir->ino()); + int auth = dir->authority(); + + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + if (!in->is_dir()) continue; + if (!in->dir) continue; + + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != mds->get_nodeid()) continue; + + // msg? + ack->add_inode(it->first, in->replicate_to(auth)); + } + + // ack + mds->send_message_mds(ack, auth, MDS_PORT_MIGRATOR); +} + + + +/* + * peer needs to send hashed dir content back to auth. + * unhash dir. + */ +void Migrator::handle_unhash_dir(MUnhashDir *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir " << *dir << endl;//" .. hash_seed is " << dir->inode->inode.hash_seed << endl; + assert(dir->is_hashed()); + assert(dir->is_unhashing()); + assert(!dir->is_auth()); + + // get message ready + bufferlist bl; + int nden = 0; + + // suck up all waiters + C_Contexts *fin = new C_Contexts; + list waiting; + dir->take_waiting(CDIR_WAIT_ANY, waiting); // all dir waiters + fin->take(waiting); + + // divy up contents + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + CInode *in = dn->inode; + + int dentryhashcode = mds->hash_dentry( dir->ino(), it->first ); + if (dentryhashcode != mds->get_nodeid()) { + // not mine! + // twiddle dir_auth? + if (in->dir) { + if (in->dir->authority() != dir->authority()) + in->dir->set_dir_auth( in->dir->authority() ); + else + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + } + continue; + } + + // -- dentry + dout(7) << "unhash_dir_go sending to " << dentryhashcode << " dn " << *dn << endl; + _encode(it->first, bl); + + // null dentry? + if (dn->is_null()) { + bl.append("N", 1); // null dentry + assert(dn->is_sync()); + continue; + } + + if (dn->is_remote()) { + // remote link + bl.append("L", 1); // remote link + + inodeno_t ino = dn->get_remote_ino(); + bl.append((char*)&ino, sizeof(ino)); + continue; + } + + // primary link + // -- inode + bl.append("I", 1); // inode dentry + + encode_export_inode(in, bl, dentryhashcode); // encode, and (update state for) export + nden++; + + if (dn->is_dirty()) + dn->mark_clean(); + + // proxy + in->state_set(CINODE_STATE_PROXY); + in->get(CINODE_PIN_PROXY); + hash_proxy_inos[dir].push_back(in); + + if (in->dir) { + if (in->dir->is_auth()) { + // mine. make it into an import. + dout(7) << "making subdir into import " << *in->dir << endl; + in->dir->set_dir_auth( mds->get_nodeid() ); + cache->imports.insert(in->dir); + in->dir->get(CDIR_PIN_IMPORT); + in->dir->state_set(CDIR_STATE_IMPORT); + } + else { + // not mine. + dout(7) << "un-exporting subdir that's being unhashed away " << *in->dir << endl; + assert(in->dir->is_export()); + in->dir->put(CDIR_PIN_EXPORT); + in->dir->state_clear(CDIR_STATE_EXPORT); + cache->exports.erase(in->dir); + cache->nested_exports[dir].erase(in->dir); + } + } + + // waiters + list waiters; + in->take_waiting(CINODE_WAIT_ANY, waiters); + fin->take(waiters); + } + + // we should have no nested exports; we're not auth for the dir! + assert(cache->nested_exports[dir].empty()); + cache->nested_exports.erase(dir); + + // dir state + //dir->state_clear(CDIR_STATE_UNHASHING); // later + dir->state_clear(CDIR_STATE_HASHED); + dir->put(CDIR_PIN_HASHED); + cache->hashdirs.erase(dir); + dir->mark_clean(); + + // inode state + dir->inode->inode.hash_seed = 0; + if (dir->inode->is_auth()) { + dir->inode->mark_dirty(); + mds->mdlog->submit_entry(new EInodeUpdate(dir->inode)); + } + + // init gather set + hash_gather[dir] = mds->get_mds_map()->get_mds(); + hash_gather[dir].erase(mds->get_nodeid()); + + // send unhash message + mds->send_message_mds(new MUnhashDirAck(dir->ino(), bl, nden), + dir->authority(), MDS_PORT_MIGRATOR); +} + + +/* + * first notify comes from auth. + * send notifies to all other peers, with peer = self + * if we get notify from peer=other, remove from our gather list. + * when we've gotten notifies from everyone, + * unpin proxies, + * send notify_ack to auth. + * this ensures that all mds links are flushed of cache_expire type messages. + */ +void Migrator::handle_unhash_dir_notify(MUnhashDirNotify *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(7) << "handle_unhash_dir_finish " << *dir << endl; + assert(!dir->is_hashed()); + assert(dir->is_unhashing()); + assert(!dir->is_auth()); + + int from = MSG_ADDR_NUM(m->get_source()); + assert(hash_gather[dir].count(from) == 1); + hash_gather[dir].erase(from); + delete m; + + // did we send our shout out? + if (from == dir->authority()) { + // send notify to everyone else in weird chatter storm + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i == from) continue; + if (i == mds->get_nodeid()) continue; + mds->send_message_mds(new MUnhashDirNotify(dir->ino()), i, MDS_PORT_MIGRATOR); + } + } + + // are we done? + if (!hash_gather[dir].empty()) { + dout(7) << "still waiting for notify from " << hash_gather[dir] << endl; + return; + } + hash_gather.erase(dir); + + // all done! + dout(7) << "all mds links flushed, unpinning unhash proxies" << endl; + + // unpin proxies + for (list::iterator it = hash_proxy_inos[dir].begin(); + it != hash_proxy_inos[dir].end(); + it++) { + CInode *in = *it; + assert(in->state_test(CINODE_STATE_PROXY)); + in->state_clear(CINODE_STATE_PROXY); + in->put(CINODE_PIN_PROXY); + } + + // unfreeze + dir->unfreeze_dir(); + + // ack + dout(7) << "sending notify_ack to auth for unhash of " << *dir << endl; + mds->send_message_mds(new MUnhashDirNotifyAck(dir->ino()), dir->authority(), MDS_PORT_MIGRATOR); + +} + + + + +void Migrator::show_imports() +{ + mds->balancer->show_imports(); +} diff --git a/branches/sage/cephmds2/mds/Migrator.h b/branches/sage/cephmds2/mds/Migrator.h new file mode 100644 index 0000000000000..eac7d2046690b --- /dev/null +++ b/branches/sage/cephmds2/mds/Migrator.h @@ -0,0 +1,199 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_MIGRATOR_H +#define __MDS_MIGRATOR_H + +#include "include/types.h" + +#include +#include +#include +using std::map; +using std::list; +using std::set; + + +class MDS; +class CDir; +class CInode; +class CDentry; + +class MExportDirDiscover; +class MExportDirDiscoverAck; +class MExportDirPrep; +class MExportDirPrepAck; +class MExportDirWarning; +class MExportDir; +class MExportDirNotify; +class MExportDirNotifyAck; +class MExportDirFinish; + +class MHashDirDiscover; +class MHashDirDiscoverAck; +class MHashDirPrep; +class MHashDirPrepAck; +class MHashDir; +class MHashDirAck; +class MHashDirNotify; + +class MUnhashDirPrep; +class MUnhashDirPrepAck; +class MUnhashDir; +class MUnhashDirAck; +class MUnhashDirNotify; +class MUnhashDirNotifyAck; + +class Migrator { +private: + MDS *mds; + MDCache *cache; + + // export fun + map > export_notify_ack_waiting; // nodes i am waiting to get export_notify_ack's from + map > export_proxy_inos; + map > export_proxy_dirinos; + + set stray_export_warnings; // notifies i haven't seen + map stray_export_notifies; + + // hashing madness + multimap unhash_waiting; // nodes i am waiting for UnhashDirAck's from + multimap import_hashed_replicate_waiting; // nodes i am waiting to discover to complete my import of a hashed dir + // maps frozen_dir_ino's to waiting-for-discover ino's. + multimap import_hashed_frozen_waiting; // dirs i froze (for the above) + +public: + // -- cons -- + Migrator(MDS *m, MDCache *c) : mds(m), cache(c) {} + + void dispatch(Message*); + + // -- import/export -- + // exporter + public: + void export_dir(CDir *dir, + int mds); + void export_empty_import(CDir *dir); + + void encode_export_inode(CInode *in, bufferlist& enc_state, int newauth); + void decode_import_inode(CDentry *dn, bufferlist& bl, int &off, int oldauth); + + protected: + map< CDir*, set > export_gather; + void handle_export_dir_discover_ack(MExportDirDiscoverAck *m); + void export_dir_frozen(CDir *dir, int dest); + void handle_export_dir_prep_ack(MExportDirPrepAck *m); + void export_dir_go(CDir *dir, + int dest); + int export_dir_walk(MExportDir *req, + class C_Contexts *fin, + CDir *basedir, + CDir *dir, + int newauth); + void export_dir_finish(CDir *dir); + void handle_export_dir_notify_ack(MExportDirNotifyAck *m); + + + friend class C_MDC_ExportFreeze; + + // importer + void handle_export_dir_discover(MExportDirDiscover *m); + void handle_export_dir_discover_2(MExportDirDiscover *m, CInode *in, int r); + void handle_export_dir_prep(MExportDirPrep *m); + void handle_export_dir(MExportDir *m); + void import_dir_finish(CDir *dir); + void handle_export_dir_finish(MExportDirFinish *m); + int import_dir_block(bufferlist& bl, + int& off, + int oldauth, + CDir *import_root, + list& imported_subdirs); + void got_hashed_replica(CDir *import, + inodeno_t dir_ino, + inodeno_t replica_ino); + + + friend class C_MDC_ExportDirDiscover; + + // bystander + void handle_export_dir_warning(MExportDirWarning *m); + void handle_export_dir_notify(MExportDirNotify *m); + + void show_imports(); + + // -- hashed directories -- + + // HASH + public: + void hash_dir(CDir *dir); // on auth + protected: + map< CDir*, set > hash_gather; + map< CDir*, map< int, set > > hash_notify_gather; + map< CDir*, list > hash_proxy_inos; + + // hash on auth + void handle_hash_dir_discover_ack(MHashDirDiscoverAck *m); + void hash_dir_complete(CDir *dir); + void hash_dir_frozen(CDir *dir); + void handle_hash_dir_prep_ack(MHashDirPrepAck *m); + void hash_dir_go(CDir *dir); + void handle_hash_dir_ack(MHashDirAck *m); + void hash_dir_finish(CDir *dir); + friend class C_MDC_HashFreeze; + friend class C_MDC_HashComplete; + + // auth and non-auth + void handle_hash_dir_notify(MHashDirNotify *m); + + // hash on non-auth + void handle_hash_dir_discover(MHashDirDiscover *m); + void handle_hash_dir_discover_2(MHashDirDiscover *m, CInode *in, int r); + void handle_hash_dir_prep(MHashDirPrep *m); + void handle_hash_dir(MHashDir *m); + friend class C_MDC_HashDirDiscover; + + // UNHASH + public: + void unhash_dir(CDir *dir); // on auth + protected: + map< CDir*, list > unhash_content; + void import_hashed_content(CDir *dir, bufferlist& bl, int nden, int oldauth); + + // unhash on auth + void unhash_dir_frozen(CDir *dir); + void unhash_dir_prep(CDir *dir); + void handle_unhash_dir_prep_ack(MUnhashDirPrepAck *m); + void unhash_dir_go(CDir *dir); + void handle_unhash_dir_ack(MUnhashDirAck *m); + void handle_unhash_dir_notify_ack(MUnhashDirNotifyAck *m); + void unhash_dir_finish(CDir *dir); + friend class C_MDC_UnhashFreeze; + friend class C_MDC_UnhashComplete; + + // unhash on all + void unhash_dir_complete(CDir *dir); + + // unhash on non-auth + void handle_unhash_dir_prep(MUnhashDirPrep *m); + void unhash_dir_prep_frozen(CDir *dir); + void unhash_dir_prep_finish(CDir *dir); + void handle_unhash_dir(MUnhashDir *m); + void handle_unhash_dir_notify(MUnhashDirNotify *m); + friend class C_MDC_UnhashPrepFreeze; + + +}; + + +#endif diff --git a/branches/sage/cephmds2/mds/OSDMonitor.cc b/branches/sage/cephmds2/mds/OSDMonitor.cc new file mode 100644 index 0000000000000..0c7cadbce3a6d --- /dev/null +++ b/branches/sage/cephmds2/mds/OSDMonitor.cc @@ -0,0 +1,523 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "OSDMonitor.h" + +#include "osd/OSDMap.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDIn.h" +#include "messages/MOSDOut.h" + +#include "common/Timer.h" +#include "common/Clock.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " e" << (osdmap ? osdmap->get_epoch():0) << " " + + +class C_OM_PingTick : public Context { +public: + Messenger *msgr; + C_OM_PingTick(Messenger *m) : msgr(m) {} + void finish(int r) { + msgr->send_message(new MPing, MSG_ADDR_MON(0)); + } +}; + +class C_OM_Faker : public Context { +public: + OSDMonitor *om; + C_OM_Faker(OSDMonitor *m) { + this->om = m; + } + void finish(int r) { + om->fake_reorg(); + } +}; + +class C_OM_FakeOSDFailure : public Context { + OSDMonitor *mon; + int osd; + bool down; +public: + C_OM_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} + void finish(int r) { + mon->fake_osd_failure(osd,down); + } +}; + + + +void OSDMonitor::fake_osdmap_update() +{ + dout(1) << "fake_osdmap_update" << endl; + accept_pending(); + + // tell a random osd + send_incremental_map(osdmap->get_epoch()-1, // ick! FIXME + MSG_ADDR_OSD(rand() % g_conf.num_osd)); +} + + +void OSDMonitor::fake_reorg() +{ + int r = rand() % g_conf.num_osd; + + if (osdmap->is_out(r)) { + dout(1) << "fake_reorg marking osd" << r << " in" << endl; + pending.new_in.push_back(r); + } else { + dout(1) << "fake_reorg marking osd" << r << " out" << endl; + pending.new_out.push_back(r); + } + + accept_pending(); + + // tell him! + send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(r)); +} + + +void OSDMonitor::init() +{ + dout(1) << "init" << endl; + + + // + osdmap = new OSDMap(); + osdmap->set_pg_bits(g_conf.osd_pg_bits); + + // start at epoch 0 until all osds boot + //osdmap->inc_epoch(); // = 1 + //assert(osdmap->get_epoch() == 1); + + + //if (g_conf.mkfs) osdmap->set_mkfs(); + + Bucket *b = new UniformBucket(1, 0); + int root = osdmap->crush.add_bucket(b); + for (int i=0; iosds.insert(i); + b->add_item(i, 1); + } + + for (int i=1; i<5; i++) { + osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); + osdmap->crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + } + + if (g_conf.mds_local_osd) { + // add mds osds, but don't put them in the crush mapping func + for (int i=0; iosds.insert(i+10000); + } + + // + + + + if (whoami == 0 && + g_conf.num_osd > 4 && + g_conf.fake_osdmap_expand) { + dout(1) << "scheduling OSD map reorg at " << g_conf.fake_osdmap_expand << endl; + g_timer.add_event_after(g_conf.fake_osdmap_expand, + new C_OM_Faker(this)); + } + + if (whoami == 0) { + // fake osd failures + for (map::iterator i = g_fake_osd_down.begin(); + i != g_fake_osd_down.end(); + i++) { + dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; + g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 1)); + } + for (map::iterator i = g_fake_osd_out.begin(); + i != g_fake_osd_out.end(); + i++) { + dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; + g_timer.add_event_after(i->second, new C_OM_FakeOSDFailure(this, i->first, 0)); + } + } + + + // i'm ready! + messenger->set_dispatcher(this); + + // start ticker + g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger)); +} + + +void OSDMonitor::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_OSD_FAILURE: + handle_osd_failure((MOSDFailure*)m); + break; + + case MSG_PING_ACK: + handle_ping_ack((MPingAck*)m); + break; + + case MSG_OSD_GETMAP: + handle_osd_getmap((MOSDGetMap*)m); + return; + + case MSG_OSD_BOOT: + handle_osd_boot((MOSDBoot*)m); + return; + + case MSG_OSD_IN: + handle_osd_in((MOSDIn*)m); + break; + case MSG_OSD_OUT: + handle_osd_out((MOSDOut*)m); + break; + + case MSG_SHUTDOWN: + handle_shutdown(m); + return; + + case MSG_PING: + tick(); + delete m; + return; + + default: + dout(0) << "unknown message " << *m << endl; + assert(0); + } +} + + +void OSDMonitor::handle_shutdown(Message *m) +{ + dout(1) << "shutdown from " << m->get_source() << endl; + messenger->shutdown(); + delete messenger; + delete m; +} + +void OSDMonitor::handle_ping_ack(MPingAck *m) +{ + // ... + + delete m; +} + +void OSDMonitor::handle_osd_failure(MOSDFailure *m) +{ + dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl; + + // FIXME? + + // take their word for it + int from = m->get_failed().num(); + if (osdmap->is_up(from) && + (osdmap->osd_inst.count(from) == 0 || + osdmap->osd_inst[from] == m->get_inst())) { + pending.new_down[from] = m->get_inst(); + + if (osdmap->is_in(from)) + pending_out[from] = g_clock.now(); + + //awaiting_maps[pending.epoch][m->get_source()] = + + accept_pending(); + bcast_latest_osd_map_mds(); + //bcast_latest_osd_map_osd(); // FIXME: which osds can i tell? + } + + send_incremental_map(m->get_epoch(), m->get_source()); + + delete m; +} + + + +void OSDMonitor::fake_osd_failure(int osd, bool down) +{ + if (down) { + dout(1) << "fake_osd_failure DOWN osd" << osd << endl; + pending.new_down[osd] = osdmap->osd_inst[osd]; + } else { + dout(1) << "fake_osd_failure OUT osd" << osd << endl; + pending.new_out.push_back(osd); + } + accept_pending(); + bcast_latest_osd_map_osd(); + bcast_latest_osd_map_mds(); +} + + +void OSDMonitor::handle_osd_boot(MOSDBoot *m) +{ + dout(7) << "osd_boot from " << m->get_source() << endl; + assert(m->get_source().is_osd()); + int from = m->get_source().num(); + + if (osdmap->get_epoch() == 0) { + // waiting for boot! + osdmap->osd_inst[from] = m->get_source_inst(); + + if (osdmap->osd_inst.size() == osdmap->osds.size()) { + dout(-7) << "osd_boot all osds booted." << endl; + osdmap->inc_epoch(); + osdmap->encode(maps[osdmap->get_epoch()]); // 1 + pending.epoch = osdmap->get_epoch()+1; // 2 + + send_map(); + bcast_latest_osd_map_osd(); + bcast_latest_osd_map_mds(); + } else { + dout(7) << "osd_boot waiting for " + << (osdmap->osds.size() - osdmap->osd_inst.size()) + << " osds to boot" << endl; + } + return; + } + + // already up? mark down first? + if (osdmap->is_up(from)) { + assert(m->get_source_inst() > osdmap->osd_inst[from]); // this better be newer! + pending.new_down[from] = osdmap->osd_inst[from]; + accept_pending(); + } + + // mark up. + pending_out.erase(from); + assert(osdmap->is_down(from)); + pending.new_up[from] = m->get_source_inst(); + + // mark in? + if (osdmap->out_osds.count(from)) + pending.new_in.push_back(from); + + accept_pending(); + + // the booting osd will spread word + send_incremental_map(m->sb.current_epoch, m->get_source()); + delete m; + + // tell mds + bcast_latest_osd_map_mds(); +} + +void OSDMonitor::handle_osd_in(MOSDIn *m) +{ + dout(7) << "osd_in from " << m->get_source() << endl; + int from = m->get_source().num(); + if (osdmap->is_out(from)) { + pending.new_in.push_back(from); + accept_pending(); + send_incremental_map(m->map_epoch, m->get_source()); + } +} + +void OSDMonitor::handle_osd_out(MOSDOut *m) +{ + dout(7) << "osd_out from " << m->get_source() << endl; + int from = m->get_source().num(); + if (osdmap->is_in(from)) { + pending.new_out.push_back(from); + accept_pending(); + send_incremental_map(m->map_epoch, m->get_source()); + } +} + + +void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) +{ + dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; + + if (osdmap->get_epoch() == 0) { + awaiting_map[1][m->get_source()] = m->get_since(); + } else { + if (m->get_since()) + send_incremental_map(m->get_since(), m->get_source()); + else + send_full_map(m->get_source()); + } + delete m; +} + + + +void OSDMonitor::accept_pending() +{ + dout(-10) << "accept_pending " << osdmap->get_epoch() << " -> " << pending.epoch << endl; + + // accept pending into a new map! + pending.encode( inc_maps[ pending.epoch ] ); + + // advance! + osdmap->apply_incremental(pending); + + + // tell me about it + for (map::iterator i = pending.new_up.begin(); + i != pending.new_up.end(); + i++) { + dout(0) << "osd" << i->first << " UP " << i->second << endl; + derr(0) << "osd" << i->first << " UP " << i->second << endl; + messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); + } + for (map::iterator i = pending.new_down.begin(); + i != pending.new_down.end(); + i++) { + dout(0) << "osd" << i->first << " DOWN " << i->second << endl; + derr(0) << "osd" << i->first << " DOWN " << i->second << endl; + messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); + } + for (list::iterator i = pending.new_in.begin(); + i != pending.new_in.end(); + i++) { + dout(0) << "osd" << *i << " IN" << endl; + derr(0) << "osd" << *i << " IN" << endl; + } + for (list::iterator i = pending.new_out.begin(); + i != pending.new_out.end(); + i++) { + dout(0) << "osd" << *i << " OUT" << endl; + derr(0) << "osd" << *i << " OUT" << endl; + } + + // clear new pending + OSDMap::Incremental next(osdmap->get_epoch() + 1); + pending = next; +} + +void OSDMonitor::send_map() +{ + dout(10) << "send_map " << osdmap->get_epoch() << endl; + + map s; + s.swap( awaiting_map[osdmap->get_epoch()] ); + awaiting_map.erase(osdmap->get_epoch()); + + for (map::iterator i = s.begin(); + i != s.end(); + i++) + send_incremental_map(i->second, i->first); +} + + +void OSDMonitor::send_full_map(msg_addr_t who) +{ + messenger->send_message(new MOSDMap(osdmap), who); +} + +void OSDMonitor::send_incremental_map(epoch_t since, msg_addr_t dest) +{ + dout(-10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() + << " to " << dest << endl; + + MOSDMap *m = new MOSDMap; + + for (epoch_t e = osdmap->get_epoch(); + e > since; + e--) { + bufferlist bl; + if (inc_maps.count(e)) { + dout(-10) << "send_incremental_map inc " << e << endl; + m->incremental_maps[e] = inc_maps[e]; + } else if (maps.count(e)) { + dout(-10) << "send_incremental_map full " << e << endl; + m->maps[e] = maps[e]; + //if (!full) break; + } + else { + assert(0); // we should have all maps. + } + } + + messenger->send_message(m, dest); +} + + + +void OSDMonitor::bcast_latest_osd_map_mds() +{ + epoch_t e = osdmap->get_epoch(); + dout(1) << "bcast_latest_osd_map_mds epoch " << e << endl; + + // tell mds + for (int i=0; iget_epoch()-1, MSG_ADDR_MDS(i)); + } +} + +void OSDMonitor::bcast_latest_osd_map_osd() +{ + epoch_t e = osdmap->get_epoch(); + dout(1) << "bcast_latest_osd_map_osd epoch " << e << endl; + + // tell osds + set osds; + osdmap->get_all_osds(osds); + for (set::iterator it = osds.begin(); + it != osds.end(); + it++) { + if (osdmap->is_down(*it)) continue; + + send_incremental_map(osdmap->get_epoch()-1, MSG_ADDR_OSD(*it)); + } +} + + + +void OSDMonitor::tick() +{ + dout(10) << "tick" << endl; + + // mark down osds out? + utime_t now = g_clock.now(); + list mark_out; + for (map::iterator i = pending_out.begin(); + i != pending_out.end(); + i++) { + utime_t down = now; + down -= i->second; + + if (down.sec() >= g_conf.mon_osd_down_out_interval) { + dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; + mark_out.push_back(i->first); + } + } + for (list::iterator i = mark_out.begin(); + i != mark_out.end(); + i++) { + pending_out.erase(*i); + pending.new_out.push_back( *i ); + accept_pending(); + } + + // next! + g_timer.add_event_after(g_conf.mon_tick_interval, new C_OM_PingTick(messenger)); +} diff --git a/branches/sage/cephmds2/mds/OSDMonitor.h b/branches/sage/cephmds2/mds/OSDMonitor.h new file mode 100644 index 0000000000000..cd8babc054225 --- /dev/null +++ b/branches/sage/cephmds2/mds/OSDMonitor.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __OSDMONITOR_H +#define __OSDMONITOR_H + +#include + +#include +#include +using namespace std; + +#include "include/types.h" +#include "msg/Messenger.h" + +#include "osd/OSDMap.h" + +class OSDMonitor : public Dispatcher { + // me + int whoami; + Messenger *messenger; + + // maps + OSDMap *osdmap; + map maps; + map inc_maps; + + OSDMap::Incremental pending; + + map > awaiting_map; + + // osd down -> out + map pending_out; + + + void tick(); // check state, take actions + + // maps + void accept_pending(); // accept pending, new map. + void send_map(); // send current map to waiters. + void send_full_map(msg_addr_t dest); + void send_incremental_map(epoch_t since, msg_addr_t dest); + void bcast_latest_osd_map_mds(); + void bcast_latest_osd_map_osd(); + + + public: + OSDMonitor(int w, Messenger *m) : + whoami(w), + messenger(m), + osdmap(0) { + } + + void init(); + + void dispatch(Message *m); + void handle_shutdown(Message *m); + + void handle_osd_boot(class MOSDBoot *m); + void handle_osd_in(class MOSDIn *m); + void handle_osd_out(class MOSDOut *m); + void handle_osd_failure(class MOSDFailure *m); + void handle_osd_getmap(class MOSDGetMap *m); + + void handle_ping_ack(class MPingAck *m); + + // hack + void fake_osd_failure(int osd, bool down); + void fake_osdmap_update(); + void fake_reorg(); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/Renamer.cc b/branches/sage/cephmds2/mds/Renamer.cc new file mode 100644 index 0000000000000..dfea8d6336803 --- /dev/null +++ b/branches/sage/cephmds2/mds/Renamer.cc @@ -0,0 +1,915 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDCache.h" +#include "MDStore.h" +#include "CInode.h" +#include "CDir.h" +#include "MDS.h" +#include "MDSMap.h" +#include "MDLog.h" +#include "AnchorClient.h" +#include "Migrator.h" +#include "Renamer.h" + +#include "include/filepath.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "events/EInodeUpdate.h" +#include "events/EDirUpdate.h" +#include "events/EUnlink.h" + +#include "messages/MRenameWarning.h" +#include "messages/MRenameNotify.h" +#include "messages/MRenameNotifyAck.h" +#include "messages/MRename.h" +#include "messages/MRenameAck.h" +#include "messages/MRenameReq.h" +#include "messages/MRenamePrep.h" + + + +void Renamer::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_MDS_RENAMEWARNING: + handle_rename_warning((MRenameWarning*)m); + break; + case MSG_MDS_RENAMENOTIFY: + handle_rename_notify((MRenameNotify*)m); + break; + case MSG_MDS_RENAMENOTIFYACK: + handle_rename_notify_ack((MRenameNotifyAck*)m); + break; + case MSG_MDS_RENAME: + handle_rename((MRename*)m); + break; + case MSG_MDS_RENAMEREQ: + handle_rename_req((MRenameReq*)m); + break; + case MSG_MDS_RENAMEPREP: + handle_rename_prep((MRenamePrep*)m); + break; + case MSG_MDS_RENAMEACK: + handle_rename_ack((MRenameAck*)m); + break; + + default: + assert(0); + } +} + + +// renaming! + + +/* + fix_renamed_dir(): + + caller has already: + - relinked inode in new location + - fixed in->is_auth() + - set dir_auth, if appropriate + + caller has not: + - touched in->dir + - updated import/export tables +*/ +void Renamer::fix_renamed_dir(CDir *srcdir, + CInode *in, + CDir *destdir, + bool authchanged, // _inode_ auth + int dir_auth) // dir auth (for certain cases) +{ + dout(7) << "fix_renamed_dir on " << *in << endl; + dout(7) << "fix_renamed_dir on " << *in->dir << endl; + + if (in->dir->is_auth()) { + // dir ours + dout(7) << "dir is auth" << endl; + assert(!in->dir->is_export()); + + if (in->is_auth()) { + // inode now ours + + if (authchanged) { + // inode _was_ replica, now ours + dout(7) << "inode was replica, now ours. removing from import list." << endl; + assert(in->dir->is_import()); + + // not import anymore! + cache->imports.erase(in->dir); + in->dir->state_clear(CDIR_STATE_IMPORT); + in->dir->put(CDIR_PIN_IMPORT); + + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; + + // move my nested imports to in's containing import + CDir *con = cache->get_auth_container(in->dir); + assert(con); + for (set::iterator p = cache->nested_exports[in->dir].begin(); + p != cache->nested_exports[in->dir].end(); + p++) { + dout(7) << "moving nested export under new container " << *con << endl; + cache->nested_exports[con].insert(*p); + } + cache->nested_exports.erase(in->dir); + + } else { + // inode was ours, still ours. + dout(7) << "inode was ours, still ours." << endl; + assert(!in->dir->is_import()); + assert(in->dir->get_dir_auth() == CDIR_AUTH_PARENT); + + // move any exports nested beneath me? + CDir *newcon = cache->get_auth_container(in->dir); + assert(newcon); + CDir *oldcon = cache->get_auth_container(srcdir); + assert(oldcon); + if (newcon != oldcon) { + dout(7) << "moving nested exports under new container" << endl; + set nested; + cache->find_nested_exports_under(oldcon, in->dir, nested); + for (set::iterator it = nested.begin(); + it != nested.end(); + it++) { + dout(7) << "moving nested export " << *it << " under new container" << endl; + cache->nested_exports[oldcon].erase(*it); + cache->nested_exports[newcon].insert(*it); + } + } + } + + } else { + // inode now replica + + if (authchanged) { + // inode was ours, but now replica + dout(7) << "inode was ours, now replica. adding to import list." << endl; + + // i am now an import + cache->imports.insert(in->dir); + in->dir->state_set(CDIR_STATE_IMPORT); + in->dir->get(CDIR_PIN_IMPORT); + + in->dir->set_dir_auth( mds->get_nodeid() ); + dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; + + // find old import + CDir *oldcon = cache->get_auth_container(srcdir); + assert(oldcon); + dout(7) << " oldcon is " << *oldcon << endl; + + // move nested exports under me + set nested; + cache->find_nested_exports_under(oldcon, in->dir, nested); + for (set::iterator it = nested.begin(); + it != nested.end(); + it++) { + dout(7) << "moving nested export " << *it << " under me" << endl; + cache->nested_exports[oldcon].erase(*it); + cache->nested_exports[in->dir].insert(*it); + } + + } else { + // inode was replica, still replica + dout(7) << "inode was replica, still replica. doing nothing." << endl; + assert(in->dir->is_import()); + + // verify dir_auth + assert(in->dir->get_dir_auth() == mds->get_nodeid()); // me, because i'm auth for dir. + assert(in->authority() != in->dir->get_dir_auth()); // inode not me. + } + + assert(in->dir->is_import()); + } + + } else { + // dir is not ours + dout(7) << "dir is not auth" << endl; + + if (in->is_auth()) { + // inode now ours + + if (authchanged) { + // inode was replica, now ours + dout(7) << "inode was replica, now ours. now an export." << endl; + assert(!in->dir->is_export()); + + // now export + cache->exports.insert(in->dir); + in->dir->state_set(CDIR_STATE_EXPORT); + in->dir->get(CDIR_PIN_EXPORT); + + assert(dir_auth >= 0); // better be defined + in->dir->set_dir_auth( dir_auth ); + dout(7) << " fixing dir_auth to be " << in->dir->get_dir_auth() << endl; + + CDir *newcon = cache->get_auth_container(in->dir); + assert(newcon); + cache->nested_exports[newcon].insert(in->dir); + + } else { + // inode was ours, still ours + dout(7) << "inode was ours, still ours. did my import change?" << endl; + + // sanity + assert(in->dir->is_export()); + assert(in->dir->get_dir_auth() >= 0); + assert(in->dir->get_dir_auth() != in->authority()); + + // moved under new import? + CDir *oldcon = cache->get_auth_container(srcdir); + CDir *newcon = cache->get_auth_container(in->dir); + if (oldcon != newcon) { + dout(7) << "moving myself under new import " << *newcon << endl; + cache->nested_exports[oldcon].erase(in->dir); + cache->nested_exports[newcon].insert(in->dir); + } + } + + assert(in->dir->is_export()); + } else { + // inode now replica + + if (authchanged) { + // inode was ours, now replica + dout(7) << "inode was ours, now replica. removing from export list." << endl; + assert(in->dir->is_export()); + + // remove from export list + cache->exports.erase(in->dir); + in->dir->state_clear(CDIR_STATE_EXPORT); + in->dir->put(CDIR_PIN_EXPORT); + + CDir *oldcon = cache->get_auth_container(srcdir); + assert(oldcon); + assert(cache->nested_exports[oldcon].count(in->dir) == 1); + cache->nested_exports[oldcon].erase(in->dir); + + // simplify dir_auth + if (in->authority() == in->dir->authority()) { + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + dout(7) << "simplified dir_auth to -1, inode auth is (also) " << in->authority() << endl; + } else { + assert(in->dir->get_dir_auth() >= 0); // someone else's export, + } + + } else { + // inode was replica, still replica + dout(7) << "inode was replica, still replica. do nothing." << endl; + + // fix dir_auth? + if (in->authority() == dir_auth) + in->dir->set_dir_auth( CDIR_AUTH_PARENT ); + else + in->dir->set_dir_auth( dir_auth ); + dout(7) << " fixing dir_auth to be " << dir_auth << endl; + + // do nothing. + } + + assert(!in->dir->is_export()); + } + } + + cache->show_imports(); +} + +/* + * when initiator gets an ack back for a foreign rename + */ + +class C_MDC_RenameNotifyAck : public Context { + Renamer *rn; + CInode *in; + int initiator; + +public: + C_MDC_RenameNotifyAck(Renamer *r, + CInode *i, int init) : rn(r), in(i), initiator(init) {} + void finish(int r) { + rn->file_rename_ack(in, initiator); + } +}; + + + +/************** initiator ****************/ + +/* + * when we get MRenameAck (and rename is done, notifies gone out+acked, etc.) + */ +class C_MDC_RenameAck : public Context { + Renamer *mdc; + CDir *srcdir; + CInode *in; + Context *c; +public: + C_MDC_RenameAck(Renamer *mdc, CDir *srcdir, CInode *in, Context *c) { + this->mdc = mdc; + this->srcdir = srcdir; + this->in = in; + this->c = c; + } + void finish(int r) { + mdc->file_rename_finish(srcdir, in, c); + } +}; + + +void Renamer::file_rename(CDentry *srcdn, CDentry *destdn, Context *onfinish) +{ + assert(srcdn->is_xlocked()); // by me + assert(destdn->is_xlocked()); // by me + + CDir *srcdir = srcdn->dir; + string srcname = srcdn->name; + + CDir *destdir = destdn->dir; + string destname = destdn->name; + + CInode *in = srcdn->inode; + //Message *req = srcdn->xlockedby; + + + // determine the players + int srcauth = srcdir->dentry_authority(srcdn->name); + int destauth = destdir->dentry_authority(destname); + + + // FOREIGN rename? + if (srcauth != mds->get_nodeid() || + destauth != mds->get_nodeid()) { + dout(7) << "foreign rename. srcauth " << srcauth << ", destauth " << destauth << ", isdir " << srcdn->inode->is_dir() << endl; + + string destpath; + destdn->make_path(destpath); + + if (destauth != mds->get_nodeid()) { + // make sure dest has dir open. + dout(7) << "file_rename i'm not dest auth. sending MRenamePrep to " << destauth << endl; + + // prep dest first, they must have the dir open! rest will follow. + string srcpath; + srcdn->make_path(srcpath); + + MRenamePrep *m = new MRenamePrep(mds->get_nodeid(), // i'm the initiator + srcdir->ino(), srcname, srcpath, + destdir->ino(), destname, destpath, + srcauth); // tell dest who src is (maybe even me) + mds->send_message_mds(m, destauth, MDS_PORT_CACHE); + + cache->show_imports(); + + } + + else if (srcauth != mds->get_nodeid()) { + if (destauth == mds->get_nodeid()) { + dout(7) << "file_rename dest auth, not src auth. sending MRenameReq" << endl; + } else { + dout(7) << "file_rename neither src auth nor dest auth. sending MRenameReq" << endl; + } + + // srcdn not important on destauth, just request + MRenameReq *m = new MRenameReq(mds->get_nodeid(), // i'm the initiator + srcdir->ino(), srcname, + destdir->ino(), destname, destpath, destauth); // tell src who dest is (they may not know) + mds->send_message_mds(m, srcauth, MDS_PORT_CACHE); + } + + else + assert(0); + + // set waiter on the inode (is this the best place?) + in->add_waiter(CINODE_WAIT_RENAMEACK, + new C_MDC_RenameAck(this, + srcdir, in, onfinish)); + return; + } + + // LOCAL rename! + assert(srcauth == mds->get_nodeid() && destauth == mds->get_nodeid()); + dout(7) << "file_rename src and dest auth, renaming locally (easy!)" << endl; + + // update our cache + if (destdn->inode && destdn->inode->is_dirty()) + destdn->inode->mark_clean(); + + cache->rename_file(srcdn, destdn); + + // update imports/exports? + if (in->is_dir() && in->dir) + fix_renamed_dir(srcdir, in, destdir, false); // auth didnt change + + // mark dentries dirty + srcdn->mark_dirty(); + destdn->mark_dirty(); + in->mark_dirty(); + + + // local, restrict notify to ppl with open dirs + set notify = srcdir->get_open_by(); + for (set::iterator it = destdir->open_by_begin(); + it != destdir->open_by_end(); + it++) + if (notify.count(*it) == 0) notify.insert(*it); + + if (notify.size()) { + // warn + notify + file_rename_warn(in, notify); + file_rename_notify(in, srcdir, srcname, destdir, destname, notify, mds->get_nodeid()); + + // wait for MRenameNotifyAck's + in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, + new C_MDC_RenameNotifyAck(this, in, mds->get_nodeid())); // i am initiator + + // wait for finish + in->add_waiter(CINODE_WAIT_RENAMEACK, + new C_MDC_RenameAck(this, srcdir, in, onfinish)); + } else { + // sweet, no notify necessary, we're done! + file_rename_finish(srcdir, in, onfinish); + } +} + +void Renamer::handle_rename_ack(MRenameAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + + dout(7) << "handle_rename_ack on " << *in << endl; + + // all done! + in->finish_waiting(CINODE_WAIT_RENAMEACK); + + delete m; +} + +void Renamer::file_rename_finish(CDir *srcdir, CInode *in, Context *c) +{ + dout(10) << "file_rename_finish on " << *in << endl; + + // did i empty out an imported dir? FIXME this check should go somewhere else??? + if (srcdir->is_import() && !srcdir->inode->is_root() && srcdir->get_size() == 0) + cache->migrator->export_empty_import(srcdir); + + // finish our caller + if (c) { + c->finish(0); + delete c; + } +} + + +/************* src **************/ + + +/** handle_rename_req + * received by auth of src dentry (from init, or destauth if dir). + * src may not have dest dir open. + * src will export inode, unlink|rename, and send MRename to dest. + */ +void Renamer::handle_rename_req(MRenameReq *m) +{ + // i am auth, i will have it. + CInode *srcdiri = cache->get_inode(m->get_srcdirino()); + CDir *srcdir = srcdiri->dir; + CDentry *srcdn = srcdir->lookup(m->get_srcname()); + assert(srcdn); + + // do it + file_rename_foreign_src(srcdn, + m->get_destdirino(), m->get_destname(), m->get_destpath(), m->get_destauth(), + m->get_initiator()); + delete m; +} + + +void Renamer::file_rename_foreign_src(CDentry *srcdn, + inodeno_t destdirino, string& destname, string& destpath, int destauth, + int initiator) +{ + dout(7) << "file_rename_foreign_src " << *srcdn << endl; + + CDir *srcdir = srcdn->dir; + string srcname = srcdn->name; + + // (we're basically exporting this inode) + CInode *in = srcdn->inode; + assert(in); + assert(in->is_auth()); + + if (in->is_dir()) cache->show_imports(); + + // encode and export inode state + bufferlist inode_state; + cache->migrator->encode_export_inode(in, inode_state, destauth); + + // send + MRename *m = new MRename(initiator, + srcdir->ino(), srcdn->name, destdirino, destname, + inode_state); + mds->send_message_mds(m, destauth, MDS_PORT_CACHE); + + // have dest? + CInode *destdiri = cache->get_inode(m->get_destdirino()); + CDir *destdir = 0; + if (destdiri) destdir = destdiri->dir; + CDentry *destdn = 0; + if (destdir) destdn = destdir->lookup(m->get_destname()); + + // discover src + if (!destdn) { + dout(7) << "file_rename_foreign_src doesn't have destdn, discovering " << destpath << endl; + + filepath destfilepath = destpath; + vector trace; + int r = cache->path_traverse(destfilepath, trace, true, + m, new C_MDS_RetryMessage(mds, m), + MDS_TRAVERSE_DISCOVER); + assert(r>0); + return; + } + + assert(destdn); + + // update our cache + cache->rename_file(srcdn, destdn); + + // update imports/exports? + if (in->is_dir() && in->dir) + fix_renamed_dir(srcdir, in, destdir, true); // auth changed + + srcdn->mark_dirty(); + + // proxy! + in->state_set(CINODE_STATE_PROXY); + in->get(CINODE_PIN_PROXY); + + // generate notify list (everybody but src|dst) and send warnings + set notify; + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i != mds->get_nodeid() && // except the source + i != destauth) // and the dest + notify.insert(i); + } + file_rename_warn(in, notify); + + + // wait for MRenameNotifyAck's + in->add_waiter(CINODE_WAIT_RENAMENOTIFYACK, + new C_MDC_RenameNotifyAck(this, in, initiator)); +} + +void Renamer::file_rename_warn(CInode *in, + set& notify) +{ + // note gather list + rename_waiting_for_ack[in->ino()] = notify; + + // send + for (set::iterator it = notify.begin(); + it != notify.end(); + it++) { + dout(10) << "file_rename_warn to " << *it << " for " << *in << endl; + mds->send_message_mds(new MRenameWarning(in->ino()), *it, MDS_PORT_CACHE); + } +} + + +void Renamer::handle_rename_notify_ack(MRenameNotifyAck *m) +{ + CInode *in = cache->get_inode(m->get_ino()); + assert(in); + dout(7) << "handle_rename_notify_ack on " << *in << endl; + + int source = MSG_ADDR_NUM(m->get_source()); + rename_waiting_for_ack[in->ino()].erase(source); + if (rename_waiting_for_ack[in->ino()].empty()) { + // last one! + rename_waiting_for_ack.erase(in->ino()); + in->finish_waiting(CINODE_WAIT_RENAMENOTIFYACK, 0); + } else { + dout(7) << "still waiting for " << rename_waiting_for_ack[in->ino()] << endl; + } +} + + +void Renamer::file_rename_ack(CInode *in, int initiator) +{ + // we got all our MNotifyAck's. + + // was i proxy (if not, it's cuz this was a local rename) + if (in->state_test(CINODE_STATE_PROXY)) { + dout(10) << "file_rename_ack clearing proxy bit on " << *in << endl; + in->state_clear(CINODE_STATE_PROXY); + in->put(CINODE_PIN_PROXY); + } + + // done! + if (initiator == mds->get_nodeid()) { + // it's me, finish + dout(7) << "file_rename_ack i am initiator, finishing" << endl; + in->finish_waiting(CINODE_WAIT_RENAMEACK); + } else { + // send ack + dout(7) << "file_rename_ack sending MRenameAck to initiator " << initiator << endl; + mds->send_message_mds(new MRenameAck(in->ino()), initiator, MDS_PORT_CACHE); + } +} + + + + +/************ dest *************/ + +/** handle_rename_prep + * received by auth of dest dentry to make sure they have src + dir open. + * this is so that when they get the inode and dir, they can update exports etc properly. + * will send MRenameReq to src. + */ +void Renamer::handle_rename_prep(MRenamePrep *m) +{ + // open src + filepath srcpath = m->get_srcpath(); + vector trace; + int r = cache->path_traverse(srcpath, trace, false, + m, new C_MDS_RetryMessage(mds, m), + MDS_TRAVERSE_DISCOVER); + + if (r>0) return; + + // ok! + CInode *srcin = trace[trace.size()-1]->inode; + assert(srcin); + + dout(7) << "handle_rename_prep have srcin " << *srcin << endl; + + if (srcin->is_dir()) { + if (!srcin->dir) { + dout(7) << "handle_rename_prep need to open dir" << endl; + cache->open_remote_dir(srcin, + new C_MDS_RetryMessage(mds,m)); + return; + } + + dout(7) << "handle_rename_prep have dir " << *srcin->dir << endl; + } + + // pin + srcin->get(CINODE_PIN_RENAMESRC); + + // send rename request + MRenameReq *req = new MRenameReq(m->get_initiator(), // i'm the initiator + m->get_srcdirino(), m->get_srcname(), + m->get_destdirino(), m->get_destname(), m->get_destpath(), + mds->get_nodeid()); // i am dest + mds->send_message_mds(req, m->get_srcauth(), MDS_PORT_CACHE); + delete m; + return; +} + + + +/** handle_rename + * received by auth of dest dentry. includes exported inode info. + * dest may not have srcdir open. + */ +void Renamer::handle_rename(MRename *m) +{ + // srcdn (required) + CInode *srcdiri = cache->get_inode(m->get_srcdirino()); + CDir *srcdir = srcdiri->dir; + CDentry *srcdn = srcdir->lookup(m->get_srcname()); + string srcname = srcdn->name; + assert(srcdn && srcdn->inode); + + dout(7) << "handle_rename srcdn " << *srcdn << endl; + + // destdn (required). i am auth, so i will have it. + CInode *destdiri = cache->get_inode(m->get_destdirino()); + CDir *destdir = destdiri->dir; + CDentry *destdn = destdir->lookup(m->get_destname()); + string destname = destdn->name; + assert(destdn); + + dout(7) << "handle_rename destdn " << *destdn << endl; + + // note old dir auth + int old_dir_auth = -1; + if (srcdn->inode->dir) old_dir_auth = srcdn->inode->dir->authority(); + + // rename replica into position + if (destdn->inode && destdn->inode->is_dirty()) + destdn->inode->mark_clean(); + + cache->rename_file(srcdn, destdn); + + // decode + import inode (into new location start) + int off = 0; + // HACK + bufferlist bufstate; + bufstate.claim_append(m->get_inode_state()); + cache->migrator->decode_import_inode(destdn, bufstate, off, MSG_ADDR_NUM(m->get_source())); + + CInode *in = destdn->inode; + assert(in); + + // update imports/exports? + if (in->is_dir()) { + assert(in->dir); // i had better already ahve it open.. see MRenamePrep + fix_renamed_dir(srcdir, in, destdir, true, // auth changed + old_dir_auth); // src is possibly new dir auth. + } + + // mark dirty + destdn->mark_dirty(); + in->mark_dirty(); + + // unpin + in->put(CINODE_PIN_RENAMESRC); + + // ok, send notifies. + set notify; + for (int i=0; iget_mds_map()->get_num_mds(); i++) { + if (i != MSG_ADDR_NUM(m->get_source()) && // except the source + i != mds->get_nodeid()) // and the dest + notify.insert(i); + } + file_rename_notify(in, srcdir, srcname, destdir, destname, notify, MSG_ADDR_NUM(m->get_source())); + + delete m; +} + + +void Renamer::file_rename_notify(CInode *in, + CDir *srcdir, string& srcname, CDir *destdir, string& destname, + set& notify, + int srcauth) +{ + /* NOTE: notify list might include myself */ + + // tell + string destdirpath; + destdir->inode->make_path(destdirpath); + + for (set::iterator it = notify.begin(); + it != notify.end(); + it++) { + dout(10) << "file_rename_notify to " << *it << " for " << *in << endl; + mds->send_message_mds(new MRenameNotify(in->ino(), + srcdir->ino(), + srcname, + destdir->ino(), + destdirpath, + destname, + srcauth), + *it, MDS_PORT_CACHE); + } +} + + + +/************** bystanders ****************/ + +void Renamer::handle_rename_warning(MRenameWarning *m) +{ + // add to warning list + stray_rename_warnings.insert( m->get_ino() ); + + // did i already see the notify? + if (stray_rename_notifies.count(m->get_ino())) { + // i did, we're good. + dout(7) << "handle_rename_warning on " << m->get_ino() << ". already got notify." << endl; + + handle_rename_notify(stray_rename_notifies[m->get_ino()]); + stray_rename_notifies.erase(m->get_ino()); + } else { + dout(7) << "handle_rename_warning on " << m->get_ino() << ". waiting for notify." << endl; + } + + // done + delete m; +} + + +void Renamer::handle_rename_notify(MRenameNotify *m) +{ + // FIXME: when we do hard links, i think we need to + // have srcdn and destdn both, or neither, always! + + // did i see the warning yet? + if (!stray_rename_warnings.count(m->get_ino())) { + // wait for it. + dout(7) << "handle_rename_notify on " << m->get_ino() << ", waiting for warning." << endl; + stray_rename_notifies[m->get_ino()] = m; + return; + } + + dout(7) << "handle_rename_notify dir " << m->get_srcdirino() << " dn " << m->get_srcname() << " to dir " << m->get_destdirino() << " dname " << m->get_destname() << endl; + + // src + CInode *srcdiri = cache->get_inode(m->get_srcdirino()); + CDir *srcdir = 0; + if (srcdiri) srcdir = srcdiri->dir; + CDentry *srcdn = 0; + if (srcdir) srcdn = srcdir->lookup(m->get_srcname()); + + // dest + CInode *destdiri = cache->get_inode(m->get_destdirino()); + CDir *destdir = 0; + if (destdiri) destdir = destdiri->dir; + CDentry *destdn = 0; + if (destdir) destdn = destdir->lookup(m->get_destname()); + + // have both? + list finished; + if (srcdn && destdir) { + CInode *in = srcdn->inode; + + int old_dir_auth = -1; + if (in && in->dir) old_dir_auth = in->dir->authority(); + + if (!destdn) { + destdn = destdir->add_dentry(m->get_destname()); // create null dentry + destdn->lockstate = DN_LOCK_XLOCK; // that's xlocked! + } + + dout(7) << "handle_rename_notify renaming " << *srcdn << " to " << *destdn << endl; + + if (in) { + cache->rename_file(srcdn, destdn); + + // update imports/exports? + if (in && in->is_dir() && in->dir) { + fix_renamed_dir(srcdir, in, destdir, false, old_dir_auth); // auth didnt change + } + } else { + dout(7) << " i don't have the inode (just null dentries)" << endl; + } + + } + + else if (srcdn) { + dout(7) << "handle_rename_notify no dest, but have src" << endl; + dout(7) << "srcdn is " << *srcdn << endl; + + if (destdiri) { + dout(7) << "have destdiri, opening dir " << *destdiri << endl; + cache->open_remote_dir(destdiri, + new C_MDS_RetryMessage(mds,m)); + } else { + filepath destdirpath = m->get_destdirpath(); + dout(7) << "don't have destdiri even, doing traverse+discover on " << destdirpath << endl; + + vector trace; + int r = cache->path_traverse(destdirpath, trace, true, + m, new C_MDS_RetryMessage(mds, m), + MDS_TRAVERSE_DISCOVER); + assert(r>0); + } + return; + } + + else if (destdn) { + dout(7) << "handle_rename_notify unlinking dst only " << *destdn << endl; + if (destdn->inode) { + destdir->unlink_inode(destdn); + } + } + + else { + dout(7) << "handle_rename_notify didn't have srcdn or destdn" << endl; + assert(srcdn == 0 && destdn == 0); + } + + mds->queue_finished(finished); + + + // ack + dout(10) << "sending RenameNotifyAck back to srcauth " << m->get_srcauth() << endl; + MRenameNotifyAck *ack = new MRenameNotifyAck(m->get_ino()); + mds->send_message_mds(ack, m->get_srcauth(), MDS_PORT_CACHE); + + + stray_rename_warnings.erase( m->get_ino() ); + delete m; +} + + + + diff --git a/branches/sage/cephmds2/mds/Renamer.h b/branches/sage/cephmds2/mds/Renamer.h new file mode 100644 index 0000000000000..1005971df986f --- /dev/null +++ b/branches/sage/cephmds2/mds/Renamer.h @@ -0,0 +1,98 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_RENAMER_H +#define __MDS_RENAMER_H + +#include "include/types.h" + +#include +#include +using std::map; +using std::set; + +class MDS; +class MDCache; +class CDentry; +class CInode; +class CDir; + +class Message; +class MRenameWarning; +class MRenameNotify; +class MRenameNotifyAck; +class MRename; +class MRenamePrep; +class MRenameReq; +class MRenameAck; + +class Renamer { + MDS *mds; + MDCache *cache; + + // rename fun + set stray_rename_warnings; // notifies i haven't seen + map stray_rename_notifies; + + map > rename_waiting_for_ack; + + + + void fix_renamed_dir(CDir *srcdir, + CInode *in, + CDir *destdir, + bool authchanged, // _inode_ auth changed + int dirauth=-1); // dirauth (for certain cases) + + +public: + Renamer(MDS *m, MDCache *c) : mds(m), cache(c) {} + + void dispatch(Message *m); + + // RENAME + // initiator + public: + void file_rename(CDentry *srcdn, CDentry *destdn, Context *c); + protected: + void handle_rename_ack(MRenameAck *m); // dest -> init (almost always) + void file_rename_finish(CDir *srcdir, CInode *in, Context *c); + friend class C_MDC_RenameAck; + + // src + void handle_rename_req(MRenameReq *m); // dest -> src + void file_rename_foreign_src(CDentry *srcdn, + inodeno_t destdirino, string& destname, string& destpath, int destauth, + int initiator); + void file_rename_warn(CInode *in, set& notify); + void handle_rename_notify_ack(MRenameNotifyAck *m); // bystanders -> src + void file_rename_ack(CInode *in, int initiator); + friend class C_MDC_RenameNotifyAck; + + // dest + void handle_rename_prep(MRenamePrep *m); // init -> dest + void handle_rename(MRename *m); // src -> dest + void file_rename_notify(CInode *in, + CDir *srcdir, string& srcname, CDir *destdir, string& destname, + set& notify, int srcauth); + + // bystander + void handle_rename_warning(MRenameWarning *m); // src -> bystanders + void handle_rename_notify(MRenameNotify *m); // dest -> bystanders + + +}; + +#endif + + diff --git a/branches/sage/cephmds2/mds/Server.cc b/branches/sage/cephmds2/mds/Server.cc new file mode 100644 index 0000000000000..28ebb826e1a3a --- /dev/null +++ b/branches/sage/cephmds2/mds/Server.cc @@ -0,0 +1,2151 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "MDS.h" +#include "Server.h" +#include "Locker.h" +#include "MDCache.h" +#include "MDLog.h" +#include "Migrator.h" +#include "MDBalancer.h" +#include "Renamer.h" + +#include "msg/Messenger.h" + +#include "messages/MClientMount.h" +#include "messages/MClientMountAck.h" +#include "messages/MClientRequest.h" +#include "messages/MClientReply.h" +#include "messages/MHashReaddir.h" +#include "messages/MHashReaddirReply.h" + +#include "messages/MLock.h" + +#include "messages/MInodeLink.h" + +#include "events/EInodeUpdate.h" +#include "events/EDirUpdate.h" +#include "events/EMknod.h" +#include "events/EMkdir.h" + +#include "include/filepath.h" +#include "common/Timer.h" +#include "common/Logger.h" +#include "common/LogType.h" + +#include +#include + +#include +#include +using namespace std; + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << whoami << ".server " + + +void Server::dispatch(Message *m) +{ + // active? + if (!mds->is_active()) { + dout(3) << "not active yet, waiting" << endl; + mds->queue_waitfor_active(new C_MDS_RetryMessage(mds, m)); + return; + } + + switch (m->get_type()) { + case MSG_CLIENT_MOUNT: + handle_client_mount((MClientMount*)m); + return; + case MSG_CLIENT_UNMOUNT: + handle_client_unmount(m); + return; + } + + + switch (m->get_type()) { + case MSG_CLIENT_REQUEST: + handle_client_request((MClientRequest*)m); + return; + + case MSG_MDS_HASHREADDIR: + handle_hash_readdir((MHashReaddir*)m); + return; + case MSG_MDS_HASHREADDIRREPLY: + handle_hash_readdir_reply((MHashReaddirReply*)m); + return; + + } + + dout(1) << " main unknown message " << m->get_type() << endl; + assert(0); +} + + + + + +void Server::handle_client_mount(MClientMount *m) +{ + int n = MSG_ADDR_NUM(m->get_source()); + dout(3) << "mount by client" << n << endl; + mds->clientmap.add_mount(n, m->get_source_inst()); + + assert(whoami == 0); // mds0 mounts/unmounts + + // ack + messenger->send_message(new MClientMountAck(m, mds->mdsmap, mds->osdmap), + m->get_source(), m->get_source_inst()); + delete m; +} + +void Server::handle_client_unmount(Message *m) +{ + int n = MSG_ADDR_NUM(m->get_source()); + dout(3) << "unmount by client" << n << endl; + + assert(whoami == 0); // mds0 mounts/unmounts + + mds->clientmap.rem_mount(n); + + if (mds->clientmap.get_mount_set().empty()) { + dout(3) << "all clients done, initiating shutdown" << endl; + mds->shutdown_start(); + } + + // ack by sending back to client + entity_inst_t srcinst = m->get_source_inst(); // make a copy! + messenger->send_message(m, m->get_source(), srcinst); +} + + + +/******* + * some generic stuff for finishing off requests + */ + +/** C_MDS_CommitRequest + */ + +class C_MDS_CommitRequest : public Context { + Server *server; + MClientRequest *req; + MClientReply *reply; + CInode *tracei; // inode to include a trace for + LogEvent *event; + +public: + C_MDS_CommitRequest(Server *server, + MClientRequest *req, MClientReply *reply, CInode *tracei, + LogEvent *event=0) { + this->server = server; + this->req = req; + this->tracei = tracei; + this->reply = reply; + this->event = event; + } + void finish(int r) { + if (r != 0) { + // failure. set failure code and reply. + reply->set_result(r); + } + if (event) { + server->commit_request(req, reply, tracei, event); + } else { + // reply. + server->reply_request(req, reply, tracei); + } + } +}; + + +/* + * send generic response (just and error code) + */ +void Server::reply_request(MClientRequest *req, int r, CInode *tracei) +{ + reply_request(req, new MClientReply(req, r), tracei); +} + + +/* + * send given reply + * include a trace to tracei + */ +void Server::reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei) { + dout(10) << "reply_request r=" << reply->get_result() << " " << *req << endl; + + // include trace + if (tracei) { + reply->set_trace_dist( tracei, whoami ); + } + + // send reply + messenger->send_message(reply, + MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst()); + + // discard request + mdcache->request_finish(req); + + // stupid stats crap (FIXME) + stat_ops++; +} + + +/* + * commit event(s) to the metadata journal, then reply. + * or, be sloppy and do it concurrently (see g_conf.mds_log_before_reply) + */ +void Server::commit_request(MClientRequest *req, + MClientReply *reply, + CInode *tracei, + LogEvent *event, + LogEvent *event2) +{ + // log + if (event) mdlog->submit_entry(event); + if (event2) mdlog->submit_entry(event2); + + if (g_conf.mds_log_before_reply && g_conf.mds_log && event) { + // SAFE mode! + + // pin inode so it doesn't go away! + if (tracei) mdcache->request_pin_inode(req, tracei); + + // wait for log sync + mdlog->wait_for_sync(new C_MDS_CommitRequest(this, req, reply, tracei)); + return; + } + else { + // just reply + reply_request(req, reply, tracei); + } +} + + + +/*** + * process a client request + */ + +void Server::handle_client_request(MClientRequest *req) +{ + dout(4) << "req " << *req << endl; + + // note original client addr + if (req->get_source().is_client()) { + req->set_client_inst( req->get_source_inst() ); + req->clear_payload(); + } + + if (!mds->is_active()) { + dout(5) << " not active, discarding client request." << endl; + delete req; + return; + } + + if (!mdcache->get_root()) { + dout(5) << "need to open root" << endl; + mdcache->open_root(new C_MDS_RetryMessage(mds, req)); + return; + } + + // okay, i want + CInode *ref = 0; + vector trace; // might be blank, for fh guys + + bool follow_trailing_symlink = false; + + // operations on fh's or other non-files + switch (req->get_op()) { + /* + case MDS_OP_FSTAT: + reply = handle_client_fstat(req, cur); + break; ****** fiX ME *** + */ + + case MDS_OP_TRUNCATE: + if (!req->get_ino()) break; // can be called w/ either fh OR path + + case MDS_OP_RELEASE: + case MDS_OP_FSYNC: + ref = mdcache->get_inode(req->get_ino()); // fixme someday no ino needed? + + if (!ref) { + int next = whoami + 1; + if (next >= mds->mdsmap->get_num_mds()) next = 0; + dout(10) << "got request on ino we don't have, passing buck to " << next << endl; + mds->send_message_mds(req, next, MDS_PORT_SERVER); + return; + } + } + + if (!ref) { + // we need to traverse a path + filepath refpath = req->get_filepath(); + + // ops on non-existing files --> directory paths + switch (req->get_op()) { + case MDS_OP_OPEN: + if (!(req->get_iarg() & O_CREAT)) break; + + case MDS_OP_MKNOD: + case MDS_OP_MKDIR: + case MDS_OP_SYMLINK: + case MDS_OP_LINK: + case MDS_OP_UNLINK: // also wrt parent dir, NOT the unlinked inode!! + case MDS_OP_RMDIR: + case MDS_OP_RENAME: + // remove last bit of path + refpath = refpath.prefixpath(refpath.depth()-1); + break; + } + dout(10) << "refpath = " << refpath << endl; + + Context *ondelay = new C_MDS_RetryMessage(mds, req); + + if (req->get_op() == MDS_OP_LSTAT) { + follow_trailing_symlink = false; + } + + // do trace + int r = mdcache->path_traverse(refpath, trace, follow_trailing_symlink, + req, ondelay, + MDS_TRAVERSE_FORWARD, + 0, + true); // is MClientRequest + + if (r > 0) return; // delayed + if (r == -ENOENT || + r == -ENOTDIR || + r == -EISDIR) { + // error! + dout(10) << " path traverse error " << r << ", replying" << endl; + + // send error + messenger->send_message(new MClientReply(req, r), + MSG_ADDR_CLIENT(req->get_client()), req->get_client_inst()); + + // + // is this a special debug command? + if (refpath.depth() - 1 == trace.size() && + refpath.last_bit().find(".ceph.") == 0) { + CDir *dir = 0; + if (trace.empty()) + dir = mdcache->get_root()->dir; + else + dir = trace[trace.size()-1]->get_inode()->dir; + + dout(1) << "** POSSIBLE CEPH DEBUG COMMAND '" << refpath.last_bit() << "' in " << *dir << endl; + + if (refpath.last_bit() == ".ceph.hash" && + refpath.depth() > 1) { + dout(1) << "got explicit hash command " << refpath << endl; + CDir *dir = trace[trace.size()-1]->get_inode()->dir; + if (!dir->is_hashed() && + !dir->is_hashing() && + dir->is_auth()) + mdcache->migrator->hash_dir(dir); + } + else if (refpath.last_bit() == ".ceph.commit") { + dout(1) << "got explicit commit command on " << *dir << endl; + mds->mdstore->commit_dir(dir, 0); + } + } + // + + + delete req; + return; + } + + if (trace.size()) + ref = trace[trace.size()-1]->inode; + else + ref = mdcache->get_root(); + } + + dout(10) << "ref is " << *ref << endl; + + // rename doesn't pin src path (initially) + if (req->get_op() == MDS_OP_RENAME) trace.clear(); + + // register + if (!mdcache->request_start(req, ref, trace)) + return; + + // process + dispatch_request(req, ref); +} + + + +void Server::dispatch_request(Message *m, CInode *ref) +{ + MClientRequest *req = 0; + + // MLock or MClientRequest? + /* this is a little weird. + client requests and mlocks both initial dentry xlocks, path pins, etc., + and thus both make use of the context C_MDS_RetryRequest. + */ + switch (m->get_type()) { + case MSG_CLIENT_REQUEST: + req = (MClientRequest*)m; + break; // continue below! + + case MSG_MDS_LOCK: + mds->locker->handle_lock_dn((MLock*)m); + return; // done + + default: + assert(0); // shouldn't get here + } + + // MClientRequest. + + switch(req->get_op()) { + + // files + case MDS_OP_OPEN: + if (req->get_iarg() & O_CREAT) + handle_client_openc(req, ref); + else + handle_client_open(req, ref); + break; + case MDS_OP_TRUNCATE: + handle_client_truncate(req, ref); + break; + /* + case MDS_OP_FSYNC: + handle_client_fsync(req, ref); + break; + */ + /* + case MDS_OP_RELEASE: + handle_client_release(req, ref); + break; + */ + + // inodes + case MDS_OP_STAT: + case MDS_OP_LSTAT: + handle_client_stat(req, ref); + break; + case MDS_OP_UTIME: + handle_client_utime(req, ref); + break; + case MDS_OP_CHMOD: + handle_client_chmod(req, ref); + break; + case MDS_OP_CHOWN: + handle_client_chown(req, ref); + break; + + // namespace + case MDS_OP_READDIR: + handle_client_readdir(req, ref); + break; + case MDS_OP_MKNOD: + handle_client_mknod(req, ref); + break; + case MDS_OP_LINK: + handle_client_link(req, ref); + break; + case MDS_OP_UNLINK: + handle_client_unlink(req, ref); + break; + case MDS_OP_RENAME: + handle_client_rename(req, ref); + break; + case MDS_OP_RMDIR: + handle_client_unlink(req, ref); + break; + case MDS_OP_MKDIR: + handle_client_mkdir(req, ref); + break; + case MDS_OP_SYMLINK: + handle_client_symlink(req, ref); + break; + + + + default: + dout(1) << " unknown client op " << req->get_op() << endl; + assert(0); + } + + return; +} + + + + +// STAT + +void Server::handle_client_stat(MClientRequest *req, + CInode *ref) +{ + // do I need file info? + int mask = req->get_iarg(); + if (mask & (INODE_MASK_SIZE|INODE_MASK_MTIME)) { + // yes. do a full stat. + if (!mds->locker->inode_file_read_start(ref, req)) + return; // syncing + mds->locker->inode_file_read_finish(ref); + } else { + // nope! easy peasy. + } + + mds->balancer->hit_inode(ref, META_POP_IRD); + + // reply + dout(10) << "reply to " << *req << " stat " << ref->inode.mtime << endl; + MClientReply *reply = new MClientReply(req); + + reply_request(req, reply, ref); +} + + + +// INODE UPDATES + +// utime + +void Server::handle_client_utime(MClientRequest *req, + CInode *cur) +{ + // write + if (!mds->locker->inode_file_write_start(cur, req)) + return; // fw or (wait for) sync + + // do update + cur->inode.mtime = req->get_targ(); + cur->inode.atime = req->get_targ2(); + if (cur->is_auth()) + cur->mark_dirty(); + + mds->locker->inode_file_write_finish(cur); + + mds->balancer->hit_inode(cur, META_POP_IWR); + + // init reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_result(0); + + // commit + commit_request(req, reply, cur, + new EInodeUpdate(cur)); +} + + + +// HARD + +// chmod + +void Server::handle_client_chmod(MClientRequest *req, + CInode *cur) +{ + // write + if (!mds->locker->inode_hard_write_start(cur, req)) + return; // fw or (wait for) lock + + + // check permissions + + // do update + int mode = req->get_iarg(); + cur->inode.mode &= ~04777; + cur->inode.mode |= (mode & 04777); + cur->mark_dirty(); + + mds->locker->inode_hard_write_finish(cur); + + mds->balancer->hit_inode(cur, META_POP_IWR); + + // start reply + MClientReply *reply = new MClientReply(req, 0); + + // commit + commit_request(req, reply, cur, + new EInodeUpdate(cur)); +} + +// chown + +void Server::handle_client_chown(MClientRequest *req, + CInode *cur) +{ + // write + if (!mds->locker->inode_hard_write_start(cur, req)) + return; // fw or (wait for) lock + + // check permissions + + // do update + int uid = req->get_iarg(); + int gid = req->get_iarg2(); + cur->inode.uid = uid; + cur->inode.gid = gid; + cur->mark_dirty(); + + mds->locker->inode_hard_write_finish(cur); + + mds->balancer->hit_inode(cur, META_POP_IWR); + + // start reply + MClientReply *reply = new MClientReply(req, 0); + + // commit + commit_request(req, reply, cur, + new EInodeUpdate(cur)); +} + + + +bool Server::try_open_dir(CInode *in, MClientRequest *req) +{ + if (!in->dir && in->is_frozen_dir()) { + // doh! + dout(10) << " dir inode is frozen, can't open dir, waiting " << *in << endl; + assert(in->get_parent_dir()); + in->get_parent_dir()->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryRequest(mds, req, in)); + return false; + } + + in->get_or_open_dir(mds); + return true; +} + + +// DIRECTORY and NAMESPACE OPS + +// READDIR + +int Server::encode_dir_contents(CDir *dir, + list& inls, + list& dnls) +{ + int numfiles = 0; + + for (CDir_map_t::iterator it = dir->begin(); + it != dir->end(); + it++) { + CDentry *dn = it->second; + + // hashed? + if (dir->is_hashed() && + whoami != mds->hash_dentry( dir->ino(), it->first )) + continue; + + // is dentry readable? + if (dn->is_xlocked()) { + // ***** FIXME ***** + // ? + dout(10) << "warning, returning xlocked dentry, we _may_ be fudging on POSIX consistency" << endl; + } + + CInode *in = dn->inode; + if (!in) continue; // null dentry? + + dout(12) << "including inode " << *in << endl; + + // add this item + // note: InodeStat makes note of whether inode data is readable. + dnls.push_back( it->first ); + inls.push_back( new InodeStat(in, whoami) ); + numfiles++; + } + return numfiles; +} + + +/* + * note: this is pretty sloppy, but should work just fine i think... + */ +void Server::handle_hash_readdir(MHashReaddir *m) +{ + CInode *cur = mdcache->get_inode(m->get_ino()); + assert(cur); + + if (!cur->dir || + !cur->dir->is_hashed()) { + assert(0); + dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; + delete m; + return; + } + CDir *dir = cur->dir; + assert(dir); + assert(dir->is_hashed()); + + // complete? + if (!dir->is_complete()) { + dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << endl; + mds->mdstore->fetch_dir(dir, new C_MDS_RetryMessage(mds, m)); + return; + } + + // get content + list inls; + list dnls; + int num = encode_dir_contents(dir, inls, dnls); + + // sent it back! + messenger->send_message(new MHashReaddirReply(dir->ino(), inls, dnls, num), + m->get_source(), m->get_source_inst(), MDS_PORT_CACHE); +} + + +void Server::handle_hash_readdir_reply(MHashReaddirReply *m) +{ + CInode *cur = mdcache->get_inode(m->get_ino()); + assert(cur); + + if (!cur->dir || + !cur->dir->is_hashed()) { + assert(0); + dout(7) << "handle_hash_readdir don't have dir open, or not hashed. giving up!" << endl; + delete m; + return; + } + CDir *dir = cur->dir; + assert(dir); + assert(dir->is_hashed()); + + // move items to hashed_readdir gather + int from = MSG_ADDR_NUM(m->get_source()); + assert(dir->hashed_readdir.count(from) == 0); + dir->hashed_readdir[from].first.splice(dir->hashed_readdir[from].first.begin(), + m->get_in()); + dir->hashed_readdir[from].second.splice(dir->hashed_readdir[from].second.begin(), + m->get_dn()); + delete m; + + // gather finished? + if (dir->hashed_readdir.size() < (unsigned)mds->mdsmap->get_num_mds()) { + dout(7) << "still waiting for more hashed readdir bits" << endl; + return; + } + + dout(7) << "got last bit! finishing waiters" << endl; + + // do these finishers. they'll copy the results. + list finished; + dir->take_waiting(CDIR_WAIT_THISHASHEDREADDIR, finished); + finish_contexts(finished); + + // now discard these results + for (map, list > >::iterator it = dir->hashed_readdir.begin(); + it != dir->hashed_readdir.end(); + it++) { + for (list::iterator ci = it->second.first.begin(); + ci != it->second.first.end(); + ci++) + delete *ci; + } + dir->hashed_readdir.clear(); + + // unpin dir (we're done!) + dir->auth_unpin(); + + // trigger any waiters for next hashed readdir cycle + dir->take_waiting(CDIR_WAIT_NEXTHASHEDREADDIR, mds->finished_queue); +} + + +class C_MDS_HashReaddir : public Context { + Server *server; + MClientRequest *req; + CDir *dir; +public: + C_MDS_HashReaddir(Server *server, MClientRequest *req, CDir *dir) { + this->server = server; + this->req = req; + this->dir = dir; + } + void finish(int r) { + server->finish_hash_readdir(req, dir); + } +}; + +void Server::finish_hash_readdir(MClientRequest *req, CDir *dir) +{ + dout(7) << "finish_hash_readdir on " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->hashed_readdir.size() == (unsigned)mds->mdsmap->get_num_mds()); + + // reply! + MClientReply *reply = new MClientReply(req); + reply->set_result(0); + + for (int i=0; imdsmap->get_num_mds(); i++) { + reply->copy_dir_items(dir->hashed_readdir[i].first, + dir->hashed_readdir[i].second); + } + + // ok! + reply_request(req, reply, dir->inode); +} + + +void Server::handle_client_readdir(MClientRequest *req, + CInode *cur) +{ + // it's a directory, right? + if (!cur->is_dir()) { + // not a dir + dout(10) << "reply to " << *req << " readdir -ENOTDIR" << endl; + reply_request(req, -ENOTDIR); + return; + } + + // auth? + if (!cur->dir_is_auth()) { + int dirauth = cur->authority(); + if (cur->dir) + dirauth = cur->dir->authority(); + assert(dirauth >= 0); + assert(dirauth != whoami); + + // forward to authority + dout(10) << " forwarding readdir to authority " << dirauth << endl; + mdcache->request_forward(req, dirauth); + return; + } + + if (!try_open_dir(cur, req)) + return; + assert(cur->dir->is_auth()); + + // unhashing? wait! + if (cur->dir->is_hashed() && + cur->dir->is_unhashing()) { + dout(10) << "unhashing, waiting" << endl; + cur->dir->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryRequest(mds, req, cur)); + return; + } + + // check perm + if (!mds->locker->inode_hard_read_start(cur,req)) + return; + mds->locker->inode_hard_read_finish(cur); + + CDir *dir = cur->dir; + assert(dir); + + if (!dir->is_complete()) { + // fetch + dout(10) << " incomplete dir contents for readdir on " << *cur->dir << ", fetching" << endl; + mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + + if (dir->is_hashed()) { + // HASHED + dout(7) << "hashed dir" << endl; + if (!dir->can_auth_pin()) { + dout(7) << "can't auth_pin dir " << *dir << " waiting" << endl; + dir->add_waiter(CDIR_WAIT_AUTHPINNABLE, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + + if (!dir->hashed_readdir.empty()) { + dout(7) << "another readdir gather in progres, waiting" << endl; + dir->add_waiter(CDIR_WAIT_NEXTHASHEDREADDIR, new C_MDS_RetryRequest(mds, req, cur)); + return; + } + + // start new readdir gather + dout(7) << "staring new hashed readdir gather" << endl; + + // pin auth for process! + dir->auth_pin(); + + // get local bits + encode_dir_contents(cur->dir, + dir->hashed_readdir[whoami].first, + dir->hashed_readdir[whoami].second); + + // request other bits + for (int i=0; imdsmap->get_num_mds(); i++) { + if (i == whoami) continue; + mds->send_message_mds(new MHashReaddir(dir->ino()), i, MDS_PORT_SERVER); + } + + // wait + dir->add_waiter(CDIR_WAIT_THISHASHEDREADDIR, + new C_MDS_HashReaddir(this, req, dir)); + } else { + // NON-HASHED + // build dir contents + list inls; + list dnls; + int numfiles = encode_dir_contents(cur->dir, inls, dnls); + + // . too + dnls.push_back("."); + inls.push_back(new InodeStat(cur, whoami)); + ++numfiles; + + // yay, reply + MClientReply *reply = new MClientReply(req); + reply->take_dir_items(inls, dnls, numfiles); + + dout(10) << "reply to " << *req << " readdir " << numfiles << " files" << endl; + reply->set_result(0); + + //balancer->hit_dir(cur->dir); + + // reply + reply_request(req, reply, cur); + } +} + + +// MKNOD + +void Server::handle_client_mknod(MClientRequest *req, CInode *ref) +{ + // make dentry and inode, link. + CInode *newi = mknod(req, ref); + if (!newi) return; + + // it's a file! + newi->inode.mode = req->get_iarg(); + newi->inode.mode &= ~INODE_TYPE_MASK; + newi->inode.mode |= INODE_MODE_FILE; + + mds->balancer->hit_inode(newi, META_POP_IWR); + + // commit + commit_request(req, new MClientReply(req, 0), ref, + new EMknod(newi)); +} + +// mknod(): used by handle_client_mkdir, handle_client_mknod, which are mostly identical. + +CInode *Server::mknod(MClientRequest *req, CInode *diri, bool okexist) +{ + dout(10) << "mknod " << req->get_filepath() << " in " << *diri << endl; + + // get containing directory (without last bit) + filepath dirpath = req->get_filepath().prefixpath(req->get_filepath().depth() - 1); + string name = req->get_filepath().last_bit(); + + // did we get to parent? + dout(10) << "dirpath is " << dirpath << " depth " << dirpath.depth() << endl; + + // make sure parent is a dir? + if (!diri->is_dir()) { + dout(7) << "not a dir" << endl; + reply_request(req, -ENOTDIR); + return 0; + } + + // am i not open, not auth? + if (!diri->dir && !diri->is_auth()) { + int dirauth = diri->authority(); + dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; + mdcache->request_forward(req, dirauth); + return 0; + } + + if (!try_open_dir(diri, req)) return 0; + CDir *dir = diri->dir; + + // make sure it's my dentry + int dnauth = dir->dentry_authority(name); + if (dnauth != whoami) { + // fw + + dout(7) << "mknod on " << req->get_path() << ", dentry " << *dir << " dn " << name << " not mine, fw to " << dnauth << endl; + mdcache->request_forward(req, dnauth); + return 0; + } + // ok, done passing buck. + + + // frozen? + if (dir->is_frozen()) { + dout(7) << "dir is frozen " << *dir << endl; + dir->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryRequest(mds, req, diri)); + return 0; + } + + // make sure name doesn't already exist + CDentry *dn = dir->lookup(name); + if (dn) { + if (!dn->can_read(req)) { + dout(10) << "waiting on (existing!) dentry " << *dn << endl; + dir->add_waiter(CDIR_WAIT_DNREAD, name, new C_MDS_RetryRequest(mds, req, diri)); + return 0; + } + + if (!dn->is_null()) { + // name already exists + if (okexist) { + dout(10) << "dentry " << name << " exists in " << *dir << endl; + return dn->inode; + } else { + dout(10) << "dentry " << name << " exists in " << *dir << endl; + reply_request(req, -EEXIST); + return 0; + } + } + } + + // make sure dir is complete + if (!dir->is_complete()) { + dout(7) << " incomplete dir contents for " << *dir << ", fetching" << endl; + mds->mdstore->fetch_dir(dir, new C_MDS_RetryRequest(mds, req, diri)); + return 0; + } + + // create! + CInode *newi = mdcache->create_inode(); + newi->inode.uid = req->get_caller_uid(); + newi->inode.gid = req->get_caller_gid(); + newi->inode.ctime = newi->inode.mtime = newi->inode.atime = g_clock.gettime(); // now + + // link + if (!dn) + dn = dir->add_dentry(name, newi); + else + dir->link_inode(dn, newi); + + // bump modify pop + mds->balancer->hit_dir(dir, META_POP_DWR); + + // mark dirty + dn->mark_dirty(); + newi->mark_dirty(); + + // journal it + //mdlog->submit_entry(new EMknod(newi)); + + // ok! + return newi; +} + + +// LINK + +class C_MDS_LinkTraverse : public Context { + Server *server; + MClientRequest *req; + CInode *ref; +public: + vector trace; + C_MDS_LinkTraverse(Server *server, MClientRequest *req, CInode *ref) { + this->server = server; + this->req = req; + this->ref = ref; + } + void finish(int r) { + server->handle_client_link_2(r, req, ref, trace); + } +}; + +void Server::handle_client_link(MClientRequest *req, CInode *ref) +{ + // figure out name + string dname = req->get_filepath().last_bit(); + dout(7) << "dname is " << dname << endl; + + // make sure parent is a dir? + if (!ref->is_dir()) { + dout(7) << "not a dir " << *ref << endl; + reply_request(req, -EINVAL); + return; + } + + // am i not open, not auth? + if (!ref->dir && !ref->is_auth()) { + int dirauth = ref->authority(); + dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; + mdcache->request_forward(req, dirauth); + return; + } + + if (!try_open_dir(ref, req)) return; + CDir *dir = ref->dir; + dout(7) << "handle_client_link dir is " << *dir << endl; + + + + // make sure it's my dentry + int dauth = dir->dentry_authority(dname); + if (dauth != whoami) { + // fw + dout(7) << "link on " << req->get_path() << ", dn " << dname << " in " << *dir << " not mine, fw to " << dauth << endl; + mdcache->request_forward(req, dauth); + return; + } + // ok, done passing buck. + + + // exists? + CDentry *dn = dir->lookup(dname); + if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { + dout(7) << "handle_client_link dn exists " << *dn << endl; + reply_request(req, -EEXIST); + return; + } + + // keep src dir in memory + mdcache->request_pin_dir(req, dir); + + // discover link target + filepath target = req->get_sarg(); + + dout(7) << "handle_client_link discovering target " << target << endl; + + C_MDS_LinkTraverse *onfinish = new C_MDS_LinkTraverse(this, req, ref); + Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); + + mdcache->path_traverse(target, onfinish->trace, false, + req, ondelay, + MDS_TRAVERSE_DISCOVER, //XLOCK, + onfinish); +} + + +class C_MDS_RemoteLink : public Context { + Server *server; + MClientRequest *req; + CInode *ref; + CDentry *dn; + CInode *targeti; +public: + C_MDS_RemoteLink(Server *server, MClientRequest *req, CInode *ref, CDentry *dn, CInode *targeti) { + this->server = server; + this->req = req; + this->ref = ref; + this->dn = dn; + this->targeti = targeti; + } + void finish(int r) { + if (r > 0) { // success + // yay + server->handle_client_link_finish(req, ref, dn, targeti); + } + else if (r == 0) { + // huh? retry! + assert(0); + server->dispatch_request(req, ref); + } else { + // link failed + server->reply_request(req, r); + } + } +}; + +void Server::handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace) +{ + // target dne? + if (r < 0) { + dout(7) << "target " << req->get_sarg() << " dne" << endl; + reply_request(req, r); + return; + } + assert(r == 0); + + CInode *targeti = mdcache->get_root(); + if (trace.size()) targeti = trace[trace.size()-1]->inode; + assert(targeti); + + // dir? + dout(7) << "target is " << *targeti << endl; + if (targeti->is_dir()) { + dout(7) << "target is a dir, failing" << endl; + reply_request(req, -EINVAL); + return; + } + + // keep target inode in memory + mdcache->request_pin_inode(req, targeti); + + dout(7) << "dir is " << *ref << endl; + + // xlock the dentry + CDir *dir = ref->dir; + assert(dir); + + string dname = req->get_filepath().last_bit(); + int dauth = dir->dentry_authority(dname); + if (whoami != dauth) { + // ugh, exported out from under us + dout(7) << "ugh, forwarded out from under us, dentry auth is " << dauth << endl; + mdcache->request_forward(req, dauth); + return; + } + + CDentry *dn = dir->lookup(dname); + if (dn && (!dn->is_null() || dn->is_xlockedbyother(req))) { + dout(7) << "handle_client_link dn exists " << *dn << endl; + reply_request(req, -EEXIST); + return; + } + + if (!dn) dn = dir->add_dentry(dname); + + if (!dn->is_xlockedbyme(req)) { + if (!mds->locker->dentry_xlock_start(dn, req, ref)) { + if (dn->is_clean() && dn->is_null() && dn->is_sync()) dir->remove_dentry(dn); + return; + } + } + + + // ok xlocked! + if (targeti->is_auth()) { + // mine + if (targeti->is_anchored()) { + dout(7) << "target anchored already (nlink=" << targeti->inode.nlink << "), sweet" << endl; + } else { + assert(targeti->inode.nlink == 1); + dout(7) << "target needs anchor, nlink=" << targeti->inode.nlink << ", creating anchor" << endl; + + mdcache->anchor_inode(targeti, + new C_MDS_RetryRequest(mds, req, ref)); + return; + } + + // ok, inc link! + targeti->inode.nlink++; + dout(7) << "nlink++, now " << targeti->inode.nlink << " on " << *targeti << endl; + targeti->mark_dirty(); + + } else { + // remote: send nlink++ request, wait + dout(7) << "target is remote, sending InodeLink" << endl; + mds->send_message_mds(new MInodeLink(targeti->ino(), whoami), targeti->authority(), MDS_PORT_CACHE); + + // wait + targeti->add_waiter(CINODE_WAIT_LINK, + new C_MDS_RemoteLink(this, req, ref, dn, targeti)); + return; + } + + handle_client_link_finish(req, ref, dn, targeti); +} + +void Server::handle_client_link_finish(MClientRequest *req, CInode *ref, + CDentry *dn, CInode *targeti) +{ + // create remote link + dn->dir->link_inode(dn, targeti->ino()); + dn->link_remote( targeti ); // since we have it + dn->mark_dirty(); + + mds->balancer->hit_dir(dn->dir, META_POP_DWR); + + // done! + commit_request(req, new MClientReply(req, 0), ref, + 0); // FIXME i should log something +} + + +// UNLINK + +void Server::handle_client_unlink(MClientRequest *req, + CInode *diri) +{ + // rmdir or unlink + bool rmdir = false; + if (req->get_op() == MDS_OP_RMDIR) rmdir = true; + + // find it + if (req->get_filepath().depth() == 0) { + dout(7) << "can't rmdir root" << endl; + reply_request(req, -EINVAL); + return; + } + string name = req->get_filepath().last_bit(); + + // make sure parent is a dir? + if (!diri->is_dir()) { + dout(7) << "not a dir" << endl; + reply_request(req, -ENOTDIR); + return; + } + + // am i not open, not auth? + if (!diri->dir && !diri->is_auth()) { + int dirauth = diri->authority(); + dout(7) << "don't know dir auth, not open, auth is i think " << dirauth << endl; + mdcache->request_forward(req, dirauth); + return; + } + + if (!try_open_dir(diri, req)) return; + CDir *dir = diri->dir; + int dnauth = dir->dentry_authority(name); + + // does it exist? + CDentry *dn = dir->lookup(name); + if (!dn) { + if (dnauth == whoami) { + dout(7) << "handle_client_rmdir/unlink dne " << name << " in " << *dir << endl; + reply_request(req, -ENOENT); + } else { + // send to authority! + dout(7) << "handle_client_rmdir/unlink fw, don't have " << name << " in " << *dir << endl; + mdcache->request_forward(req, dnauth); + } + return; + } + + // have it. locked? + if (!dn->can_read(req)) { + dout(10) << " waiting on " << *dn << endl; + dir->add_waiter(CDIR_WAIT_DNREAD, + name, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + + // null? + if (dn->is_null()) { + dout(10) << "unlink on null dn " << *dn << endl; + reply_request(req, -ENOENT); + return; + } + + // ok! + CInode *in = dn->inode; + assert(in); + if (rmdir) { + dout(7) << "handle_client_rmdir on dir " << *in << endl; + } else { + dout(7) << "handle_client_unlink on non-dir " << *in << endl; + } + + // dir stuff + if (in->is_dir()) { + if (rmdir) { + // rmdir + + // open dir? + if (in->is_auth() && !in->dir) { + if (!try_open_dir(in, req)) return; + } + + // not dir auth? (or not open, which implies the same!) + if (!in->dir) { + dout(7) << "handle_client_rmdir dir not open for " << *in << ", sending to dn auth " << dnauth << endl; + mdcache->request_forward(req, dnauth); + return; + } + if (!in->dir->is_auth()) { + int dirauth = in->dir->authority(); + dout(7) << "handle_client_rmdir not auth for dir " << *in->dir << ", sending to dir auth " << dnauth << endl; + mdcache->request_forward(req, dirauth); + return; + } + + assert(in->dir); + assert(in->dir->is_auth()); + + // dir size check on dir auth (but not necessarily dentry auth)? + + // should be empty + if (in->dir->get_size() == 0 && !in->dir->is_complete()) { + dout(7) << "handle_client_rmdir on dir " << *in->dir << ", empty but not complete, fetching" << endl; + mds->mdstore->fetch_dir(in->dir, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + if (in->dir->get_size() > 0) { + dout(7) << "handle_client_rmdir on dir " << *in->dir << ", not empty" << endl; + reply_request(req, -ENOTEMPTY); + return; + } + + dout(7) << "handle_client_rmdir dir is empty!" << endl; + + // export sanity check + if (!in->is_auth()) { + // i should be exporting this now/soon, since the dir is empty. + dout(7) << "handle_client_rmdir dir is auth, but not inode." << endl; + if (!in->dir->is_freezing() && in->dir->is_frozen()) { + assert(in->dir->is_import()); + mdcache->migrator->export_empty_import(in->dir); + } else { + dout(7) << "apparently already exporting" << endl; + } + in->dir->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + + } else { + // unlink + dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << endl; + reply_request(req, -EISDIR); + return; + } + } else { + if (rmdir) { + // unlink + dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << endl; + reply_request(req, -ENOTDIR); + return; + } + } + + // am i dentry auth? + if (dnauth != whoami) { + // not auth; forward! + dout(7) << "handle_client_unlink not auth for " << *dir << " dn " << dn->name << ", fwd to " << dnauth << endl; + mdcache->request_forward(req, dnauth); + return; + } + + dout(7) << "handle_client_unlink/rmdir on " << *in << endl; + + // xlock dentry + if (!mds->locker->dentry_xlock_start(dn, req, diri)) + return; + + // is this a remote link? + if (dn->is_remote() && !dn->inode) { + CInode *in = mdcache->get_inode(dn->get_remote_ino()); + if (in) { + dn->link_remote(in); + } else { + // open inode + dout(7) << "opening target inode first, ino is " << dn->get_remote_ino() << endl; + mdcache->open_remote_ino(dn->get_remote_ino(), req, + new C_MDS_RetryRequest(mds, req, diri)); + return; + } + } + + + mds->balancer->hit_dir(dn->dir, META_POP_DWR); + + // it's locked, unlink! + MClientReply *reply = new MClientReply(req,0); + mdcache->dentry_unlink(dn, + new C_MDS_CommitRequest(this, req, reply, diri, + new EInodeUpdate(diri))); // FIXME WRONG EVENT + return; +} + + + + + + +// RENAME + +class C_MDS_RenameTraverseDst : public Context { + Server *server; + MClientRequest *req; + CInode *ref; + CInode *srcdiri; + CDir *srcdir; + CDentry *srcdn; + filepath destpath; +public: + vector trace; + + C_MDS_RenameTraverseDst(Server *server, + MClientRequest *req, + CInode *ref, + CInode *srcdiri, + CDir *srcdir, + CDentry *srcdn, + filepath& destpath) { + this->server = server; + this->req = req; + this->ref = ref; + this->srcdiri = srcdiri; + this->srcdir = srcdir; + this->srcdn = srcdn; + this->destpath = destpath; + } + void finish(int r) { + server->handle_client_rename_2(req, ref, + srcdiri, srcdir, srcdn, destpath, + trace, r); + } +}; + + +/* + + weirdness iwith rename: + - ref inode is what was originally srcdiri, but that may change by the tiem + the rename actually happens. for all practical purpose, ref is useless except + for C_MDS_RetryRequest + + */ +void Server::handle_client_rename(MClientRequest *req, + CInode *ref) +{ + dout(7) << "handle_client_rename on " << *req << endl; + + // sanity checks + if (req->get_filepath().depth() == 0) { + dout(7) << "can't rename root" << endl; + reply_request(req, -EINVAL); + return; + } + // mv a/b a/b/c -- meaningless + if (req->get_sarg().compare( 0, req->get_path().length(), req->get_path()) == 0 && + req->get_sarg().c_str()[ req->get_path().length() ] == '/') { + dout(7) << "can't rename to underneath myself" << endl; + reply_request(req, -EINVAL); + return; + } + + // mv blah blah -- also meaningless + if (req->get_sarg() == req->get_path()) { + dout(7) << "can't rename something to itself (or into itself)" << endl; + reply_request(req, -EINVAL); + return; + } + + // traverse to source + /* + this is abnoraml, just for rename. since we don't pin source path + (because we don't want to screw up the lock ordering) the ref inode + (normally/initially srcdiri) may move, and this may fail. + -> so, re-traverse path. and make sure we request_finish in the case of a forward! + */ + filepath refpath = req->get_filepath(); + string srcname = refpath.last_bit(); + refpath = refpath.prefixpath(refpath.depth()-1); + + dout(7) << "handle_client_rename src traversing to srcdir " << refpath << endl; + vector trace; + int r = mdcache->path_traverse(refpath, trace, true, + req, new C_MDS_RetryRequest(mds, req, ref), + MDS_TRAVERSE_FORWARD); + if (r == 2) { + dout(7) << "path traverse forwarded, ending request, doing manual request_cleanup" << endl; + dout(7) << "(pseudo) request_forward to 9999 req " << *req << endl; + mdcache->request_cleanup(req); // not _finish (deletes) or _forward (path_traverse did that) + return; + } + if (r > 0) return; + if (r < 0) { // dne or something. got renamed out from under us, probably! + dout(7) << "traverse r=" << r << endl; + reply_request(req, r); + return; + } + + CInode *srcdiri; + if (trace.size()) + srcdiri = trace[trace.size()-1]->inode; + else + srcdiri = mdcache->get_root(); + + dout(7) << "handle_client_rename srcdiri is " << *srcdiri << endl; + + dout(7) << "handle_client_rename srcname is " << srcname << endl; + + // make sure parent is a dir? + if (!srcdiri->is_dir()) { + dout(7) << "srcdiri not a dir " << *srcdiri << endl; + reply_request(req, -EINVAL); + return; + } + + // am i not open, not auth? + if (!srcdiri->dir && !srcdiri->is_auth()) { + int dirauth = srcdiri->authority(); + dout(7) << "don't know dir auth, not open, srcdir auth is probably " << dirauth << endl; + mdcache->request_forward(req, dirauth); + return; + } + + if (!try_open_dir(srcdiri, req)) return; + CDir *srcdir = srcdiri->dir; + dout(7) << "handle_client_rename srcdir is " << *srcdir << endl; + + // make sure it's my dentry + int srcauth = srcdir->dentry_authority(srcname); + if (srcauth != whoami) { + // fw + dout(7) << "rename on " << req->get_path() << ", dentry " << *srcdir << " dn " << srcname << " not mine, fw to " << srcauth << endl; + mdcache->request_forward(req, srcauth); + return; + } + // ok, done passing buck. + + // src dentry + CDentry *srcdn = srcdir->lookup(srcname); + + // xlocked? + if (srcdn && !srcdn->can_read(req)) { + dout(10) << " waiting on " << *srcdn << endl; + srcdir->add_waiter(CDIR_WAIT_DNREAD, + srcname, + new C_MDS_RetryRequest(mds, req, srcdiri)); + return; + } + + if ((srcdn && !srcdn->inode) || + (!srcdn && srcdir->is_complete())) { + dout(10) << "handle_client_rename src dne " << endl; + reply_request(req, -EEXIST); + return; + } + + if (!srcdn && !srcdir->is_complete()) { + dout(10) << "readding incomplete dir" << endl; + mds->mdstore->fetch_dir(srcdir, + new C_MDS_RetryRequest(mds, req, srcdiri)); + return; + } + assert(srcdn && srcdn->inode); + + + dout(10) << "handle_client_rename srcdn is " << *srcdn << endl; + dout(10) << "handle_client_rename srci is " << *srcdn->inode << endl; + + // pin src in cache (so it won't expire) + mdcache->request_pin_inode(req, srcdn->inode); + + // find the destination, normalize + // discover, etc. on the way... just get it on the local node. + filepath destpath = req->get_sarg(); + + C_MDS_RenameTraverseDst *onfinish = new C_MDS_RenameTraverseDst(this, req, ref, srcdiri, srcdir, srcdn, destpath); + Context *ondelay = new C_MDS_RetryRequest(mds, req, ref); + + /* + * use DISCOVERXLOCK mode: + * the dest may not exist, and may be xlocked from a remote host + * we want to succeed if we find the xlocked dentry + * ?? + */ + mdcache->path_traverse(destpath, onfinish->trace, false, + req, ondelay, + MDS_TRAVERSE_DISCOVER, //XLOCK, + onfinish); +} + +void Server::handle_client_rename_2(MClientRequest *req, + CInode *ref, + CInode *srcdiri, + CDir *srcdir, + CDentry *srcdn, + filepath& destpath, + vector& trace, + int r) +{ + dout(7) << "handle_client_rename_2 on " << *req << endl; + dout(12) << " r = " << r << " trace depth " << trace.size() << " destpath depth " << destpath.depth() << endl; + + CInode *srci = srcdn->inode; + assert(srci); + CDir* destdir = 0; + string destname; + + // what is the dest? (dir or file or complete filename) + // note: trace includes root, destpath doesn't (include leading /) + if (trace.size() && trace[trace.size()-1]->inode == 0) { + dout(10) << "dropping null dentry from tail of trace" << endl; + trace.pop_back(); // drop it! + } + + CInode *d; + if (trace.size()) + d = trace[trace.size()-1]->inode; + else + d = mdcache->get_root(); + assert(d); + dout(10) << "handle_client_rename_2 traced to " << *d << ", trace size = " << trace.size() << ", destpath = " << destpath.depth() << endl; + + // make sure i can open the dir? + if (d->is_dir() && !d->dir_is_auth() && !d->dir) { + // discover it + mdcache->open_remote_dir(d, + new C_MDS_RetryRequest(mds, req, ref)); + return; + } + + if (trace.size() == destpath.depth()) { + if (d->is_dir()) { + // mv /some/thing /to/some/dir + if (!try_open_dir(d, req)) return; + destdir = d->dir; // /to/some/dir + destname = req->get_filepath().last_bit(); // thing + destpath.add_dentry(destname); + } else { + // mv /some/thing /to/some/existing_filename + destdir = trace[trace.size()-1]->dir; // /to/some + destname = destpath.last_bit(); // existing_filename + } + } + else if (trace.size() == destpath.depth()-1) { + if (d->is_dir()) { + // mv /some/thing /to/some/place_that_maybe_dne (we might be replica) + if (!try_open_dir(d, req)) return; + destdir = d->dir; // /to/some + destname = destpath.last_bit(); // place_that_MAYBE_dne + } else { + dout(7) << "dest dne" << endl; + reply_request(req, -EINVAL); + return; + } + } + else { + assert(trace.size() < destpath.depth()-1); + // check traverse return value + if (r > 0) { + return; // discover, readdir, etc. + } + + // ?? + assert(r < 0 || trace.size() == 0); // musta been an error + + // error out + dout(7) << " rename dest " << destpath << " dne" << endl; + reply_request(req, -EINVAL); + return; + } + + string srcpath = req->get_path(); + dout(10) << "handle_client_rename_2 srcpath " << srcpath << endl; + dout(10) << "handle_client_rename_2 destpath " << destpath << endl; + + // src == dest? + if (srcdn->get_dir() == destdir && srcdn->name == destname) { + dout(7) << "rename src=dest, same file " << endl; + reply_request(req, -EINVAL); + return; + } + + // does destination exist? (is this an overwrite?) + CDentry *destdn = destdir->lookup(destname); + CInode *oldin = 0; + if (destdn) { + oldin = destdn->get_inode(); + + if (oldin) { + // make sure it's also a file! + // this can happen, e.g. "mv /some/thing /a/dir" where /a/dir/thing exists and is a dir. + if (oldin->is_dir()) { + // fail! + dout(7) << "dest exists and is dir" << endl; + reply_request(req, -EISDIR); + return; + } + + if (srcdn->inode->is_dir() && + !oldin->is_dir()) { + dout(7) << "cannot overwrite non-directory with directory" << endl; + reply_request(req, -EISDIR); + return; + } + } + + dout(7) << "dest exists " << *destdn << endl; + if (destdn->get_inode()) { + dout(7) << "destino is " << *destdn->get_inode() << endl; + } else { + dout(7) << "dest dn is a NULL stub" << endl; + } + } else { + dout(7) << "dest dn dne (yet)" << endl; + } + + + // local or remote? + int srcauth = srcdir->dentry_authority(srcdn->name); + int destauth = destdir->dentry_authority(destname); + dout(7) << "handle_client_rename_2 destname " << destname << " destdir " << *destdir << " auth " << destauth << endl; + + // + if (srcauth != whoami || + destauth != whoami) { + dout(7) << "rename has remote dest " << destauth << endl; + dout(7) << "FOREIGN RENAME" << endl; + + // punt? + if (false && srcdn->inode->is_dir()) { + reply_request(req, -EINVAL); + return; + } + + } else { + dout(7) << "rename is local" << endl; + } + + handle_client_rename_local(req, ref, + srcpath, srcdiri, srcdn, + destpath.get_path(), destdir, destdn, destname); + return; +} + + + + +void Server::handle_client_rename_local(MClientRequest *req, + CInode *ref, + string& srcpath, + CInode *srcdiri, + CDentry *srcdn, + string& destpath, + CDir *destdir, + CDentry *destdn, + string& destname) +{ + //bool everybody = false; + //if (true || srcdn->inode->is_dir()) { + /* overkill warning: lock w/ everyone for simplicity. FIXME someday! along with the foreign rename crap! + i could limit this to cases where something beneath me is exported. + could possibly limit the list. (maybe.) + Underlying constraint is that, regardless of the order i do the xlocks, and whatever + imports/exports might happen in the process, the destdir _must_ exist on any node + importing something beneath me when rename finishes, or else mayhem ensues when + their import is dangling in the cache. + */ + /* + having made a proper mess of this on the first pass, here is my plan: + + - xlocks of src, dest are done in lex order + - xlock is optional.. if you have the dentry, lock it, if not, don't. + - if you discover an xlocked dentry, you get the xlock. + + possible trouble: + - you have an import beneath the source, and don't have the dest dir. + - when the actual rename happens, you discover the dest + - actually, do this on any open dir, so we don't detach whole swaths + of our cache. + + notes: + - xlocks are initiated from authority, as are discover_replies, so replicas are + guaranteed to either not have dentry, or to have it xlocked. + - + - foreign xlocks are eventually unraveled by the initiator on success or failure. + + todo to make this work: + - hose bool everybody param crap + /- make handle_lock_dn not discover, clean up cases + /- put dest path in MRenameNotify + /- make rename_notify discover if its a dir + / - this will catch nested imports too, obviously + /- notify goes to merged list on local rename + /- notify goes to everybody on a foreign rename + /- handle_notify needs to gracefully ignore spurious notifies + */ + //dout(7) << "handle_client_rename_local: overkill? doing xlocks with _all_ nodes" << endl; + //everybody = true; + //} + + bool srclocal = srcdn->dir->dentry_authority(srcdn->name) == whoami; + bool destlocal = destdir->dentry_authority(destname) == whoami; + + dout(7) << "handle_client_rename_local: src local=" << srclocal << " " << *srcdn << endl; + if (destdn) { + dout(7) << "handle_client_rename_local: dest local=" << destlocal << " " << *destdn << endl; + } else { + dout(7) << "handle_client_rename_local: dest local=" << destlocal << " dn dne yet" << endl; + } + + /* lock source and dest dentries, in lexicographic order. + */ + bool dosrc = srcpath < destpath; + for (int i=0; i<2; i++) { + if (dosrc) { + + // src + if (srclocal) { + if (!srcdn->is_xlockedbyme(req) && + !mds->locker->dentry_xlock_start(srcdn, req, ref)) + return; + } else { + if (!srcdn || srcdn->xlockedby != req) { + mds->locker->dentry_xlock_request(srcdn->dir, srcdn->name, false, req, new C_MDS_RetryRequest(mds, req, ref)); + return; + } + } + dout(7) << "handle_client_rename_local: srcdn is xlock " << *srcdn << endl; + + } else { + + if (destlocal) { + // dest + if (!destdn) destdn = destdir->add_dentry(destname); + if (!destdn->is_xlockedbyme(req) && + !mds->locker->dentry_xlock_start(destdn, req, ref)) { + if (destdn->is_clean() && destdn->is_null() && destdn->is_sync()) destdir->remove_dentry(destdn); + return; + } + } else { + if (!destdn || destdn->xlockedby != req) { + /* NOTE: require that my xlocked item be a leaf/file, NOT a dir. in case + * my traverse and determination of dest vs dest/srcfilename was out of date. + */ + mds->locker->dentry_xlock_request(destdir, destname, true, req, new C_MDS_RetryRequest(mds, req, ref)); + return; + } + } + dout(7) << "handle_client_rename_local: destdn is xlock " << *destdn << endl; + + } + + dosrc = !dosrc; + } + + + // final check: verify if dest exists that src is a file + + // FIXME: is this necessary? + + if (destdn->inode) { + if (destdn->inode->is_dir()) { + dout(7) << "handle_client_rename_local failing, dest exists and is a dir: " << *destdn->inode << endl; + assert(0); + reply_request(req, -EINVAL); + return; + } + if (srcdn->inode->is_dir()) { + dout(7) << "handle_client_rename_local failing, dest exists and src is a dir: " << *destdn->inode << endl; + assert(0); + reply_request(req, -EINVAL); + return; + } + } else { + // if destdn->inode is null, then we know it's a non-existent dest, + // why? because if it's local, it dne. and if it's remote, we xlocked with + // REQXLOCKC, which will only allow you to lock a file. + // so we know dest is a file, or non-existent + if (!destlocal) { + if (srcdn->inode->is_dir()) { + // help: maybe the dest exists and is a file? ..... FIXME + } else { + // we're fine, src is file, dest is file|dne + } + } + } + + mds->balancer->hit_dir(srcdn->dir, META_POP_DWR); + mds->balancer->hit_dir(destdn->dir, META_POP_DWR); + + // we're golden. + // everything is xlocked by us, we rule, etc. + MClientReply *reply = new MClientReply(req, 0); + mdcache->renamer->file_rename( srcdn, destdn, + new C_MDS_CommitRequest(this, req, reply, srcdn->inode, + new EInodeUpdate(srcdn->inode)) ); // FIXME WRONG EVENT +} + + + + + + + +// MKDIR + +void Server::handle_client_mkdir(MClientRequest *req, CInode *diri) +{ + // make dentry and inode, link. + CInode *newi = mknod(req, diri); + if (!newi) return; + + // make my new inode a dir. + newi->inode.mode = req->get_iarg(); + newi->inode.mode &= ~INODE_TYPE_MASK; + newi->inode.mode |= INODE_MODE_DIR; + + // use dir layout + newi->inode.layout = g_OSD_MDDirLayout; + + // init dir to be empty + assert(!newi->is_frozen_dir()); // bc mknod worked + CDir *newdir = newi->get_or_open_dir(mds); + newdir->mark_complete(); + newdir->mark_dirty(); + + mds->balancer->hit_dir(newdir, META_POP_DWR); + + if ( + diri->dir->is_auth() && + diri->dir->is_rep() && + newdir->is_auth() && + !newdir->is_hashing()) { + int dest = rand() % mds->mdsmap->get_num_mds(); + if (dest != whoami) { + dout(10) << "exporting new dir " << *newdir << " in replicated parent " << *diri->dir << endl; + mdcache->migrator->export_dir(newdir, dest); + } + } + + // commit to log + commit_request(req, new MClientReply(req, 0), diri, + new EMkdir(newdir)); + //new EInodeUpdate(newi),//); + //new EDirUpdate(newdir)); // FIXME: weird performance regression here w/ double log; somewhat of a mystery! + return; +} + + + + + +// SYMLINK + +void Server::handle_client_symlink(MClientRequest *req, CInode *diri) +{ + // make dentry and inode, link. + CInode *newi = mknod(req, diri); + if (!newi) return; + + // make my new inode a symlink + newi->inode.mode &= ~INODE_TYPE_MASK; + newi->inode.mode |= INODE_MODE_SYMLINK; + + // set target + newi->symlink = req->get_sarg(); + + mds->balancer->hit_dir(diri->dir, META_POP_DWR); + + // commit + commit_request(req, new MClientReply(req, 0), diri, + new EInodeUpdate(newi)); // FIXME should be differnet log entry +} + + + + + + + +// =================================== +// TRUNCATE, FSYNC + +/* + * FIXME: this truncate implemention is WRONG WRONG WRONG + */ + +void Server::handle_client_truncate(MClientRequest *req, CInode *cur) +{ + // write + if (!mds->locker->inode_file_write_start(cur, req)) + return; // fw or (wait for) lock + + // check permissions + + // do update + cur->inode.size = req->get_sizearg(); + cur->mark_dirty(); + + mds->locker->inode_file_write_finish(cur); + + mds->balancer->hit_inode(cur, META_POP_IWR); + + // start reply + MClientReply *reply = new MClientReply(req, 0); + + // commit + commit_request(req, reply, cur, + new EInodeUpdate(cur)); +} + + + +// =========================== +// open, openc, close + +void Server::handle_client_open(MClientRequest *req, + CInode *cur) +{ + int flags = req->get_iarg(); + int mode = req->get_iarg2(); + + dout(7) << "open " << flags << " on " << *cur << endl; + dout(10) << "open flags = " << flags << " mode = " << mode << endl; + + // is it a file? + if (!(cur->inode.mode & INODE_MODE_FILE)) { + dout(7) << "not a regular file" << endl; + reply_request(req, -EINVAL); // FIXME what error do we want? + return; + } + + // auth for write access + if (mode != FILE_MODE_R && mode != FILE_MODE_LAZY && + !cur->is_auth()) { + int auth = cur->authority(); + assert(auth != whoami); + dout(9) << "open writeable on replica for " << *cur << " fw to auth " << auth << endl; + + mdcache->request_forward(req, auth); + return; + } + + + // hmm, check permissions or something. + + + // can we issue the caps they want? + version_t fdv = mds->locker->issue_file_data_version(cur); + Capability *cap = mds->locker->issue_new_caps(cur, mode, req); + if (!cap) return; // can't issue (yet), so wait! + + dout(12) << "open gets caps " << cap_string(cap->pending()) << " for " << req->get_source() << " on " << *cur << endl; + + mds->balancer->hit_inode(cur, META_POP_IRD); + + // reply + MClientReply *reply = new MClientReply(req, 0); + reply->set_file_caps(cap->pending()); + reply->set_file_caps_seq(cap->get_last_seq()); + reply->set_file_data_version(fdv); + reply_request(req, reply, cur); +} + + + +void Server::handle_client_openc(MClientRequest *req, CInode *ref) +{ + dout(7) << "open w/ O_CREAT on " << req->get_filepath() << endl; + + CInode *in = mknod(req, ref, true); + if (!in) return; + + in->inode.mode = 0644; // wtf FIXME + in->inode.mode |= INODE_MODE_FILE; + + handle_client_open(req, in); +} + + + + + + + + + + + + + + diff --git a/branches/sage/cephmds2/mds/Server.h b/branches/sage/cephmds2/mds/Server.h new file mode 100644 index 0000000000000..912af31ca909a --- /dev/null +++ b/branches/sage/cephmds2/mds/Server.h @@ -0,0 +1,144 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_SERVER_H +#define __MDS_SERVER_H + +#include "MDS.h" + +class LogEvent; + +class Server { + MDS *mds; + MDCache *mdcache; + MDLog *mdlog; + Messenger *messenger; + int whoami; + + __uint64_t stat_ops; + + +public: + Server(MDS *m) : + mds(m), + mdcache(mds->mdcache), mdlog(mds->mdlog), + messenger(mds->messenger), whoami(mds->get_nodeid()), + stat_ops(0) { + } + + void dispatch(Message *m); + + // generic request helpers + void reply_request(MClientRequest *req, int r = 0, CInode *tracei = 0); + void reply_request(MClientRequest *req, MClientReply *reply, CInode *tracei); + void commit_request(MClientRequest *req, + MClientReply *reply, + CInode *tracei, + LogEvent *event, + LogEvent *event2 = 0); + + bool try_open_dir(CInode *in, MClientRequest *req); + + + // clients + void handle_client_mount(class MClientMount *m); + void handle_client_unmount(Message *m); + + void handle_client_request(MClientRequest *m); + void handle_client_request_2(MClientRequest *req, + vector& trace, + int r); + + // fs ops + void handle_client_fstat(MClientRequest *req); + + // requests + void dispatch_request(Message *m, CInode *ref); + + // inode request *req, CInode *ref; + void handle_client_stat(MClientRequest *req, CInode *ref); + void handle_client_utime(MClientRequest *req, CInode *ref); + void handle_client_inode_soft_update_2(MClientRequest *req, + MClientReply *reply, + CInode *ref); + void handle_client_chmod(MClientRequest *req, CInode *ref); + void handle_client_chown(MClientRequest *req, CInode *ref); + void handle_client_inode_hard_update_2(MClientRequest *req, + MClientReply *reply, + CInode *ref); + + // readdir + void handle_client_readdir(MClientRequest *req, CInode *ref); + int encode_dir_contents(CDir *dir, + list& inls, + list& dnls); + void handle_hash_readdir(MHashReaddir *m); + void handle_hash_readdir_reply(MHashReaddirReply *m); + void finish_hash_readdir(MClientRequest *req, CDir *dir); + + // namespace changes + void handle_client_mknod(MClientRequest *req, CInode *ref); + void handle_client_link(MClientRequest *req, CInode *ref); + void handle_client_link_2(int r, MClientRequest *req, CInode *ref, vector& trace); + void handle_client_link_finish(MClientRequest *req, CInode *ref, + CDentry *dn, CInode *targeti); + + void handle_client_unlink(MClientRequest *req, CInode *ref); + void handle_client_rename(MClientRequest *req, CInode *ref); + void handle_client_rename_2(MClientRequest *req, + CInode *ref, + CInode *srcdiri, + CDir *srcdir, + CDentry *srcdn, + filepath& destpath, + vector& trace, + int r); + void handle_client_rename_local(MClientRequest *req, CInode *ref, + string& srcpath, CInode *srcdiri, CDentry *srcdn, + string& destpath, CDir *destdir, CDentry *destdn, string& name); + + void handle_client_mkdir(MClientRequest *req, CInode *ref); + void handle_client_rmdir(MClientRequest *req, CInode *ref); + void handle_client_symlink(MClientRequest *req, CInode *ref); + + // file + void handle_client_open(MClientRequest *req, CInode *ref); + void handle_client_openc(MClientRequest *req, CInode *ref); + void handle_client_release(MClientRequest *req, CInode *in); + void handle_client_truncate(MClientRequest *req, CInode *in); + void handle_client_fsync(MClientRequest *req, CInode *in); + + CInode *mknod(MClientRequest *req, CInode *ref, bool okexist=false); // used by mknod, symlink, mkdir, openc + + +}; + +class C_MDS_RetryRequest : public Context { + MDS *mds; + Message *req; // MClientRequest or MLock + CInode *ref; + public: + C_MDS_RetryRequest(MDS *mds, Message *req, CInode *ref) { + assert(ref); + this->mds = mds; + this->req = req; + this->ref = ref; + } + virtual void finish(int r) { + mds->server->dispatch_request(req, ref); + } +}; + + + +#endif diff --git a/branches/sage/cephmds2/mds/events/EAlloc.h b/branches/sage/cephmds2/mds/events/EAlloc.h new file mode 100644 index 0000000000000..b3b5f21f84038 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EAlloc.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_EALLOC_H +#define __MDS_EALLOC_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "../IdAllocator.h" + +#define EALLOC_EV_ALLOC 1 +#define EALLOC_EV_FREE 2 + +class EAlloc : public LogEvent { + protected: + int idtype; + idno_t id; + int what; // alloc or dealloc + version_t table_version; + + public: + EAlloc() : LogEvent(EVENT_ALLOC) { } + EAlloc(int idtype, idno_t id, int what, version_t v) : + LogEvent(EVENT_ALLOC) { + this->idtype = idtype; + this->id = id; + this->what = what; + this->table_version = v; + } + + void encode_payload(bufferlist& bl) { + bl.append((char*)&idtype, sizeof(idtype)); + bl.append((char*)&id, sizeof(id)); + bl.append((char*)&what, sizeof(what)); + bl.append((char*)&table_version, sizeof(table_version)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(idtype), (char*)&idtype); + off += sizeof(idtype); + bl.copy(off, sizeof(id), (char*)&id); + off += sizeof(id); + bl.copy(off, sizeof(what), (char*)&what); + off += sizeof(what); + bl.copy(off, sizeof(table_version), (char*)&table_version); + off += sizeof(table_version); + } + + + void print(ostream& out) { + if (what == EALLOC_EV_ALLOC) + out << "alloc " << hex << id << dec << " tablev " << table_version; + else + out << "dealloc " << hex << id << dec << " tablev " << table_version; + } + + + // live journal + bool can_expire(MDS *mds) { + if (mds->idalloc->get_committed_version() < table_version) + return false; // still dirty + else + return true; // already flushed + } + + void retire(MDS *mds, Context *c) { + mds->idalloc->save(c, table_version); + } + + + // recovery + bool has_happened(MDS *mds) { + if (mds->idalloc->get_version() >= table_version) { + cout << " event " << table_version << " <= table " << mds->idalloc->get_version() << endl; + return true; + } else + return false; + } + + void replay(MDS *mds) { + assert(table_version-1 == mds->idalloc->get_version()); + + if (what == EALLOC_EV_ALLOC) { + idno_t nid = mds->idalloc->alloc_id(true); + assert(nid == id); // this should match. + } + else if (what == EALLOC_EV_FREE) { + mds->idalloc->reclaim_id(id, true); + } + else + assert(0); + + assert(table_version == mds->idalloc->get_version()); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EDirUpdate.h b/branches/sage/cephmds2/mds/events/EDirUpdate.h new file mode 100644 index 0000000000000..9c8881d4c91b9 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EDirUpdate.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EDIRUPDATE_H +#define __EDIRUPDATE_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "ETrace.h" +#include "../CDir.h" +#include "../MDCache.h" +#include "../MDStore.h" + + + +class EDirUpdate : public LogEvent { + protected: + ETrace trace; + inodeno_t dirino; + version_t version; + + public: + EDirUpdate(CDir *dir) : LogEvent(EVENT_DIRUPDATE), + trace(dir->inode) { + this->dirino = dir->ino(); + version = dir->get_version(); + } + EDirUpdate() : LogEvent(EVENT_DIRUPDATE) { + } + + void print(ostream& out) { + out << "up dir " << dirino << " " + << trace + << "/ v " << version; + } + + virtual void encode_payload(bufferlist& bl) { + trace.encode(bl); + bl.append((char*)&version, sizeof(version)); + bl.append((char*)&dirino, sizeof(dirino)); + } + void decode_payload(bufferlist& bl, int& off) { + trace.decode(bl, off); + bl.copy(off, sizeof(version), (char*)&version); + off += sizeof(version); + bl.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + } + + + virtual bool can_expire(MDS *mds) { + // am i obsolete? + CInode *in = mds->mdcache->get_inode(dirino); + if (!in) return true; + CDir *dir = in->dir; + if (!dir) return true; + + dout(10) << "EDirUpdate v " << version << " on dir " << *dir << endl; + + if (!dir->is_auth()) return true; // not mine! + if (dir->is_frozen()) return true; // frozen -> exporting -> obsolete? FIXME + + if (!dir->is_dirty()) return true; + + if (dir->get_committing_version() > version) + return true; + + return false; + } + + virtual void retire(MDS *mds, Context *c) { + // commit directory + CInode *in = mds->mdcache->get_inode(dirino); + assert(in); + CDir *dir = in->dir; + assert(dir); + + dout(10) << "EDirUpdate committing dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EInodeUpdate.h b/branches/sage/cephmds2/mds/events/EInodeUpdate.h new file mode 100644 index 0000000000000..dba233c833883 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EInodeUpdate.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EINODEUPDATE_H +#define __EINODEUPDATE_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "ETrace.h" + + +class EInodeUpdate : public LogEvent { + protected: + ETrace trace; + + public: + EInodeUpdate(CInode *in) : LogEvent(EVENT_INODEUPDATE), + trace(in) { + } + EInodeUpdate() : LogEvent(EVENT_INODEUPDATE) { } + + void print(ostream& out) { + out << "up inode " << trace.back().inode.ino + << " " << trace + << " v " << trace.back().inode.version; + } + + virtual void encode_payload(bufferlist& bl) { + trace.encode(bl); + } + void decode_payload(bufferlist& bl, int& off) { + trace.decode(bl, off); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EMkdir.h b/branches/sage/cephmds2/mds/events/EMkdir.h new file mode 100644 index 0000000000000..f7f9c05c2207c --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EMkdir.h @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EMKDIR_H +#define __EMKDIR_H + +#include +#include "config.h" +#include "include/types.h" + +#include "ETrace.h" +#include "../MDS.h" +#include "../MDStore.h" + + +class EMkdir : public LogEvent { + protected: + ETrace trace; + //version_t pdirv; + + public: + EMkdir(CDir *dir) : LogEvent(EVENT_MKDIR), + trace(dir->inode) { + //pdirv = dir->inode->get_parent_dir()->get_version(); + } + EMkdir() : LogEvent(EVENT_MKDIR) { } + + void print(ostream& out) { + out << "mkdir "; + trace.print(out); + } + + virtual void encode_payload(bufferlist& bl) { + trace.encode(bl); + //bl.append((char*)&pdirv, sizeof(pdirv)); + } + void decode_payload(bufferlist& bl, int& off) { + trace.decode(bl, off); + //bl.copy(off, sizeof(pdirv), (char*)&pdirv); + //off += sizeof(pdirv); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + + // recovery + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EMknod.h b/branches/sage/cephmds2/mds/events/EMknod.h new file mode 100644 index 0000000000000..27ade4671a0c7 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EMknod.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EMKNOD_H +#define __EMKNOD_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "ETrace.h" +#include "../MDS.h" +#include "../MDStore.h" + + +class EMknod : public LogEvent { + protected: + ETrace trace; + //version_t pdirv; + + public: + EMknod(CInode *in) : LogEvent(EVENT_MKNOD), + trace(in) { + //pdirv = in->get_parent_dir()->get_version(); + } + EMknod() : LogEvent(EVENT_MKNOD) { } + + void print(ostream& out) { + out << "mknod " << trace; + } + + virtual void encode_payload(bufferlist& bl) { + trace.encode(bl); + //bl.append((char*)&pdirv, sizeof(pdirv)); + } + void decode_payload(bufferlist& bl, int& off) { + trace.decode(bl, off); + //bl.copy(off, sizeof(pdirv), (char*)&pdirv); + //off += sizeof(pdirv); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EPurgeFinish.h b/branches/sage/cephmds2/mds/events/EPurgeFinish.h new file mode 100644 index 0000000000000..bacfa8e93c737 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EPurgeFinish.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EPURGE_H +#define __EPURGE_H + +#include +#include "config.h" +#include "include/types.h" + +class EPurgeFinish : public LogEvent { + protected: + inodeno_t ino; + + public: + EPurgeFinish(inodeno_t i) : + LogEvent(EVENT_PURGEFINISH), + ino(i) { } + EPurgeFinish() : LogEvent(EVENT_PURGEFINISH) { } + + void print(ostream& out) { + out << "purgefinish " << ino; + } + + virtual void encode_payload(bufferlist& bl) { + bl.append((char*)&ino, sizeof(ino)); + } + void decode_payload(bufferlist& bl, int& off) { + bl.copy(off, sizeof(ino), (char*)&ino); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/EString.h b/branches/sage/cephmds2/mds/events/EString.h new file mode 100644 index 0000000000000..6bd10030549ba --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EString.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __ESTRING_H +#define __ESTRING_H + +#include +#include +using namespace std; + +#include "../LogEvent.h" + +// generic log event +class EString : public LogEvent { + protected: + string event; + + public: + EString(string e) : + LogEvent(EVENT_STRING) { + event = e; + } + EString() : + LogEvent(EVENT_STRING) { + } + + void decode_payload(bufferlist& bl, int& off) { + event = bl.c_str() + off; + off += event.length() + 1; + } + + void encode_payload(bufferlist& bl) { + bl.append(event.c_str(), event.length()+1); + } + + void print(ostream& out) { + out << '"' << event << '"'; + } + +}; + +#endif diff --git a/branches/sage/cephmds2/mds/events/ETrace.h b/branches/sage/cephmds2/mds/events/ETrace.h new file mode 100644 index 0000000000000..a320137512178 --- /dev/null +++ b/branches/sage/cephmds2/mds/events/ETrace.h @@ -0,0 +1,119 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDS_ETRACE_H +#define __MDS_ETRACE_H + +#include +#include +using namespace std; + +#include "../CInode.h" +#include "../CDir.h" +#include "../CDentry.h" + + +// path trace for use in journal events + +class ETrace { + + // segment. + struct bit { + inodeno_t dirino; + version_t dirv; + string dn; + inode_t inode; + + bit(bufferlist& bl, int& off) { _decode(bl,off); } + bit(inodeno_t di, version_t dv, const string& d, inode_t i) : + dirino(di), dirv(dv), dn(d), inode(i) {} + + void _encode(bufferlist& bl) { + bl.append((char*)&dirino, sizeof(dirino)); + bl.append((char*)&dirv, sizeof(dirv)); + ::_encode(dn, bl); + bl.append((char*)&inode, sizeof(inode)); + } + void _decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(dirino), (char*)&dirino); off += sizeof(dirino); + bl.copy(off, sizeof(dirv), (char*)&dirv); off += sizeof(dirv); + ::_decode(dn, bl, off); + bl.copy(off, sizeof(inode), (char*)&inode); off += sizeof(inode); + } + }; + + public: + list trace; + + ETrace(CInode *in = 0) { + if (in) { + CDir *dir; + CDentry *dn; + do { + dn = in->get_parent_dn(); + if (!dn) break; + dir = dn->get_dir(); + if (!dir) break; + + trace.push_front(bit(dir->ino(), + dir->get_version(), + dn->get_name(), + in->inode)); + + in = dir->get_inode(); + } while (!dir->is_import()); + } + } + + bit& back() { + return trace.back(); + } + + void decode(bufferlist& bl, int& off) { + int n; + bl.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i::iterator i = trace.begin(); + i != trace.end(); + i++) + i->_encode(bl); + } + + void print(ostream& out) const { + for (list::const_iterator p = trace.begin(); + p != trace.end(); + p++) { + if (p == trace.begin()) + out << "[" << p->dirino << "]/" << p->dn; + else + out << "/" << p->dn; + } + } + + CInode *restore_trace(MDS *mds); + +}; + +inline ostream& operator<<(ostream& out, const ETrace& t) { + t.print(out); + return out; +} + +#endif diff --git a/branches/sage/cephmds2/mds/events/EUnlink.h b/branches/sage/cephmds2/mds/events/EUnlink.h new file mode 100644 index 0000000000000..9b7484174886a --- /dev/null +++ b/branches/sage/cephmds2/mds/events/EUnlink.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __EUNLINK_H +#define __EUNLINK_H + +#include +#include "config.h" +#include "include/types.h" + +#include "../LogEvent.h" +#include "ETrace.h" + +#include "../CInode.h" +#include "../CDentry.h" +#include "../CDir.h" + +class EUnlink : public LogEvent { + protected: + ETrace diritrace; + version_t dirv; + string dname; + ETrace inodetrace; + + public: + EUnlink(CDir *dir, CDentry* dn, CInode *in) : + LogEvent(EVENT_UNLINK), + diritrace(dir->inode), + dirv(dir->get_version()), + dname(dn->get_name()), + inodetrace(in) {} + EUnlink() : LogEvent(EVENT_UNLINK) { } + + virtual void encode_payload(bufferlist& bl) { + diritrace.encode(bl); + bl.append((char*)&dirv, sizeof(dirv)); + ::_encode(dname, bl); + inodetrace.encode(bl); + } + void decode_payload(bufferlist& bl, int& off) { + diritrace.decode(bl,off); + bl.copy(off, sizeof(dirv), (char*)&dirv); + off += sizeof(dirv); + ::_decode(dname, bl, off); + inodetrace.decode(bl, off); + } + + bool can_expire(MDS *mds); + void retire(MDS *mds, Context *c); + bool has_happened(MDS *mds); + void replay(MDS *mds); +}; + +#endif diff --git a/branches/sage/cephmds2/mds/journal.cc b/branches/sage/cephmds2/mds/journal.cc new file mode 100644 index 0000000000000..9ac2406e2cbc2 --- /dev/null +++ b/branches/sage/cephmds2/mds/journal.cc @@ -0,0 +1,345 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "events/ETrace.h" +#include "events/EMknod.h" +#include "events/EMkdir.h" +#include "events/EInodeUpdate.h" +#include "events/EPurgeFinish.h" +#include "events/EUnlink.h" + +#include "MDS.h" +#include "MDCache.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " +#define derr(l) if (l<=g_conf.debug || l <= g_conf.debug_mds) cout << g_clock.now() << " mds" << mds->get_nodeid() << ".journal " + + +// ----------------------- +// ETrace + +CInode *ETrace::restore_trace(MDS *mds) +{ + CInode *in = 0; + for (list::iterator p = trace.begin(); + p != trace.end(); + ++p) { + // the dir + CInode *diri = mds->mdcache->get_inode(p->dirino); + if (!diri) { + dout(10) << "ETrace.restore_trace adding dir " << p->dirino << endl; + diri = new CInode(mds->mdcache); + diri->inode.ino = p->dirino; + diri->inode.mode = INODE_MODE_DIR; + mds->mdcache->add_inode(diri); + + CDir *dir = diri->get_or_open_dir(mds); + + // root? import? + if (p == trace.begin()) { + mds->mdcache->add_import(dir); + if (dir->ino() == 1) + mds->mdcache->set_root(diri); + } + } else { + dout(20) << "ETrace.restore_trace had dir " << p->dirino << endl; + diri->get_or_open_dir(mds); + } + assert(diri->dir); + dout(20) << "ETrace.restore_trace dir is " << *diri->dir << endl; + + // the inode + in = mds->mdcache->get_inode(p->inode.ino); + if (!in) { + dout(10) << "ETrace.restore_trace adding dn '" << p->dn << "' inode " << p->inode.ino << endl; + in = new CInode(mds->mdcache); + in->inode = p->inode; + mds->mdcache->add_inode(in); + + // the dentry + CDentry *dn = diri->dir->add_dentry( p->dn, in ); + dn->mark_dirty(); + assert(dn); + } else { + dout(20) << "ETrace.restore_trace had dn '" << p->dn << "' inode " << p->inode.ino << endl; + in->inode = p->inode; + } + dout(20) << "ETrace.restore_trace in is " << *in << endl; + } + return in; +} + + +// ----------------------- +// EMkdir +// - trace goes to new dir's inode. + +bool EMkdir::can_expire(MDS *mds) +{ + // am i obsolete? + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + CDir *dir = in->dir; + if (!dir) return true; + CDir *pdir = in->get_parent_dir(); + assert(pdir); + + dout(10) << "EMkdir.can_expire in is " << *in << endl; + dout(10) << "EMkdir.can_expire inv is " << trace.back().inode.version << endl; + dout(10) << "EMkdir.can_expire dir is " << *dir << endl; + bool commitparent = in->get_last_committed_version() < trace.back().inode.version; + bool commitnew = dir->get_last_committed_version() == 0; + + if (commitparent || commitnew) return false; + return true; +} + +void EMkdir::retire(MDS *mds, Context *c) +{ + // commit parent dir AND my dir + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + assert(in); + CDir *dir = in->dir; + assert(dir); + CDir *pdir = in->get_parent_dir(); + assert(pdir); + + dout(10) << "EMkdir.retire in is " << *in << endl; + dout(10) << "EMkdir.retire inv is " << trace.back().inode.version << endl; + dout(10) << "EMkdir.retire dir is " << *dir << endl; + bool commitparent = in->get_last_committed_version() < trace.back().inode.version; + bool commitnew = dir->get_last_committed_version() == 0; + + if (commitparent && commitnew) { + // both + dout(10) << "EMkdir.retire committing parent+new dir " << *dir << endl; + C_Gather *gather = new C_Gather(c); + mds->mdstore->commit_dir(pdir, gather->new_sub()); + mds->mdstore->commit_dir(dir, gather->new_sub()); + } else if (commitparent) { + // just parent + dout(10) << "EMkdir.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(pdir, c); + } else { + // just new dir + dout(10) << "EMkdir.retire committing new dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); + } +} + +bool EMkdir::has_happened(MDS *mds) +{ + return false; +} + +void EMkdir::replay(MDS *mds) +{ + dout(10) << "EMkdir.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + + // mark dir inode dirty + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); + + // mark new dir dirty + complete + CDir *dir = in->get_or_open_dir(mds); + dir->mark_dirty(); + dir->mark_complete(); +} + + + +// ----------------------- +// EMknod + +bool EMknod::can_expire(MDS *mds) +{ + // am i obsolete? + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + + if (!in->is_auth()) return true; // not my inode anymore! + if (in->get_version() != trace.back().inode.version) + return true; // i'm obsolete! (another log entry follows) + + if (in->get_last_committed_version() >= trace.back().inode.version) + return true; + + return false; +} + +void EMknod::retire(MDS *mds, Context *c) +{ + // commit parent directory + CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + dout(10) << "EMknod.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); +} + +bool EMknod::has_happened(MDS *mds) +{ + return false; +} + +void EMknod::replay(MDS *mds) +{ + dout(10) << "EMknod.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); +} + + + +// ----------------------- +// EInodeUpdate + +bool EInodeUpdate::can_expire(MDS *mds) +{ + CInode *in = mds->mdcache->get_inode( trace.back().inode.ino ); + if (!in) return true; + + if (!in->is_auth()) return true; // not my inode anymore! + if (in->get_version() != trace.back().inode.version) + return true; // i'm obsolete! (another log entry follows) + + /* + // frozen -> exporting -> obsolete (FOR NOW?) + if (in->is_frozen()) + return true; + */ + + if (in->get_last_committed_version() >= trace.back().inode.version) + return true; + + return false; +} + +void EInodeUpdate::retire(MDS *mds, Context *c) +{ + // commit parent directory + CInode *diri = mds->mdcache->get_inode( trace.back().dirino ); + assert(diri); + CDir *dir = diri->dir; + assert(dir); + + dout(10) << "EMknod.retire committing parent dir " << *dir << endl; + mds->mdstore->commit_dir(dir, c); +} + +bool EInodeUpdate::has_happened(MDS *mds) +{ + return false; +} + +void EInodeUpdate::replay(MDS *mds) +{ + dout(10) << "EInodeUpdate.replay " << *this << endl; + CInode *in = trace.restore_trace(mds); + in->mark_dirty(); + + // mark parent dir dirty, and set version. + // this may end up being below water when dir is fetched from disk. + CDir *pdir = in->get_parent_dir(); + if (!pdir->is_dirty()) pdir->mark_dirty(); + pdir->set_version(trace.back().dirv); +} + + + +// ----------------------- +// EUnlink + +bool EUnlink::can_expire(MDS *mds) +{ + // dir + CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); + CDir *dir = 0; + if (diri) dir = diri->dir; + + if (dir && dir->get_last_committed_version() < dirv) return false; + + if (!inodetrace.trace.empty()) { + // inode + CInode *in = mds->mdcache->get_inode( inodetrace.back().inode.ino ); + if (in && in->get_last_committed_version() < inodetrace.back().inode.version) + return false; + } + + return true; +} + +void EUnlink::retire(MDS *mds, Context *c) +{ + CInode *diri = mds->mdcache->get_inode( diritrace.back().inode.ino ); + CDir *dir = diri->dir; + assert(dir); + + // okay! + dout(7) << "commiting dirty (from unlink) dir " << *dir << endl; + mds->mdstore->commit_dir(dir, dirv, c); +} + +bool EUnlink::has_happened(MDS *mds) +{ + return true; +} + +void EUnlink::replay(MDS *mds) +{ +} + + + + +// ----------------------- +// EPurgeFinish + + +bool EPurgeFinish::can_expire(MDS *mds) +{ + return true; +} + +void EPurgeFinish::retire(MDS *mds, Context *c) +{ +} + +bool EPurgeFinish::has_happened(MDS *mds) +{ + return true; +} + +void EPurgeFinish::replay(MDS *mds) +{ +} + + + + diff --git a/branches/sage/cephmds2/mds/mdstypes.h b/branches/sage/cephmds2/mds/mdstypes.h new file mode 100644 index 0000000000000..b448123bf929e --- /dev/null +++ b/branches/sage/cephmds2/mds/mdstypes.h @@ -0,0 +1,135 @@ +#ifndef __MDSTYPES_H +#define __MDSTYPES_H + + +#include +#include +using namespace std; + +#include "config.h" +#include "common/DecayCounter.h" + +#include + + +/* meta_load_t + * hierarchical load for an inode/dir and it's children + */ +#define META_POP_IRD 0 +#define META_POP_IWR 1 +#define META_POP_DWR 2 +//#define META_POP_LOG 3 +//#define META_POP_FDIR 4 +//#define META_POP_CDIR 4 +#define META_NPOP 3 + +class meta_load_t { + public: + DecayCounter pop[META_NPOP]; + + double meta_load() { + return pop[META_POP_IRD].get() + 2*pop[META_POP_IWR].get(); + } + + void take(meta_load_t& other) { + for (int i=0; i"; +} + + +inline meta_load_t& operator-=(meta_load_t& l, meta_load_t& r) +{ + for (int i=0; i"; +} + +/* +inline mds_load_t& operator+=( mds_load_t& l, mds_load_t& r ) +{ + l.root_pop += r.root_pop; + l.req_rate += r.req_rate; + l.queue_len += r.queue_len; + return l; +} + +inline mds_load_t operator/( mds_load_t& a, double d ) +{ + mds_load_t r; + r.root_pop = a.root_pop / d; + r.req_rate = a.req_rate / d; + r.queue_len = a.queue_len / d; + return r; +} +*/ + + +#endif diff --git a/branches/sage/cephmds2/mds/oldcachestuff.cc b/branches/sage/cephmds2/mds/oldcachestuff.cc new file mode 100644 index 0000000000000..31bb9eaa81e3d --- /dev/null +++ b/branches/sage/cephmds2/mds/oldcachestuff.cc @@ -0,0 +1,944 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +/* + + +OLD LOCK CRAP: + (old): + sync - soft metadata.. no reads/writes can proceed. (eg no stat) + lock - hard(+soft) metadata.. path traversals stop etc. (??) + + + replication consistency modes: + hard+soft - hard and soft are defined on all replicas. + all reads proceed (in absense of sync lock) + writes require sync lock, fw to auth + -> normal behavior. + + hard - hard only, soft is undefined + reads require a sync + writes proceed if field updates are monotonic (e.g. size, m/c/atime) + -> 'softasync' + + types of access by cache users: + + hard soft + R - read_hard_try path traversal + R <= R read_soft_start stat + R <= W write_soft_start touch + W => W write_hard_start chmod + + note on those implications: + read_soft_start() calls read_hard_try() + write_soft_start() calls read_hard_try() + a hard lock implies/subsumes a soft sync (read_soft_start() returns true if a + lock is held) + + + relationship with frozen directories: + + read_hard_try - can proceed, because any hard changes require a lock, which + requires an active authority, which implies things are unfrozen. + write_hard_start - waits (has to; only auth can initiate) + read_soft_start - ???? waits for now. (FIXME: if !softasync & !syncbyauth) + write_soft_start - ???? waits for now. (FIXME: if (softasync & !syncbyauth)) + + if sticky is on, an export_dir will drop any sync or lock so that the freeze will + proceed (otherwise, deadlock!). likewise, a sync will not stick if is_freezing(). + + + +NAMESPACE: + + none right now. + + +*/ + + +/* soft sync locks: mtime, size, etc. + */ + +bool MDCache::read_soft_start(CInode *in, Message *m) +{ + // if (!read_hard_try(in, m)) + // return false; + + // if frozen: i can't proceed (for now, see above) + if (in->is_frozen()) { + dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + + dout(5) << "read_soft_start " << *in << endl; + + // what soft sync mode? + + if (in->is_softasync()) { + // softasync: hard consistency only + + if (in->is_auth()) { + // i am auth: i need sync + if (in->is_syncbyme()) goto yes; + if (in->is_lockbyme()) goto yes; // lock => sync + if (!in->is_cached_by_anyone() && + !in->is_open_write()) goto yes; // i'm alone + } else { + // i am replica: fw to auth + int auth = in->authority(); + dout(5) << "read_soft_start " << *in << " is softasync, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + } else { + // normal: soft+hard consistency + + if (in->is_syncbyauth()) { + // wait for sync + } else { + // i'm consistent + goto yes; + } + } + + // we need sync + if (in->is_syncbyauth() && !in->is_softasync()) { + dout(5) << "read_soft_start " << *in << " is normal+replica+syncbyauth" << endl; + } else if (in->is_softasync() && in->is_auth()) { + dout(5) << "read_soft_start " << *in << " is softasync+auth, waiting on sync" << endl; + } else + assert(2+2==5); + + if (!in->can_auth_pin()) { + dout(5) << "read_soft_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, + new C_MDS_RetryMessage(mds,m)); + return false; + } + + if (in->is_auth()) { + // wait for sync + in->add_waiter(CINODE_WAIT_SYNC, + new C_MDS_RetryMessage(mds, m)); + + if (!in->is_presync()) + inode_sync_start(in); + } else { + // wait for unsync + in->add_waiter(CINODE_WAIT_UNSYNC, + new C_MDS_RetryMessage(mds, m)); + + assert(in->is_syncbyauth()); + + if (!in->is_waitonunsync()) + inode_sync_wait(in); + } + + return false; + + yes: + mds->balancer->hit_inode(in, MDS_POP_SOFTRD); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +int MDCache::read_soft_finish(CInode *in) +{ + dout(5) << "read_soft_finish " << *in << endl; // " soft_sync_count " << in->soft_sync_count << endl; + return 0; // do nothing, actually.. +} + + +bool MDCache::write_soft_start(CInode *in, Message *m) +{ + // if (!read_hard_try(in, m)) + //return false; + + // if frozen: i can't proceed (for now, see above) + if (in->is_frozen()) { + dout(7) << "read_soft_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + dout(5) << "write_soft_start " << *in << endl; + // what soft sync mode? + + if (in->is_softasync()) { + // softasync: hard consistency only + + if (in->is_syncbyauth()) { + // wait for sync release + } else { + // i'm inconsistent; write away! + goto yes; + } + + } else { + // normal: soft+hard consistency + + if (in->is_auth()) { + // i am auth: i need sync + if (in->is_syncbyme()) goto yes; + if (in->is_lockbyme()) goto yes; // lock => sync + if (!in->is_cached_by_anyone() && + !in->is_open_write()) goto yes; // i'm alone + } else { + // i am replica: fw to auth + int auth = in->authority(); + dout(5) << "write_soft_start " << *in << " is !softasync, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + } + + // we need sync + if (in->is_syncbyauth() && in->is_softasync() && !in->is_auth()) { + dout(5) << "write_soft_start " << *in << " is softasync+replica+syncbyauth" << endl; + } else if (!in->is_softasync() && in->is_auth()) { + dout(5) << "write_soft_start " << *in << " is normal+auth, waiting on sync" << endl; + } else + assert(2+2==5); + + if (!in->can_auth_pin()) { + dout(5) << "write_soft_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, + new C_MDS_RetryMessage(mds,m)); + return false; + } + + if (in->is_auth()) { + // wait for sync + in->add_waiter(CINODE_WAIT_SYNC, + new C_MDS_RetryMessage(mds, m)); + + if (!in->is_presync()) + inode_sync_start(in); + } else { + // wait for unsync + in->add_waiter(CINODE_WAIT_UNSYNC, + new C_MDS_RetryMessage(mds, m)); + + assert(in->is_syncbyauth()); + assert(in->is_softasync()); + + if (!in->is_waitonunsync()) + inode_sync_wait(in); + } + + return false; + + yes: + mds->balancer->hit_inode(in, MDS_POP_SOFTWR); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +int MDCache::write_soft_finish(CInode *in) +{ + dout(5) << "write_soft_finish " << *in << endl; //" soft_sync_count " << in->soft_sync_count << endl; + return 0; // do nothing, actually.. +} + + + + + + + + +/* hard locks: owner, mode + */ + +/* +bool MDCache::read_hard_try(CInode *in, + Message *m) +{ + //dout(5) << "read_hard_try " << *in << endl; + + if (in->is_auth()) { + // auth + goto yes; // fine + } else { + // replica + if (in->is_lockbyauth()) { + // locked by auth; wait! + dout(7) << "read_hard_try waiting on " << *in << endl; + in->add_waiter(CINODE_WAIT_UNLOCK, new C_MDS_RetryMessage(mds, m)); + if (!in->is_waitonunlock()) + inode_lock_wait(in); + return false; + } else { + // not locked. + goto yes; + } + } + + yes: + mds->balancer->hit_inode(in, MDS_POP_HARDRD); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + + +bool MDCache::write_hard_start(CInode *in, + Message *m) +{ + // if frozen: i can't proceed; only auth can initiate lock + if (in->is_frozen()) { + dout(7) << "write_hard_start " << *in << " is frozen, waiting" << endl; + in->add_waiter(CDIR_WAIT_UNFREEZE, + new C_MDS_RetryMessage(mds, m)); + return false; + } + + // NOTE: if freezing, and locked, we must proceed, to avoid deadlock (where + // the freeze is waiting for our lock to be released) + + + if (in->is_auth()) { + // auth + if (in->is_lockbyme()) goto success; + if (!in->is_cached_by_anyone()) goto success; + + // need lock + if (!in->can_auth_pin()) { + dout(5) << "write_hard_start " << *in << " waiting to auth_pin" << endl; + in->add_waiter(CINODE_WAIT_AUTHPINNABLE, new C_MDS_RetryMessage(mds, m)); + return false; + } + + in->add_waiter(CINODE_WAIT_LOCK, new C_MDS_RetryMessage(mds, m)); + + if (!in->is_prelock()) + inode_lock_start(in); + + return false; + } else { + // replica + // fw to auth + int auth = in->authority(); + dout(5) << "write_hard_start " << *in << " on replica, fw to auth " << auth << endl; + assert(auth != mds->get_nodeid()); + mds->messenger->send_message(m, + MSG_ADDR_MDS(auth), m->get_dest_port(), + MDS_PORT_CACHE); + return false; + } + + success: + in->lock_active_count++; + dout(5) << "write_hard_start " << *in << " count now " << in->lock_active_count << endl; + assert(in->lock_active_count > 0); + + mds->balancer->hit_inode(in, MDS_POP_HARDWR); + mds->balancer->hit_inode(in, MDS_POP_ANY); + return true; +} + +void MDCache::write_hard_finish(CInode *in) +{ + in->lock_active_count--; + dout(5) << "write_hard_finish " << *in << " count now " << in->lock_active_count << endl; + assert(in->lock_active_count >= 0); + + // release lock? + if (in->lock_active_count == 0 && + in->is_lockbyme() && + !g_conf.mdcache_sticky_lock) { + dout(7) << "write_hard_finish " << *in << " !sticky, releasing lock immediately" << endl; + inode_lock_release(in); + } +} + + +void MDCache::inode_lock_start(CInode *in) +{ + dout(5) << "lock_start on " << *in << ", waiting for " << in->cached_by << endl; + + assert(in->is_auth()); + assert(!in->is_prelock()); + assert(!in->is_lockbyme()); + assert(!in->is_lockbyauth()); + + in->lock_waiting_for_ack = in->cached_by; + in->dist_state |= CINODE_DIST_PRELOCK; + in->get(CINODE_PIN_PRELOCK); + in->auth_pin(); + + // send messages + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockStart(in->inode.ino, mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::inode_lock_release(CInode *in) +{ + dout(5) << "lock_release on " << *in << ", messages to " << in->get_cached_by() << endl; + + assert(in->is_lockbyme()); + assert(in->is_auth()); + + in->dist_state &= ~CINODE_DIST_LOCKBYME; + + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeLockRelease(in), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + in->auth_unpin(); +} + +void MDCache::inode_lock_wait(CInode *in) +{ + dout(5) << "lock_wait on " << *in << endl; + assert(!in->is_auth()); + assert(in->is_lockbyauth()); + + in->dist_state |= CINODE_DIST_WAITONUNLOCK; + in->get(CINODE_PIN_WAITONUNLOCK); +} + + +void MDCache::handle_inode_lock_start(MInodeLockStart *m) +{ + // authority is requesting a lock + CInode *in = get_inode(m->get_ino()); + if (!in) { + // don't have it anymore! + dout(7) << "handle_lock_start " << m->get_ino() << ": don't have it anymore, nak" << endl; + mds->messenger->send_message(new MInodeLockAck(m->get_ino(), false), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done + return; + } + + // we shouldn't be authoritative... + assert(!in->is_auth()); + + dout(7) << "handle_lock_start " << *in << ", sending ack" << endl; + + // lock it + in->dist_state |= CINODE_DIST_LOCKBYAUTH; + + // sanity check: make sure we know who _is_ authoritative! + assert(m->get_asker() == in->authority()); + + // send ack + mds->messenger->send_message(new MInodeLockAck(in->ino()), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + + delete m; // done +} + + +void MDCache::handle_inode_lock_ack(MInodeLockAck *m) +{ + CInode *in = get_inode(m->get_ino()); + int from = m->get_source(); + dout(7) << "handle_lock_ack from " << from << " on " << *in << endl; + + assert(in); + assert(in->is_auth()); + assert(in->dist_state & CINODE_DIST_PRELOCK); + + // remove it from waiting list + in->lock_waiting_for_ack.erase(from); + + if (!m->did_have()) { + // erase from cached_by too! + in->cached_by_remove(from); + } + + if (in->lock_waiting_for_ack.size()) { + + // more coming + dout(7) << "handle_lock_ack " << *in << " from " << from << ", still waiting for " << in->lock_waiting_for_ack << endl; + + } else { + + // yay! + dout(7) << "handle_lock_ack " << *in << " from " << from << ", last one" << endl; + + in->dist_state &= ~CINODE_DIST_PRELOCK; + in->dist_state |= CINODE_DIST_LOCKBYME; + in->put(CINODE_PIN_PRELOCK); + + // do waiters! + in->finish_waiting(CINODE_WAIT_LOCK); + } + + delete m; // done +} + + +void MDCache::handle_inode_lock_release(MInodeLockRelease *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_lock_release " << m->get_ino() << ", don't have it, dropping" << endl; + delete m; // done + return; + } + + if (!in->is_lockbyauth()) { + dout(7) << "handle_lock_release " << m->get_ino() << ", not flagged as locked, wtf" << endl; + assert(0); // i should have it, locked, or not have it at all! + delete m; // done + return; + } + + dout(7) << "handle_lock_release " << *in << endl; + assert(!in->is_auth()); + + // release state + in->dist_state &= ~CINODE_DIST_LOCKBYAUTH; + + // waiters? + if (in->is_waitonunlock()) { + in->put(CINODE_PIN_WAITONUNLOCK); + in->dist_state &= ~CINODE_DIST_WAITONUNLOCK; + + // finish + in->finish_waiting(CINODE_WAIT_UNLOCK); + } + + // done + delete m; +} +*/ + + + + + + + + + +// sync interface + +void MDCache::inode_sync_wait(CInode *in) +{ + assert(!in->is_auth()); + + int auth = in->authority(); + dout(5) << "inode_sync_wait on " << *in << ", auth " << auth << endl; + + assert(in->is_syncbyauth()); + assert(!in->is_waitonunsync()); + + in->dist_state |= CINODE_DIST_WAITONUNSYNC; + in->get(CINODE_PIN_WAITONUNSYNC); + + if ((in->is_softasync() && g_conf.mdcache_sticky_sync_softasync) || + (!in->is_softasync() && g_conf.mdcache_sticky_sync_normal)) { + // actually recall; if !sticky, auth will immediately release. + dout(5) << "inode_sync_wait on " << *in << " sticky, recalling from auth" << endl; + mds->messenger->send_message(new MInodeSyncRecall(in->inode.ino), + MSG_ADDR_MDS(auth), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::inode_sync_start(CInode *in) +{ + // wait for all replicas + dout(5) << "inode_sync_start on " << *in << ", waiting for " << in->cached_by << " " << in->get_open_write()<< endl; + + assert(in->is_auth()); + assert(!in->is_presync()); + assert(!in->is_sync()); + + in->sync_waiting_for_ack.clear(); + in->dist_state |= CINODE_DIST_PRESYNC; + in->get(CINODE_PIN_PRESYNC); + in->auth_pin(); + + in->sync_replicawantback = false; + + // send messages + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + in->sync_waiting_for_ack.insert(MSG_ADDR_MDS(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->inode.ino, mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // sync clients + int last = -1; + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + if (*it == last) continue; last = *it; // only 1 per client (even if open multiple times) + in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + +} + +void MDCache::inode_sync_release(CInode *in) +{ + dout(5) << "inode_sync_release on " << *in << ", messages to " << in->get_cached_by() << " " << in->get_open_write() << endl; + + assert(in->is_syncbyme()); + assert(in->is_auth()); + + in->dist_state &= ~CINODE_DIST_SYNCBYME; + + // release replicas + for (set::iterator it = in->cached_by_begin(); + it != in->cached_by_end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } + + // release writers + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + + in->auth_unpin(); +} + + + + +// messages +void MDCache::handle_inode_sync_start(MInodeSyncStart *m) +{ + // assume asker == authority for now. + + // authority is requesting a lock + CInode *in = get_inode(m->get_ino()); + if (!in) { + // don't have it anymore! + dout(7) << "handle_sync_start " << m->get_ino() << ": don't have it anymore, nak" << endl; + mds->messenger->send_message(new MInodeSyncAck(m->get_ino(), false), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + delete m; // done + return; + } + + dout(10) << "handle_sync_start " << *in << endl; + + // we shouldn't be authoritative... + assert(!in->is_auth()); + + // sanity check: make sure we know who _is_ authoritative! + assert(m->get_asker() == in->authority()); + + // lock it + in->dist_state |= CINODE_DIST_SYNCBYAUTH; + + // open for write by clients? + if (in->is_open_write()) { + dout(7) << "handle_sync_start " << *in << " syncing write clients " << in->get_open_write() << endl; + + // sync clients + in->sync_waiting_for_ack.clear(); + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + in->sync_waiting_for_ack.insert(MSG_ADDR_CLIENT(*it)); + mds->messenger->send_message(new MInodeSyncStart(in->ino(), mds->get_nodeid()), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + + in->pending_sync_request = m; + } else { + // no writers, ack. + dout(7) << "handle_sync_start " << *in << ", sending ack" << endl; + + inode_sync_ack(in, m); + } +} + +void MDCache::inode_sync_ack(CInode *in, MInodeSyncStart *m, bool wantback) +{ + dout(7) << "sending inode_sync_ack " << *in << endl; + + // send ack + mds->messenger->send_message(new MInodeSyncAck(in->ino(), true, wantback), + MSG_ADDR_MDS(m->get_asker()), MDS_PORT_CACHE, + MDS_PORT_CACHE); + + delete m; +} + +void MDCache::handle_inode_sync_ack(MInodeSyncAck *m) +{ + CInode *in = get_inode(m->get_ino()); + assert(in); + + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << endl; + + if (in->is_auth()) { + assert(in->is_presync()); + } else { + assert(in->is_syncbyauth()); + assert(in->pending_sync_request); + } + + // remove it from waiting list + in->sync_waiting_for_ack.erase(m->get_source()); + + if (MSG_ADDR_ISCLIENT(m->get_source()) && !m->did_have()) { + // erase from cached_by too! + in->cached_by_remove(m->get_source()); + } + + if (m->replica_wantsback()) + in->sync_replicawantback = true; + + if (in->sync_waiting_for_ack.size()) { + + // more coming + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", still waiting for " << in->sync_waiting_for_ack << endl; + + } else { + + // yay! + dout(7) << "handle_sync_ack " << *in << " from " << m->get_source() << ", last one" << endl; + + if (!in->is_auth()) { + // replica, sync ack back to auth + assert(in->pending_sync_request); + inode_sync_ack(in, in->pending_sync_request, true); + in->pending_sync_request = 0; + delete m; + return; + } + + in->dist_state &= ~CINODE_DIST_PRESYNC; + in->dist_state |= CINODE_DIST_SYNCBYME; + in->put(CINODE_PIN_PRESYNC); + + // do waiters! + in->finish_waiting(CINODE_WAIT_SYNC); + + + // release sync right away? + if (in->is_syncbyme()) { + if (in->is_freezing()) { + dout(7) << "handle_sync_ack freezing " << *in << ", dropping sync immediately" << endl; + inode_sync_release(in); + } + else if (in->sync_replicawantback) { + dout(7) << "handle_sync_ack replica wantback, releasing sync immediately" << endl; + inode_sync_release(in); + } + else if ((in->is_softasync() && !g_conf.mdcache_sticky_sync_softasync) || + (!in->is_softasync() && !g_conf.mdcache_sticky_sync_normal)) { + dout(7) << "handle_sync_ack !sticky, releasing sync immediately" << endl; + inode_sync_release(in); + } + else { + dout(7) << "handle_sync_ack sticky sync is on, keeping sync for now" << endl; + } + } else { + dout(7) << "handle_sync_ack don't have sync anymore, something must have just released it?" << endl; + } + } + + delete m; // done +} + + +void MDCache::handle_inode_sync_release(MInodeSyncRelease *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_sync_release " << m->get_ino() << ", don't have it, dropping" << endl; + delete m; // done + return; + } + + if (!in->is_syncbyauth()) { + dout(7) << "handle_sync_release " << *in << ", not flagged as sync" << endl; + assert(0); // this shouldn't happen. + delete m; // done + return; + } + + dout(7) << "handle_sync_release " << *in << endl; + assert(!in->is_auth()); + + // release state + in->dist_state &= ~CINODE_DIST_SYNCBYAUTH; + + // waiters? + if (in->is_waitonunsync()) { + in->put(CINODE_PIN_WAITONUNSYNC); + in->dist_state &= ~CINODE_DIST_WAITONUNSYNC; + + // finish + in->finish_waiting(CINODE_WAIT_UNSYNC); + } + + // client readers? + if (in->is_open_write()) { + dout(7) << "handle_sync_release releasing clients " << in->get_open_write() << endl; + for (multiset::iterator it = in->get_open_write().begin(); + it != in->get_open_write().end(); + it++) { + mds->messenger->send_message(new MInodeSyncRelease(in), + MSG_ADDR_CLIENT(*it), 0, + MDS_PORT_CACHE); + } + } + + + // done + delete m; +} + + +void MDCache::handle_inode_sync_recall(MInodeSyncRecall *m) +{ + CInode *in = get_inode(m->get_ino()); + + if (!in) { + dout(7) << "handle_sync_recall " << m->get_ino() << ", don't have it, wtf" << endl; + assert(0); // shouldn't happen + delete m; // done + return; + } + if(!in->is_auth()) { + do_ino_proxy(in, m); + return; + } + + if (in->is_syncbyme()) { + dout(7) << "handle_sync_recall " << *in << ", releasing" << endl; + inode_sync_release(in); + } + else if (in->is_presync()) { + dout(7) << "handle_sync_recall " << *in << " is presync, flagging" << endl; + in->sync_replicawantback = true; + } + else { + dout(7) << "handle_sync_recall " << m->get_ino() << ", not flagged as sync or presync, dropping" << endl; + } + + // done + delete m; +} + + + + + + + + + + +// DIR SYNC + +/* + + dir sync + + - this are used when a directory is HASHED only. namely, + - to stat the dir inode we need an accurate directory size (????) + - for a readdir + +*/ + +void MDCache::dir_sync_start(CDir *dir) +{ + // wait for all replicas + dout(5) << "sync_start on " << *dir << endl; + + assert(dir->is_hashed()); + assert(dir->is_auth()); + assert(!dir->is_presync()); + assert(!dir->is_sync()); + + dir->sync_waiting_for_ack = mds->get_cluster()->get_mds_set(); + dir->state_set(CDIR_STATE_PRESYNC); + dir->auth_pin(); + + //dir->sync_replicawantback = false; + + // send messages + for (set::iterator it = dir->sync_waiting_for_ack.begin(); + it != dir->sync_waiting_for_ack.end(); + it++) { + mds->messenger->send_message(new MDirSyncStart(dir->ino(), mds->get_nodeid()), + MSG_ADDR_MDS(*it), MDS_PORT_CACHE, + MDS_PORT_CACHE); + } +} + + +void MDCache::dir_sync_release(CDir *dir) +{ + + +} + +void MDCache::dir_sync_wait(CDir *dir) +{ + +} + + +void handle_dir_sync_start(MDirSyncStart *m) +{ +} + + + + diff --git a/branches/sage/cephmds2/messages/MAnchorReply.h b/branches/sage/cephmds2/messages/MAnchorReply.h new file mode 100644 index 0000000000000..0186118f53260 --- /dev/null +++ b/branches/sage/cephmds2/messages/MAnchorReply.h @@ -0,0 +1,74 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MANCHORREPLY_H +#define __MANCHORREPLY_H + +#include + +#include "msg/Message.h" +#include "mds/AnchorTable.h" + +#include "MAnchorRequest.h" + + +class MAnchorReply : public Message { + int op; + inodeno_t ino; + vector trace; + + public: + MAnchorReply() {} + MAnchorReply(MAnchorRequest *req) : Message(MSG_MDS_ANCHORREPLY) { + this->op = req->get_op(); + this->ino = req->get_ino(); + } + ~MAnchorReply() { + for (unsigned i=0; i& trace) { this->trace = trace; } + + int get_op() { return op; } + inodeno_t get_ino() { return ino; } + vector& get_trace() { return trace; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(op), (char*)&op); + off += sizeof(op); + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + int n; + payload.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + trace.push_back(a); + } + } + + virtual void encode_payload() { + payload.append((char*)&op, sizeof(op)); + payload.append((char*)&ino, sizeof(ino)); + int n = trace.size(); + payload.append((char*)&n, sizeof(int)); + for (int i=0; i_encode(payload); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MAnchorRequest.h b/branches/sage/cephmds2/messages/MAnchorRequest.h new file mode 100644 index 0000000000000..2a2d0088978b4 --- /dev/null +++ b/branches/sage/cephmds2/messages/MAnchorRequest.h @@ -0,0 +1,76 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MANCHORREQUEST_H +#define __MANCHORREQUEST_H + +#include + +#include "msg/Message.h" +#include "mds/AnchorTable.h" + +#define ANCHOR_OP_CREATE 1 +#define ANCHOR_OP_DESTROY 2 +#define ANCHOR_OP_LOOKUP 3 +#define ANCHOR_OP_UPDATE 4 + +class MAnchorRequest : public Message { + int op; + inodeno_t ino; + vector trace; + + public: + MAnchorRequest() {} + MAnchorRequest(int op, inodeno_t ino) : Message(MSG_MDS_ANCHORREQUEST) { + this->op = op; + this->ino = ino; + } + ~MAnchorRequest() { + for (unsigned i=0; i& trace) { this->trace = trace; } + + int get_op() { return op; } + inodeno_t get_ino() { return ino; } + vector& get_trace() { return trace; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(op), (char*)&op); + off += sizeof(op); + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + int n; + payload.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + trace.push_back(a); + } + } + + virtual void encode_payload() { + payload.append((char*)&op, sizeof(op)); + payload.append((char*)&ino, sizeof(ino)); + int n = trace.size(); + payload.append((char*)&n, sizeof(int)); + for (int i=0; i_encode(payload); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MCacheExpire.h b/branches/sage/cephmds2/messages/MCacheExpire.h new file mode 100644 index 0000000000000..11d941f5131d1 --- /dev/null +++ b/branches/sage/cephmds2/messages/MCacheExpire.h @@ -0,0 +1,95 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCACHEEXPIRE_H +#define __MCACHEEXPIRE_H + + +class MCacheExpire : public Message { + map inodes; + map dirs; + int from; + + public: + map& get_inodes() { return inodes; } + map& get_dirs() { return dirs; } + int get_from() { return from; } + + MCacheExpire() {} + MCacheExpire(int from) : Message(MSG_MDS_CACHEEXPIRE) { + this->from = from; + } + virtual char *get_type_name() { return "CEx";} + + void add_inode(inodeno_t ino, int nonce) { + inodes.insert(pair(ino,nonce)); + } + void add_dir(inodeno_t ino, int nonce) { + dirs.insert(pair(ino,nonce)); + } + + virtual void decode_payload(crope& s, int& off) { + int n; + + s.copy(off, sizeof(from), (char*)&from); + off += sizeof(from); + + // inodes + s.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i(ino,nonce)); + } + + // dirs + s.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i(ino,nonce)); + } + } + + void rope_map(crope& s, map& mp) { + int n = mp.size(); + s.append((char*)&n, sizeof(int)); + for (map::iterator it = mp.begin(); + it != mp.end(); + it++) { + inodeno_t ino = it->first; + int nonce = it->second; + s.append((char*)&ino, sizeof(ino)); + s.append((char*)&nonce, sizeof(nonce)); + } + } + + virtual void encode_payload(crope& s) { + s.append((char*)&from, sizeof(from)); + rope_map(s, inodes); + rope_map(s, dirs); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientFileCaps.h b/branches/sage/cephmds2/messages/MClientFileCaps.h new file mode 100644 index 0000000000000..7fde047b02655 --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientFileCaps.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTFILECAPS_H +#define __MCLIENTFILECAPS_H + +#define CLIENT_FILECAP_RELEASE 1 // mds closed the cap +#define CLIENT_FILECAP_STALE 2 // mds has exported the cap +#define CLIENT_FILECAP_REAP 3 // mds has imported the cap from get_mds() + +class MClientFileCaps : public Message { + public: + static const int FILECAP_RELEASE = 1; + static const int FILECAP_STALE = 2; + static const int FILECAP_REAP = 3; + + + private: + inode_t inode; + int caps; + long seq; + int wanted; + //int client; + + int special; // stale || reap; in conjunction w/ mds value + int mds; + + public: + inodeno_t get_ino() { return inode.ino; } + inode_t& get_inode() { return inode; } + int get_caps() { return caps; } + int get_wanted() { return wanted; } + long get_seq() { return seq; } + //int get_client() { return client; } + + // for cap migration + int get_mds() { return mds; } + int get_special() { return special; } + + //void set_client(int c) { client = c; } + void set_caps(int c) { caps = c; } + void set_wanted(int w) { wanted = w; } + + void set_mds(int m) { mds = m; } + void set_special(int s) { special = s; } + + MClientFileCaps() {} + MClientFileCaps(inode_t& inode, + long seq, + int caps, + int wanted, + int special=0, + int mds=0) : + Message(MSG_CLIENT_FILECAPS) { + this->inode = inode; + this->seq = seq; + this->caps = caps; + this->wanted = wanted; + this->special = special; + this->mds = mds; + } + virtual char *get_type_name() { return "Cfcap";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(seq), (char*)&seq); + off += sizeof(seq); + s.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); + s.copy(off, sizeof(caps), (char*)&caps); + off += sizeof(caps); + s.copy(off, sizeof(wanted), (char*)&wanted); + off += sizeof(wanted); + //s.copy(off, sizeof(client), (char*)&client); + //off += sizeof(client); + s.copy(off, sizeof(mds), (char*)&mds); + off += sizeof(mds); + s.copy(off, sizeof(special), (char*)&special); + off += sizeof(special); + } + virtual void encode_payload(crope& s) { + s.append((char*)&seq, sizeof(seq)); + s.append((char*)&inode, sizeof(inode)); + s.append((char*)&caps, sizeof(caps)); + s.append((char*)&wanted, sizeof(wanted)); + //s.append((char*)&client, sizeof(client)); + s.append((char*)&mds,sizeof(mds)); + s.append((char*)&special,sizeof(special)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h b/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h new file mode 100644 index 0000000000000..e9083f6abc575 --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientInodeAuthUpdate.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTINODEAUTHUPDATE_H +#define __MCLIENTINODEAUTHUPDATE_H + +class MClientInodeAuthUpdate : public Message { + inodeno_t ino; + int newauth; + + public: + inodeno_t get_ino() { return ino; } + int get_auth() { return newauth; } + + MClientInodeAuthUpdate() {} + MClientInodeAuthUpdate(inodeno_t ino, int newauth) : + Message(MSG_CLIENT_INODEAUTHUPDATE) { + this->ino = ino; + this->newauth = newauth; + } + virtual char *get_type_name() { return "Ciau";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(newauth), (char*)&newauth); + off += sizeof(newauth); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino,sizeof(ino)); + s.append((char*)&newauth,sizeof(newauth)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientMount.h b/branches/sage/cephmds2/messages/MClientMount.h new file mode 100644 index 0000000000000..fd253baed0f24 --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientMount.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTMOUNT_H +#define __MCLIENTMOUNT_H + +#include "msg/Message.h" + +class MClientMount : public Message { + long pcid; + int mkfs; + + public: + MClientMount() : Message(MSG_CLIENT_MOUNT) { + pcid = 0; + mkfs = 0; + } + + void set_mkfs(int m) { mkfs = m; } + int get_mkfs() { return mkfs; } + + void set_pcid(long pcid) { this->pcid = pcid; } + long get_pcid() { return pcid; } + + char *get_type_name() { return "Cmnt"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(pcid), (char*)&pcid); + off += sizeof(pcid); + s.copy(off, sizeof(mkfs), (char*)&mkfs); + off += sizeof(mkfs); + } + virtual void encode_payload(crope& s) { + s.append((char*)&pcid, sizeof(pcid)); + s.append((char*)&mkfs, sizeof(mkfs)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientMountAck.h b/branches/sage/cephmds2/messages/MClientMountAck.h new file mode 100644 index 0000000000000..6b1b7cb2a901b --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientMountAck.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTMOUNTACK_H +#define __MCLIENTMOUNTACK_H + +#include "msg/Message.h" +#include "MClientMount.h" +#include "mds/MDSMap.h" +#include "osd/OSDMap.h" + + +class MClientMountAck : public Message { + long pcid; + bufferlist osd_map_state; + bufferlist mds_map_state; + + public: + MClientMountAck() {} + MClientMountAck(MClientMount *mnt, MDSMap *mdsmap, OSDMap *osdmap) : Message(MSG_CLIENT_MOUNTACK) { + this->pcid = mnt->get_pcid(); + mdsmap->encode( mds_map_state ); + osdmap->encode( osd_map_state ); + } + + bufferlist& get_mds_map_state() { return mds_map_state; } + bufferlist& get_osd_map_state() { return osd_map_state; } + + void set_pcid(long pcid) { this->pcid = pcid; } + long get_pcid() { return pcid; } + + char *get_type_name() { return "CmntA"; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(pcid), (char*)&pcid); + off += sizeof(pcid); + ::_decode( mds_map_state, payload, off); + ::_decode( osd_map_state, payload, off); + } + virtual void encode_payload() { + payload.append((char*)&pcid, sizeof(pcid)); + ::_encode( mds_map_state, payload ); + ::_encode( osd_map_state, payload ); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientReply.h b/branches/sage/cephmds2/messages/MClientReply.h new file mode 100644 index 0000000000000..6206b909b0c05 --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientReply.h @@ -0,0 +1,302 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTREPLY_H +#define __MCLIENTREPLY_H + +#include "include/types.h" + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "mds/CDir.h" +#include "mds/CDentry.h" + +#include +using namespace std; + +class CInode; + +/*** + * + * MClientReply - container message for MDS reply to a client's MClientRequest + * + * key fields: + * long tid - transaction id, so the client can match up with pending request + * int result - error code, or fh if it was open + * + * for most requests: + * trace is a vector of c_inoe_info's tracing from root to the file/dir/whatever + * the operation referred to, so that the client can update it's info about what + * metadata lives on what MDS. + * + * for readdir replies: + * dir_contents is a vector c_inode_info*'s. + * + * that's mostly it, i think! + * + */ + +class InodeStat { + + public: + inode_t inode; + string symlink; // symlink content (if symlink) + + + // mds distribution hints + int dir_auth; + bool hashed, replicated; + bool spec_defined; + set dist; // where am i replicated? + + public: + InodeStat() {} + InodeStat(CInode *in, int whoami) : + inode(in->inode) + { + // inode.mask + inode.mask = INODE_MASK_BASE; + if (in->filelock.can_read(in->is_auth())) + inode.mask |= INODE_MASK_PERM; + if (in->hardlock.can_read(in->is_auth())) + inode.mask |= INODE_MASK_SIZE | INODE_MASK_MTIME; // fixme when we separate this out. + + // symlink content? + if (in->is_symlink()) + symlink = in->symlink; + + // replicated where? + if (in->dir && in->dir->is_auth()) { + spec_defined = true; + in->dir->get_dist_spec(this->dist, whoami); + } else + spec_defined = false; + + if (in->dir) + dir_auth = in->dir->get_dir_auth(); + else + dir_auth = -1; + + // dir info + hashed = (in->dir && in->dir->is_hashed()); // FIXME not quite right. + replicated = (in->dir && in->dir->is_rep()); + } + + void _encode(bufferlist &bl) { + bl.append((char*)&inode, sizeof(inode)); + bl.append((char*)&spec_defined, sizeof(spec_defined)); + bl.append((char*)&dir_auth, sizeof(dir_auth)); + bl.append((char*)&hashed, sizeof(hashed)); + bl.append((char*)&replicated, sizeof(replicated)); + + ::_encode(symlink, bl); + ::_encode(dist, bl); // distn + } + + void _decode(bufferlist &bl, int& off) { + bl.copy(off, sizeof(inode), (char*)&inode); + off += sizeof(inode); + bl.copy(off, sizeof(spec_defined), (char*)&spec_defined); + off += sizeof(spec_defined); + bl.copy(off, sizeof(dir_auth), (char*)&dir_auth); + off += sizeof(dir_auth); + bl.copy(off, sizeof(hashed), (char*)&hashed); + off += sizeof(hashed); + bl.copy(off, sizeof(replicated), (char*)&replicated); + off += sizeof(replicated); + + ::_decode(symlink, bl, off); + ::_decode(dist, bl, off); + } +}; + + +typedef struct { + long pcid; + long tid; + int op; + int result; // error code + unsigned char file_caps; // for open + long file_caps_seq; + __uint64_t file_data_version; // for client buffercache consistency + + int _num_trace_in; + int _dir_size; +} MClientReply_st; + +class MClientReply : public Message { + // reply data + MClientReply_st st; + + string path; + list trace_in; + list trace_dn; + + list dir_in; + list dir_dn; + + public: + void set_pcid(long pcid) { this->st.pcid = pcid; } + long get_pcid() { return st.pcid; } + + long get_tid() { return st.tid; } + int get_op() { return st.op; } + + int get_result() { return st.result; } + const string& get_path() { return path; } + + inodeno_t get_ino() { return trace_in.back()->inode.ino; } + const inode_t& get_inode() { return trace_in.back()->inode; } + + const list& get_trace_in() { return trace_in; } + const list& get_trace_dn() { return trace_dn; } + + const list& get_dir_in() { return dir_in; } + const list& get_dir_dn() { return dir_dn; } + + unsigned char get_file_caps() { return st.file_caps; } + long get_file_caps_seq() { return st.file_caps_seq; } + __uint64_t get_file_data_version() { return st.file_data_version; } + + void set_result(int r) { st.result = r; } + void set_file_caps(unsigned char c) { st.file_caps = c; } + void set_file_caps_seq(long s) { st.file_caps_seq = s; } + void set_file_data_version(__uint64_t v) { st.file_data_version = v; } + + MClientReply() {}; + MClientReply(MClientRequest *req, int result = 0) : + Message(MSG_CLIENT_REPLY) { + memset(&st, 0, sizeof(st)); + this->st.pcid = req->get_pcid(); // match up procedure call id!!! + this->st.tid = req->get_tid(); + this->st.op = req->get_op(); + this->path = req->get_path(); + + this->st.result = result; + + st._dir_size = 0; + st._num_trace_in = 0; + } + virtual ~MClientReply() { + list::iterator it; + + for (it = trace_in.begin(); it != trace_in.end(); ++it) + delete *it; + for (it = dir_in.begin(); it != dir_in.end(); ++it) + delete *it; + } + virtual char *get_type_name() { return "creply"; } + + + // serialization + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + + _decode(path, payload, off); + + for (int i=0; i_decode(payload, off); + trace_in.push_back(ci); + } + + for (int i=0; i_decode(payload, off); + dir_in.push_back(ci); + string dn; + ::_decode(dn, payload, off); + dir_dn.push_back(dn); + } + } + virtual void encode_payload() { + payload.append((char*)&st, sizeof(st)); + _encode(path, payload); + + // trace + list::iterator pdn = trace_dn.begin(); + list::iterator pin; + for (pin = trace_in.begin(); + pin != trace_in.end(); + ++pin) { + if (pin != trace_in.begin()) { + ::_encode(*pdn, payload); + ++pdn; + } + (*pin)->_encode(payload); + } + + // dir contents + pdn = dir_dn.begin(); + for (pin = dir_in.begin(); + pin != dir_in.end(); + ++pin, ++pdn) { + (*pin)->_encode(payload); + ::_encode(*pdn, payload); + } + } + + // builders + /* + void add_dir_item(string& dn, InodeStat *in) { + dir_dn.push_back(dn); + dir_in.push_back(in); + ++st._dir_size; + }*/ + void take_dir_items(list& inls, + list& dnls, + int num) { + dir_in.swap(inls); + dir_dn.swap(dnls); + st._dir_size = num; + } + void copy_dir_items(const list& inls, + const list& dnls) { + list::const_iterator pdn = dnls.begin(); + list::const_iterator pin = inls.begin(); + while (pin != inls.end()) { + // copy! + InodeStat *i = new InodeStat; + *i = **pin; + dir_in.push_back(i); + dir_dn.push_back(*pdn); + ++pin; + ++pdn; + ++st._dir_size; + } + } + + void set_trace_dist(CInode *in, int whoami) { + st._num_trace_in = 0; + while (in) { + // add this inode to trace, along with referring dentry name + if (in->get_parent_dn()) + trace_dn.push_front(in->get_parent_dn()->get_name()); + trace_in.push_front(new InodeStat(in, whoami)); + ++st._num_trace_in; + + in = in->get_parent_inode(); + } + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MClientRequest.h b/branches/sage/cephmds2/messages/MClientRequest.h new file mode 100644 index 0000000000000..dff2af23deb5f --- /dev/null +++ b/branches/sage/cephmds2/messages/MClientRequest.h @@ -0,0 +1,201 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MCLIENTREQUEST_H +#define __MCLIENTREQUEST_H + +#include + +#include "msg/Message.h" +#include "include/filepath.h" +#include "mds/MDS.h" + +/** + * + * MClientRequest - container for a client METADATA request. created/sent by clients. + * can be forwarded around between MDS's. + * + * int client - the originating client + * long pcid - procedure call id, used to match request+response. + * long tid - transaction id, unique among requests for that client. probably just a counter! + * -> the MDS passes the Request to the Reply constructor, so this always matches. + * + * int op - the metadata op code. MDS_OP_RENAME, etc. + * int caller_uid, _gid - guess + * + * arguments: one or more of these are defined, depending on the metadata op: + * inodeno ino - used by close(), along with fh. not strictly necessary except MDS is currently coded lame. + * filepath path - main file argument (almost everything) + * string sarg - string argument (if a second arg is needed, e.g. rename, symlink) + * int iarg - int arg... file mode for open, fh for close, mode for mkdir, etc. + * int iarg2 - second int arg... gid for chown (iarg is uid) + * time_t targ, targ2 - time args, used by utime + * + * That's basically it! + * + */ + + +typedef struct { + long tid; + int client; + int op; + + entity_inst_t client_inst; + + int caller_uid, caller_gid; + inodeno_t ino; + + int iarg, iarg2; + time_t targ, targ2; + + inodeno_t mds_wants_replica_in_dirino; + + size_t sizearg; +} MClientRequest_st; + + +class MClientRequest : public Message { + MClientRequest_st st; + filepath path; + string sarg; + string sarg2; + + + public: + MClientRequest() {} + MClientRequest(int op, int client) : Message(MSG_CLIENT_REQUEST) { + memset(&st, 0, sizeof(st)); + this->st.op = op; + this->st.client = client; + this->st.iarg = 0; + } + virtual char *get_type_name() { return "creq"; } + + // keep a pcid (procedure call id) to match up request+reply + //void set_pcid(long pcid) { this->st.pcid = pcid; } + //long get_pcid() { return st.pcid; } + + // normal fields + void set_tid(long t) { st.tid = t; } + void set_path(string& p) { path.set_path(p); } + void set_path(const char *p) { path.set_path(p); } + void set_path(const filepath& fp) { path = fp; } + void set_caller_uid(int u) { st.caller_uid = u; } + void set_caller_gid(int g) { st.caller_gid = g; } + void set_ino(inodeno_t ino) { st.ino = ino; } + void set_iarg(int i) { st.iarg = i; } + void set_iarg2(int i) { st.iarg2 = i; } + void set_targ(time_t& t) { st.targ = t; } + void set_targ2(time_t& t) { st.targ2 = t; } + void set_sarg(string& arg) { this->sarg = arg; } + void set_sarg(const char *arg) { this->sarg = arg; } + void set_sarg2(string& arg) { this->sarg2 = arg; } + void set_sizearg(size_t s) { st.sizearg = s; } + void set_mds_wants_replica_in_dirino(inodeno_t dirino) { + st.mds_wants_replica_in_dirino = dirino; } + + void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } + const entity_inst_t& get_client_inst() { return st.client_inst; } + + int get_client() { return st.client; } + long get_tid() { return st.tid; } + int get_op() { return st.op; } + int get_caller_uid() { return st.caller_uid; } + int get_caller_gid() { return st.caller_gid; } + inodeno_t get_ino() { return st.ino; } + string& get_path() { return path.get_path(); } + filepath& get_filepath() { return path; } + int get_iarg() { return st.iarg; } + int get_iarg2() { return st.iarg2; } + time_t get_targ() { return st.targ; } + time_t get_targ2() { return st.targ2; } + string& get_sarg() { return sarg; } + string& get_sarg2() { return sarg2; } + size_t get_sizearg() { return st.sizearg; } + inodeno_t get_mds_wants_replica_in_dirino() { + return st.mds_wants_replica_in_dirino; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + path._decode(payload, off); + _decode(sarg, payload, off); + _decode(sarg2, payload, off); + } + + virtual void encode_payload() { + payload.append((char*)&st, sizeof(st)); + path._encode(payload); + _encode(sarg, payload); + _encode(sarg2, payload); + } + + void print(ostream& out) { + out << "clientreq(client" << get_client() + << "." << get_tid() + //<< ".pcid=" << get_pcid() + << ":"; + switch(get_op()) { + case MDS_OP_STAT: + out << "stat"; break; + case MDS_OP_LSTAT: + out << "lstat"; break; + case MDS_OP_UTIME: + out << "utime"; break; + case MDS_OP_CHMOD: + out << "chmod"; break; + case MDS_OP_CHOWN: + out << "chown"; break; + + case MDS_OP_READDIR: + out << "readdir"; break; + case MDS_OP_MKNOD: + out << "mknod"; break; + case MDS_OP_LINK: + out << "link"; break; + case MDS_OP_UNLINK: + out << "unlink"; break; + case MDS_OP_RENAME: + out << "rename"; break; + + case MDS_OP_MKDIR: + out << "mkdir"; break; + case MDS_OP_RMDIR: + out << "rmdir"; break; + case MDS_OP_SYMLINK: + out << "symlink"; break; + + case MDS_OP_OPEN: + out << "open"; break; + case MDS_OP_TRUNCATE: + out << "truncate"; break; + case MDS_OP_FSYNC: + out << "fsync"; break; + case MDS_OP_RELEASE: + out << "release"; break; + default: + out << "unknown=" << get_op(); + } + if (get_path().length()) + out << "=" << get_path(); + if (get_sarg().length()) + out << " " << get_sarg(); + out << ")"; + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDentryUnlink.h b/branches/sage/cephmds2/messages/MDentryUnlink.h new file mode 100644 index 0000000000000..ec1503eeadf00 --- /dev/null +++ b/branches/sage/cephmds2/messages/MDentryUnlink.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDENTRYUNLINK_H +#define __MDENTRYUNLINK_H + +class MDentryUnlink : public Message { + inodeno_t dirino; + string dn; + + public: + inodeno_t get_dirino() { return dirino; } + string& get_dn() { return dn; } + + MDentryUnlink() {} + MDentryUnlink(inodeno_t dirino, string& dn) : + Message(MSG_MDS_DENTRYUNLINK) { + this->dirino = dirino; + this->dn = dn; + } + virtual char *get_type_name() { return "Dun";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(dirino), (char*)&dirino); + off += sizeof(dirino); + _unrope(dn, s, off); + } + virtual void encode_payload(crope& s) { + s.append((char*)&dirino,sizeof(dirino)); + _rope(dn, s); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDirExpire.h b/branches/sage/cephmds2/messages/MDirExpire.h new file mode 100644 index 0000000000000..a81de3d538365 --- /dev/null +++ b/branches/sage/cephmds2/messages/MDirExpire.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDIREXPIRE_H +#define __MDIREXPIRE_H + +typedef struct { + inodeno_t ino; + int nonce; + int from; +} MDirExpire_st; + +class MDirExpire : public Message { + MDirExpire_st st; + + public: + inodeno_t get_ino() { return st.ino; } + int get_from() { return st.from; } + int get_nonce() { return st.nonce; } + + MDirExpire() {} + MDirExpire(inodeno_t ino, int from, int nonce) : + Message(MSG_MDS_DIREXPIRE) { + st.ino = ino; + st.from = from; + st.nonce = nonce; + } + virtual char *get_type_name() { return "DirEx";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDirExpireReq.h b/branches/sage/cephmds2/messages/MDirExpireReq.h new file mode 100644 index 0000000000000..604a55265c723 --- /dev/null +++ b/branches/sage/cephmds2/messages/MDirExpireReq.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDIREXPIREREQ_H +#define __MDIREXPIREREQ_H + +typedef struct { + inodeno_t ino; + int nonce; + int from; +} MDirExpireReq_st; + +class MDirExpire : public Message { + MDirExpireReq_st st; + + public: + inodeno_t get_ino() { return st.ino; } + int get_from() { return st.from; } + int get_nonce() { return st.nonce; } + + MDirExpire() {} + MDirExpire(inodeno_t ino, int from, int nonce) : + Message(MSG_MDS_DIREXPIREREQ) { + st.ino = ino; + st.from = from; + st.nonce = nonce; + } + virtual char *get_type_name() { return "DirExR";} + + virtual void decode_payload(crope& s) { + s.copy(0, sizeof(st), (char*)&st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDirUpdate.h b/branches/sage/cephmds2/messages/MDirUpdate.h new file mode 100644 index 0000000000000..9bac721654c22 --- /dev/null +++ b/branches/sage/cephmds2/messages/MDirUpdate.h @@ -0,0 +1,71 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDIRUPDATE_H +#define __MDIRUPDATE_H + +#include "msg/Message.h" + +typedef struct { + inodeno_t ino; + int dir_rep; + int discover; +} MDirUpdate_st; + +class MDirUpdate : public Message { + MDirUpdate_st st; + set dir_rep_by; + string path; + + public: + inodeno_t get_ino() { return st.ino; } + int get_dir_rep() { return st.dir_rep; } + set& get_dir_rep_by() { return dir_rep_by; } + bool should_discover() { return st.discover > 0; } + string& get_path() { return path; } + + void tried_discover() { + if (st.discover) st.discover--; + } + + MDirUpdate() {} + MDirUpdate(inodeno_t ino, + int dir_rep, + set& dir_rep_by, + string& path, + bool discover = false) : + Message(MSG_MDS_DIRUPDATE) { + this->st.ino = ino; + this->st.dir_rep = dir_rep; + this->dir_rep_by = dir_rep_by; + if (discover) this->st.discover = 5; + this->path = path; + } + virtual char *get_type_name() { return "dup"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + _unrope(dir_rep_by, s, off); + _unrope(path, s, off); + } + + virtual void encode_payload(crope& r) { + r.append((char*)&st, sizeof(st)); + _rope(dir_rep_by, r); + _rope(path, r); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDiscover.h b/branches/sage/cephmds2/messages/MDiscover.h new file mode 100644 index 0000000000000..d207ab28cc143 --- /dev/null +++ b/branches/sage/cephmds2/messages/MDiscover.h @@ -0,0 +1,75 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDISCOVER_H +#define __MDISCOVER_H + +#include "msg/Message.h" +#include "mds/CDir.h" +#include "include/filepath.h" + +#include +#include +using namespace std; + + +class MDiscover : public Message { + int asker; + inodeno_t base_ino; // 0 -> none, want root + bool want_base_dir; + bool want_root_inode; + + filepath want; // ... [/]need/this/stuff + + public: + int get_asker() { return asker; } + inodeno_t get_base_ino() { return base_ino; } + filepath& get_want() { return want; } + const string& get_dentry(int n) { return want[n]; } + bool wants_base_dir() { return want_base_dir; } + + MDiscover() { } + MDiscover(int asker, + inodeno_t base_ino, + filepath& want, + bool want_base_dir = true, + bool want_root_inode = false) : + Message(MSG_MDS_DISCOVER) { + this->asker = asker; + this->base_ino = base_ino; + this->want = want; + this->want_base_dir = want_base_dir; + this->want_root_inode = want_root_inode; + } + virtual char *get_type_name() { return "Dis"; } + + virtual void decode_payload(crope& r, int& off) { + r.copy(off, sizeof(asker), (char*)&asker); + off += sizeof(asker); + r.copy(off, sizeof(base_ino), (char*)&base_ino); + off += sizeof(base_ino); + r.copy(off, sizeof(bool), (char*)&want_base_dir); + off += sizeof(bool); + want._unrope(r, off); + } + virtual void encode_payload(crope& r) { + r.append((char*)&asker, sizeof(asker)); + r.append((char*)&base_ino, sizeof(base_ino)); + r.append((char*)&want_base_dir, sizeof(want_base_dir)); + want._rope(r); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MDiscoverReply.h b/branches/sage/cephmds2/messages/MDiscoverReply.h new file mode 100644 index 0000000000000..78e5d001086ec --- /dev/null +++ b/branches/sage/cephmds2/messages/MDiscoverReply.h @@ -0,0 +1,266 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MDISCOVERREPLY_H +#define __MDISCOVERREPLY_H + +#include "msg/Message.h" +#include "mds/CDir.h" +#include "mds/CInode.h" +#include "include/filepath.h" + +#include +#include +using namespace std; + +#define max(a,b) ((a)>(b) ? (a):(b)) + + +/** + * MDiscoverReply - return new replicas (of inodes, dirs, dentries) + * + * we group returned items by (dir, dentry, inode). each + * item in each set shares an index (it's "depth"). + * + * we can start and end with any type. + * no_base_dir = true if the first group has an inode but no dir + * no_base_dentry = true if the first group has an inode but no dentry + * they are false if there is no returned data, ie the first group is empty. + * + * we also return errors: + * error_flag_dn(string) - the specified dentry dne + * error_flag_dir - the last item wasn't a dir, so we couldn't continue. + * + * depth() gives us the number of depth units/indices for which we have + * information. this INCLUDES those for which we have errors but no data. + * + * see MDCache::handle_discover, handle_discover_reply. + * + + old crap, maybe not accurate: + + // dir [ + ... ] : discover want_base_dir=true + + // dentry [ + inode [ + ... ] ] : discover want_base_dir=false + // no_base_dir=true + // -> we only exclude inode if dentry is null+xlock + + // inode [ + ... ], base_ino = 0 : discover base_ino=0, start w/ root ino, + // no_base_dir=no_base_dentry=true + + * + */ + +class MDiscoverReply : public Message { + inodeno_t base_ino; + bool no_base_dir; // no base dir (but IS dentry+inode) + bool no_base_dentry; // no base dentry (but IS inode) + bool flag_error_dn; + bool flag_error_dir; + string error_dentry; // dentry that was not found (to trigger waiters on asker) + + + vector dirs; // not inode-aligned if no_base_dir = true. + filepath path; // not inode-aligned if no_base_dentry = true + vector path_xlock; + vector inodes; + + public: + // accessors + inodeno_t get_base_ino() { return base_ino; } + int get_num_inodes() { return inodes.size(); } + int get_num_dentries() { return path.depth(); } + int get_num_dirs() { return dirs.size(); } + + int get_depth() { // return depth of deepest object (in dir/dentry/inode units) + return max( inodes.size(), // at least this many + max( no_base_dentry + path.depth() + flag_error_dn, // inode start + path + possible error + dirs.size() + no_base_dir )); // dn/inode + dirs + } + + bool has_base_dir() { return !no_base_dir && dirs.size(); } + bool has_base_dentry() { return !no_base_dentry && path.depth(); } + bool has_root() { + if (base_ino == 0) { + assert(no_base_dir && no_base_dentry); + return true; + } + return false; + } + const string& get_path() { return path.get_path(); } + bool get_path_xlock(int i) { return path_xlock[i]; } + + // bool is_flag_forward() { return flag_forward; } + bool is_flag_error_dn() { return flag_error_dn; } + bool is_flag_error_dir() { return flag_error_dir; } + string& get_error_dentry() { return error_dentry; } + + // these index _arguments_ are aligned to each ([[dir, ] dentry, ] inode) set. + CDirDiscover& get_dir(int n) { return *(dirs[n - no_base_dir]); } + const string& get_dentry(int n) { return path[n - no_base_dentry]; } + bool get_dentry_xlock(int n) { return path_xlock[n - no_base_dentry]; } + CInodeDiscover& get_inode(int n) { return *(inodes[n]); } + inodeno_t get_ino(int n) { return inodes[n]->get_ino(); } + + // cons + MDiscoverReply() {} + MDiscoverReply(inodeno_t base_ino) : + Message(MSG_MDS_DISCOVERREPLY) { + this->base_ino = base_ino; + flag_error_dn = false; + flag_error_dir = false; + no_base_dir = no_base_dentry = false; + } + ~MDiscoverReply() { + for (vector::iterator it = dirs.begin(); + it != dirs.end(); + it++) + delete *it; + for (vector::iterator it = inodes.begin(); + it != inodes.end(); + it++) + delete *it; + } + virtual char *get_type_name() { return "DisR"; } + + // builders + bool is_empty() { + return dirs.empty() && path.depth() == 0 && + inodes.empty() && + !flag_error_dn && + !flag_error_dir; + } + void set_path(const filepath& dp) { path = dp; } + void add_dentry(const string& dn, bool xlock) { + if (path.depth() == 0 && dirs.empty()) no_base_dir = true; + path.add_dentry(dn); + path_xlock.push_back(xlock); + } + + void add_inode(CInodeDiscover* din) { + if (inodes.empty() && path.depth() == 0) no_base_dir = no_base_dentry = true; + inodes.push_back( din ); + } + + void add_dir(CDirDiscover* dir) { + dirs.push_back( dir ); + } + + // void set_flag_forward() { flag_forward = true; } + void set_flag_error_dn(const string& dn) { + flag_error_dn = true; + error_dentry = dn; + } + void set_flag_error_dir() { + flag_error_dir = true; + } + + + // ... + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(base_ino), (char*)&base_ino); + off += sizeof(base_ino); + payload.copy(off, sizeof(bool), (char*)&no_base_dir); + off += sizeof(bool); + payload.copy(off, sizeof(bool), (char*)&no_base_dentry); + off += sizeof(bool); + // payload.copy(off, sizeof(bool), (char*)&flag_forward); + //off += sizeof(bool); + payload.copy(off, sizeof(bool), (char*)&flag_error_dn); + off += sizeof(bool); + + _decode(error_dentry, payload, off); + payload.copy(off, sizeof(bool), (char*)&flag_error_dir); + off += sizeof(bool); + + // dirs + int n; + payload.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + } + //dout(12) << n << " dirs out" << endl; + + // inodes + payload.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + } + //dout(12) << n << " inodes out" << endl; + + // filepath + path._decode(payload, off); + //dout(12) << path.depth() << " dentries out" << endl; + + // path_xlock + payload.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i::iterator it = dirs.begin(); + it != dirs.end(); + it++) + (*it)->_encode( payload ); + //dout(12) << n << " dirs in" << endl; + + // inodes + n = inodes.size(); + payload.append((char*)&n, sizeof(int)); + for (vector::iterator it = inodes.begin(); + it != inodes.end(); + it++) + (*it)->_encode( payload ); + //dout(12) << n << " inodes in" << endl; + + // path + path._encode( payload ); + //dout(12) << path.depth() << " dentries in" << endl; + + // path_xlock + n = path_xlock.size(); + payload.append((char*)&n, sizeof(int)); + for (vector::iterator it = path_xlock.begin(); + it != path_xlock.end(); + it++) { + bool b = *it; + payload.append((char*)&b, sizeof(bool)); + } + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDir.h b/branches/sage/cephmds2/messages/MExportDir.h new file mode 100644 index 0000000000000..2879579f6929f --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDir.h @@ -0,0 +1,102 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIR_H +#define __MEXPORTDIR_H + +#include "msg/Message.h" + + +class MExportDir : public Message { + inodeno_t ino; + + int ndirs; + bufferlist state; + + list exports; + + // hashed pre-discovers + //map > hashed_prediscover; + + public: + MExportDir() {} + MExportDir(CInode *in) : + Message(MSG_MDS_EXPORTDIR) { + this->ino = in->inode.ino; + ndirs = 0; + } + virtual char *get_type_name() { return "Ex"; } + + inodeno_t get_ino() { return ino; } + int get_ndirs() { return ndirs; } + bufferlist& get_state() { return state; } + list& get_exports() { return exports; } + + void add_dir(bufferlist& dir) { + state.claim_append( dir ); + ndirs++; + } + void add_export(CDir *dir) { exports.push_back(dir->ino()); } + + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + payload.copy(off, sizeof(ndirs), (char*)&ndirs); + off += sizeof(ndirs); + + // exports + int nex; + payload.copy(off, sizeof(nex), (char*)&nex); + off += sizeof(int); + dout(12) << nex << " nested exports out" << endl; + for (int i=0; i::iterator it = exports.begin(); + it != exports.end(); + it++) { + inodeno_t ino = *it; + payload.append((char*)&ino, sizeof(ino)); + } + + // dir data + size_t len = state.length(); + payload.append((char*)&len, sizeof(len)); + payload.claim_append(state); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirAck.h b/branches/sage/cephmds2/messages/MExportDirAck.h new file mode 100644 index 0000000000000..35691bf94e2a7 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRACK_H +#define __MEXPORTDIRACK_H + +#include "MExportDir.h" + +class MExportDirAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirAck() {} + MExportDirAck(MExportDir *req) : + Message(MSG_MDS_EXPORTDIRACK) { + ino = req->get_ino(); + } + virtual char *get_type_name() { return "ExAck"; } + + virtual void decode_payload(crope& s) { + s.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirDiscover.h b/branches/sage/cephmds2/messages/MExportDirDiscover.h new file mode 100644 index 0000000000000..24f77036455f4 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirDiscover.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRDISCOVER_H +#define __MEXPORTDIRDISCOVER_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirDiscover : public Message { + inodeno_t ino; + string path; + + public: + inodeno_t get_ino() { return ino; } + string& get_path() { return path; } + + MExportDirDiscover() {} + MExportDirDiscover(CInode *in) : + Message(MSG_MDS_EXPORTDIRDISCOVER) { + in->make_path(path); + ino = in->ino(); + } + virtual char *get_type_name() { return "ExDis"; } + + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + _unrope(path, s, off); + } + + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + _rope(path, s); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h new file mode 100644 index 0000000000000..a25e3b46672e3 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirDiscoverAck.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRDISCOVERACK_H +#define __MEXPORTDIRDISCOVERACK_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirDiscoverAck : public Message { + inodeno_t ino; + bool success; + + public: + inodeno_t get_ino() { return ino; } + bool is_success() { return success; } + + MExportDirDiscoverAck() {} + MExportDirDiscoverAck(inodeno_t ino, bool success=true) : + Message(MSG_MDS_EXPORTDIRDISCOVERACK) { + this->ino = ino; + this->success = false; + } + virtual char *get_type_name() { return "ExDisA"; } + + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(success), (char*)&success); + off += sizeof(success); + } + + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + s.append((char*)&success, sizeof(success)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirFinish.h b/branches/sage/cephmds2/messages/MExportDirFinish.h new file mode 100644 index 0000000000000..89c9e5290c4b2 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirFinish.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRFINISH_H +#define __MEXPORTDIRFINISH_H + +#include "MExportDir.h" + +class MExportDirFinish : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirFinish() {} + MExportDirFinish(inodeno_t ino) : + Message(MSG_MDS_EXPORTDIRFINISH) { + this->ino = ino; + } + virtual char *get_type_name() { return "ExFin"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirNotify.h b/branches/sage/cephmds2/messages/MExportDirNotify.h new file mode 100644 index 0000000000000..9d6532cad478c --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirNotify.h @@ -0,0 +1,111 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRNOTIFY_H +#define __MEXPORTDIRNOTIFY_H + +#include "msg/Message.h" +#include +using namespace std; + +class MExportDirNotify : public Message { + int new_auth; + int old_auth; + inodeno_t ino; + + list exports; // bounds; these dirs are _not_ included (tho the inodes are) + list subdirs; + + public: + inodeno_t get_ino() { return ino; } + int get_new_auth() { return new_auth; } + int get_old_auth() { return old_auth; } + list& get_exports() { return exports; } + list::iterator subdirs_begin() { return subdirs.begin(); } + list::iterator subdirs_end() { return subdirs.end(); } + int num_subdirs() { return subdirs.size(); } + + MExportDirNotify() {} + MExportDirNotify(inodeno_t ino, int old_auth, int new_auth) : + Message(MSG_MDS_EXPORTDIRNOTIFY) { + this->ino = ino; + this->old_auth = old_auth; + this->new_auth = new_auth; + } + virtual char *get_type_name() { return "ExNot"; } + + void copy_subdirs(list& s) { + this->subdirs = s; + } + void copy_exports(list& ex) { + this->exports = ex; + } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(int), (char*)&new_auth); + off += sizeof(int); + s.copy(off, sizeof(int), (char*)&old_auth); + off += sizeof(int); + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + // notify + int n; + s.copy(off, sizeof(int), (char*)&n); + off += sizeof(int); + for (int i=0; i::iterator it = exports.begin(); + it != exports.end(); + it++) { + inodeno_t ino = *it; + s.append((char*)&ino, sizeof(ino)); + } + + // subdirs + n = subdirs.size(); + s.append((char*)&n, sizeof(int)); + for (list::iterator it = subdirs.begin(); + it != subdirs.end(); + it++) { + inodeno_t ino = *it; + s.append((char*)&ino, sizeof(ino)); + } + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirNotifyAck.h b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h new file mode 100644 index 0000000000000..3179fd4f544f1 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirNotifyAck.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRNOTIFYACK_H +#define __MEXPORTDIRNOTIFYACK_H + +#include "msg/Message.h" +#include +using namespace std; + +class MExportDirNotifyAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirNotifyAck() {} + MExportDirNotifyAck(inodeno_t ino) : + Message(MSG_MDS_EXPORTDIRNOTIFYACK) { + this->ino = ino; + } + virtual char *get_type_name() { return "ExNotA"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirPrep.h b/branches/sage/cephmds2/messages/MExportDirPrep.h new file mode 100644 index 0000000000000..6e814212ac98b --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirPrep.h @@ -0,0 +1,186 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRPREP_H +#define __MEXPORTDIRPREP_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirPrep : public Message { + inodeno_t ino; + + /* nested export discover payload. + not all inodes will have dirs; they may require a separate discover. + dentries are the links to each inode. + dirs map includes base dir (ino) + */ + list exports; + + list inodes; + map inode_dirino; + map inode_dentry; + + map dirs; + + bool b_did_assim; + + public: + inodeno_t get_ino() { return ino; } + list& get_exports() { return exports; } + list& get_inodes() { return inodes; } + inodeno_t get_containing_dirino(inodeno_t ino) { + return inode_dirino[ino]; + } + string& get_dentry(inodeno_t ino) { + return inode_dentry[ino]; + } + bool have_dir(inodeno_t ino) { + return dirs.count(ino); + } + CDirDiscover* get_dir(inodeno_t ino) { + return dirs[ino]; + } + + bool did_assim() { return b_did_assim; } + void mark_assim() { b_did_assim = true; } + + MExportDirPrep() { + b_did_assim = false; + } + MExportDirPrep(CInode *in) : + Message(MSG_MDS_EXPORTDIRPREP) { + ino = in->ino(); + b_did_assim = false; + } + ~MExportDirPrep() { + for (list::iterator iit = inodes.begin(); + iit != inodes.end(); + iit++) + delete *iit; + for (map::iterator dit = dirs.begin(); + dit != dirs.end(); + dit++) + delete dit->second; + } + + + virtual char *get_type_name() { return "ExP"; } + + + + + void add_export(inodeno_t dirino) { + exports.push_back( dirino ); + } + void add_inode(inodeno_t dirino, string& dentry, CInodeDiscover *in) { + inodes.push_back(in); + inode_dirino.insert(pair(in->get_ino(), dirino)); + inode_dentry.insert(pair(in->get_ino(), dentry)); + } + void add_dir(CDirDiscover *dir) { + dirs.insert(pair(dir->get_ino(), dir)); + } + + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + // exports + int ne; + payload.copy(off, sizeof(int), (char*)&ne); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + inodes.push_back(in); + + // dentry + string d; + _decode(d, payload, off); + inode_dentry[in->get_ino()] = d; + + // dir ino + inodeno_t dino; + payload.copy(off, sizeof(dino), (char*)&dino); + off += sizeof(dino); + inode_dirino[in->get_ino()] = dino; + } + + // dirs + int nd; + payload.copy(off, sizeof(int), (char*)&nd); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + dirs[dir->get_ino()] = dir; + } + } + + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + + // exports + int ne = exports.size(); + payload.append((char*)&ne, sizeof(int)); + for (list::iterator it = exports.begin(); + it != exports.end(); + it++) { + inodeno_t ino = *it; + payload.append((char*)&ino, sizeof(ino)); + } + + // inodes + int ni = inodes.size(); + payload.append((char*)&ni, sizeof(int)); + for (list::iterator iit = inodes.begin(); + iit != inodes.end(); + iit++) { + (*iit)->_encode(payload); + + // dentry + _encode(inode_dentry[(*iit)->get_ino()], payload); + + // dir ino + inodeno_t ino = inode_dirino[(*iit)->get_ino()]; + payload.append((char*)&ino, sizeof(ino)); + } + + // dirs + int nd = dirs.size(); + payload.append((char*)&nd, sizeof(int)); + for (map::iterator dit = dirs.begin(); + dit != dirs.end(); + dit++) + dit->second->_encode(payload); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirPrepAck.h b/branches/sage/cephmds2/messages/MExportDirPrepAck.h new file mode 100644 index 0000000000000..c32d7255c5074 --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirPrepAck.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRPREPACK_H +#define __MEXPORTDIRPREPACK_H + +#include "msg/Message.h" +#include "include/types.h" + +class MExportDirPrepAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirPrepAck() {} + MExportDirPrepAck(inodeno_t ino) : + Message(MSG_MDS_EXPORTDIRPREPACK) { + this->ino = ino; + } + + virtual char *get_type_name() { return "ExPAck"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MExportDirWarning.h b/branches/sage/cephmds2/messages/MExportDirWarning.h new file mode 100644 index 0000000000000..6f2fdf55dde4f --- /dev/null +++ b/branches/sage/cephmds2/messages/MExportDirWarning.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MEXPORTDIRWARNING_H +#define __MEXPORTDIRWARNING_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MExportDirWarning : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MExportDirWarning() {} + MExportDirWarning(inodeno_t ino) : + Message(MSG_MDS_EXPORTDIRWARNING) { + this->ino = ino; + } + + virtual char *get_type_name() { return "ExW"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino, sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MFailure.h b/branches/sage/cephmds2/messages/MFailure.h new file mode 100644 index 0000000000000..1663565b692dd --- /dev/null +++ b/branches/sage/cephmds2/messages/MFailure.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MFAILURE_H +#define __MFAILURE_H + +#include "msg/Message.h" + + +class MFailure : public Message { + public: + msg_addr_t failed; + entity_inst_t inst; + + MFailure() {} + MFailure(msg_addr_t f, entity_inst_t& i) : + Message(MSG_FAILURE), + failed(f), inst(i) {} + + msg_addr_t get_failed() { return failed; } + entity_inst_t& get_inst() { return inst; } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(failed), (char*)&failed); + off += sizeof(failed); + payload.copy(off, sizeof(inst), (char*)&inst); + off += sizeof(inst); + } + void encode_payload() { + payload.append((char*)&failed, sizeof(failed)); + payload.append((char*)&inst, sizeof(inst)); + } + + virtual char *get_type_name() { return "fail"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MFailureAck.h b/branches/sage/cephmds2/messages/MFailureAck.h new file mode 100644 index 0000000000000..ee9a0d04d0fd4 --- /dev/null +++ b/branches/sage/cephmds2/messages/MFailureAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MFAILUREACK_H +#define __MFAILUREACK_H + +#include "MFailure.h" + + +class MFailureAck : public Message { + public: + msg_addr_t failed; + MFailureAck(MFailure *m) : Message(MSG_FAILURE_ACK) { + this->failed = m->get_failed(); + } + MFailureAck() {} + + msg_addr_t get_failed() { return failed; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(0, sizeof(failed), (char*)&failed); + off += sizeof(failed); + } + virtual void encode_payload(crope& s) { + s.append((char*)&failed, sizeof(failed)); + } + + virtual char *get_type_name() { return "faila"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MGenericMessage.h b/branches/sage/cephmds2/messages/MGenericMessage.h new file mode 100644 index 0000000000000..b2f39534e6e23 --- /dev/null +++ b/branches/sage/cephmds2/messages/MGenericMessage.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MGENERICMESSAGE_H +#define __MGENERICMESSAGE_H + +#include "msg/Message.h" + +class MGenericMessage : public Message { + char tname[20]; + //long pcid; + + public: + MGenericMessage(int t) : Message(t) { + sprintf(tname, "generic%d", get_type()); + } + + //void set_pcid(long pcid) { this->pcid = pcid; } + //long get_pcid() { return pcid; } + + char *get_type_name() { return tname; } + + virtual void decode_payload() { + //int off = 0; + //payload.copy(off, sizeof(pcid), (char*)&pcid); + //off += sizeof(pcid); + } + virtual void encode_payload() { + //payload.append((char*)&pcid, sizeof(pcid)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDir.h b/branches/sage/cephmds2/messages/MHashDir.h new file mode 100644 index 0000000000000..ddf7e3ac2bbce --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDir.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIR_H +#define __MHASHDIR_H + +#include "msg/Message.h" + +class MHashDir : public Message { + inodeno_t ino; + bufferlist state; + int nden; + + public: + MHashDir() {} + MHashDir(inodeno_t ino) : + Message(MSG_MDS_HASHDIR) { + this->ino = ino; + nden = 0; + } + virtual char *get_type_name() { return "Ha"; } + + inodeno_t get_ino() { return ino; } + bufferlist& get_state() { return state; } + bufferlist* get_state_ptr() { return &state; } + int get_nden() { return nden; } + + void set_nden(int n) { nden = n; } + void inc_nden() { nden++; } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + payload.copy(off, sizeof(nden), (char*)&nden); + off += sizeof(nden); + + size_t len; + payload.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + state.substr_of(payload, off, len); + } + void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&nden, sizeof(nden)); + size_t size = state.length(); + payload.append((char*)&size, sizeof(size)); + payload.claim_append(state); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirAck.h b/branches/sage/cephmds2/messages/MHashDirAck.h new file mode 100644 index 0000000000000..cd6d4da8cf34f --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRACK_H +#define __MHASHDIRACK_H + +#include "MHashDir.h" + +class MHashDirAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MHashDirAck() {} + MHashDirAck(inodeno_t ino) : + Message(MSG_MDS_HASHDIRACK) { + this->ino = ino; + } + virtual char *get_type_name() { return "HAck"; } + + virtual void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirDiscover.h b/branches/sage/cephmds2/messages/MHashDirDiscover.h new file mode 100644 index 0000000000000..0ea1ff8b79990 --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirDiscover.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRDISCOVER_H +#define __MHASHDIRDISCOVER_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MHashDirDiscover : public Message { + inodeno_t ino; + string path; + + public: + inodeno_t get_ino() { return ino; } + string& get_path() { return path; } + + MHashDirDiscover() {} + MHashDirDiscover(CInode *in) : + Message(MSG_MDS_HASHDIRDISCOVER) { + in->make_path(path); + ino = in->ino(); + } + virtual char *get_type_name() { return "HDis"; } + + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + _decode(path, payload, off); + } + + void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + _encode(path, payload); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h b/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h new file mode 100644 index 0000000000000..34734af0f97ad --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirDiscoverAck.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRDISCOVERACK_H +#define __MHASHDIRDISCOVERACK_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MHashDirDiscoverAck : public Message { + inodeno_t ino; + bool success; + + public: + inodeno_t get_ino() { return ino; } + bool is_success() { return success; } + + MHashDirDiscoverAck() {} + MHashDirDiscoverAck(inodeno_t ino, bool success=true) : + Message(MSG_MDS_HASHDIRDISCOVERACK) { + this->ino = ino; + this->success = false; + } + virtual char *get_type_name() { return "HDisA"; } + + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + payload.copy(off, sizeof(success), (char*)&success); + off += sizeof(success); + } + + void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&success, sizeof(success)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirNotify.h b/branches/sage/cephmds2/messages/MHashDirNotify.h new file mode 100644 index 0000000000000..ececc3ec2cc65 --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirNotify.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRNOTIFY_H +#define __MHASHDIRNOTIFY_H + +#include "msg/Message.h" + +class MHashDirNotify : public Message { + inodeno_t ino; + int from; + + public: + inodeno_t get_ino() { return ino; } + int get_from() { return from; } + + MHashDirNotify() {} + MHashDirNotify(inodeno_t ino, int from) : + Message(MSG_MDS_HASHDIRNOTIFY) { + this->ino = ino; + this->from = from; + } + virtual char *get_type_name() { return "HN"; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + payload.copy(off, sizeof(from), (char*)&from); + off += sizeof(from); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&from, sizeof(from)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirPrep.h b/branches/sage/cephmds2/messages/MHashDirPrep.h new file mode 100644 index 0000000000000..29a42217d6a4b --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirPrep.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRPREP_H +#define __MHASHDIRPREP_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MHashDirPrep : public Message { + inodeno_t ino; + bool assim; + + // subdir dentry names + inodes + map inodes; + + public: + inodeno_t get_ino() { return ino; } + map& get_inodes() { return inodes; } + + bool did_assim() { return assim; } + void mark_assim() { assert(!assim); assim = true; } + + MHashDirPrep() : assim(false) { } + MHashDirPrep(inodeno_t ino) : + Message(MSG_MDS_HASHDIRPREP), + assim(false) { + this->ino = ino; + } + ~MHashDirPrep() { + for (map::iterator it = inodes.begin(); + it != inodes.end(); + it++) + delete it->second; + } + + + virtual char *get_type_name() { return "HP"; } + + void add_inode(const string& dentry, CInodeDiscover *in) { + inodes[dentry] = in; + } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + // inodes + int ni; + payload.copy(off, sizeof(int), (char*)&ni); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + + inodes[dname] = in; + } + } + + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + + // inodes + int ni = inodes.size(); + payload.append((char*)&ni, sizeof(int)); + for (map::iterator iit = inodes.begin(); + iit != inodes.end(); + iit++) { + _encode(iit->first, payload); // dentry + iit->second->_encode(payload); // inode + } + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashDirPrepAck.h b/branches/sage/cephmds2/messages/MHashDirPrepAck.h new file mode 100644 index 0000000000000..1d0db35c10f88 --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashDirPrepAck.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHDIRPREPACK_H +#define __MHASHDIRPREPACK_H + +#include "msg/Message.h" +#include "include/types.h" + +class MHashDirPrepAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MHashDirPrepAck() {} + MHashDirPrepAck(inodeno_t ino) : + Message(MSG_MDS_HASHDIRPREPACK) { + this->ino = ino; + } + + virtual char *get_type_name() { return "HPAck"; } + + void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashReaddir.h b/branches/sage/cephmds2/messages/MHashReaddir.h new file mode 100644 index 0000000000000..864cb6944aeda --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashReaddir.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHREADDIR_H +#define __MHASHREADDIR_H + +#include "include/types.h" +#include "msg/Message.h" + +class MHashReaddir : public Message { + inodeno_t ino; + + public: + MHashReaddir() { } + MHashReaddir(inodeno_t ino) : + Message(MSG_MDS_HASHREADDIR) { + this->ino = ino; + } + + inodeno_t get_ino() { return ino; } + + virtual char *get_type_name() { return "Hls"; } + + virtual void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHashReaddirReply.h b/branches/sage/cephmds2/messages/MHashReaddirReply.h new file mode 100644 index 0000000000000..d9d73d8528f00 --- /dev/null +++ b/branches/sage/cephmds2/messages/MHashReaddirReply.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHASHREADDIRREPLY_H +#define __MHASHREADDIRREPLY_H + +#include "MClientReply.h" + +class MHashReaddirReply : public Message { + inodeno_t ino; + + list dir_in; + list dir_dn; + + int num; + + public: + MHashReaddirReply() { } + MHashReaddirReply(inodeno_t _ino, list& inls, list& dnls, int n) : + Message(MSG_MDS_HASHREADDIRREPLY), + ino(_ino), + num(n) { + dir_in.swap(inls); + dir_dn.swap(dnls); + } + ~MHashReaddirReply() { + for (list::iterator it = dir_in.begin(); it != dir_in.end(); it++) + delete *it; + } + + inodeno_t get_ino() { return ino; } + list& get_in() { return dir_in; } + list& get_dn() { return dir_dn; } + + virtual char *get_type_name() { return "Hls"; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + int n; + payload.copy(n, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i_decode(payload, off); + dir_in.push_back(ci); + } + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + int n = dir_in.size(); // FIXME? + payload.append((char*)&n, sizeof(n)); + list::iterator pdn = dir_dn.begin(); + for (list::iterator pin = dir_in.begin(); + pin != dir_in.end(); + ++pin, ++pdn) { + ::_encode(*pdn, payload); + (*pin)->_encode(payload); + } + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MHeartbeat.h b/branches/sage/cephmds2/messages/MHeartbeat.h new file mode 100644 index 0000000000000..55455f406ef18 --- /dev/null +++ b/branches/sage/cephmds2/messages/MHeartbeat.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MHEARTBEAT_H +#define __MHEARTBEAT_H + +#include "include/types.h" +#include "msg/Message.h" + +class MHeartbeat : public Message { + mds_load_t load; + int beat; + map import_map; + + public: + mds_load_t& get_load() { return load; } + int get_beat() { return beat; } + + map& get_import_map() { + return import_map; + } + + MHeartbeat() {} + MHeartbeat(mds_load_t& load, int beat) : + Message(MSG_MDS_HEARTBEAT) { + this->load = load; + this->beat = beat; + } + + virtual char *get_type_name() { return "HB"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off,sizeof(load), (char*)&load); + off += sizeof(load); + s.copy(off, sizeof(beat), (char*)&beat); + off += sizeof(beat); + + int n; + s.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + while (n--) { + int f; + s.copy(off, sizeof(f), (char*)&f); + off += sizeof(f); + float v; + s.copy(off, sizeof(v), (char*)&v); + off += sizeof(v); + import_map[f] = v; + } + } + virtual void encode_payload(crope& s) { + s.append((char*)&load, sizeof(load)); + s.append((char*)&beat, sizeof(beat)); + + int n = import_map.size(); + s.append((char*)&n, sizeof(n)); + for (map::iterator it = import_map.begin(); + it != import_map.end(); + it++) { + int f = it->first; + s.append((char*)&f, sizeof(f)); + float v = it->second; + s.append((char*)&v, sizeof(v)); + } + + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeExpire.h b/branches/sage/cephmds2/messages/MInodeExpire.h new file mode 100644 index 0000000000000..637f378324022 --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeExpire.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODEEXPIRE_H +#define __MINODEEXPIRE_H + +typedef struct { + inodeno_t ino; + int nonce; + int from; +} MInodeExpire_st; + +class MInodeExpire : public Message { + MInodeExpire_st st; + + public: + inodeno_t get_ino() { return st.ino; } + int get_from() { return st.from; } + int get_nonce() { return st.nonce; } + + MInodeExpire() {} + MInodeExpire(inodeno_t ino, int from, int nonce) : + Message(MSG_MDS_INODEEXPIRE) { + st.ino = ino; + st.from = from; + st.nonce = nonce; + } + virtual char *get_type_name() { return "InEx";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeFileCaps.h b/branches/sage/cephmds2/messages/MInodeFileCaps.h new file mode 100644 index 0000000000000..5bd51be0e347b --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeFileCaps.h @@ -0,0 +1,55 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODEFILECAPS_H +#define __MINODEFILECAPS_H + +class MInodeFileCaps : public Message { + inodeno_t ino; + int from; + int caps; + + public: + inodeno_t get_ino() { return ino; } + int get_from() { return from; } + int get_caps() { return caps; } + + MInodeFileCaps() {} + // from auth + MInodeFileCaps(inodeno_t ino, int from, int caps) : + Message(MSG_MDS_INODEFILECAPS) { + + this->ino = ino; + this->from = from; + this->caps = caps; + } + + virtual char *get_type_name() { return "Icap";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(from), (char*)&from); + off += sizeof(from); + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(caps), (char*)&caps); + off += sizeof(caps); + } + virtual void encode_payload(crope& s) { + s.append((char*)&from, sizeof(from)); + s.append((char*)&ino, sizeof(ino)); + s.append((char*)&caps, sizeof(caps)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeLink.h b/branches/sage/cephmds2/messages/MInodeLink.h new file mode 100644 index 0000000000000..feefc4ea21c7b --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeLink.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODELINK_H +#define __MINODELINK_H + +typedef struct { + inodeno_t ino; + int from; +} MInodeLink_st; + +class MInodeLink : public Message { + MInodeLink_st st; + + public: + inodeno_t get_ino() { return st.ino; } + int get_from() { return st.from; } + + MInodeLink() {} + MInodeLink(inodeno_t ino, int from) : + Message(MSG_MDS_INODELINK) { + st.ino = ino; + st.from = from; + } + virtual char *get_type_name() { return "InL";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeLinkAck.h b/branches/sage/cephmds2/messages/MInodeLinkAck.h new file mode 100644 index 0000000000000..987b70741edcb --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeLinkAck.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODELINKACK_H +#define __MINODELINKACK_H + +typedef struct { + inodeno_t ino; + bool success; +} MInodeLinkAck_st; + +class MInodeLinkAck : public Message { + MInodeLinkAck_st st; + + public: + inodeno_t get_ino() { return st.ino; } + bool is_success() { return st.success; } + + MInodeLinkAck() {} + MInodeLinkAck(inodeno_t ino, bool success) : + Message(MSG_MDS_INODELINKACK) { + st.ino = ino; + st.success = success; + } + virtual char *get_type_name() { return "InLA";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeUnlink.h b/branches/sage/cephmds2/messages/MInodeUnlink.h new file mode 100644 index 0000000000000..e1aa463153c26 --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeUnlink.h @@ -0,0 +1,47 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODEUNLINK_H +#define __MINODEUNLINK_H + +typedef struct { + inodeno_t ino; + int from; +} MInodeUnlink_st; + +class MInodeUnlink : public Message { + MInodeUnlink_st st; + + public: + inodeno_t get_ino() { return st.ino; } + int get_from() { return st.from; } + + MInodeUnlink() {} + MInodeUnlink(inodeno_t ino, int from) : + Message(MSG_MDS_INODEUNLINK) { + st.ino = ino; + st.from = from; + } + virtual char *get_type_name() { return "InUl";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeUnlinkAck.h b/branches/sage/cephmds2/messages/MInodeUnlinkAck.h new file mode 100644 index 0000000000000..283c016f2bec9 --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeUnlinkAck.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODEUNLINKACK_H +#define __MINODEUNLINKACK_H + +typedef struct { + inodeno_t ino; +} MInodeUnlinkAck_st; + +class MInodeUnlinkAck : public Message { + MInodeUnlinkAck_st st; + + public: + inodeno_t get_ino() { return st.ino; } + + MInodeUnlinkAck() {} + MInodeUnlinkAck(inodeno_t ino) : + Message(MSG_MDS_INODEUNLINKACK) { + st.ino = ino; + } + virtual char *get_type_name() { return "InUlA";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + } + virtual void encode_payload(crope& s) { + s.append((char*)&st,sizeof(st)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MInodeUpdate.h b/branches/sage/cephmds2/messages/MInodeUpdate.h new file mode 100644 index 0000000000000..bbab924089aa5 --- /dev/null +++ b/branches/sage/cephmds2/messages/MInodeUpdate.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MINODEUPDATE_H +#define __MINODEUPDATE_H + +#include "msg/Message.h" + +#include +using namespace std; + +class MInodeUpdate : public Message { + int nonce; + crope inode_basic_state; + + public: + inodeno_t get_ino() { + inodeno_t ino; + inode_basic_state.copy(0, sizeof(inodeno_t), (char*)&ino); + return ino; + } + int get_nonce() { return nonce; } + + MInodeUpdate() {} + MInodeUpdate(CInode *in, int nonce) : + Message(MSG_MDS_INODEUPDATE) { + inode_basic_state = in->encode_basic_state(); + this->nonce = nonce; + } + virtual char *get_type_name() { return "Iup"; } + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(int), (char*)&nonce); + off += sizeof(int); + size_t len; + s.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + inode_basic_state = s.substr(off, len); + off += len; + } + virtual void encode_payload(crope& s) { + s.append((char*)&nonce, sizeof(int)); + size_t len = inode_basic_state.length(); + s.append((char*)&len, sizeof(len)); + s.append(inode_basic_state); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MLock.h b/branches/sage/cephmds2/messages/MLock.h new file mode 100644 index 0000000000000..1d22d297d79d4 --- /dev/null +++ b/branches/sage/cephmds2/messages/MLock.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MLOCK_H +#define __MLOCK_H + +#include "msg/Message.h" + +#define LOCK_OTYPE_IHARD 1 +#define LOCK_OTYPE_IFILE 2 +#define LOCK_OTYPE_DIR 3 +#define LOCK_OTYPE_DN 4 + +// for replicas +#define LOCK_AC_SYNC 0 +#define LOCK_AC_MIXED 1 +#define LOCK_AC_LOCK 2 + +#define LOCK_AC_REQXLOCKACK 9 // req dentry xlock +#define LOCK_AC_REQXLOCKNAK 10 // req dentry xlock +#define LOCK_AC_LOCKNAK 12 // for dentry xlock + + +#define LOCK_AC_FOR_REPLICA(a) ((a) <= 10) +#define LOCK_AC_FOR_AUTH(a) ((a) >= 11) + +// for auth + +#define LOCK_AC_SYNCACK 13 +#define LOCK_AC_MIXEDACK 14 +#define LOCK_AC_LOCKACK 15 + + +#define LOCK_AC_REQREAD 19 +#define LOCK_AC_REQWRITE 20 + +#define LOCK_AC_REQXLOCK 21 +#define LOCK_AC_REQXLOCKC 22 // create if necessary +#define LOCK_AC_UNXLOCK 23 + +#define lock_ac_name(x) + + +class MLock : public Message { + int asker; // who is initiating this request + int action; // action type + + char otype; // lock object type + inodeno_t ino; // ino ref, or possibly + string dn; // dentry name + bufferlist data; // and possibly some data + string path; // possibly a path too (for dentry lock discovers) + + public: + inodeno_t get_ino() { return ino; } + string& get_dn() { return dn; } + bufferlist& get_data() { return data; } + int get_asker() { return asker; } + int get_action() { return action; } + int get_otype() { return otype; } + string& get_path() { return path; } + + MLock() {} + MLock(int action, int asker) : + Message(MSG_MDS_LOCK) { + this->action = action; + this->asker = asker; + } + virtual char *get_type_name() { return "ILock"; } + + void set_ino(inodeno_t ino, char ot) { + otype = ot; + this->ino = ino; + } + void set_dirino(inodeno_t dirino) { + otype = LOCK_OTYPE_DIR; + this->ino = ino; + } + void set_dn(inodeno_t dirino, string& dn) { + otype = LOCK_OTYPE_DN; + this->ino = dirino; + this->dn = dn; + } + void set_data(bufferlist& data) { + this->data.claim( data ); + } + void set_path(const string& p) { + path = p; + } + + void decode_payload() { + int off = 0; + payload.copy(off,sizeof(action), (char*)&action); + off += sizeof(action); + payload.copy(off,sizeof(asker), (char*)&asker); + off += sizeof(asker); + payload.copy(off,sizeof(otype), (char*)&otype); + off += sizeof(otype); + payload.copy(off,sizeof(ino), (char*)&ino); + off += sizeof(ino); + ::_decode(dn, payload, off); + ::_decode(path, payload, off); + ::_decode(data, payload, off); + } + virtual void encode_payload() { + payload.append((char*)&action, sizeof(action)); + payload.append((char*)&asker, sizeof(asker)); + payload.append((char*)&otype, sizeof(otype)); + payload.append((char*)&ino, sizeof(inodeno_t)); + ::_encode(dn, payload); + ::_encode(path, payload); + ::_encode(data, payload); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMDSBoot.h b/branches/sage/cephmds2/messages/MMDSBoot.h new file mode 100644 index 0000000000000..c0c554152cc87 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMDSBoot.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSBOOT_H +#define __MMDSBOOT_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMDSBoot : public Message { + public: + MMDSBoot() : Message(MSG_MDS_BOOT) { + } + + char *get_type_name() { return "mdsboot"; } + + void encode_payload() { + //payload.append((char*)&sb, sizeof(sb)); + } + void decode_payload() { + //int off = 0; + //payload.copy(off, sizeof(sb), (char*)&sb); + //off += sizeof(sb); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMDSGetMap.h b/branches/sage/cephmds2/messages/MMDSGetMap.h new file mode 100644 index 0000000000000..6bb6b92c00ccd --- /dev/null +++ b/branches/sage/cephmds2/messages/MMDSGetMap.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMDSGETMAP_H +#define __MMDSGETMAP_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMDSGetMap : public Message { + public: + MMDSGetMap() : Message(MSG_MDS_GETMAP) { + } + + char *get_type_name() { return "mdsgetmap"; } + + void encode_payload() { + //payload.append((char*)&sb, sizeof(sb)); + } + void decode_payload() { + //int off = 0; + //payload.copy(off, sizeof(sb), (char*)&sb); + //off += sizeof(sb); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMDSMap.h b/branches/sage/cephmds2/messages/MMDSMap.h new file mode 100644 index 0000000000000..c8dd60abcb331 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMDSMap.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMDSMAP_H +#define __MMDSMAP_H + +#include "msg/Message.h" +#include "mds/MDSMap.h" + + +class MMDSMap : public Message { + public: + map maps; + map incremental_maps; + + epoch_t get_first() { + epoch_t e = 0; + map::iterator i = maps.begin(); + if (i != maps.end()) e = i->first; + i = incremental_maps.begin(); + if (i != incremental_maps.end() && + (e == 0 || i->first < e)) e = i->first; + return e; + } + epoch_t get_last() { + epoch_t e = 0; + map::reverse_iterator i = maps.rbegin(); + if (i != maps.rend()) e = i->first; + i = incremental_maps.rbegin(); + if (i != incremental_maps.rend() && + (e == 0 || i->first > e)) e = i->first; + return e; + } + + + MMDSMap() : + Message(MSG_MDS_MAP) {} + MMDSMap(MDSMap *mm) : + Message(MSG_MDS_MAP) { + mm->encode(maps[mm->get_epoch()]); + } + + + // marshalling + virtual void decode_payload() { + int off = 0; + ::_decode(maps, payload, off); + ::_decode(incremental_maps, payload, off); + } + virtual void encode_payload() { + ::_encode(maps, payload); + ::_encode(incremental_maps, payload); + } + + virtual char *get_type_name() { return "mdsmap"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonElectionAck.h b/branches/sage/cephmds2/messages/MMonElectionAck.h new file mode 100644 index 0000000000000..dbfa30c9cb099 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonElectionAck.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONACK_H +#define __MMONELECTIONACK_H + +#include "msg/Message.h" + + +class MMonElectionAck : public Message { + public: + int q; + int refresh_num; + + MMonElectionAck() {} + MMonElectionAck(int _q, int _n) : + Message(MSG_MON_ELECTION_ACK), + q(_q), refresh_num(_n) {} + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(q), (char*)&q); + off += sizeof(q); + payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); + off += sizeof(refresh_num); + } + void encode_payload() { + payload.append((char*)&q, sizeof(q)); + payload.append((char*)&refresh_num, sizeof(refresh_num)); + } + + virtual char *get_type_name() { return "MonElAck"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonElectionCollect.h b/branches/sage/cephmds2/messages/MMonElectionCollect.h new file mode 100644 index 0000000000000..d91870dfce5c6 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonElectionCollect.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONCOLLECT_H +#define __MMONELECTIONCOLLECT_H + +#include "msg/Message.h" + + +class MMonElectionCollect : public Message { + public: + int read_num; + + MMonElectionCollect() {} + MMonElectionCollect(int n) : + Message(MSG_MON_ELECTION_COLLECT), + read_num(n) {} + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(read_num), (char*)&read_num); + off += sizeof(read_num); + } + void encode_payload() { + payload.append((char*)&read_num, sizeof(read_num)); + } + + virtual char *get_type_name() { return "MonElCollect"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonElectionRefresh.h b/branches/sage/cephmds2/messages/MMonElectionRefresh.h new file mode 100644 index 0000000000000..497276f06b12f --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonElectionRefresh.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONREFRESH_H +#define __MMONELECTIONREFRESH_H + +#include "msg/Message.h" + +#include "mon/Elector.h" + +class MMonElectionRefresh : public Message { + public: + int p; + Elector::State state; + int refresh_num; + + MMonElectionRefresh() {} + MMonElectionRefresh(int _p, Elector::State& s, int r) : + Message(MSG_MON_ELECTION_REFRESH), + p(_p), state(s), refresh_num(r) {} + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(p), (char*)&p); + off += sizeof(p); + payload.copy(off, sizeof(state), (char*)&state); + off += sizeof(state); + payload.copy(off, sizeof(refresh_num), (char*)&refresh_num); + off += sizeof(refresh_num); + } + void encode_payload() { + payload.append((char*)&p, sizeof(p)); + payload.append((char*)&state, sizeof(state)); + payload.append((char*)&refresh_num, sizeof(refresh_num)); + } + + virtual char *get_type_name() { return "MonElRefresh"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonElectionStatus.h b/branches/sage/cephmds2/messages/MMonElectionStatus.h new file mode 100644 index 0000000000000..071d0fcc82e0a --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonElectionStatus.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MMONELECTIONSTATUS_H +#define __MMONELECTIONSTATUS_H + +#include "msg/Message.h" + +#include "mon/Elector.h" + +class MMonElectionStatus : public Message { + public: + int q; + int read_num; + map registry; + + MMonElectionStatus() {} + MMonElectionStatus(int _q, int r, map reg) : + Message(MSG_MON_ELECTION_STATUS), + q(_q), read_num(r), registry(reg) {} + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(q), (char*)&q); + off += sizeof(q); + payload.copy(off, sizeof(read_num), (char*)&read_num); + off += sizeof(read_num); + ::_decode(registry, payload, off); + } + void encode_payload() { + payload.append((char*)&q, sizeof(q)); + payload.append((char*)&read_num, sizeof(read_num)); + ::_encode(registry, payload); + } + + virtual char *get_type_name() { return "MonElStatus"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapInfo.h b/branches/sage/cephmds2/messages/MMonOSDMapInfo.h new file mode 100644 index 0000000000000..182b36f0a57cf --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapInfo.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPINFO_H +#define __MMONOSDMAPINFO_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapInfo : public Message { + public: + epoch_t epoch; + epoch_t mon_epoch; + + epoch_t get_epoch() { return epoch; } + epoch_t get_mon_epoch() { return mon_epoch; } + + MMonOSDMapInfo(epoch_t e, epoch_t me) : + Message(MSG_MON_OSDMAP_UPDATE_PREPARE), + epoch(e), mon_epoch(me) { + } + + char *get_type_name() { return "omap_info"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + payload.append((char*)&mon_epoch, sizeof(mon_epoch)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + payload.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); + off += sizeof(mon_epoch); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapLease.h b/branches/sage/cephmds2/messages/MMonOSDMapLease.h new file mode 100644 index 0000000000000..c6112bd898cae --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapLease.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPLEASE_H +#define __MMONOSDMAPLEASE_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapLease : public Message { + epoch_t epoch; + utime_t lease_expire; + + public: + epoch_t get_epoch() { return epoch; } + const utime_t& get_lease_expire() { return lease_expire; } + + MMonOSDMapLease(epoch_t e, utime_t le) : + Message(MSG_MON_OSDMAP_LEASE), + epoch(e), lease_expire(le) { + } + + char *get_type_name() { return "omap_lease"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + payload.append((char*)&lease_expire, sizeof(lease_expire)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + payload.copy(off, sizeof(lease_expire), (char*)&lease_expire); + off += sizeof(lease_expire); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h b/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h new file mode 100644 index 0000000000000..85d5ea7c02809 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapLeaseAck.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPLEASEACK_H +#define __MMONOSDMAPLEASEACK_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapLeaseAck : public Message { + epoch_t epoch; + +public: + epoch_t get_epoch() { return epoch; } + + MMonOSDMapLeaseAck(epoch_t e) : + Message(MSG_MON_OSDMAP_LEASE_ACK), + epoch(e) { + } + + char *get_type_name() { return "omap_lease_ack"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h new file mode 100644 index 0000000000000..8673788f0632f --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdateAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPUPDATEACK_H +#define __MMONOSDMAPUPDATEACK_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapUpdateAck : public Message { +public: + epoch_t epoch; + + MMonOSDMapUpdateAck(epoch_t e) : + Message(MSG_MON_OSDMAP_UPDATE_ACK), + epoch(e) { + } + + char *get_type_name() { return "omap_update_ack"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h new file mode 100644 index 0000000000000..6f12a8e3c784d --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdateCommit.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPUPDATECOMMIT_H +#define __MMONOSDMAPUPDATECOMMIT_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapUpdateCommit : public Message { + public: + epoch_t epoch; + + MMonOSDMapUpdateCommit(epoch_t e) : + Message(MSG_MON_OSDMAP_UPDATE_COMMIT), + epoch(e) { + } + + char *get_type_name() { return "omap_update_commit"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h b/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h new file mode 100644 index 0000000000000..bc962ea2b3eb2 --- /dev/null +++ b/branches/sage/cephmds2/messages/MMonOSDMapUpdatePrepare.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MMONOSDMAPUPDATEPREPARE_H +#define __MMONOSDMAPUPDATEPREPARE_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MMonOSDMapUpdatePrepare : public Message { + public: + epoch_t epoch; + bufferlist map_bl; + bufferlist inc_map_bl; + + epoch_t get_epoch() { return epoch; } + + MMonOSDMapUpdatePrepare(epoch_t e, + bufferlist& mbl, bufferlist& incmbl) : + Message(MSG_MON_OSDMAP_UPDATE_PREPARE), + epoch(e), + map_bl(mbl), inc_map_bl(incmbl) { + } + + char *get_type_name() { return "omap_update_prepare"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + ::_encode(map_bl, payload); + ::_encode(inc_map_bl, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + ::_decode(map_bl, payload, off); + ::_decode(inc_map_bl, payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MNSConnect.h b/branches/sage/cephmds2/messages/MNSConnect.h new file mode 100644 index 0000000000000..28150f79d8476 --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSConnect.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSCONNECT_H +#define __MNSCONNECT_H + +#include "msg/Message.h" +#include "msg/tcp.h" + +class MNSConnect : public Message { + tcpaddr_t tcpaddr; + + public: + MNSConnect() {} + MNSConnect(tcpaddr_t t) : + Message(MSG_NS_CONNECT) { + tcpaddr = t; + } + + char *get_type_name() { return "NSCon"; } + + tcpaddr_t& get_addr() { return tcpaddr; } + + void encode_payload() { + payload.append((char*)&tcpaddr, sizeof(tcpaddr)); + } + void decode_payload() { + payload.copy(0, sizeof(tcpaddr), (char*)&tcpaddr); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSConnectAck.h b/branches/sage/cephmds2/messages/MNSConnectAck.h new file mode 100644 index 0000000000000..696b13f2a41e6 --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSConnectAck.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSCONNECTACK_H +#define __MNSCONNECTACK_H + +#include "msg/Message.h" +#include "msg/TCPMessenger.h" + +class MNSConnectAck : public Message { + int rank; + int inst; + + public: + MNSConnectAck() {} + MNSConnectAck(int r, int g=0) : + Message(MSG_NS_CONNECTACK) { + rank = r; + inst = g; + } + + char *get_type_name() { return "NSConA"; } + + int get_rank() { return rank; } + int get_inst() { return inst; } + + void encode_payload() { + payload.append((char*)&rank, sizeof(rank)); + payload.append((char*)&inst, sizeof(inst)); + } + void decode_payload() { + unsigned off = 0; + payload.copy(off, sizeof(rank), (char*)&rank); + off += sizeof(rank); + payload.copy(off, sizeof(inst), (char*)&inst); + off += sizeof(inst); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSFailure.h b/branches/sage/cephmds2/messages/MNSFailure.h new file mode 100644 index 0000000000000..405bfcfd2dacb --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSFailure.h @@ -0,0 +1,52 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSFAILURE_H +#define __MNSFAILURE_H + +#include "msg/Message.h" +#include "msg/tcp.h" + +class MNSFailure : public Message { + //msg_addr_t entity; + entity_inst_t inst; + + public: + MNSFailure() {} + MNSFailure(entity_inst_t& i) : + Message(MSG_NS_FAILURE), + //entity(w), + inst(i) {} + + char *get_type_name() { return "NSFail"; } + + //msg_addr_t &get_entity() { return entity; } + entity_inst_t &get_inst() { return inst; } + + void encode_payload() { + //payload.append((char*)&entity, sizeof(entity)); + payload.append((char*)&inst, sizeof(inst)); + } + void decode_payload() { + unsigned off = 0; + //payload.copy(off, sizeof(entity), (char*)&entity); + //off += sizeof(entity); + payload.copy(off, sizeof(inst), (char*)&inst); + off += sizeof(inst); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSLookup.h b/branches/sage/cephmds2/messages/MNSLookup.h new file mode 100644 index 0000000000000..cbea43092908a --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSLookup.h @@ -0,0 +1,46 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSLOOKUP_H +#define __MNSLOOKUP_H + +#include "msg/Message.h" + +class MNSLookup : public Message { + msg_addr_t entity; + + public: + MNSLookup() {} + MNSLookup(msg_addr_t e) : + Message(MSG_NS_LOOKUP) { + entity = e; + } + + char *get_type_name() { return "NSLook"; } + + msg_addr_t get_entity() { return entity; } + + void encode_payload() { + payload.append((char*)&entity, sizeof(entity)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(entity), (char*)&entity); + off += sizeof(entity); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSLookupReply.h b/branches/sage/cephmds2/messages/MNSLookupReply.h new file mode 100644 index 0000000000000..e87b48435c92a --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSLookupReply.h @@ -0,0 +1,44 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSLOOKUPREPLY_H +#define __MNSLOOKUPREPLY_H + +#include "msg/Message.h" +#include "msg/TCPMessenger.h" + +class MNSLookupReply : public Message { + public: + map entity_map; + + public: + MNSLookupReply() {} + MNSLookupReply(MNSLookup *m) : + Message(MSG_NS_LOOKUPREPLY) { + } + + char *get_type_name() { return "NSLookR"; } + + void encode_payload() { + ::_encode(entity_map, payload); + } + void decode_payload() { + int off = 0; + ::_decode(entity_map, payload, off); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSRegister.h b/branches/sage/cephmds2/messages/MNSRegister.h new file mode 100644 index 0000000000000..9af0dd15aa1dc --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSRegister.h @@ -0,0 +1,59 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSREGISTER_H +#define __MNSREGISTER_H + +#include "msg/Message.h" +#include "msg/TCPMessenger.h" + +class MNSRegister : public Message { + msg_addr_t addr; + int rank; + long tid; + + public: + MNSRegister() {} + MNSRegister(msg_addr_t a, int r, int ti) : + Message(MSG_NS_REGISTER) { + addr = a; + rank = r; + tid = ti; + } + + char *get_type_name() { return "NSReg"; } + + msg_addr_t get_entity() { return addr; } + int get_rank() { return rank; } + long get_tid() { return tid; } + + void encode_payload() { + payload.append((char*)&addr, sizeof(addr)); + payload.append((char*)&rank, sizeof(rank)); + payload.append((char*)&tid, sizeof(tid)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(addr), (char*)&addr); + off += sizeof(addr); + payload.copy(off, sizeof(rank), (char*)&rank); + off += sizeof(rank); + payload.copy(off, sizeof(tid), (char*)&tid); + off += sizeof(tid); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MNSRegisterAck.h b/branches/sage/cephmds2/messages/MNSRegisterAck.h new file mode 100644 index 0000000000000..54e4b93db2118 --- /dev/null +++ b/branches/sage/cephmds2/messages/MNSRegisterAck.h @@ -0,0 +1,53 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MNSREGISTERACK_H +#define __MNSREGISTERACK_H + +#include "msg/Message.h" +#include "msg/TCPMessenger.h" + +class MNSRegisterAck : public Message { + msg_addr_t entity; + long tid; + + public: + MNSRegisterAck() {} + MNSRegisterAck(long t, msg_addr_t e) : + Message(MSG_NS_REGISTERACK) { + entity = e; + tid = t; + } + + char *get_type_name() { return "NSRegA"; } + + msg_addr_t get_entity() { return entity; } + long get_tid() { return tid; } + + void encode_payload() { + payload.append((char*)&entity, sizeof(entity)); + payload.append((char*)&tid, sizeof(tid)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(entity), (char*)&entity); + off += sizeof(entity); + payload.copy(off, sizeof(tid), (char*)&tid); + off += sizeof(tid); + } +}; + + +#endif + diff --git a/branches/sage/cephmds2/messages/MOSDBoot.h b/branches/sage/cephmds2/messages/MOSDBoot.h new file mode 100644 index 0000000000000..17604282b0635 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDBoot.h @@ -0,0 +1,43 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MOSDBOOT_H +#define __MOSDBOOT_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MOSDBoot : public Message { + public: + OSDSuperblock sb; + + MOSDBoot() {} + MOSDBoot(OSDSuperblock& s) : + Message(MSG_OSD_BOOT), + sb(s) { + } + + char *get_type_name() { return "oboot"; } + + void encode_payload() { + payload.append((char*)&sb, sizeof(sb)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(sb), (char*)&sb); + off += sizeof(sb); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDFailure.h b/branches/sage/cephmds2/messages/MOSDFailure.h new file mode 100644 index 0000000000000..7dd75758ff0d6 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDFailure.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDFAILURE_H +#define __MOSDFAILURE_H + +#include "msg/Message.h" + + +class MOSDFailure : public Message { + public: + msg_addr_t failed; + entity_inst_t inst; + epoch_t epoch; + + MOSDFailure() {} + MOSDFailure(msg_addr_t f, const entity_inst_t& i, epoch_t e) : + Message(MSG_OSD_FAILURE), + failed(f), inst(i), epoch(e) {} + + msg_addr_t get_failed() { return failed; } + entity_inst_t& get_inst() { return inst; } + epoch_t get_epoch() { return epoch; } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(failed), (char*)&failed); + off += sizeof(failed); + payload.copy(off, sizeof(inst), (char*)&inst); + off += sizeof(inst); + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + } + void encode_payload() { + payload.append((char*)&failed, sizeof(failed)); + payload.append((char*)&inst, sizeof(inst)); + payload.append((char*)&epoch, sizeof(epoch)); + } + + virtual char *get_type_name() { return "osdfail"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDGetMap.h b/branches/sage/cephmds2/messages/MOSDGetMap.h new file mode 100644 index 0000000000000..58afd527bda93 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDGetMap.h @@ -0,0 +1,45 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MOSDGETMAP_H +#define __MOSDGETMAP_H + +#include "msg/Message.h" + +#include "include/types.h" + +class MOSDGetMap : public Message { + public: + epoch_t since; + + //MOSDGetMap() : since(0) {} + MOSDGetMap(epoch_t s=0) : + Message(MSG_OSD_GETMAP), + since(s) { + } + + epoch_t get_since() { return since; } + + char *get_type_name() { return "getomap"; } + + void encode_payload() { + payload.append((char*)&since, sizeof(since)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(since), (char*)&since); + off += sizeof(since); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDIn.h b/branches/sage/cephmds2/messages/MOSDIn.h new file mode 100644 index 0000000000000..276a930d2e00b --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDIn.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MOSDIN_H +#define __MOSDIN_H + +#include "msg/Message.h" + + +class MOSDIn : public Message { + public: + epoch_t map_epoch; + + MOSDIn(epoch_t e) : Message(MSG_OSD_IN), map_epoch(e) { + } + MOSDIn() {} + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); + off += sizeof(map_epoch); + } + virtual void encode_payload() { + payload.append((char*)&map_epoch, sizeof(map_epoch)); + } + + virtual char *get_type_name() { return "oin"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDMap.h b/branches/sage/cephmds2/messages/MOSDMap.h new file mode 100644 index 0000000000000..dd231a831d63d --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDMap.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDGETMAPACK_H +#define __MOSDGETMAPACK_H + +#include "msg/Message.h" +#include "osd/OSDMap.h" + + +class MOSDMap : public Message { + public: + map maps; + map incremental_maps; + + epoch_t get_first() { + epoch_t e = 0; + map::iterator i = maps.begin(); + if (i != maps.end()) e = i->first; + i = incremental_maps.begin(); + if (i != incremental_maps.end() && + (e == 0 || i->first < e)) e = i->first; + return e; + } + epoch_t get_last() { + epoch_t e = 0; + map::reverse_iterator i = maps.rbegin(); + if (i != maps.rend()) e = i->first; + i = incremental_maps.rbegin(); + if (i != incremental_maps.rend() && + (e == 0 || i->first > e)) e = i->first; + return e; + } + + + MOSDMap() : + Message(MSG_OSD_MAP) {} + MOSDMap(OSDMap *oc) : + Message(MSG_OSD_MAP) { + oc->encode(maps[oc->get_epoch()]); + } + + + // marshalling + virtual void decode_payload() { + int off = 0; + ::_decode(maps, payload, off); + ::_decode(incremental_maps, payload, off); + } + virtual void encode_payload() { + ::_encode(maps, payload); + ::_encode(incremental_maps, payload); + } + + virtual char *get_type_name() { return "omap"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDOp.h b/branches/sage/cephmds2/messages/MOSDOp.h new file mode 100644 index 0000000000000..1297c764402d2 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDOp.h @@ -0,0 +1,214 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDOP_H +#define __MOSDOP_H + +#include "msg/Message.h" + +/* + * OSD op + * + * oid - object id + * op - OSD_OP_DELETE, etc. + * + */ + +//#define OSD_OP_MKFS 20 + +// client ops +#define OSD_OP_READ 1 +#define OSD_OP_STAT 2 + +#define OSD_OP_WRNOOP 10 +#define OSD_OP_WRITE 11 +#define OSD_OP_DELETE 12 +#define OSD_OP_TRUNCATE 13 +#define OSD_OP_ZERO 14 + +#define OSD_OP_WRLOCK 20 +#define OSD_OP_WRUNLOCK 21 +#define OSD_OP_RDLOCK 22 +#define OSD_OP_RDUNLOCK 23 +#define OSD_OP_UPLOCK 24 +#define OSD_OP_DNLOCK 25 + +#define OSD_OP_PULL 30 +#define OSD_OP_PUSH 31 + + +typedef struct { + long pcid; + + // who's asking? + tid_t tid; + msg_addr_t client; + entity_inst_t client_inst; + + // for replication + tid_t rep_tid; + + object_t oid; + objectrev_t rev; + pg_t pg; + + epoch_t map_epoch; + + eversion_t pg_trim_to; // primary->replica: trim to here + + int op; + size_t length, offset; + eversion_t version; + eversion_t old_version; + + bool want_ack; + bool want_commit; +} MOSDOp_st; + +class MOSDOp : public Message { +public: + static const char* get_opname(int op) { + switch (op) { + case OSD_OP_READ: return "read"; + case OSD_OP_STAT: return "stat"; + + case OSD_OP_WRNOOP: return "wrnoop"; + case OSD_OP_WRITE: return "write"; + case OSD_OP_ZERO: return "zero"; + case OSD_OP_DELETE: return "delete"; + case OSD_OP_TRUNCATE: return "truncate"; + case OSD_OP_WRLOCK: return "wrlock"; + case OSD_OP_WRUNLOCK: return "wrunlock"; + case OSD_OP_RDLOCK: return "rdlock"; + case OSD_OP_RDUNLOCK: return "rdunlock"; + case OSD_OP_UPLOCK: return "uplock"; + case OSD_OP_DNLOCK: return "dnlock"; + + case OSD_OP_PULL: return "pull"; + case OSD_OP_PUSH: return "push"; + default: assert(0); + } + return 0; + } + +private: + MOSDOp_st st; + bufferlist data; + map attrset; + + friend class MOSDOpReply; + + public: + const tid_t get_tid() { return st.tid; } + const msg_addr_t& get_client() { return st.client; } + const entity_inst_t& get_client_inst() { return st.client_inst; } + void set_client_inst(const entity_inst_t& i) { st.client_inst = i; } + + const tid_t get_rep_tid() { return st.rep_tid; } + void set_rep_tid(tid_t t) { st.rep_tid = t; } + + const object_t get_oid() { return st.oid; } + const pg_t get_pg() { return st.pg; } + const epoch_t get_map_epoch() { return st.map_epoch; } + + //const int get_pg_role() { return st.pg_role; } // who am i asking for? + const eversion_t get_version() { return st.version; } + //const eversion_t get_old_version() { return st.old_version; } + + void set_rev(objectrev_t r) { st.rev = r; } + objectrev_t get_rev() { return st.rev; } + + const eversion_t get_pg_trim_to() { return st.pg_trim_to; } + void set_pg_trim_to(eversion_t v) { st.pg_trim_to = v; } + + const int get_op() { return st.op; } + void set_op(int o) { st.op = o; } + + const size_t get_length() { return st.length; } + const size_t get_offset() { return st.offset; } + + map& get_attrset() { return attrset; } + void set_attrset(map &as) { attrset = as; } + + const bool wants_ack() { return st.want_ack; } + const bool wants_commit() { return st.want_commit; } + + + void set_data(bufferlist &d) { + data.claim(d); + } + bufferlist& get_data() { + return data; + } + size_t get_data_len() { return data.length(); } + + + // keep a pcid (procedure call id) to match up request+reply + void set_pcid(long pcid) { this->st.pcid = pcid; } + long get_pcid() { return st.pcid; } + + MOSDOp(long tid, msg_addr_t asker, + object_t oid, pg_t pg, epoch_t mapepoch, int op) : + Message(MSG_OSD_OP) { + memset(&st, 0, sizeof(st)); + this->st.client = asker; + this->st.tid = tid; + this->st.rep_tid = 0; + + this->st.oid = oid; + this->st.pg = pg; + this->st.map_epoch = mapepoch; + this->st.op = op; + + this->st.want_ack = true; + this->st.want_commit = true; + } + MOSDOp() {} + + //void set_pg_role(int r) { st.pg_role = r; } + //void set_rg_nrep(int n) { st.rg_nrep = n; } + + void set_length(size_t l) { st.length = l; } + void set_offset(size_t o) { st.offset = o; } + void set_version(eversion_t v) { st.version = v; } + void set_old_version(eversion_t ov) { st.old_version = ov; } + + void set_want_ack(bool b) { st.want_ack = b; } + void set_want_commit(bool b) { st.want_commit = b; } + + // marshalling + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(st), (char*)&st); + off += sizeof(st); + ::_decode(attrset, payload, off); + ::_decode(data, payload, off); + } + virtual void encode_payload() { + payload.append((char*)&st, sizeof(st)); + ::_encode(attrset, payload); + ::_encode(data, payload); + } + + virtual char *get_type_name() { return "oop"; } +}; + +inline ostream& operator<<(ostream& out, MOSDOp& op) +{ + return out << "MOSDOp(" << MSG_ADDR_NICE(op.get_client()) << "." << op.get_tid() + << " op " << MOSDOp::get_opname(op.get_op()) + << " oid " << hex << op.get_oid() << dec << " " << &op << ")"; +} + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDOpReply.h b/branches/sage/cephmds2/messages/MOSDOpReply.h new file mode 100644 index 0000000000000..35c6ad5898b0b --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDOpReply.h @@ -0,0 +1,146 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDOPREPLY_H +#define __MOSDOPREPLY_H + +#include "msg/Message.h" + +#include "MOSDOp.h" +#include "osd/ObjectStore.h" + +/* + * OSD op reply + * + * oid - object id + * op - OSD_OP_DELETE, etc. + * + */ + + +typedef struct { + // req + long pcid; + tid_t tid; + tid_t rep_tid; + + object_t oid; + pg_t pg; + + int op; + + // reply + int result; + bool commit; + size_t length, offset; + size_t object_size; + eversion_t version; + + eversion_t pg_complete_thru; + + epoch_t map_epoch; +} MOSDOpReply_st; + + +class MOSDOpReply : public Message { + MOSDOpReply_st st; + bufferlist data; + map attrset; + + public: + long get_tid() { return st.tid; } + long get_rep_tid() { return st.rep_tid; } + object_t get_oid() { return st.oid; } + pg_t get_pg() { return st.pg; } + int get_op() { return st.op; } + bool get_commit() { return st.commit; } + + int get_result() { return st.result; } + size_t get_length() { return st.length; } + size_t get_offset() { return st.offset; } + size_t get_object_size() { return st.object_size; } + eversion_t get_version() { return st.version; } + map& get_attrset() { return attrset; } + + eversion_t get_pg_complete_thru() { return st.pg_complete_thru; } + void set_pg_complete_thru(eversion_t v) { st.pg_complete_thru = v; } + + void set_result(int r) { st.result = r; } + void set_length(size_t s) { st.length = s; } + void set_offset(size_t o) { st.offset = o; } + void set_object_size(size_t s) { st.object_size = s; } + void set_version(eversion_t v) { st.version = v; } + void set_attrset(map &as) { attrset = as; } + + void set_op(int op) { st.op = op; } + void set_tid(tid_t t) { st.tid = t; } + void set_rep_tid(tid_t t) { st.rep_tid = t; } + + // data payload + void set_data(bufferlist &d) { + data.claim(d); + } + bufferlist& get_data() { + return data; + } + + // osdmap + epoch_t get_map_epoch() { return st.map_epoch; } + + // keep a pcid (procedure call id) to match up request+reply + void set_pcid(long pcid) { this->st.pcid = pcid; } + long get_pcid() { return st.pcid; } + +public: + MOSDOpReply(MOSDOp *req, int result, epoch_t e, bool commit) : + Message(MSG_OSD_OPREPLY) { + memset(&st, 0, sizeof(st)); + this->st.pcid = req->st.pcid; + + this->st.op = req->st.op; + this->st.tid = req->st.tid; + this->st.rep_tid = req->st.rep_tid; + + this->st.oid = req->st.oid; + this->st.pg = req->st.pg; + this->st.result = result; + this->st.commit = commit; + + this->st.length = req->st.length; // speculative... OSD should ensure these are correct + this->st.offset = req->st.offset; + this->st.version = req->st.version; + + this->st.map_epoch = e; + } + MOSDOpReply() {} + + + // marshalling + virtual void decode_payload() { + payload.copy(0, sizeof(st), (char*)&st); + payload.splice(0, sizeof(st)); + int off = 0; + ::_decode(attrset, payload, off); + ::_decode(data, payload, off); + } + virtual void encode_payload() { + payload.append((char*)&st, sizeof(st)); + ::_encode(attrset, payload); + ::_encode(data, payload); + } + + virtual char *get_type_name() { return "oopr"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDOut.h b/branches/sage/cephmds2/messages/MOSDOut.h new file mode 100644 index 0000000000000..61a594de3294a --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDOut.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MOSDOUT_H +#define __MOSDOUT_H + +#include "msg/Message.h" + + +class MOSDOut : public Message { + public: + epoch_t map_epoch; + + MOSDOut(epoch_t e) : Message(MSG_OSD_OUT), map_epoch(e) { + } + MOSDOut() {} + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); + off += sizeof(map_epoch); + } + virtual void encode_payload() { + payload.append((char*)&map_epoch, sizeof(map_epoch)); + } + + virtual char *get_type_name() { return "oout"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGLog.h b/branches/sage/cephmds2/messages/MOSDPGLog.h new file mode 100644 index 0000000000000..e4731c6037107 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGLog.h @@ -0,0 +1,61 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGLOG_H +#define __MOSDPGLOG_H + +#include "msg/Message.h" + +class MOSDPGLog : public Message { + epoch_t epoch; + pg_t pgid; + +public: + PG::Info info; + PG::Log log; + PG::Missing missing; + + epoch_t get_epoch() { return epoch; } + pg_t get_pgid() { return pgid; } + + MOSDPGLog() {} + MOSDPGLog(version_t mv, pg_t pgid) : + Message(MSG_OSD_PG_LOG) { + this->epoch = mv; + this->pgid = pgid; + } + + char *get_type_name() { return "PGlog"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + payload.append((char*)&pgid, sizeof(pgid)); + payload.append((char*)&info, sizeof(info)); + log._encode(payload); + missing._encode(payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + payload.copy(off, sizeof(pgid), (char*)&pgid); + off += sizeof(pgid); + payload.copy(off, sizeof(info), (char*)&info); + off += sizeof(info); + log._decode(payload, off); + missing._decode(payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGNotify.h b/branches/sage/cephmds2/messages/MOSDPGNotify.h new file mode 100644 index 0000000000000..f6fe8ee88c170 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGNotify.h @@ -0,0 +1,54 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MOSDPGPEERNOTIFY_H +#define __MOSDPGPEERNOTIFY_H + +#include "msg/Message.h" + +#include "osd/PG.h" + +/* + * PGNotify - notify primary of my PGs and versions. + */ + +class MOSDPGNotify : public Message { + epoch_t epoch; + list pg_list; // pgid -> version + + public: + version_t get_epoch() { return epoch; } + list& get_pg_list() { return pg_list; } + + MOSDPGNotify() {} + MOSDPGNotify(epoch_t e, list& l) : + Message(MSG_OSD_PG_NOTIFY) { + this->epoch = e; + pg_list.splice(pg_list.begin(),l); + } + + char *get_type_name() { return "PGnot"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + _encode(pg_list, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + _decode(pg_list, payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGPeer.h b/branches/sage/cephmds2/messages/MOSDPGPeer.h new file mode 100644 index 0000000000000..ebe1cda485c4c --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGPeer.h @@ -0,0 +1,57 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGPEER_H +#define __MOSDPGPEER_H + +#include "msg/Message.h" + + +class MOSDPGPeer : public Message { + __uint64_t map_version; + list pg_list; + + bool complete; + + public: + __uint64_t get_version() { return map_version; } + list& get_pg_list() { return pg_list; } + bool get_complete() { return complete; } + + MOSDPGPeer() {} + MOSDPGPeer(__uint64_t v, list& l, bool c=false) : + Message(MSG_OSD_PG_PEER) { + this->map_version = v; + this->complete = c; + pg_list.splice(pg_list.begin(), l); + } + + char *get_type_name() { return "PGPeer"; } + + void encode_payload() { + payload.append((char*)&map_version, sizeof(map_version)); + payload.append((char*)&complete, sizeof(complete)); + _encode(pg_list, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_version), (char*)&map_version); + off += sizeof(map_version); + payload.copy(off, sizeof(complete), (char*)&complete); + off += sizeof(complete); + _decode(pg_list, payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGPeerAck.h b/branches/sage/cephmds2/messages/MOSDPGPeerAck.h new file mode 100644 index 0000000000000..e21a2607bb573 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGPeerAck.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGPEERACK_H +#define __MOSDPGPEERACK_H + +#include "msg/Message.h" +#include "osd/OSD.h" + +class MOSDPGPeerAck : public Message { + __uint64_t map_version; + + public: + list pg_dne; // pg dne + map pg_state; // state, lists, etc. + + __uint64_t get_version() { return map_version; } + + MOSDPGPeerAck() {} + MOSDPGPeerAck(__uint64_t v) : + Message(MSG_OSD_PG_PEERACK) { + this->map_version = v; + } + + char *get_type_name() { return "PGPeer"; } + + void encode_payload() { + payload.append((char*)&map_version, sizeof(map_version)); + _encode(pg_dne, payload); + + int n = pg_state.size(); + payload.append((char*)&n, sizeof(n)); + for (map::iterator it = pg_state.begin(); + it != pg_state.end(); + it++) { + payload.append((char*)&it->first, sizeof(it->first)); + it->second._encode(payload); + } + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_version), (char*)&map_version); + off += sizeof(map_version); + _decode(pg_dne, payload, off); + + int n; + payload.copy(off, sizeof(n), (char*)&n); + off += sizeof(n); + for (int i=0; i + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPEERREQUEST_H +#define __MOSDPEERREQUEST_H + +#include "msg/Message.h" + + +class MOSDPGPeerRequest : public Message { + __uint64_t map_version; + list pg_list; + + public: + __uint64_t get_version() { return map_version; } + list& get_pg_list() { return pg_list; } + + MOSDPGPeerRequest() {} + MOSDPGPeerRequest(__uint64_t v, list& l) : + Message(MSG_OSD_PG_PEERREQUEST) { + this->map_version = v; + pg_list.splice(pg_list.begin(), l); + } + + char *get_type_name() { return "PGPR"; } + + void encode_payload() { + payload.append((char*)&map_version, sizeof(map_version)); + _encode(pg_list, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_version), (char*)&map_version); + off += sizeof(map_version); + _decode(pg_list, payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGQuery.h b/branches/sage/cephmds2/messages/MOSDPGQuery.h new file mode 100644 index 0000000000000..926acce81349d --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGQuery.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGQUERY_H +#define __MOSDPGQUERY_H + +#include "msg/Message.h" + +/* + * PGQuery - query another OSD as to the contents of their PGs + */ + +class MOSDPGQuery : public Message { + version_t epoch; + + public: + version_t get_epoch() { return epoch; } + map pg_list; + + MOSDPGQuery() {} + MOSDPGQuery(epoch_t e, map& ls) : + Message(MSG_OSD_PG_QUERY), + epoch(e), pg_list(ls) { + } + + char *get_type_name() { return "PGq"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + ::_encode(pg_list, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + ::_decode(pg_list, payload, off); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGRemove.h b/branches/sage/cephmds2/messages/MOSDPGRemove.h new file mode 100644 index 0000000000000..9629a3782764b --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGRemove.h @@ -0,0 +1,51 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGREMOVE_H +#define __MOSDPGREMOVE_H + +#include "msg/Message.h" + + +class MOSDPGRemove : public Message { + epoch_t epoch; + + public: + set pg_list; + + epoch_t get_epoch() { return epoch; } + + MOSDPGRemove() {} + MOSDPGRemove(epoch_t e, set& l) : + Message(MSG_OSD_PG_REMOVE) { + this->epoch = e; + pg_list = l; + } + + char *get_type_name() { return "PGrm"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + _encode(pg_list, payload); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + _decode(pg_list, payload, off); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGSummary.h b/branches/sage/cephmds2/messages/MOSDPGSummary.h new file mode 100644 index 0000000000000..dc4af837209bb --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGSummary.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGQUERYREPLY_H +#define __MOSDPGQUERYREPLY_H + +#include "msg/Message.h" + +class MOSDPGSummary : public Message { + epoch_t epoch; + pg_t pgid; + +public: + PG::PGInfo info; + bufferlist sumbl; + + epoch_t get_epoch() { return epoch; } + + MOSDPGSummary() {} + MOSDPGSummary(version_t mv, pg_t pgid, PG::PGSummary &summary) : + Message(MSG_OSD_PG_SUMMARY) { + this->epoch = mv; + this->pgid = pgid; + summary._encode(sumbl); + } + + pg_t get_pgid() { return pgid; } + bufferlist& get_summary_bl() { + return sumbl; + } + + char *get_type_name() { return "PGsum"; } + + void encode_payload() { + payload.append((char*)&epoch, sizeof(epoch)); + payload.append((char*)&pgid, sizeof(pgid)); + payload.append((char*)&info, sizeof(info)); + payload.claim_append(sumbl); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + payload.copy(off, sizeof(pgid), (char*)&pgid); + off += sizeof(pgid); + payload.copy(off, sizeof(info), (char*)&info); + off += sizeof(info); + + payload.splice(0, off); + sumbl.claim(payload); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPGUpdate.h b/branches/sage/cephmds2/messages/MOSDPGUpdate.h new file mode 100644 index 0000000000000..93809d6820d21 --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPGUpdate.h @@ -0,0 +1,64 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MOSDPGUPDATE_H +#define __MOSDPGUPDATE_H + +#include "msg/Message.h" + +class MOSDPGUpdate : public Message { + version_t map_version; + pg_t pgid; + //pginfo_t info; + bool complete; + version_t last_any_complete; + + public: + version_t get_version() { return map_version; } + pg_t get_pgid() { return pgid; } + //pginfo_t& get_pginfo() { return info; } + bool is_complete() { return complete; } + version_t get_last_any_complete() { return last_any_complete; } + + MOSDPGUpdate() {} + MOSDPGUpdate(version_t mv, pg_t pgid, bool complete, version_t last_any_complete) : + Message(MSG_OSD_PG_UPDATE) { + this->map_version = mv; + this->pgid = pgid; + this->complete = complete; + this->last_any_complete = last_any_complete; + } + + char *get_type_name() { return "PGUp"; } + + void encode_payload() { + payload.append((char*)&map_version, sizeof(map_version)); + payload.append((char*)&pgid, sizeof(pgid)); + payload.append((char*)&complete, sizeof(complete)); + payload.append((char*)&last_any_complete, sizeof(last_any_complete)); + } + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_version), (char*)&map_version); + off += sizeof(map_version); + payload.copy(off, sizeof(pgid), (char*)&pgid); + off += sizeof(pgid); + payload.copy(off, sizeof(complete), (char*)&complete); + off += sizeof(complete); + payload.copy(off, sizeof(last_any_complete), (char*)&last_any_complete); + off += sizeof(last_any_complete); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MOSDPing.h b/branches/sage/cephmds2/messages/MOSDPing.h new file mode 100644 index 0000000000000..fae80edd91cfc --- /dev/null +++ b/branches/sage/cephmds2/messages/MOSDPing.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MOSDPING_H +#define __MOSDPING_H + +#include "msg/Message.h" + + +class MOSDPing : public Message { + public: + epoch_t map_epoch; + bool ack; + float avg_qlen; + + MOSDPing(epoch_t e, + float aq, + bool a=false) : Message(MSG_OSD_PING), map_epoch(e), ack(a), avg_qlen(aq) { + } + MOSDPing() {} + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(map_epoch), (char*)&map_epoch); + off += sizeof(map_epoch); + payload.copy(off, sizeof(ack), (char*)&ack); + off += sizeof(ack); + payload.copy(off, sizeof(avg_qlen), (char*)&avg_qlen); + off += sizeof(avg_qlen); + } + virtual void encode_payload() { + payload.append((char*)&map_epoch, sizeof(map_epoch)); + payload.append((char*)&ack, sizeof(ack)); + payload.append((char*)&avg_qlen, sizeof(avg_qlen)); + } + + virtual char *get_type_name() { return "oping"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MPing.h b/branches/sage/cephmds2/messages/MPing.h new file mode 100644 index 0000000000000..65b65a738cd66 --- /dev/null +++ b/branches/sage/cephmds2/messages/MPing.h @@ -0,0 +1,41 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MPING_H +#define __MPING_H + +#include "msg/Message.h" + + +class MPing : public Message { + public: + int seq; + MPing(int s) : Message(MSG_PING) { + seq = s; + } + MPing() : Message(MSG_PING) {} + + virtual void decode_payload(crope& s, int& off) { + s.copy(0, sizeof(seq), (char*)&seq); + off += sizeof(seq); + } + virtual void encode_payload(crope& s) { + s.append((char*)&seq, sizeof(seq)); + } + + virtual char *get_type_name() { return "ping"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MPingAck.h b/branches/sage/cephmds2/messages/MPingAck.h new file mode 100644 index 0000000000000..0ee385b7a2b80 --- /dev/null +++ b/branches/sage/cephmds2/messages/MPingAck.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MPINGACK_H +#define __MPINGACK_H + +#include "MPing.h" + + +class MPingAck : public Message { + public: + int seq; + MPingAck() {} + MPingAck(MPing *p) : Message(MSG_PING_ACK) { + this->seq = p->seq; + } + + virtual void decode_payload(crope& s, int& off) { + s.copy(0, sizeof(seq), (char*)&seq); + off += sizeof(seq); + } + virtual void encode_payload(crope& s) { + s.append((char*)&seq, sizeof(seq)); + } + + virtual char *get_type_name() { return "pinga"; } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRename.h b/branches/sage/cephmds2/messages/MRename.h new file mode 100644 index 0000000000000..e648f3e652fc7 --- /dev/null +++ b/branches/sage/cephmds2/messages/MRename.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAME_H +#define __MRENAME_H + +class MRename : public Message { + inodeno_t srcdirino; + string srcname; + inodeno_t destdirino; + string destname; + int initiator; + + bufferlist inode_state; + + public: + int get_initiator() { return initiator; } + inodeno_t get_srcdirino() { return srcdirino; } + string& get_srcname() { return srcname; } + inodeno_t get_destdirino() { return destdirino; } + string& get_destname() { return destname; } + bufferlist& get_inode_state() { return inode_state; } + + MRename() {} + MRename(int initiator, + inodeno_t srcdirino, + const string& srcname, + inodeno_t destdirino, + const string& destname, + bufferlist& inode_state) : + Message(MSG_MDS_RENAME) { + this->initiator = initiator; + this->srcdirino = srcdirino; + this->srcname = srcname; + this->destdirino = destdirino; + this->destname = destname; + this->inode_state.claim( inode_state ); + } + virtual char *get_type_name() { return "Rn";} + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(initiator), (char*)&initiator); + off += sizeof(initiator); + payload.copy(off, sizeof(srcdirino), (char*)&srcdirino); + off += sizeof(srcdirino); + payload.copy(off, sizeof(destdirino), (char*)&destdirino); + off += sizeof(destdirino); + _decode(srcname, payload, off); + _decode(destname, payload, off); + size_t len; + payload.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + inode_state.substr_of(payload, off, len); + off += len; + } + virtual void encode_payload() { + payload.append((char*)&initiator,sizeof(initiator)); + payload.append((char*)&srcdirino,sizeof(srcdirino)); + payload.append((char*)&destdirino,sizeof(destdirino)); + _encode(srcname, payload); + _encode(destname, payload); + size_t len = inode_state.length(); + payload.append((char*)&len, sizeof(len)); + payload.claim_append(inode_state); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenameAck.h b/branches/sage/cephmds2/messages/MRenameAck.h new file mode 100644 index 0000000000000..14843cef5f616 --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenameAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMEACK_H +#define __MRENAMEACK_H + +/* FIXME: relateive to dn, not inode */ + +class MRenameAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MRenameAck() {} + MRenameAck(inodeno_t ino) : + Message(MSG_MDS_RENAMEACK) { + this->ino = ino; + } + virtual char *get_type_name() { return "RnAck";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino,sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenameNotify.h b/branches/sage/cephmds2/messages/MRenameNotify.h new file mode 100644 index 0000000000000..bc32300b82e3a --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenameNotify.h @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMENOTIFY_H +#define __MRENAMENOTIFY_H + +class MRenameNotify : public Message { + inodeno_t ino; + inodeno_t srcdirino; + string srcname; + inodeno_t destdirino; + string destname; + string destdirpath; + int srcauth; + + public: + inodeno_t get_ino() { return ino; } + inodeno_t get_srcdirino() { return srcdirino; } + string& get_srcname() { return srcname; } + inodeno_t get_destdirino() { return destdirino; } + string& get_destname() { return destname; } + string& get_destdirpath() { return destdirpath; } + int get_srcauth() { return srcauth; } + + MRenameNotify() {} + MRenameNotify(inodeno_t ino, + inodeno_t srcdirino, + const string& srcname, + inodeno_t destdirino, + const string& destdirpath, + const string& destname, + int srcauth + ) : + Message(MSG_MDS_RENAMENOTIFY) { + this->ino = ino; + this->srcdirino = srcdirino; + this->srcname = srcname; + this->destdirino = destdirino; + this->destname = destname; + this->destdirpath = destdirpath; + this->srcauth = srcauth; + } + virtual char *get_type_name() { return "Rnot";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + off += sizeof(srcdirino); + s.copy(off, sizeof(destdirino), (char*)&destdirino); + off += sizeof(destdirino); + _unrope(srcname, s, off); + _unrope(destname, s, off); + _unrope(destdirpath, s, off); + s.copy(off, sizeof(srcauth), (char*)&srcauth); + off += sizeof(srcauth); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino,sizeof(ino)); + s.append((char*)&srcdirino,sizeof(srcdirino)); + s.append((char*)&destdirino,sizeof(destdirino)); + _rope(srcname, s); + _rope(destname, s); + _rope(destdirpath, s); + s.append((char*)&srcauth, sizeof(srcauth)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenameNotifyAck.h b/branches/sage/cephmds2/messages/MRenameNotifyAck.h new file mode 100644 index 0000000000000..d1a01339cd97a --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenameNotifyAck.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMENOTIFYACK_H +#define __MRENAMENOTIFYACK_H + +class MRenameNotifyAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MRenameNotifyAck() {} + MRenameNotifyAck(inodeno_t ino) : + Message(MSG_MDS_RENAMENOTIFYACK) { + this->ino = ino; + } + virtual char *get_type_name() { return "RnotA";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino,sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenamePrep.h b/branches/sage/cephmds2/messages/MRenamePrep.h new file mode 100644 index 0000000000000..1af798c674489 --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenamePrep.h @@ -0,0 +1,85 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMEPREP_H +#define __MRENAMEPREP_H + +class MRenamePrep : public Message { + inodeno_t srcdirino; + string srcname; + string srcpath; + inodeno_t destdirino; + string destname; + string destpath; + int initiator; + int srcauth; + + public: + int get_initiator() { return initiator; } + inodeno_t get_srcdirino() { return srcdirino; } + string& get_srcname() { return srcname; } + string& get_srcpath() { return srcpath; } + int get_srcauth() { return srcauth; } + inodeno_t get_destdirino() { return destdirino; } + string& get_destname() { return destname; } + string& get_destpath() { return destpath; } + + MRenamePrep() {} + MRenamePrep(int initiator, + inodeno_t srcdirino, + const string& srcname, + const string& srcpath, + inodeno_t destdirino, + const string& destname, + const string& destpath, + int srcauth) : + Message(MSG_MDS_RENAMEPREP) { + this->initiator = initiator; + this->srcdirino = srcdirino; + this->srcname = srcname; + this->srcpath = srcpath; + this->destdirino = destdirino; + this->destname = destname; + this->destpath = destpath; + this->srcauth = srcauth; + } + virtual char *get_type_name() { return "RnP";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(initiator), (char*)&initiator); + off += sizeof(initiator); + s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + off += sizeof(srcdirino); + s.copy(off, sizeof(destdirino), (char*)&destdirino); + off += sizeof(destdirino); + _unrope(srcname, s, off); + _unrope(srcpath, s, off); + _unrope(destname, s, off); + _unrope(destpath, s, off); + s.copy(off, sizeof(srcauth), (char*)&srcauth); + off += sizeof(srcauth); + } + virtual void encode_payload(crope& s) { + s.append((char*)&initiator,sizeof(initiator)); + s.append((char*)&srcdirino,sizeof(srcdirino)); + s.append((char*)&destdirino,sizeof(destdirino)); + _rope(srcname, s); + _rope(srcpath, s); + _rope(destname, s); + _rope(destpath, s); + s.append((char*)&srcauth, sizeof(srcauth)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenameReq.h b/branches/sage/cephmds2/messages/MRenameReq.h new file mode 100644 index 0000000000000..b70e96a38203b --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenameReq.h @@ -0,0 +1,79 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMEREQ_H +#define __MRENAMEREQ_H + +class MRenameReq : public Message { + int initiator; + inodeno_t srcdirino; + string srcname; + inodeno_t destdirino; + string destname; + string destpath; + int destauth; + + public: + int get_initiator() { return initiator; } + inodeno_t get_srcdirino() { return srcdirino; } + string& get_srcname() { return srcname; } + inodeno_t get_destdirino() { return destdirino; } + string& get_destname() { return destname; } + string& get_destpath() { return destpath; } + int get_destauth() { return destauth; } + + MRenameReq() {} + MRenameReq(int initiator, + inodeno_t srcdirino, + const string& srcname, + inodeno_t destdirino, + const string& destname, + const string& destpath, + int destauth) : + Message(MSG_MDS_RENAMEREQ) { + this->initiator = initiator; + this->srcdirino = srcdirino; + this->srcname = srcname; + this->destdirino = destdirino; + this->destname = destname; + this->destpath = destpath; + this->destauth = destauth; + } + virtual char *get_type_name() { return "RnReq";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(initiator), (char*)&initiator); + off += sizeof(initiator); + s.copy(off, sizeof(srcdirino), (char*)&srcdirino); + off += sizeof(srcdirino); + s.copy(off, sizeof(destdirino), (char*)&destdirino); + off += sizeof(destdirino); + _unrope(srcname, s, off); + _unrope(destname, s, off); + _unrope(destpath, s, off); + s.copy(off, sizeof(destauth), (char*)&destauth); + off += sizeof(destauth); + } + virtual void encode_payload(crope& s) { + s.append((char*)&initiator,sizeof(initiator)); + s.append((char*)&srcdirino,sizeof(srcdirino)); + s.append((char*)&destdirino,sizeof(destdirino)); + _rope(srcname, s); + _rope(destname, s); + _rope(destpath, s); + s.append((char*)&destauth, sizeof(destauth)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MRenameWarning.h b/branches/sage/cephmds2/messages/MRenameWarning.h new file mode 100644 index 0000000000000..85463dfd2c179 --- /dev/null +++ b/branches/sage/cephmds2/messages/MRenameWarning.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MRENAMEWARNING_H +#define __MRENAMEWARNING_H + +class MRenameWarning : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MRenameWarning() {} + MRenameWarning(inodeno_t ino) : + Message(MSG_MDS_RENAMEWARNING) { + this->ino = ino; + } + virtual char *get_type_name() { return "RnW";} + + virtual void decode_payload(crope& s, int& off) { + s.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + } + virtual void encode_payload(crope& s) { + s.append((char*)&ino,sizeof(ino)); + } +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDir.h b/branches/sage/cephmds2/messages/MUnhashDir.h new file mode 100644 index 0000000000000..911a14d9c9592 --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDir.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIR_H +#define __MUNHASHDIR_H + +#include "msg/Message.h" + +class MUnhashDir : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MUnhashDir() {} + MUnhashDir(inodeno_t ino) : + Message(MSG_MDS_UNHASHDIR) { + this->ino = ino; + } + virtual char *get_type_name() { return "UH"; } + + virtual void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDirAck.h b/branches/sage/cephmds2/messages/MUnhashDirAck.h new file mode 100644 index 0000000000000..e052683e736c3 --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDirAck.h @@ -0,0 +1,65 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIRACK_H +#define __MUNHASHDIRACK_H + +#include "msg/Message.h" + +class MUnhashDirAck : public Message { + inodeno_t ino; + bufferlist state; + int nden; + + public: + MUnhashDirAck() {} + MUnhashDirAck(inodeno_t ino, bufferlist& bl, int nden) : + Message(MSG_MDS_UNHASHDIRACK) { + this->ino = ino; + state.claim(bl); + this->nden = nden; + } + virtual char *get_type_name() { return "UHaA"; } + + inodeno_t get_ino() { return ino; } + bufferlist& get_state() { return state; } + bufferlist* get_state_ptr() { return &state; } + int get_nden() { return nden; } + + //void set_nden(int n) { nden = n; } + //void inc_nden() { nden++; } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + payload.copy(off, sizeof(nden), (char*)&nden); + off += sizeof(nden); + + size_t len; + payload.copy(off, sizeof(len), (char*)&len); + off += sizeof(len); + state.substr_of(payload, off, len); + } + void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + payload.append((char*)&nden, sizeof(nden)); + size_t size = state.length(); + payload.append((char*)&size, sizeof(size)); + payload.claim_append(state); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDirNotify.h b/branches/sage/cephmds2/messages/MUnhashDirNotify.h new file mode 100644 index 0000000000000..a9d6707a3aa25 --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDirNotify.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIRNOTIFY_H +#define __MUNHASHDIRNOTIFY_H + +#include "msg/Message.h" + +class MUnhashDirNotify : public Message { + inodeno_t ino; + //int peer; + + public: + inodeno_t get_ino() { return ino; } + //int get_peer() { return peer; } + + MUnhashDirNotify() {} + MUnhashDirNotify(inodeno_t ino/*, int peer*/) : + Message(MSG_MDS_UNHASHDIRNOTIFY) { + this->ino = ino; + //this->peer = peer; + } + virtual char *get_type_name() { return "UHN"; } + + virtual void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + //payload.copy(off, sizeof(peer), (char*)&peer); + //off += sizeof(peer); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + //payload.append((char*)&peer, sizeof(peer)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h b/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h new file mode 100644 index 0000000000000..ad4843676f0fb --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDirNotifyAck.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIRNOTIFYACK_H +#define __MUNHASHDIRNOTIFYACK_H + +#include "msg/Message.h" + +class MUnhashDirNotifyAck : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MUnhashDirNotifyAck() {} + MUnhashDirNotifyAck(inodeno_t ino) : + Message(MSG_MDS_UNHASHDIRNOTIFYACK) { + this->ino = ino; + } + virtual char *get_type_name() { return "UHNa"; } + + virtual void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDirPrep.h b/branches/sage/cephmds2/messages/MUnhashDirPrep.h new file mode 100644 index 0000000000000..c4dc2ea422cd9 --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDirPrep.h @@ -0,0 +1,42 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIRPREP_H +#define __MUNHASHDIRPREP_H + +#include "msg/Message.h" + +class MUnhashDirPrep : public Message { + inodeno_t ino; + + public: + inodeno_t get_ino() { return ino; } + + MUnhashDirPrep() {} + MUnhashDirPrep(inodeno_t ino) : + Message(MSG_MDS_UNHASHDIRPREP) { + this->ino = ino; + } + virtual char *get_type_name() { return "UHP"; } + + virtual void decode_payload() { + payload.copy(0, sizeof(ino), (char*)&ino); + } + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h b/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h new file mode 100644 index 0000000000000..bd7e93981964b --- /dev/null +++ b/branches/sage/cephmds2/messages/MUnhashDirPrepAck.h @@ -0,0 +1,93 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MUNHASHDIRPREPACK_H +#define __MUNHASHDIRPREPACK_H + +#include "msg/Message.h" +#include "mds/CInode.h" +#include "include/types.h" + +class MUnhashDirPrepAck : public Message { + inodeno_t ino; + bool assim; + + // subdir dentry names + inodes + map inodes; + + public: + inodeno_t get_ino() { return ino; } + map& get_inodes() { return inodes; } + + bool did_assim() { return assim; } + void mark_assim() { assert(!assim); assim = true; } + + MUnhashDirPrepAck() : assim(false) { } + MUnhashDirPrepAck(inodeno_t ino) : + Message(MSG_MDS_UNHASHDIRPREPACK), + assim(false) { + this->ino = ino; + } + ~MUnhashDirPrepAck() { + for (map::iterator it = inodes.begin(); + it != inodes.end(); + it++) + delete it->second; + } + + + virtual char *get_type_name() { return "HP"; } + + void add_inode(const string& dentry, CInodeDiscover *in) { + inodes[dentry] = in; + } + + void decode_payload() { + int off = 0; + payload.copy(off, sizeof(ino), (char*)&ino); + off += sizeof(ino); + + // inodes + int ni; + payload.copy(off, sizeof(int), (char*)&ni); + off += sizeof(int); + for (int i=0; i_decode(payload, off); + + inodes[dname] = in; + } + } + + virtual void encode_payload() { + payload.append((char*)&ino, sizeof(ino)); + + // inodes + int ni = inodes.size(); + payload.append((char*)&ni, sizeof(int)); + for (map::iterator iit = inodes.begin(); + iit != inodes.end(); + iit++) { + _encode(iit->first, payload); // dentry + iit->second->_encode(payload); // inode + } + } +}; + +#endif diff --git a/branches/sage/cephmds2/mon/Elector.cc b/branches/sage/cephmds2/mon/Elector.cc new file mode 100644 index 0000000000000..a08d0bd7f87df --- /dev/null +++ b/branches/sage/cephmds2/mon/Elector.cc @@ -0,0 +1,227 @@ + +#include "Elector.h" +#include "Monitor.h" + +#include "common/Timer.h" + +#include "messages/MMonElectionRefresh.h" +#include "messages/MMonElectionStatus.h" +#include "messages/MMonElectionAck.h" +#include "messages/MMonElectionCollect.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << "mon" << whoami << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << "mon" << whoami << " " + + + +class C_Elect_ReadTimer : public Context { + Elector *mon; +public: + C_Elect_ReadTimer(Elector *m) : mon(m){} + void finish(int r) { + mon->read_timer(); + } +}; + +void Elector::read_timer() +{ + lock.Lock(); + { + read_num++; + status_msg_count = 0; + old_views = views; // TODO deep copy + for (unsigned i=0; imessenger->send_message(new MMonElectionCollect(read_num), + MSG_ADDR_MON(processes[i])); + } + } + lock.Unlock(); +}; + +class C_Elect_TripTimer : public Context { + Elector *mon; +public: + C_Elect_TripTimer(Elector *m) : mon(m){} + void finish(int r) { + mon->trip_timer(); + } +}; + +void Elector::trip_timer() +{ + lock.Lock(); + { + views[whoami].expired = true; + registry[whoami].epoch.s_num++; + dout(1) << "Process " << whoami + << " timed out (" << ack_msg_count << "/" << (f + 1) + << ") ... increasing epoch. Now epoch is " + << registry[whoami].epoch.s_num + << endl; + } + lock.Unlock(); +}; + + + +class C_Elect_RefreshTimer : public Context { + Elector *mon; +public: + C_Elect_RefreshTimer(Elector *m) : mon(m) {} + void finish(int r) { + mon->refresh_timer(); + } +}; + +void Elector::refresh_timer() +{ + lock.Lock(); + { + ack_msg_count = 0; + refresh_num++; + MMonElectionRefresh *msg = new MMonElectionRefresh(whoami, registry[whoami], refresh_num); + for (unsigned i=0; imessenger->send_message(msg, MSG_ADDR_MON(processes[i])); + } + + // Start the trip timer + //round_trip_timer = new C_Elect_TripTimer(this); + g_timer.add_event_after(trip_delta, new C_Elect_TripTimer(this)); + } + lock.Unlock(); +}; + + + +////////////////////////// + + +Elector::Epoch Elector::get_min_epoch() +{ + assert(!views.empty()); + Epoch min = views[0].state.epoch; + for (unsigned i=1; iget_type()) { + case MSG_MON_ELECTION_ACK: + handle_ack((MMonElectionAck*)m); + break; + + case MSG_MON_ELECTION_STATUS: + handle_status((MMonElectionStatus*)m); + break; + + case MSG_MON_ELECTION_COLLECT: + handle_collect((MMonElectionCollect*)m); + break; + + case MSG_MON_ELECTION_REFRESH: + handle_refresh((MMonElectionRefresh*)m); + break; + + default: + assert(0); + } + } + lock.Unlock(); +} + +void Elector::handle_ack(MMonElectionAck* msg) +{ + assert(refresh_num >= msg->refresh_num); + + if (refresh_num > msg->refresh_num) { + // we got the message too late... discard it + return; + } + ack_msg_count++; + if (ack_msg_count >= f + 1) { + dout(5) << "Received _f+1 acks, increase freshness" << endl; + //g_timer.cancel_event(round_trip_task); + //round_trip_timer->cancel(); + registry[whoami].freshness++; + } + + delete msg; +} + +void Elector::handle_collect(MMonElectionCollect* msg) +{ + mon->messenger->send_message(new MMonElectionStatus(msg->get_source().num(), + msg->read_num, + registry), + msg->get_source()); + delete msg; +} + +void Elector::handle_refresh(MMonElectionRefresh* msg) +{ + if (registry[msg->p] < msg->state) { + // update local data + registry[msg->p] = msg->state; + + // reply to msg + mon->messenger->send_message(new MMonElectionAck(msg->p, + msg->refresh_num), + msg->get_source()); + } + + delete msg; +} + + +void Elector::handle_status(MMonElectionStatus* msg) +{ + if (read_num != msg->read_num) { + dout(1) << "handle_status " + << ":DISCARDED B/C OF READNUM(" << read_num << ":" + << msg->read_num << ")" + << endl; + return; + } + for (unsigned i=0; iregistry[r] > views[r].state ) { + views[r].state = msg->registry[r]; + } + } + + status_msg_count++; + if (status_msg_count >= (int)processes.size() - f) { // Responses from quorum collected + for (unsigned i=0; i old_views[r].state )) { + dout(5) << ":Other process (" << r << ") has expired" << endl; + views[r].expired = true; + } + if (views[r].state.epoch > old_views[r].state.epoch) { + views[r].expired = false; + } + } + Epoch leader_epoch = get_min_epoch(); + leader_id = leader_epoch.p_id; + dout(1) << " thinks leader has ID: " << leader_id << endl; + + // Restarts the timer for the next iteration + g_timer.add_event_after(main_delta + trip_delta, new C_Elect_ReadTimer(this)); + } +} + + + + diff --git a/branches/sage/cephmds2/mon/Elector.h b/branches/sage/cephmds2/mon/Elector.h new file mode 100644 index 0000000000000..7ec3a40a59130 --- /dev/null +++ b/branches/sage/cephmds2/mon/Elector.h @@ -0,0 +1,163 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MON_ELECTOR_H +#define __MON_ELECTOR_H + +#include +using namespace std; + +#include "include/types.h" +#include "msg/Message.h" + + +class Monitor; + + +class Elector { + public: + + //// sub-classes + + // Epoch + class Epoch { + public: + int p_id; + int s_num; + + Epoch(int p_id=0, int s_num=0) { + this->p_id = p_id; + this->s_num = s_num; + } + }; + + + // State + class State { + public: + Epoch epoch; + int freshness; + + State() : freshness(0) {}; + State(Epoch& e, int f) : + epoch(e), freshness(f) {} + }; + + + class View { + public: + State state; + bool expired; + View() : expired(false) {} + View(State& s, bool e) : state(s), expired(e) {} + }; + + + /////////////// + private: + Monitor *mon; + int whoami; + Mutex lock; + + // used during refresh phase + int ack_msg_count; + int refresh_num; + + // used during read phase + int read_num; + int status_msg_count; + + // the leader process id + int leader_id; + // f-accessible + int f; + + // the processes that compose the group + vector processes; + // parameters for the process + int main_delta; + int trip_delta; + + // state variables + map registry; + map views; + map old_views; + + // get the minimum epoch in the view map + Epoch get_min_epoch(); + + // handlers for election messages + void handle_ack(class MMonElectionAck *m); + void handle_collect(class MMonElectionCollect *m); + void handle_refresh(class MMonElectionRefresh *m); + void handle_status(class MMonElectionStatus *m); + + public: + Elector(Monitor *m, int w) : mon(m), whoami(w) { + // initialize all those values! + // ... + } + + // timer methods + void read_timer(); + void trip_timer(); + void refresh_timer(); + + void dispatch(Message *m); + +}; + + +inline bool operator>(const Elector::Epoch& l, const Elector::Epoch& r) { + if (l.s_num == r.s_num) + return (l.p_id > r.p_id); + else + return (l.s_num > r.s_num); +} + +inline bool operator<(const Elector::Epoch& l, const Elector::Epoch& r) { + if (l.s_num == r.s_num) + return (l.p_id < r.p_id); + else + return (l.s_num < r.s_num); +} + +inline bool operator==(const Elector::Epoch& l, const Elector::Epoch& r) { + return ((l.s_num == r.s_num) && (l.p_id > r.p_id)); +} + + +inline bool operator>(const Elector::State& l, const Elector::State& r) +{ + if (l.epoch == r.epoch) + return (l.freshness > r.freshness); + else + return l.epoch > r.epoch; +} + +inline bool operator<(const Elector::State& l, const Elector::State& r) +{ + if (l.epoch == r.epoch) + return (l.freshness < r.freshness); + else + return l.epoch < r.epoch; +} + +inline bool operator==(const Elector::State& l, const Elector::State& r) +{ + return ( (l.epoch == r.epoch) && (l.freshness == r.freshness) ); +} + + +#endif diff --git a/branches/sage/cephmds2/mon/MDSMonitor.cc b/branches/sage/cephmds2/mon/MDSMonitor.cc new file mode 100644 index 0000000000000..e2e2553670fe7 --- /dev/null +++ b/branches/sage/cephmds2/mon/MDSMonitor.cc @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "MDSMonitor.h" +#include "Monitor.h" + +#include "messages/MMDSBoot.h" +#include "messages/MMDSMap.h" +#include "messages/MMDSGetMap.h" +//#include "messages/MMDSFailure.h" + +#include "common/Timer.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".mds e" << mdsmap.get_epoch() << " " + + + +/********* MDS map **************/ + +void MDSMonitor::create_initial() +{ + mdsmap.epoch = 0; // until everyone boots + mdsmap.ctime = g_clock.now(); + for (int i=0; iget_type()) { + + case MSG_MDS_BOOT: + handle_mds_boot((MMDSBoot*)m); + break; + + case MSG_MDS_GETMAP: + handle_mds_getmap((MMDSGetMap*)m); + break; + + /* + case MSG_MDS_FAILURE: + handle_mds_failure((MMDSFailure*)m); + break; + */ + + case MSG_SHUTDOWN: + handle_mds_shutdown(m); + break; + + default: + assert(0); + } +} + +void MDSMonitor::handle_mds_boot(MMDSBoot *m) +{ + dout(7) << "mds_boot from " << m->get_source() << " at " << m->get_source_inst() << endl; + assert(m->get_source().is_mds()); + int from = m->get_source().num(); + + if (mdsmap.get_epoch() == 0) { + // waiting for boot! + mdsmap.mds_inst[from] = m->get_source_inst(); + mdsmap.down_mds.erase(from); + + if ((int)mdsmap.mds_inst.size() == mdsmap.get_num_mds()) { + mdsmap.inc_epoch(); + dout(-7) << "mds_boot all MDSs booted." << endl; + mdsmap.encode(maps[mdsmap.get_epoch()]); // 1 + + bcast_latest_mds(); + send_current(); + } else { + dout(7) << "mds_boot waiting for " + << (mdsmap.get_num_mds() - mdsmap.mds_inst.size()) + << " mdss to boot" << endl; + } + return; + } else { + dout(0) << "mds_boot everyone already booted, so who is this? write me." << endl; + assert(0); + } +} + +void MDSMonitor::handle_mds_shutdown(Message *m) +{ + assert(m->get_source().is_mds()); + int from = m->get_source().num(); + + mdsmap.mds_inst.erase(from); + mdsmap.all_mds.erase(from); + + dout(7) << "mds_shutdown from " << m->get_source() + << ", still have " << mdsmap.all_mds + << endl; + + // tell someone? + // fixme + + delete m; +} + + +void MDSMonitor::handle_mds_getmap(MMDSGetMap *m) +{ + dout(7) << "mds_getmap from " << m->get_source() << " " << m->get_source_inst() << endl; + if (mdsmap.get_epoch() > 0) + send_full(m->get_source(), m->get_source_inst()); + else + awaiting_map[m->get_source()] = m->get_source_inst(); +} + + +void MDSMonitor::bcast_latest_mds() +{ + dout(10) << "bcast_latest_mds " << mdsmap.get_epoch() << endl; + + // tell mds + for (set::iterator p = mdsmap.get_mds().begin(); + p != mdsmap.get_mds().end(); + p++) { + if (mdsmap.is_down(*p)) continue; + send_full(MSG_ADDR_MDS(*p), mdsmap.get_inst(*p)); + } +} + +void MDSMonitor::send_full(msg_addr_t dest, const entity_inst_t& inst) +{ + dout(11) << "send_full to " << dest << " inst " << inst << endl; + messenger->send_message(new MMDSMap(&mdsmap), dest, inst); +} + +void MDSMonitor::send_current() +{ + dout(10) << "mds_send_current " << mdsmap.get_epoch() << endl; + for (map::iterator i = awaiting_map.begin(); + i != awaiting_map.end(); + i++) + send_full(i->first, i->second); + awaiting_map.clear(); +} + diff --git a/branches/sage/cephmds2/mon/MDSMonitor.h b/branches/sage/cephmds2/mon/MDSMonitor.h new file mode 100644 index 0000000000000..66e28451e1de4 --- /dev/null +++ b/branches/sage/cephmds2/mon/MDSMonitor.h @@ -0,0 +1,69 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MDSMONITOR_H +#define __MDSMONITOR_H + +#include +#include +using namespace std; + +#include "include/types.h" +#include "msg/Messenger.h" + +#include "mds/MDSMap.h" + +class Monitor; + +class MDSMonitor : public Dispatcher { + Monitor *mon; + Messenger *messenger; + Mutex &lock; + + // mds maps + public: + MDSMap mdsmap; + + private: + map maps; + + //map inc_maps; + //MDSMap::Incremental pending_inc; + + map awaiting_map; + + + // maps + void create_initial(); + void send_current(); // send current map to waiters. + void send_full(msg_addr_t dest, const entity_inst_t& inst); + void bcast_latest_mds(); + + //void accept_pending(); // accept pending, new map. + //void send_incremental(epoch_t since, msg_addr_t dest); + + void handle_mds_boot(class MMDSBoot *m); + void handle_mds_failure(class MMDSFailure *m); + void handle_mds_getmap(class MMDSGetMap *m); + void handle_mds_shutdown(Message *m); + + public: + MDSMonitor(Monitor *mn, Messenger *m, Mutex& l) : mon(mn), messenger(m), lock(l) { + create_initial(); + } + + void dispatch(Message *m); + void tick(); // check state, take actions +}; + +#endif diff --git a/branches/sage/cephmds2/mon/MonMap.h b/branches/sage/cephmds2/mon/MonMap.h new file mode 100644 index 0000000000000..e72946d76cf06 --- /dev/null +++ b/branches/sage/cephmds2/mon/MonMap.h @@ -0,0 +1,63 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __MONMAP_H +#define __MONMAP_H + +#include "msg/Message.h" +#include "include/types.h" + +class MonMap { + public: + epoch_t epoch; // what epoch of the osd cluster descriptor is this + int num_mon; + vector mon_inst; + + int last_mon; // last mon i talked to + + MonMap(int s=0) : epoch(0), num_mon(s), mon_inst(s), last_mon(-1) {} + + // pick a mon. + // choice should be stable, unless we explicitly ask for a new one. + int pick_mon(bool newmon=false) { + if (newmon || (last_mon < 0)) { + last_mon = 0; //last_mon = rand() % num_mon; + } + return last_mon; + } + + const entity_inst_t get_inst(int m) { + assert(m < num_mon); + return mon_inst[m]; + } + + void encode(bufferlist& blist) { + blist.append((char*)&epoch, sizeof(epoch)); + blist.append((char*)&num_mon, sizeof(num_mon)); + + _encode(mon_inst, blist); + } + + void decode(bufferlist& blist) { + int off = 0; + blist.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + blist.copy(off, sizeof(num_mon), (char*)&num_mon); + off += sizeof(num_mon); + + _decode(mon_inst, blist, off); + } + +}; + +#endif diff --git a/branches/sage/cephmds2/mon/Monitor.cc b/branches/sage/cephmds2/mon/Monitor.cc new file mode 100644 index 0000000000000..e0462534553d6 --- /dev/null +++ b/branches/sage/cephmds2/mon/Monitor.cc @@ -0,0 +1,260 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +// TODO: missing run() method, which creates the two main timers, refreshTimer and readTimer + +#include "Monitor.h" + +#include "osd/OSDMap.h" + +#include "ebofs/Ebofs.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MGenericMessage.h" + +#include "common/Timer.h" +#include "common/Clock.h" + +#include "OSDMonitor.h" +#include "MDSMonitor.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << whoami << (is_starting() ? (const char*)"(starting)":(is_leader() ? (const char*)"(leader)":(is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << " " + + + +void Monitor::init() +{ + dout(1) << "init" << endl; + + // store + char s[80]; + sprintf(s, "dev/mon%d", whoami); + store = new Ebofs(s); + + if (g_conf.mkfs) + store->mkfs(); + int r = store->mount(); + assert(r >= 0); + + // create + osdmon = new OSDMonitor(this, messenger, lock); + mdsmon = new MDSMonitor(this, messenger, lock); + + // i'm ready! + messenger->set_dispatcher(this); + + // start ticker + reset_tick(); +} + +void Monitor::shutdown() +{ + dout(1) << "shutdown" << endl; + + cancel_tick(); + + if (store) { + store->umount(); + delete store; + } + + // stop osds. + for (set::iterator it = osdmon->osdmap.get_osds().begin(); + it != osdmon->osdmap.get_osds().end(); + it++) { + if (osdmon->osdmap.is_down(*it)) continue; + dout(10) << "sending shutdown to osd" << *it << endl; + messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), + MSG_ADDR_OSD(*it), osdmon->osdmap.get_inst(*it)); + } + + // monitors too. + for (int i=0; inum_mon; i++) + if (i != whoami) + messenger->send_message(new MGenericMessage(MSG_SHUTDOWN), + MSG_ADDR_MON(i), monmap->get_inst(i)); + + // clean up + if (monmap) delete monmap; + if (osdmon) delete osdmon; + if (mdsmon) delete mdsmon; + + // die. + messenger->shutdown(); + delete messenger; +} + + +void Monitor::call_election() +{ + dout(10) << "call_election" << endl; + state = STATE_STARTING; + + osdmon->election_starting(); + //mdsmon->election_starting(); +} + + + + + +void Monitor::dispatch(Message *m) +{ + lock.Lock(); + { + switch (m->get_type()) { + + // misc + case MSG_PING_ACK: + handle_ping_ack((MPingAck*)m); + break; + + case MSG_SHUTDOWN: + if (m->get_source().is_mds()) { + mdsmon->dispatch(m); + if (mdsmon->mdsmap.get_num_mds() == 0) + shutdown(); + } + else if (m->get_source().is_osd()) { + osdmon->dispatch(m); + } + break; + + + // OSDs + case MSG_OSD_GETMAP: + case MSG_OSD_FAILURE: + case MSG_OSD_BOOT: + case MSG_OSD_IN: + case MSG_OSD_OUT: + osdmon->dispatch(m); + break; + + + // MDSs + case MSG_MDS_BOOT: + case MSG_MDS_GETMAP: + mdsmon->dispatch(m); + break; + + + // elector messages + case MSG_MON_ELECTION_ACK: + case MSG_MON_ELECTION_STATUS: + case MSG_MON_ELECTION_COLLECT: + case MSG_MON_ELECTION_REFRESH: + elector.dispatch(m); + break; + + + default: + dout(0) << "unknown message " << *m << endl; + assert(0); + } + } + lock.Unlock(); +} + + +void Monitor::handle_shutdown(Message *m) +{ + dout(1) << "shutdown from " << m->get_source() << endl; + + shutdown(); + delete m; +} + +void Monitor::handle_ping_ack(MPingAck *m) +{ + // ... + + delete m; +} + + + + +/************ TIMER ***************/ + +class C_Mon_Tick : public Context { + Monitor *mon; +public: + C_Mon_Tick(Monitor *m) : mon(m) {} + void finish(int r) { + mon->tick(this); + } +}; + + +void Monitor::cancel_tick() +{ + if (!tick_timer) return; + + if (g_timer.cancel_event(tick_timer)) { + dout(10) << "cancel_tick canceled" << endl; + } else { + // already dispatched! + dout(10) << "cancel_tick timer dispatched, waiting to cancel" << endl; + tick_timer = (Context*)1; // hackish. + while (tick_timer) + tick_timer_cond.Wait(lock); + } +} + +void Monitor::reset_tick() +{ + if (tick_timer) + cancel_tick(); + tick_timer = new C_Mon_Tick(this); + g_timer.add_event_after(g_conf.mon_tick_interval, tick_timer); +} + + +void Monitor::tick(Context *timer) +{ + lock.Lock(); + { + if (tick_timer != timer) { + dout(10) << "tick - canceled" << endl; + tick_timer = 0; + tick_timer_cond.Signal(); + lock.Unlock(); + return; + } + + tick_timer = 0; + + // ok go. + dout(10) << "tick" << endl; + + osdmon->tick(); + + // next tick! + reset_tick(); + } + lock.Unlock(); +} + + + + + + + diff --git a/branches/sage/cephmds2/mon/Monitor.h b/branches/sage/cephmds2/mon/Monitor.h new file mode 100644 index 0000000000000..0b8890fcbae3b --- /dev/null +++ b/branches/sage/cephmds2/mon/Monitor.h @@ -0,0 +1,114 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MONITOR_H +#define __MONITOR_H + +#include "include/types.h" +#include "msg/Messenger.h" + +#include "MonMap.h" +#include "Elector.h" + +class ObjectStore; +class OSDMonitor; +class MDSMonitor; + +class Monitor : public Dispatcher { +protected: + // me + int whoami; + Messenger *messenger; + Mutex lock; + + MonMap *monmap; + + // timer. + Context *tick_timer; + Cond tick_timer_cond; + void cancel_tick(); + void reset_tick(); + friend class C_Mon_Tick; + + // my local store + ObjectStore *store; + + const static int INO_ELECTOR = 1; + const static int INO_MON_MAP = 2; + const static int INO_OSD_MAP = 10; + const static int INO_OSD_INC_MAP = 11; + const static int INO_MDS_MAP = 20; + + // elector + Elector elector; + friend class Elector; + + epoch_t mon_epoch; // monitor epoch (election instance) + set quorum; // current active set of monitors (if !starting) + + void call_election(); + + // monitor state + const static int STATE_STARTING = 0; + const static int STATE_LEADER = 1; + const static int STATE_PEON = 2; + int state; + + int leader; // current leader (to best of knowledge) + utime_t last_called_election; // [starting] last time i called an election + + bool is_starting() { return state == STATE_STARTING; } + bool is_leader() { return state == STATE_LEADER; } + bool is_peon() { return state == STATE_PEON; } + + // my public services + OSDMonitor *osdmon; + MDSMonitor *mdsmon; + + // messages + void handle_shutdown(Message *m); + void handle_ping_ack(class MPingAck *m); + + friend class OSDMonitor; + friend class MDSMonitor; + + public: + Monitor(int w, Messenger *m, MonMap *mm) : + whoami(w), + messenger(m), + monmap(mm), + tick_timer(0), + store(0), + elector(this, w), + mon_epoch(0), + state(STATE_STARTING), + leader(0), + osdmon(0), + mdsmon(0) + { + // hack leader, until election works. + if (whoami == 0) + state = STATE_LEADER; + else + state = STATE_PEON; + } + + void init(); + void shutdown(); + void dispatch(Message *m); + void tick(Context *timer); + +}; + +#endif diff --git a/branches/sage/cephmds2/mon/OSDMonitor.cc b/branches/sage/cephmds2/mon/OSDMonitor.cc new file mode 100644 index 0000000000000..7fafbff48b2f1 --- /dev/null +++ b/branches/sage/cephmds2/mon/OSDMonitor.cc @@ -0,0 +1,869 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "OSDMonitor.h" +#include "Monitor.h" +#include "MDSMonitor.h" + +#include "osd/ObjectStore.h" + +#include "messages/MOSDFailure.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDIn.h" +#include "messages/MOSDOut.h" + +#include "messages/MMonOSDMapInfo.h" +#include "messages/MMonOSDMapLease.h" +#include "messages/MMonOSDMapLeaseAck.h" +#include "messages/MMonOSDMapUpdatePrepare.h" +#include "messages/MMonOSDMapUpdateAck.h" +#include "messages/MMonOSDMapUpdateCommit.h" + +#include "common/Timer.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cout << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_mon) cerr << g_clock.now() << " mon" << mon->whoami << (mon->is_starting() ? (const char*)"(starting)":(mon->is_leader() ? (const char*)"(leader)":(mon->is_peon() ? (const char*)"(peon)":(const char*)"(?\?)"))) << ".osd(" << (state == STATE_INIT ? (const char*)"init":(state == STATE_SYNC ? (const char*)"sync":(state == STATE_LOCK ? (const char*)"lock":(state == STATE_UPDATING ? (const char*)"updating":(const char*)"?\?")))) << ") e" << osdmap.get_epoch() << " " + + +class C_Mon_FakeOSDFailure : public Context { + OSDMonitor *mon; + int osd; + bool down; +public: + C_Mon_FakeOSDFailure(OSDMonitor *m, int o, bool d) : mon(m), osd(o), down(d) {} + void finish(int r) { + mon->fake_osd_failure(osd,down); + } +}; + + +void OSDMonitor::fake_osdmap_update() +{ + dout(1) << "fake_osdmap_update" << endl; + accept_pending(); + + // tell a random osd + int osd = rand() % g_conf.num_osd; + send_incremental(osdmap.get_epoch()-1, // ick! FIXME + MSG_ADDR_OSD(osd), osdmap.get_inst(osd)); +} + + +void OSDMonitor::fake_reorg() +{ + int r = rand() % g_conf.num_osd; + + if (osdmap.is_out(r)) { + dout(1) << "fake_reorg marking osd" << r << " in" << endl; + pending_inc.new_in.push_back(r); + } else { + dout(1) << "fake_reorg marking osd" << r << " out" << endl; + pending_inc.new_out.push_back(r); + } + + accept_pending(); + + // tell him! + send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(r), osdmap.get_inst(r)); + + // do it again? + /* + if (g_conf.num_osd - d > 4 && + g_conf.num_osd - d > g_conf.num_osd/2) + g_timer.add_event_after(g_conf.fake_osdmap_expand, + new C_Mon_Faker(this)); + */ +} + + + +void OSDMonitor::init() +{ + // start with blank map + + // load my last state from the store + bufferlist bl; + if (get_map_bl(0, bl)) { // FIXME + // yay! + osdmap.decode(bl); + dout(1) << "init got epoch " << osdmap.get_epoch() << " from store" << endl; + + // set up pending_inc + pending_inc.epoch = osdmap.get_epoch()+1; + + } else { + // FIXME. when elections work! + if (mon->is_leader()) { + create_initial(); + issue_leases(); + } + } +} + + + + +/************ MAPS ****************/ + + +void OSDMonitor::create_initial() +{ + dout(1) << "create_initial generating osdmap from g_conf" << endl; + + // + osdmap.mon_epoch = mon->mon_epoch; + osdmap.ctime = g_clock.now(); + + if (g_conf.osd_pg_bits) { + osdmap.set_pg_bits(g_conf.osd_pg_bits); + } else { + int osdbits = 1; + int n = g_conf.num_osd; + while (n) { + n = n >> 1; + osdbits++; + } + + // 2 bits per osd. + osdmap.set_pg_bits(osdbits + 2); + } + + // start at epoch 0 until all osds boot + //osdmap.inc_epoch(); // = 1 + //assert(osdmap.get_epoch() == 1); + + if (g_conf.num_osd >= 12) { + int ndom = g_conf.osd_max_rep; + UniformBucket *domain[ndom]; + int domid[ndom]; + for (int i=0; iadd_item(i, 1.0); + //cerr << "osd" << i << " in domain " << dom << endl; + i++; + if (i == g_conf.num_osd) break; + } + } + + // root + Bucket *root = new ListBucket(2); + for (int i=0; iget_weight() << endl; + root->add_item(domid[i], domain[i]->get_weight()); + } + int nroot = osdmap.crush.add_bucket(root); + + // rules + for (int i=1; i<=ndom; i++) { + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, nroot)); + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 1)); + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + } + + // test + //vector out; + //osdmap.pg_to_osds(0x40200000110ULL, out); + + } else { + // one bucket + Bucket *b = new UniformBucket(1, 0); + int root = osdmap.crush.add_bucket(b); + for (int i=0; iadd_item(i, 1.0); + } + + for (int i=1; i<=g_conf.osd_max_rep; i++) { + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, i, 0)); + osdmap.crush.rules[i].steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + } + } + + if (g_conf.mds_local_osd) { + // add mds osds, but don't put them in the crush mapping func + for (int i=0; i + + // fake osd failures + for (map::iterator i = g_fake_osd_down.begin(); + i != g_fake_osd_down.end(); + i++) { + dout(0) << "will fake osd" << i->first << " DOWN after " << i->second << endl; + g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 1)); + } + for (map::iterator i = g_fake_osd_out.begin(); + i != g_fake_osd_out.end(); + i++) { + dout(0) << "will fake osd" << i->first << " OUT after " << i->second << endl; + g_timer.add_event_after(i->second, new C_Mon_FakeOSDFailure(this, i->first, 0)); + } +} + + +bool OSDMonitor::get_map_bl(epoch_t epoch, bufferlist& bl) +{ + object_t oid(Monitor::INO_OSD_MAP, epoch); + if (!mon->store->exists(oid)) + return false; + int r = mon->store->read(oid, 0, 0, bl); + assert(r > 0); + return true; +} + +bool OSDMonitor::get_inc_map_bl(epoch_t epoch, bufferlist& bl) +{ + object_t oid(Monitor::INO_OSD_INC_MAP, epoch); + if (!mon->store->exists(oid)) + return false; + int r = mon->store->read(oid, 0, 0, bl); + assert(r > 0); + return true; +} + + +void OSDMonitor::save_map() +{ + bufferlist bl; + osdmap.encode(bl); + + ObjectStore::Transaction t; + t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl); + t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); + mon->store->apply_transaction(t); + mon->store->sync(); +} + +void OSDMonitor::save_inc_map(OSDMap::Incremental &inc) +{ + bufferlist bl; + osdmap.encode(bl); + + bufferlist incbl; + inc.encode(incbl); + + ObjectStore::Transaction t; + t.write(object_t(Monitor::INO_OSD_MAP,0), 0, bl.length(), bl); + t.write(object_t(Monitor::INO_OSD_MAP,osdmap.get_epoch()), 0, bl.length(), bl); // not strictly needed?? + t.write(object_t(Monitor::INO_OSD_INC_MAP,osdmap.get_epoch()), 0, incbl.length(), incbl); + mon->store->apply_transaction(t); + mon->store->sync(); +} + + + +void OSDMonitor::dispatch(Message *m) +{ + switch (m->get_type()) { + + // services + case MSG_OSD_GETMAP: + handle_osd_getmap((MOSDGetMap*)m); + break; + case MSG_OSD_FAILURE: + handle_osd_failure((MOSDFailure*)m); + break; + case MSG_OSD_BOOT: + handle_osd_boot((MOSDBoot*)m); + break; + case MSG_OSD_IN: + handle_osd_in((MOSDIn*)m); + break; + case MSG_OSD_OUT: + handle_osd_out((MOSDOut*)m); + break; + + // replication + case MSG_MON_OSDMAP_INFO: + handle_info((MMonOSDMapInfo*)m); + break; + case MSG_MON_OSDMAP_LEASE: + handle_lease((MMonOSDMapLease*)m); + break; + case MSG_MON_OSDMAP_LEASE_ACK: + handle_lease_ack((MMonOSDMapLeaseAck*)m); + break; + case MSG_MON_OSDMAP_UPDATE_PREPARE: + handle_update_prepare((MMonOSDMapUpdatePrepare*)m); + break; + case MSG_MON_OSDMAP_UPDATE_ACK: + handle_update_ack((MMonOSDMapUpdateAck*)m); + break; + case MSG_MON_OSDMAP_UPDATE_COMMIT: + handle_update_commit((MMonOSDMapUpdateCommit*)m); + break; + + default: + assert(0); + } +} + + + +void OSDMonitor::handle_osd_failure(MOSDFailure *m) +{ + dout(1) << "osd failure: " << m->get_failed() << " from " << m->get_source() << endl; + + // FIXME + // take their word for it + int from = m->get_failed().num(); + if (osdmap.is_up(from) && + (osdmap.osd_inst.count(from) == 0 || + osdmap.osd_inst[from] == m->get_inst())) { + pending_inc.new_down[from] = m->get_inst(); + + if (osdmap.is_in(from)) + down_pending_out[from] = g_clock.now(); + + //awaiting_maps[pending_inc.epoch][m->get_source()] = + + accept_pending(); + + send_incremental(m->get_epoch(), m->get_source(), m->get_source_inst()); + + send_waiting(); + bcast_latest_mds(); + } + + delete m; +} + + +void OSDMonitor::fake_osd_failure(int osd, bool down) +{ + lock.Lock(); + { + if (down) { + dout(1) << "fake_osd_failure DOWN osd" << osd << endl; + pending_inc.new_down[osd] = osdmap.osd_inst[osd]; + } else { + dout(1) << "fake_osd_failure OUT osd" << osd << endl; + pending_inc.new_out.push_back(osd); + } + accept_pending(); + bcast_latest_osd(); + bcast_latest_mds(); + } + lock.Unlock(); +} + + +void OSDMonitor::handle_osd_boot(MOSDBoot *m) +{ + dout(7) << "osd_boot from " << m->get_source() << endl; + assert(m->get_source().is_osd()); + int from = m->get_source().num(); + + if (osdmap.get_epoch() == 0) { + // waiting for boot! + osdmap.osd_inst[from] = m->get_source_inst(); + + if (osdmap.osd_inst.size() == osdmap.osds.size()) { + dout(-7) << "osd_boot all osds booted." << endl; + osdmap.inc_epoch(); + + save_map(); + + pending_inc.epoch = osdmap.get_epoch()+1; // 2 + + bcast_latest_osd(); + bcast_latest_mds(); + } else { + dout(7) << "osd_boot waiting for " + << (osdmap.osds.size() - osdmap.osd_inst.size()) + << " osds to boot" << endl; + } + return; + } + + // already up? mark down first? + if (osdmap.is_up(from)) { + pending_inc.new_down[from] = osdmap.osd_inst[from]; + accept_pending(); + } + + // mark up. + down_pending_out.erase(from); + assert(osdmap.is_down(from)); + pending_inc.new_up[from] = m->get_source_inst(); + + // mark in? + if (osdmap.out_osds.count(from)) + pending_inc.new_in.push_back(from); + + accept_pending(); + + // the booting osd will spread word + send_incremental(m->sb.current_epoch, m->get_source(), m->get_source_inst()); + delete m; + + // tell mds + bcast_latest_mds(); +} + +void OSDMonitor::handle_osd_in(MOSDIn *m) +{ + dout(7) << "osd_in from " << m->get_source() << endl; + int from = m->get_source().num(); + + if (osdmap.is_out(from)) + pending_inc.new_in.push_back(from); + accept_pending(); + send_incremental(m->map_epoch, m->get_source(), m->get_source_inst()); +} + +void OSDMonitor::handle_osd_out(MOSDOut *m) +{ + dout(7) << "osd_out from " << m->get_source() << endl; + int from = m->get_source().num(); + if (osdmap.is_in(from)) { + pending_inc.new_out.push_back(from); + accept_pending(); + send_incremental(m->map_epoch, m->get_source(), m->get_source_inst()); + } +} + +void OSDMonitor::handle_osd_getmap(MOSDGetMap *m) +{ + dout(7) << "osd_getmap from " << m->get_source() << " since " << m->get_since() << endl; + + if (osdmap.get_epoch() == 0) { + awaiting_map[m->get_source()].first = m->get_source_inst(); + awaiting_map[m->get_source()].second = m->get_since(); + } else { + //if (m->get_since()) + send_incremental(m->get_since(), m->get_source(), m->get_source_inst()); + //else + //send_full(m->get_source(), m->get_source_inst()); + } + delete m; +} + + + +void OSDMonitor::accept_pending() +{ + dout(-10) << "accept_pending " << osdmap.get_epoch() << " -> " << pending_inc.epoch << endl; + + // accept pending into a new map! + pending_inc.ctime = g_clock.now(); + pending_inc.mon_epoch = mon->mon_epoch; + + // advance! + osdmap.apply_incremental(pending_inc); + + // save it. + save_inc_map( pending_inc ); + + // tell me about it + for (map::iterator i = pending_inc.new_up.begin(); + i != pending_inc.new_up.end(); + i++) { + dout(0) << "osd" << i->first << " UP " << i->second << endl; + derr(0) << "osd" << i->first << " UP " << i->second << endl; + messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); + } + for (map::iterator i = pending_inc.new_down.begin(); + i != pending_inc.new_down.end(); + i++) { + dout(0) << "osd" << i->first << " DOWN " << i->second << endl; + derr(0) << "osd" << i->first << " DOWN " << i->second << endl; + messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); + } + for (list::iterator i = pending_inc.new_in.begin(); + i != pending_inc.new_in.end(); + i++) { + dout(0) << "osd" << *i << " IN" << endl; + derr(0) << "osd" << *i << " IN" << endl; + } + for (list::iterator i = pending_inc.new_out.begin(); + i != pending_inc.new_out.end(); + i++) { + dout(0) << "osd" << *i << " OUT" << endl; + derr(0) << "osd" << *i << " OUT" << endl; + } + + // clear new pending + OSDMap::Incremental next(osdmap.get_epoch() + 1); + pending_inc = next; +} + +void OSDMonitor::send_waiting() +{ + dout(10) << "send_waiting " << osdmap.get_epoch() << endl; + + for (map >::iterator i = awaiting_map.begin(); + i != awaiting_map.end(); + i++) + send_incremental(i->second.second, i->first, i->second.first); +} + + +void OSDMonitor::send_full(msg_addr_t who, const entity_inst_t& inst) +{ + messenger->send_message(new MOSDMap(&osdmap), who, inst); +} + +void OSDMonitor::send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst) +{ + dout(5) << "osd_send_incremental " << since << " -> " << osdmap.get_epoch() + << " to " << dest << endl; + + MOSDMap *m = new MOSDMap; + + for (epoch_t e = osdmap.get_epoch(); + e > since; + e--) { + bufferlist bl; + if (get_inc_map_bl(e, bl)) { + dout(10) << "osd_send_incremental inc " << e << endl; + m->incremental_maps[e] = bl; + } + else if (get_map_bl(e, bl)) { + dout(10) << "osd_send_incremental full " << e << endl; + m->maps[e] = bl; + } + else { + assert(0); // we should have all maps. + } + } + + messenger->send_message(m, dest, inst); +} + + + +void OSDMonitor::bcast_latest_mds() +{ + epoch_t e = osdmap.get_epoch(); + dout(1) << "bcast_latest_mds epoch " << e << endl; + + // tell mds + for (set::iterator i = mon->mdsmon->mdsmap.get_mds().begin(); + i != mon->mdsmon->mdsmap.get_mds().end(); + i++) { + if (mon->mdsmon->mdsmap.is_down(*i)) continue; + send_incremental(osdmap.get_epoch()-1, MSG_ADDR_MDS(*i), mon->mdsmon->mdsmap.get_inst(*i)); + } +} + +void OSDMonitor::bcast_latest_osd() +{ + epoch_t e = osdmap.get_epoch(); + dout(1) << "bcast_latest_osd epoch " << e << endl; + + // tell osds + set osds; + osdmap.get_all_osds(osds); + for (set::iterator it = osds.begin(); + it != osds.end(); + it++) { + if (osdmap.is_down(*it)) continue; + + send_incremental(osdmap.get_epoch()-1, MSG_ADDR_OSD(*it), osdmap.get_inst(*it)); + } +} + + + +void OSDMonitor::tick() +{ + // mark down osds out? + utime_t now = g_clock.now(); + list mark_out; + for (map::iterator i = down_pending_out.begin(); + i != down_pending_out.end(); + i++) { + utime_t down = now; + down -= i->second; + + if (down.sec() >= g_conf.mon_osd_down_out_interval) { + dout(10) << "tick marking osd" << i->first << " OUT after " << down << " sec" << endl; + mark_out.push_back(i->first); + } + } + for (list::iterator i = mark_out.begin(); + i != mark_out.end(); + i++) { + down_pending_out.erase(*i); + pending_inc.new_out.push_back( *i ); + } + if (!mark_out.empty()) { + accept_pending(); + + // hrmpf. bcast map for now. FIXME FIXME. + bcast_latest_osd(); + } +} + +void OSDMonitor::election_starting() +{ + dout(10) << "election_starting" << endl; +} + +void OSDMonitor::election_finished() +{ + dout(10) << "election_starting" << endl; + + state = STATE_INIT; + + if (mon->is_leader()) { + // leader. + if (mon->monmap->num_mon == 1) { + // hmm, it's just me! + state = STATE_SYNC; + } + } + else if (mon->is_peon()) { + // peon. send info + messenger->send_message(new MMonOSDMapInfo(osdmap.epoch, osdmap.mon_epoch), + MSG_ADDR_MON(mon->leader), mon->monmap->get_inst(mon->leader)); + } + +} + + + +void OSDMonitor::handle_info(MMonOSDMapInfo *m) +{ + dout(10) << "handle_info from " << m->get_source() + << " epoch " << m->get_epoch() << " in mon_epoch " << m->get_mon_epoch() + << endl; + + epoch_t epoch = m->get_epoch(); + + // did they have anything? + if (epoch > 0) { + // make sure it's current. + if (epoch == osdmap.get_epoch()) { + if (osdmap.mon_epoch != m->get_mon_epoch()) { + dout(10) << "handle_info had divergent epoch " << m->get_epoch() + << ", mon_epoch " << m->get_mon_epoch() << " != " << osdmap.mon_epoch << endl; + epoch--; + } + } else { + bufferlist bl; + get_map_bl(epoch, bl); + + OSDMap old; + old.decode(bl); + + if (old.mon_epoch != m->get_mon_epoch()) { + dout(10) << "handle_info had divergent epoch " << m->get_epoch() + << ", mon_epoch " << m->get_mon_epoch() << " != " << old.mon_epoch << endl; + epoch--; + } + } + } + + // bring up to date + if (epoch < osdmap.get_epoch()) + send_incremental(epoch, m->get_source(), m->get_source_inst()); + + delete m; +} + + +void OSDMonitor::issue_leases() +{ + dout(10) << "issue_leases" << endl; + assert(mon->is_leader()); + + // set lease endpoint + lease_expire = g_clock.now(); + lease_expire += g_conf.mon_lease; + + pending_ack.clear(); + + for (set::iterator i = mon->quorum.begin(); + i != mon->quorum.end(); + i++) { + if (*i == mon->whoami) continue; + messenger->send_message(new MMonOSDMapLease(osdmap.get_epoch(), lease_expire), + MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); + pending_ack.insert(*i); + } +} + +void OSDMonitor::handle_lease(MMonOSDMapLease *m) +{ + if (m->get_epoch() != osdmap.get_epoch() + 1) { + dout(10) << "map_lease from " << m->get_source() + << " on epoch " << m->get_epoch() << ", but i am " << osdmap.get_epoch() << endl; + assert(0); + delete m; + return; + } + + dout(10) << "map_lease from " << m->get_source() << " expires " << lease_expire << endl; + lease_expire = m->get_lease_expire(); + + delete m; +} + +void OSDMonitor::handle_lease_ack(MMonOSDMapLeaseAck *m) +{ + // right epoch? + if (m->get_epoch() != osdmap.get_epoch()) { + dout(10) << "map_lease_ack from " << m->get_source() + << " on old epoch " << m->get_epoch() << ", dropping" << endl; + delete m; + return; + } + + // within time limit? + if (g_clock.now() >= lease_expire) { + dout(10) << "map_lease_ack from " << m->get_source() + << ", but lease expired, calling election" << endl; + mon->call_election(); + delete m; + return; + } + + assert(m->get_source().is_mon()); + int from = m->get_source().num(); + + assert(pending_ack.count(from)); + pending_ack.erase(from); + + if (pending_ack.empty()) { + dout(10) << "map_lease_ack from " << m->get_source() + << ", last one" << endl; + } else { + dout(10) << "map_lease_ack from " << m->get_source() + << ", still waiting on " << pending_ack << endl; + } + + delete m; +} + + +void OSDMonitor::update_map() +{ + // lock map + state = STATE_UPDATING; + pending_ack.clear(); + + // set lease endpoint + lease_expire += g_conf.mon_lease; + + // send prepare + epoch_t epoch = osdmap.get_epoch(); + bufferlist map_bl, inc_map_bl; + if (!get_inc_map_bl(epoch, inc_map_bl)) + get_map_bl(epoch, map_bl); + + for (set::iterator i = mon->quorum.begin(); + i != mon->quorum.end(); + i++) { + if (*i == mon->whoami) continue; + messenger->send_message(new MMonOSDMapUpdatePrepare(epoch, + map_bl, inc_map_bl), + MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); + pending_ack.insert(*i); + } +} + + + +void OSDMonitor::handle_update_prepare(MMonOSDMapUpdatePrepare *m) +{ + dout(10) << "map_update_prepare from " << m->get_source() << " epoch " << m->get_epoch() << endl; + // accept map + assert(m->get_epoch() == osdmap.get_epoch() + 1); + + if (m->inc_map_bl.length()) { + int off = 0; + pending_inc.decode(m->inc_map_bl, off); + accept_pending(); + } else { + osdmap.decode(m->map_bl); + } + + // state + state = STATE_LOCK; + //lease_expire = m->lease_expire; + + // ack + messenger->send_message(new MMonOSDMapUpdateAck(osdmap.get_epoch()), + m->get_source(), m->get_source_inst()); + delete m; +} + +void OSDMonitor::handle_update_ack(MMonOSDMapUpdateAck *m) +{ + /* + // right epoch? + if (m->get_epoch() != osdmap.get_epoch()) { + dout(10) << "map_update_ack from " << m->get_source() + << " on old epoch " << m->get_epoch() << ", dropping" << endl; + delete m; + return; + } + + // within time limit? + if (g_clock.now() >= lease_expire) { + dout(10) << "map_update_ack from " << m->get_source() + << ", but lease expired, calling election" << endl; + state = STATE_SYNC; + mon->call_election(); + return; + } + + assert(m->get_source().is_mon()); + int from = m->get_source().num(); + + assert(pending_lease_ack.count(from)); + pending_lease_ack.erase(from); + + if (pending_lease_ack.empty()) { + dout(10) << "map_update_ack from " << m->get_source() + << ", last one" << endl; + state = STATE_SYNC; + + // send lease commit + for (map::iterator i = mon->quorum.begin(); + i != mon->quorum.end(); + i++) { + if (i == mon->whoami) continue; + messenger->send_message(new MMonOSDMapLeaseCommit(osdmap), + MSG_ADDR_MON(*i), mon->monmap->get_inst(*i)); + } + } else { + dout(10) << "map_update_ack from " << m->get_source() + << ", still waiting on " << pending_lease_ack << endl; + } +*/ +} + +void OSDMonitor::handle_update_commit(MMonOSDMapUpdateCommit *m) +{ +} diff --git a/branches/sage/cephmds2/mon/OSDMonitor.h b/branches/sage/cephmds2/mon/OSDMonitor.h new file mode 100644 index 0000000000000..9936ecc1ff70e --- /dev/null +++ b/branches/sage/cephmds2/mon/OSDMonitor.h @@ -0,0 +1,108 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __OSDMONITOR_H +#define __OSDMONITOR_H + +#include +#include +using namespace std; + +#include "include/types.h" +#include "msg/Messenger.h" + +#include "osd/OSDMap.h" + +class Monitor; + +class OSDMonitor : public Dispatcher { + Monitor *mon; + Messenger *messenger; + Mutex &lock; + + // osd maps +public: + OSDMap osdmap; + +private: + map > awaiting_map; + + void create_initial(); + bool get_map_bl(epoch_t epoch, bufferlist &bl); + bool get_inc_map_bl(epoch_t epoch, bufferlist &bl); + + void save_map(); + void save_inc_map(OSDMap::Incremental &inc); + + // [leader] + OSDMap::Incremental pending_inc; + map down_pending_out; // osd down -> out + + set pending_ack; + + // we are distributed + const static int STATE_INIT = 0; // startup + const static int STATE_SYNC = 1; // sync map copy (readonly) + const static int STATE_LOCK = 2; // [peon] map locked + const static int STATE_UPDATING = 3; // [leader] map locked, waiting for peon ack + + int state; + utime_t lease_expire; // when lease expires + + void init(); + + // maps + void accept_pending(); // accept pending, new map. + void send_waiting(); // send current map to waiters. + void send_full(msg_addr_t dest, const entity_inst_t& inst); + void send_incremental(epoch_t since, msg_addr_t dest, const entity_inst_t& inst); + void bcast_latest_mds(); + void bcast_latest_osd(); + + void update_map(); + + void handle_osd_boot(class MOSDBoot *m); + void handle_osd_in(class MOSDIn *m); + void handle_osd_out(class MOSDOut *m); + void handle_osd_failure(class MOSDFailure *m); + void handle_osd_getmap(class MOSDGetMap *m); + + void handle_info(class MMonOSDMapInfo*); + void handle_lease(class MMonOSDMapLease*); + void handle_lease_ack(class MMonOSDMapLeaseAck*); + void handle_update_prepare(class MMonOSDMapUpdatePrepare*); + void handle_update_ack(class MMonOSDMapUpdateAck*); + void handle_update_commit(class MMonOSDMapUpdateCommit*); + + public: + OSDMonitor(Monitor *mn, Messenger *m, Mutex& l) : + mon(mn), messenger(m), lock(l), + state(STATE_SYNC) { + init(); + } + + void dispatch(Message *m); + void tick(); // check state, take actions + + void election_starting(); // abort whatever. + void election_finished(); // reinitialize whatever. + + void issue_leases(); + + void fake_osd_failure(int osd, bool down); + void fake_osdmap_update(); + void fake_reorg(); +}; + +#endif diff --git a/branches/sage/cephmds2/msg/Dispatcher.cc b/branches/sage/cephmds2/msg/Dispatcher.cc new file mode 100644 index 0000000000000..edee54a2c631f --- /dev/null +++ b/branches/sage/cephmds2/msg/Dispatcher.cc @@ -0,0 +1,27 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Dispatcher.h" +#include "Messenger.h" + +#include "mds/MDS.h" + +/* +int Dispatcher::send_message(Message *m, msg_addr_t dest, int dest_port) +{ + assert(0); + //return dis_messenger->send_message(m, dest, dest_port, MDS_PORT_SERVER); // on my port! +} +*/ diff --git a/branches/sage/cephmds2/msg/Dispatcher.h b/branches/sage/cephmds2/msg/Dispatcher.h new file mode 100644 index 0000000000000..e6fe8d8da47ce --- /dev/null +++ b/branches/sage/cephmds2/msg/Dispatcher.h @@ -0,0 +1,40 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __DISPATCHER_H +#define __DISPATCHER_H + +#include "Message.h" + +class Messenger; + +class Dispatcher { + public: + virtual ~Dispatcher() { } + + // how i receive messages + virtual void dispatch(Message *m) = 0; + + + // how i deal with transmission failures. + virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) { delete m; } + + // lookups + virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst) { assert(0); return 0; } + + // this is how i send messages + //int send_message(Message *m, msg_addr_t dest, int dest_port); +}; + +#endif diff --git a/branches/sage/cephmds2/msg/FakeMessenger.cc b/branches/sage/cephmds2/msg/FakeMessenger.cc new file mode 100644 index 0000000000000..01f6301c2618e --- /dev/null +++ b/branches/sage/cephmds2/msg/FakeMessenger.cc @@ -0,0 +1,379 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "Message.h" +#include "FakeMessenger.h" +#include "mds/MDS.h" + +#include "common/Timer.h" + +#include "common/LogType.h" +#include "common/Logger.h" + +#include "config.h" + +#undef dout +#define dout(x) if ((x) <= g_conf.debug_ms) cout << g_clock.now() << " " + + + +#include +#include +#include +#include +#include + +using namespace std; + +#include +using namespace __gnu_cxx; + + +#include "common/Cond.h" +#include "common/Mutex.h" +#include + + +// global queue. + +map directory; +hash_map loggers; +LogType fakemsg_logtype; + +set shutdown_set; + +Mutex lock; +Cond cond; + +bool pending_timer = false; + +bool awake = false; +bool fm_shutdown = false; +pthread_t thread_id; + + +class C_FakeKicker : public Context { + void finish(int r) { + dout(18) << "timer kick" << endl; + pending_timer = true; + lock.Lock(); + cond.Signal(); // why not + lock.Unlock(); + } +}; + +void FakeMessenger::callback_kick() +{ + pending_timer = true; + lock.Lock(); + cond.Signal(); // why not + lock.Unlock(); +} + +void *fakemessenger_thread(void *ptr) +{ + //dout(1) << "thread start, setting timer kicker" << endl; + //g_timer.set_messenger_kicker(new C_FakeKicker()); + //msgr_callback_kicker = new C_FakeKicker(); + + lock.Lock(); + while (1) { + dout(20) << "thread waiting" << endl; + if (fm_shutdown) break; + awake = false; + cond.Wait(lock); + awake = true; + dout(20) << "thread woke up" << endl; + if (fm_shutdown) break; + + fakemessenger_do_loop_2(); + + if (directory.empty()) break; + } + lock.Unlock(); + + //cout << "unsetting messenger" << endl; + //g_timer.unset_messenger_kicker(); + //g_timer.unset_messenger(); + //msgr_callback_kicker = 0; + + dout(1) << "thread finish (i woke up but no messages, bye)" << endl; + return 0; +} + + +void fakemessenger_startthread() { + pthread_create(&thread_id, NULL, fakemessenger_thread, 0); +} + +void fakemessenger_stopthread() { + cout << "fakemessenger_stopthread setting stop flag" << endl; + lock.Lock(); + fm_shutdown = true; + lock.Unlock(); + cond.Signal(); + + fakemessenger_wait(); +} + +void fakemessenger_wait() +{ + cout << "fakemessenger_wait waiting" << endl; + void *ptr; + pthread_join(thread_id, &ptr); +} + + + + +// lame main looper + +int fakemessenger_do_loop() +{ + lock.Lock(); + fakemessenger_do_loop_2(); + lock.Unlock(); + + g_timer.shutdown(); + return 0; +} + + +int fakemessenger_do_loop_2() +{ + //lock.Lock(); + dout(18) << "do_loop begin." << endl; + + while (1) { + bool didone = false; + + dout(18) << "do_loop top" << endl; + + /*// timer? + if (pending_timer) { + pending_timer = false; + dout(5) << "pending timer" << endl; + g_timer.execute_pending(); + } + */ + + // callbacks + lock.Unlock(); + Messenger::do_callbacks(); + lock.Lock(); + + // messages + map::iterator it = directory.begin(); + while (it != directory.end()) { + + dout(18) << "messenger " << it->second << " at " << MSG_ADDR_NICE(it->first) << " has " << it->second->num_incoming() << " queued" << endl; + + FakeMessenger *mgr = it->second; + + if (!mgr->is_ready()) { + dout(18) << "messenger " << it->second << " at " << MSG_ADDR_NICE(it->first) << " has no dispatcher, skipping" << endl; + it++; + continue; + } + + Message *m = mgr->get_message(); + it++; + + if (m) { + //dout(18) << "got " << m << endl; + dout(1) << "---- '" << m->get_type_name() + << "' from " << MSG_ADDR_NICE(m->get_source()) // << ':' << m->get_source_port() + << " to " << MSG_ADDR_NICE(m->get_dest()) //<< ':' << m->get_dest_port() + << " ---- " << m + << endl; + + if (g_conf.fakemessenger_serialize) { + // encode + if (m->empty_payload()) + m->encode_payload(); + msg_envelope_t env = m->get_envelope(); + bufferlist bl; + bl.claim( m->get_payload() ); + //bl.c_str(); // condense into 1 buffer + + delete m; + + // decode + m = decode_message(env, bl); + assert(m); + } + + didone = true; + + lock.Unlock(); + mgr->dispatch(m); + lock.Lock(); + } + } + + // deal with shutdowns.. dleayed to avoid concurrent directory modification + if (!shutdown_set.empty()) { + for (set::iterator it = shutdown_set.begin(); + it != shutdown_set.end(); + it++) { + dout(7) << "fakemessenger: removing " << *it << " from directory" << endl; + assert(directory.count(*it)); + directory.erase(*it); + if (directory.empty()) { + dout(1) << "fakemessenger: last shutdown" << endl; + ::fm_shutdown = true; + } + } + shutdown_set.clear(); + } + + if (!didone) + break; + } + + + dout(18) << "do_loop end (no more messages)." << endl; + //lock.Unlock(); + return 0; +} + + +FakeMessenger::FakeMessenger(msg_addr_t me) : Messenger(me) +{ + myaddr = me; + lock.Lock(); + directory[ myaddr ] = this; + lock.Unlock(); + + cout << "fakemessenger " << myaddr << " messenger is " << this << endl; + + //g_timer.set_messenger(this); + + qlen = 0; + + /* + string name; + name = "m."; + name += MSG_ADDR_TYPE(myaddr); + int w = MSG_ADDR_NUM(myaddr); + if (w >= 1000) name += ('0' + ((w/1000)%10)); + if (w >= 100) name += ('0' + ((w/100)%10)); + if (w >= 10) name += ('0' + ((w/10)%10)); + name += ('0' + ((w/1)%10)); + + loggers[ myaddr ] = new Logger(name, (LogType*)&fakemsg_logtype); + */ +} + +FakeMessenger::~FakeMessenger() +{ + +} + + +int FakeMessenger::shutdown() +{ + //cout << "shutdown on messenger " << this << " has " << num_incoming() << " queued" << endl; + lock.Lock(); + assert(directory.count(myaddr) == 1); + shutdown_set.insert(myaddr); + + /* + directory.erase(myaddr); + if (directory.empty()) { + dout(1) << "fakemessenger: last shutdown" << endl; + ::fm_shutdown = true; + cond.Signal(); // why not + } + */ + + /* + if (loggers[myaddr]) { + delete loggers[myaddr]; + loggers.erase(myaddr); + } + */ + + lock.Unlock(); + return 0; +} + +/* +void FakeMessenger::trigger_timer(Timer *t) +{ + // note timer to call + pending_timer = t; + + // wake up thread? + cond.Signal(); // why not +} +*/ + +int FakeMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +{ + m->set_source(myaddr, fromport); + m->set_dest(dest, port); + //m->set_lamport_send_stamp( get_lamport() ); + + entity_inst_t blank; + m->set_source_inst(blank); + + lock.Lock(); + + // deliver + try { +#ifdef LOG_MESSAGES + // stats + loggers[myaddr]->inc("+send",1); + loggers[dest]->inc("-recv",1); + + char s[20]; + sprintf(s,"+%s", m->get_type_name()); + loggers[myaddr]->inc(s); + sprintf(s,"-%s", m->get_type_name()); + loggers[dest]->inc(s); +#endif + + // queue + FakeMessenger *dm = directory[dest]; + if (!dm) { + dout(1) << "** destination " << MSG_ADDR_NICE(dest) << " (" << dest << ") dne" << endl; + assert(dm); + } + dm->queue_incoming(m); + + dout(1) << "--> " << myaddr << " sending " << m << " '" << m->get_type_name() << "'" + << " to " << MSG_ADDR_NICE(dest) + << endl;//" m " << dm << " has " << dm->num_incoming() << " queued" << endl; + + } + catch (...) { + cout << "no destination " << dest << endl; + assert(0); + } + + + // wake up loop? + if (!awake) { + dout(10) << "waking up fakemessenger thread" << endl; + cond.Signal(); + lock.Unlock(); + } else + lock.Unlock(); + + return 0; +} + + diff --git a/branches/sage/cephmds2/msg/FakeMessenger.h b/branches/sage/cephmds2/msg/FakeMessenger.h new file mode 100644 index 0000000000000..51bec779c4366 --- /dev/null +++ b/branches/sage/cephmds2/msg/FakeMessenger.h @@ -0,0 +1,81 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __FAKEMESSENGER_H +#define __FAKEMESSENGER_H + +#include "Messenger.h" +#include "Dispatcher.h" + +#include +#include + +class Timer; + +class FakeMessenger : public Messenger { + protected: + msg_addr_t myaddr; + + class Logger *logger; + + int qlen; + list incoming; // incoming queue + + public: + FakeMessenger(msg_addr_t me); + ~FakeMessenger(); + + virtual int shutdown(); + + // msg interface + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); + + // events + //virtual void trigger_timer(Timer *t); + + int get_dispatch_queue_len() { return qlen; } + + void callback_kick(); + + // -- incoming queue -- + // (that nothing uses) + Message *get_message() { + if (!incoming.empty()) { + Message *m = incoming.front(); + incoming.pop_front(); + qlen--; + return m; + } + return NULL; + } + bool queue_incoming(Message *m) { + incoming.push_back(m); + qlen++; + return true; + } + int num_incoming() { + //return incoming.size(); + return qlen; + } + +}; + +int fakemessenger_do_loop(); +int fakemessenger_do_loop_2(); +void fakemessenger_startthread(); +void fakemessenger_stopthread(); +void fakemessenger_wait(); + +#endif diff --git a/branches/sage/cephmds2/msg/HostMonitor.cc b/branches/sage/cephmds2/msg/HostMonitor.cc new file mode 100644 index 0000000000000..33bef09565df2 --- /dev/null +++ b/branches/sage/cephmds2/msg/HostMonitor.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "HostMonitor.h" + +#include "msg/Message.h" +#include "msg/Messenger.h" + +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MFailure.h" +#include "messages/MFailureAck.h" + +#include "common/Timer.h" +#include "common/Clock.h" + +#define DBL 10 + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << whoami << " hostmon: " + + +// timer contexts + +class C_HM_InitiateHeartbeat : public Context { + HostMonitor *hm; +public: + C_HM_InitiateHeartbeat(HostMonitor *hm) { + this->hm = hm; + } + void finish(int r) { + //cout << "HEARTBEAT" << endl; + hm->pending_events.erase(this); + hm->initiate_heartbeat(); + } +}; + +class C_HM_CheckHeartbeat : public Context { + HostMonitor *hm; +public: + C_HM_CheckHeartbeat(HostMonitor *hm) { + this->hm = hm; + } + void finish(int r) { + //cout << "CHECK" << endl; + hm->pending_events.erase(this); + hm->check_heartbeat(); + } +}; + + + +// startup/shutdown + +void HostMonitor::init() +{ + dout(DBL) << "init" << endl; + + // hack params for now + heartbeat_interval = 10; + max_ping_time = 2; + max_heartbeat_misses = 3; + notify_retry_interval = 10; + + // schedule first hb + schedule_heartbeat(); +} + + +void HostMonitor::shutdown() +{ + // cancel any events + for (set::iterator it = pending_events.begin(); + it != pending_events.end(); + it++) { + g_timer.cancel_event(*it); + delete *it; + } + pending_events.clear(); +} + + +// schedule next heartbeat + +void HostMonitor::schedule_heartbeat() +{ + dout(DBL) << "schedule_heartbeat" << endl; + Context *e = new C_HM_InitiateHeartbeat(this); + pending_events.insert(e); + g_timer.add_event_after(heartbeat_interval, e); +} + + +// take note of a live host + +void HostMonitor::host_is_alive(msg_addr_t host) +{ + if (hosts.count(host)) + status[host].last_heard_from = g_clock.gettime(); +} + + +// do heartbeat + +void HostMonitor::initiate_heartbeat() +{ + time_t now = g_clock.gettime(); + + // send out pings + inflight_pings.clear(); + for (set::iterator it = hosts.begin(); + it != hosts.end(); + it++) { + // have i heard from them recently? + if (now - status[*it].last_heard_from < heartbeat_interval) { + dout(DBL) << "skipping " << *it << ", i heard from them recently" << endl; + } else { + dout(DBL) << "pinging " << *it << endl; + status[*it].last_pinged = now; + inflight_pings.insert(*it); + + messenger->send_message(new MPing(1), *it, 0); + } + } + + // set timer to check results + Context *e = new C_HM_CheckHeartbeat(this); + pending_events.insert(e); + g_timer.add_event_after(max_ping_time, e); + dout(10) << "scheduled check " << e << endl; + + schedule_heartbeat(); // schedule next heartbeat +} + + +// check results + +void HostMonitor::check_heartbeat() +{ + dout(DBL) << "check_heartbeat()" << endl; + + // check inflight pings + for (set::iterator it = inflight_pings.begin(); + it != inflight_pings.end(); + it++) { + status[*it].num_heartbeats_missed++; + + dout(DBL) << "no response from " << *it << " for " << status[*it].num_heartbeats_missed << " beats" << endl; + + if (status[*it].num_heartbeats_missed >= max_heartbeat_misses) { + if (acked_failures.count(*it)) { + dout(DBL) << *it << " is already failed" << endl; + } else { + if (unacked_failures.count(*it)) { + dout(DBL) << *it << " is already failed, but unacked, sending another failure message" << endl; + } else { + dout(DBL) << "failing " << *it << endl; + unacked_failures.insert(*it); + } + + /*if (false) // do this in NewMessenger for now! FIXME + for (set::iterator nit = notify.begin(); + nit != notify.end(); + nit++) { + messenger->send_message(new MFailure(*it, messenger->get_inst(*it)), + *nit, notify_port, 0); + } + */ + } + } + } + + // forget about the pings. + inflight_pings.clear(); +} + + +// incoming messages + +void HostMonitor::proc_message(Message *m) +{ + switch (m->get_type()) { + + case MSG_PING_ACK: + handle_ping_ack((MPingAck*)m); + break; + + case MSG_FAILURE_ACK: + handle_failure_ack((MFailureAck*)m); + break; + + } +} + +void HostMonitor::handle_ping_ack(MPingAck *m) +{ + msg_addr_t from = m->get_source(); + + dout(DBL) << "ping ack from " << from << endl; + status[from].last_pinged = g_clock.gettime(); + status[from].num_heartbeats_missed = 0; + inflight_pings.erase(from); + + delete m; +} + +void HostMonitor::handle_failure_ack(MFailureAck *m) +{ + + // FIXME: this doesn't handle failed -> alive transitions gracefully at all.. + + // the higher-up's acknowledged our failure notification, we can stop resending it. + msg_addr_t failed = m->get_failed(); + dout(DBL) << "handle_failure_ack " << failed << endl; + unacked_failures.erase(failed); + acked_failures.insert(failed); + + delete m; +} + + diff --git a/branches/sage/cephmds2/msg/HostMonitor.h b/branches/sage/cephmds2/msg/HostMonitor.h new file mode 100644 index 0000000000000..20ef24eff8daf --- /dev/null +++ b/branches/sage/cephmds2/msg/HostMonitor.h @@ -0,0 +1,97 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __HOSTMONITOR_H +#define __HOSTMONITOR_H + +#include + +#include +#include +using namespace std; + +#include "include/Context.h" +#include "msg/Message.h" + +class Message; +class Messenger; + +typedef struct { + time_t last_heard_from; + time_t last_pinged; + int num_heartbeats_missed; +} monitor_rec_t; + +class HostMonitor { + Messenger *messenger; + string whoami; + + // hosts i monitor + set hosts; + + // who i tell when they fail + set notify; + int notify_port; + + // their status + map status; + + set inflight_pings; // pings we sent that haven't replied yet + + set unacked_failures; // failed hosts that haven't been acked yet. + set acked_failures; // these failures have been acked. + + float heartbeat_interval; // how often to do a heartbeat + float max_ping_time; // how long before it's a miss + int max_heartbeat_misses; // how many misses before i tell + float notify_retry_interval; // how often to retry failure notification + + public: + set pending_events; + + private: + void schedule_heartbeat(); + + public: + HostMonitor(Messenger *m, string& whoami) { + this->messenger = m; + this->whoami = whoami; + notify_port = 0; + } + set& get_hosts() { return hosts; } + set& get_notify() { return notify; } + void set_notify_port(int p) { notify_port = p; } + + void remove_host(msg_addr_t h) { + hosts.erase(h); + status.erase(h); + unacked_failures.erase(h); + acked_failures.erase(h); + } + + void init(); + void shutdown(); + + void host_is_alive(msg_addr_t who); + + void proc_message(Message *m); + void handle_ping_ack(class MPingAck *m); + void handle_failure_ack(class MFailureAck *m); + + void initiate_heartbeat(); + void check_heartbeat(); + +}; + +#endif diff --git a/branches/sage/cephmds2/msg/MPIMessenger.cc b/branches/sage/cephmds2/msg/MPIMessenger.cc new file mode 100644 index 0000000000000..3dfcd3224a4b9 --- /dev/null +++ b/branches/sage/cephmds2/msg/MPIMessenger.cc @@ -0,0 +1,608 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "config.h" +#include "include/error.h" + +#include "common/Timer.h" +#include "common/Mutex.h" + +#include "MPIMessenger.h" +#include "Message.h" + +#include +#include +using namespace std; +#include +using namespace __gnu_cxx; + +#include +#include + +/* + * We make a directory, so that we can have multiple Messengers in the + * same process (rank). This is useful for benchmarking and creating lots of + * simulated clients, e.g. + */ + +hash_map directory; +list outgoing, incoming; +list unfinished_sends; +map unfinished_send_message; + +/* this process */ +int mpi_world; +int mpi_rank; +bool mpi_done = false; // set this flag to stop the event loop + + +#define FUNNEL_MPI // if we want to funnel mpi through a single thread +#define TAG_UNSOLICITED 0 +#define DBLVL 18 + +// the key used to fetch the tag for the current thread. +pthread_key_t tag_key; +pthread_t thread_id = 0; // thread id of the event loop. init value == nobody + +Mutex sender_lock; +Mutex out_queue_lock; + +bool pending_timer; + + +// our lock for any common data; it's okay to have only the one global mutex +// because our common data isn't a whole lot. +//static pthread_mutex_t mutex; + +// the number of distinct threads we've seen so far; used to generate +// a unique tag for each thread. +//static int nthreads = 10; + +//#define TAG_UNSOLICITED 0 + +// debug +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "[MPI " << mpi_rank << "/" << mpi_world << " " << getpid() << "." << pthread_self() << "] " + + + +/***** + * MPI global methods for process-wide startup, shutdown. + */ + +int mpimessenger_init(int& argc, char**& argv) +{ + MPI_Init(&argc, &argv); + + MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + char hostname[100]; + gethostname(hostname,100); + int pid = getpid(); + + dout(12) << "init: i am " << hostname << " pid " << pid << endl; + + assert(mpi_world > g_conf.num_osd+g_conf.num_mds); + + return mpi_rank; +} + +int mpimessenger_shutdown() +{ + dout(5) << "mpimessenger_shutdown barrier waiting for all to finish" << endl; + MPI_Barrier (MPI_COMM_WORLD); + dout(1) << "mpimessenger_shutdown all done, MPI_Finalize()" << endl; + MPI_Finalize(); + return 0; +} + +int mpimessenger_world() +{ + return mpi_world; +} + + + +/*** + * internal send/recv + */ + + +/* + * get fresh MPI_Request* (on heap) for a new async MPI_Isend + */ + +MPI_Request *mpi_prep_send_req() { + MPI_Request *req = new MPI_Request; + unfinished_sends.push_back(req); + dout(DBLVL) << "prep_send_req " << req << endl; + return req; +} + + +/* + * clean up MPI_Request*'s for Isends that have completed. + * also, hose any associated Message*'s for Messages that are completely sent. + * + * if wait=true, block and wait for sends to finish. + */ + +void mpi_reap_sends(bool wait=false) { + sender_lock.Lock(); + + list::iterator it = unfinished_sends.begin(); + while (it != unfinished_sends.end()) { + MPI_Status status; + int flag; + + if (wait) { + MPI_Wait(*it, &status); + } else { + MPI_Test(*it, &flag, &status); + if (!flag) break; // not finished yet + } + + dout(DBLVL) << "send " << *it << " completed" << endl; + + if (unfinished_send_message.count(*it)) { + dout(DBLVL) << "send message " << unfinished_send_message[*it] << " completed" << endl; + delete unfinished_send_message[*it]; + unfinished_send_message.erase(*it); + } + + delete *it; + it++; + unfinished_sends.pop_front(); + } + + dout(DBLVL) << "reap has " << unfinished_sends.size() << " Isends outstanding, " << unfinished_send_message.size() << " messages" << endl; + + sender_lock.Unlock(); +} + + +void mpi_finish_sends() { + mpi_reap_sends(true); +} + + +/* + * recv a Message* + */ +Message *mpi_recv(int tag) +{ + // envelope + dout(DBLVL) << "mpi_recv waiting for message tag " << tag << endl; + + MPI_Status status; + msg_envelope_t env; + + ASSERT(MPI_Recv((void*)&env, + sizeof(env), + MPI_CHAR, + MPI_ANY_SOURCE,// status.MPI_SOURCE,//MPI_ANY_SOURCE, + tag, + MPI_COMM_WORLD, + &status/*, + &recv_env_req*/) == MPI_SUCCESS); + assert(status.count == MSG_ENVELOPE_LEN); + + if (env.type == 0) { + dout(DBLVL) << "mpi_recv got type 0 message, kicked!" << endl; + return 0; + } + + dout(DBLVL) << "mpi_recv got envelope " << status.count << ", type=" << env.type << " src " << env.source << " dst " << env.dest << " nchunks=" << env.nchunks << " from " << status.MPI_SOURCE << endl; + + // payload + bufferlist blist; + for (int i=0; iget_dest(), mpi_world); + + // local? + if (rank == mpi_rank) { + dout(DBLVL) << "queuing local delivery" << endl; + incoming.push_back(m); + return 0; + } + + // marshall + if (m->empty_payload()) + m->encode_payload(); + msg_envelope_t *env = &m->get_envelope(); + env->nchunks = m->get_payload().buffers().size(); + + dout(7) << "sending " << *m << " to " << MSG_ADDR_NICE(env->dest) << " (rank " << rank << ")" << endl; + +#ifndef FUNNEL_MPI + sender_lock.Lock(); +#endif + + // send envelope + ASSERT(MPI_Isend((void*)env, + sizeof(*env), + MPI_CHAR, + rank, + tag, + MPI_COMM_WORLD, + mpi_prep_send_req()) == MPI_SUCCESS); + + // payload + int i = 0; + for (list::iterator it = m->get_payload().buffers().begin(); + it != m->get_payload().buffers().end(); + it++) { + dout(DBLVL) << "mpi_sending frag " << i << " len " << (*it).length() << endl; + //MPI_Request *req = new MPI_Request; + ASSERT(MPI_Isend((void*)(*it).c_str(), + (*it).length(), + MPI_CHAR, + rank, + tag, + MPI_COMM_WORLD, + mpi_prep_send_req()) == MPI_SUCCESS); + i++; + } + + // attach message to last send, so we can free it later + MPI_Request *req = unfinished_sends.back(); + unfinished_send_message[req] = m; + + dout(DBLVL) << "mpi_send done, attached message to Isend " << req << endl; + +#ifndef FUNNEL_MPI + sender_lock.Unlock(); +#endif + return 0; +} + + + +// get the tag for this thread + +#ifndef FUNNEL_MPI +static int get_thread_tag() +{ + int tag = (int)pthread_getspecific(tag_key); + + if (tag == 0) { + // first time this thread has performed MPI messaging + + if (pthread_mutex_lock(&mutex) < 0) + SYSERROR(); + + tag = ++nthreads; + + if (pthread_mutex_unlock(&mutex) < 0) + SYSERROR(); + + if (pthread_setspecific(tag_key, (void*)tag) < 0) + SYSERROR(); + } + + return tag; +} +#endif + + + +// recv event loop, for unsolicited messages. + +void* mpimessenger_loop(void*) +{ + dout(5) << "mpimessenger_loop start pid " << getpid() << endl; + + while (1) { + + // outgoing + mpi_reap_sends(); + +#ifdef FUNNEL_MPI + // check outgoing queue + out_queue_lock.Lock(); + if (outgoing.size()) { + dout(10) << outgoing.size() << " outgoing messages" << endl; + for (list::iterator it = outgoing.begin(); + it != outgoing.end(); + it++) { + mpi_send(*it, TAG_UNSOLICITED); + } + } + outgoing.clear(); + out_queue_lock.Unlock(); +#endif + + + // timer events? + if (pending_timer) { + dout(DBLVL) << "pending timer" << endl; + g_timer.execute_pending(); + } + + // done? + if (mpi_done && + incoming.empty() && + outgoing.empty() && + !pending_timer) break; + + + // incoming + Message *m = 0; + + if (incoming.size()) { + dout(12) << "loop pulling message off incoming" << endl; + m = incoming.front(); + incoming.pop_front(); + } + else { + // check mpi + dout(12) << "loop waiting for incoming messages" << endl; + + // get message + m = mpi_recv(TAG_UNSOLICITED); + } + + // dispatch? + if (m) { + int dest = m->get_dest(); + if (directory.count(dest)) { + Messenger *who = directory[ dest ]; + + dout(4) << "---- '" << m->get_type_name() << + "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << + " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " + << m + << endl; + + who->dispatch(m); + } else { + dout (1) << "---- i don't know who " << dest << " is." << endl; + assert(0); + break; + } + } + + } + + dout(5) << "finishing async sends" << endl; + mpi_finish_sends(); + + g_timer.shutdown(); + + dout(5) << "mpimessenger_loop exiting loop" << endl; + return 0; +} + + +// start/stop mpi receiver thread (for unsolicited messages) +int mpimessenger_start() +{ + dout(5) << "starting thread" << endl; + + // start a thread + pthread_create(&thread_id, + NULL, + mpimessenger_loop, + 0); + return 0; +} + + +/* + * kick and wake up _loop (to pick up new outgoing message, or quit) + */ + +MPI_Request kick_req; +msg_envelope_t kick_env; + +void mpimessenger_kick_loop() +{ + // if we're same thread as the loop, no kicking necessary + if (pthread_self() == thread_id) return; + + kick_env.type = 0; + + sender_lock.Lock(); + ASSERT(MPI_Isend(&kick_env, // kick sync for now, but ONLY because it makes me feel safer. + sizeof(kick_env), + MPI_CHAR, + mpi_rank, + TAG_UNSOLICITED, + MPI_COMM_WORLD, + mpi_prep_send_req()) == MPI_SUCCESS); + sender_lock.Unlock(); +} + + +// stop thread + +void mpimessenger_stop() +{ + dout(5) << "mpimessenger_stop stopping thread" << endl; + + if (mpi_done) { + dout(1) << "mpimessenger_stop called, but already done!" << endl; + assert(!mpi_done); + } + + // set finish flag + mpi_done = true; + mpimessenger_kick_loop(); + + // wait for thread to stop + mpimessenger_wait(); +} + + +// wait for thread to finish + +void mpimessenger_wait() +{ + void *returnval; + dout(10) << "mpimessenger_wait waiting for thread to finished." << endl; + pthread_join(thread_id, &returnval); + dout(10) << "mpimessenger_wait thread finished." << endl; +} + + + + +/*********** + * MPIMessenger class implementation + */ + +class C_MPIKicker : public Context { + void finish(int r) { + dout(DBLVL) << "timer kick" << endl; + mpimessenger_kick_loop(); + } +}; + +MPIMessenger::MPIMessenger(msg_addr_t myaddr) : Messenger(myaddr) +{ + // my address + this->myaddr = myaddr; + + // register myself in the messenger directory + directory[myaddr] = this; + + // register to execute timer events + g_timer.set_messenger_kicker(new C_MPIKicker()); + + // logger + /* + string name; + name = "m."; + name += MSG_ADDR_TYPE(whoami); + int w = MSG_ADDR_NUM(whoami); + if (w >= 1000) name += ('0' + ((w/1000)%10)); + if (w >= 100) name += ('0' + ((w/100)%10)); + if (w >= 10) name += ('0' + ((w/10)%10)); + name += ('0' + ((w/1)%10)); + + logger = new Logger(name, (LogType*)&mpimsg_logtype); + loggers[ whoami ] = logger; + */ +} + +MPIMessenger::~MPIMessenger() +{ + //delete logger; +} + + +int MPIMessenger::shutdown() +{ + // remove me from the directory + directory.erase(myaddr); + + // no more timer events + g_timer.unset_messenger_kicker(); + + // last one? + if (directory.empty()) { + dout(10) << "shutdown last mpimessenger on rank " << mpi_rank << " shut down" << endl; + pthread_t whoami = pthread_self(); + + dout(15) << "whoami = " << whoami << ", thread = " << thread_id << endl; + if (whoami == thread_id) { + // i am the event loop thread, just set flag! + dout(15) << " set mpi_done=true" << endl; + mpi_done = true; + } else { + // i am a different thread, tell the event loop to stop. + dout(15) << " calling mpimessenger_stop()" << endl; + mpimessenger_stop(); + } + } else { + dout(10) << "shutdown still " << directory.size() << " other messengers on rank " << mpi_rank << endl; + } + return 0; +} + + + + +/*** + * public messaging interface + */ + + +/* note: send_message _MUST_ be non-blocking */ +int MPIMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +{ + // set envelope + m->set_source(myaddr, fromport); + m->set_dest(dest, port); + +#ifdef FUNNEL_MPI + + // queue up + out_queue_lock.Lock(); + dout(DBLVL) << "queuing outgoing message " << *m << endl; + outgoing.push_back(m); + out_queue_lock.Unlock(); + + mpimessenger_kick_loop(); + +#else + + // send in this thread + mpi_send(m, m->get_pcid()); + +#endif + return 0; +} + + + + + + diff --git a/branches/sage/cephmds2/msg/MPIMessenger.h b/branches/sage/cephmds2/msg/MPIMessenger.h new file mode 100644 index 0000000000000..d050f5bf49470 --- /dev/null +++ b/branches/sage/cephmds2/msg/MPIMessenger.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MPIMESSENGER_H +#define __MPIMESSENGER_H + +#include "Messenger.h" +#include "Dispatcher.h" + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define MPI_DEST_TO_RANK(dest,world) ((dest)<(NUMMDS+NUMOSD) ? \ + (dest) : \ + ((NUMMDS+NUMOSD)+(((dest)-NUMMDS-NUMOSD) % ((world)-NUMMDS-NUMOSD)))) + +class Timer; + +class MPIMessenger : public Messenger { + protected: + msg_addr_t myaddr; // my address + //class Logger *logger; // for logging + + public: + MPIMessenger(msg_addr_t myaddr); + ~MPIMessenger(); + + // init, shutdown MPI and associated event loop thread. + virtual int shutdown(); + + // message interface + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); +}; + +/** + * these are all ONE per process. + */ +extern int mpimessenger_world(); // get world size +extern int mpimessenger_init(int& argc, char**& argv); // init mpi +extern int mpimessenger_start(); // start thread +extern void mpimessenger_stop(); // stop thread. +extern void mpimessenger_wait(); // wait for thread to finish. +extern int mpimessenger_shutdown(); // finalize MPI + + +#endif diff --git a/branches/sage/cephmds2/msg/MTMessenger.cc b/branches/sage/cephmds2/msg/MTMessenger.cc new file mode 100644 index 0000000000000..301915a336ea5 --- /dev/null +++ b/branches/sage/cephmds2/msg/MTMessenger.cc @@ -0,0 +1,197 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include "mpi.h" + +#include "include/config.h" +#include "include/error.h" +#include "Messenger.h" +#include "MTMessenger.h" + +// This module uses MPI to implement a blocking sendrecv function that +// feels more like a procedure call and less like event processesing. +// +// Threads are not independently addressable in MPI, only processes +// are. However, MPI does include a user defined tag in the message +// envelope, and a reader may selectively read only messages with a +// matching tag. The modules assign an integer to each thread to use +// as the tag. +// + +// our lock for any common data; it's okay to have only the one global mutex +// because our common data isn't a whole lot. +static pthread_mutex_t mutex; + +// the key used to fetch the tag for the current thread. +pthread_key_t tag_key; + +// the number of distinct threads we've seen so far; used to generate +// a unique tag for each thread. +static int nthreads; + +// the MPI identity of this process +static int mpi_rank; + + +// get the tag for this thread +static int get_tag() +{ + int tag = (int)pthread_getspecific(tag_key); + + if (tag == 0) { + // first time this thread has performed MPI messaging + + if (pthread_mutex_lock(&mutex) < 0) + SYSERROR(); + + tag = ++nthreads; + + if (pthread_mutex_unlock(&mutex) < 0) + SYSERROR(); + + if (pthread_setspecific(tag_key, (void*)tag) < 0) + SYSERROR(); + } + + return tag; +} + + +// marshall a message and send it over MPI +static void send(Message *m, int rank, int tag) +{ + // marshall the message + crope r; + m->encode(r); + int size = r.length(); + + char *buf = (char*)r.c_str(); + ASSERT(MPI_Send(buf, + size, + MPI_CHAR, + rank, + tag, + MPI_COMM_WORLD) == MPI_SUCCESS); +} + +// read a message from MPI and unmarshall it +static Message *receive(int tag) +{ + MPI_Status status; + + // get message size + ASSERT(MPI_Probe(MPI_ANY_SOURCE, + tag, + MPI_COMM_WORLD, + &status) == MPI_SUCCESS); + + // get message; there may be multiple messages on the queue, we + // need to be sure to read the one which corresponds to size + // obtained above. + char *buf = new char[status.count]; + ASSERT(MPI_Recv(buf, + status.count, + MPI_CHAR, + status.MPI_SOURCE, + status.MPI_TAG, + MPI_COMM_WORLD, + &status) == MPI_SUCCESS); + + // unmarshall message + crope r(buf, status.count); + delete[] buf; + Message *m = decode_message(r); + + return m; +} + +MTMessenger::MTMessenger(int& argc, char**& argv) +{ + // setup MPI; MPI errors will probably invoke the default MPI error + // handler, which aborts the program with a friendly message rather + // than returning from a function; just in case, we abort the + // program if we get an MPI error. + + int provided; + ASSERT(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided) + == MPI_SUCCESS); + + ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank) == MPI_SUCCESS); + + if (pthread_mutex_init(&mutex, NULL) < 0) + SYSERROR(); + + if (pthread_key_create(&tag_key, NULL) < 0) + SYSERROR(); + + nthreads = 0; +} + +MTMessenger::~MTMessenger() +{ + // ignore shutdown errors + + pthread_key_delete(tag_key); + + pthread_mutex_destroy(&mutex); + + MPI_Finalize(); +} + +// send a request and wait for the response +Message *MTMessenger::sendrecv(Message *m, msg_addr_t dest) +{ + int dest_tag = 0; // servers listen for any tag + int my_tag = get_tag(); + + // set our envelope (not to be confused with the MPI envelope) + m->set_source(mpi_rank, my_tag); + m->set_dest(dest, dest_tag); + + send(m, dest, dest_tag); + + return receive(my_tag); +} + +// receive a request from anyone +Message *MTMessenger::recvreq() +{ + return receive(MPI_ANY_TAG); +} + +// forward request, masquerading as original source +void MTMessenger::fwdreq(Message *req, int dest) +{ + int dest_tag = 0; // servers listen for any tag + + // set our envelope (not to be confused with the MPI envelope) + req->set_dest(dest, dest_tag); + + send(req, dest, dest_tag); +} + +// send a response to the originator of the request +void MTMessenger::sendresp(Message *req, Message *resp) +{ + int req_rank = req->get_source(); + int req_tag = req->get_source_port(); + int my_tag = get_tag(); + + // set our envelope (not to be confused with the MPI envelope) + resp->set_source(mpi_rank, my_tag); + resp->set_dest(req_rank, req_tag); + + send(resp, req_rank, req_tag); +} diff --git a/branches/sage/cephmds2/msg/MTMessenger.h b/branches/sage/cephmds2/msg/MTMessenger.h new file mode 100644 index 0000000000000..6489de407ba2f --- /dev/null +++ b/branches/sage/cephmds2/msg/MTMessenger.h @@ -0,0 +1,50 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __MTMESSENGER_H +#define __MTMESSENGER_H + +#include "Message.h" +#include "SerialMessenger.h" + +// Marshall and unmarshall OBFS messages, send and receive them over +// MPI. + +class MTMessenger +{ +public: + // sets up the queues and internal thread; the MPI initialization + // will scan argc/argv for MPI specific flags and remove them from + // argc/argv. + MTMessenger(int &argc, char **&argv); + + // tears it all down + ~MTMessenger(); + + // send a request to a server and wait (block) for the response; + virtual Message *sendrecv(Message *m, msg_addr_t dest); + + // wait (block) for a request from anyone + Message *recvreq(); + + // forward request, masquerading as original source + void fwdreq(Message *req, int dest); + + // send the response to the originator of the request + virtual void sendresp(Message *req, Message *resp); + + +}; // class MTMessenger + +#endif // __MTMESSENGER_H diff --git a/branches/sage/cephmds2/msg/Message.cc b/branches/sage/cephmds2/msg/Message.cc new file mode 100644 index 0000000000000..b37c4d2cb421d --- /dev/null +++ b/branches/sage/cephmds2/msg/Message.cc @@ -0,0 +1,442 @@ + +#include +#include +using namespace std; + +#include "include/types.h" + +#include "Message.h" + +#include "messages/MGenericMessage.h" + +#include "messages/MNSConnect.h" +#include "messages/MNSConnectAck.h" +#include "messages/MNSRegister.h" +#include "messages/MNSRegisterAck.h" +#include "messages/MNSLookup.h" +#include "messages/MNSLookupReply.h" +#include "messages/MNSFailure.h" + +#include "messages/MMonElectionAck.h" +#include "messages/MMonElectionCollect.h" +#include "messages/MMonElectionRefresh.h" +#include "messages/MMonElectionStatus.h" + +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MFailure.h" +#include "messages/MFailureAck.h" + +#include "messages/MOSDBoot.h" +#include "messages/MOSDIn.h" +#include "messages/MOSDOut.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDPing.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" + +#include "messages/MClientMount.h" +#include "messages/MClientMountAck.h" +#include "messages/MClientRequest.h" +#include "messages/MClientReply.h" +#include "messages/MClientFileCaps.h" + +#include "messages/MMDSGetMap.h" +#include "messages/MMDSMap.h" +#include "messages/MMDSBoot.h" + +#include "messages/MDirUpdate.h" +#include "messages/MDiscover.h" +#include "messages/MDiscoverReply.h" + +#include "messages/MExportDirDiscover.h" +#include "messages/MExportDirDiscoverAck.h" +#include "messages/MExportDirPrep.h" +#include "messages/MExportDirPrepAck.h" +#include "messages/MExportDirWarning.h" +#include "messages/MExportDir.h" +#include "messages/MExportDirNotify.h" +#include "messages/MExportDirNotifyAck.h" +#include "messages/MExportDirFinish.h" + +#include "messages/MHashReaddir.h" +#include "messages/MHashReaddirReply.h" + +#include "messages/MHashDirDiscover.h" +#include "messages/MHashDirDiscoverAck.h" +#include "messages/MHashDirPrep.h" +#include "messages/MHashDirPrepAck.h" +#include "messages/MHashDir.h" +#include "messages/MHashDirAck.h" +#include "messages/MHashDirNotify.h" + +#include "messages/MUnhashDirPrep.h" +#include "messages/MUnhashDirPrepAck.h" +#include "messages/MUnhashDir.h" +#include "messages/MUnhashDirAck.h" +#include "messages/MUnhashDirNotify.h" +#include "messages/MUnhashDirNotifyAck.h" + +#include "messages/MRenameWarning.h" +#include "messages/MRenameNotify.h" +#include "messages/MRenameNotifyAck.h" +#include "messages/MRename.h" +#include "messages/MRenamePrep.h" +#include "messages/MRenameReq.h" +#include "messages/MRenameAck.h" +#include "messages/MDentryUnlink.h" + +#include "messages/MHeartbeat.h" + +#include "messages/MAnchorRequest.h" +#include "messages/MAnchorReply.h" +#include "messages/MInodeLink.h" +#include "messages/MInodeLinkAck.h" + +//#include "messages/MInodeUpdate.h" +#include "messages/MInodeExpire.h" +#include "messages/MDirExpire.h" +#include "messages/MCacheExpire.h" +#include "messages/MInodeFileCaps.h" + +#include "messages/MLock.h" + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "messenger: " +#define DEBUGLVL 10 // debug level of output + + + + + + + +Message * +decode_message(msg_envelope_t& env, bufferlist& payload) +{ + // make message + Message *m = 0; + switch(env.type) { + + // -- with payload -- + + case MSG_NS_CONNECT: + m = new MNSConnect(); + break; + case MSG_NS_CONNECTACK: + m = new MNSConnectAck(); + break; + case MSG_NS_REGISTER: + m = new MNSRegister(); + break; + case MSG_NS_REGISTERACK: + m = new MNSRegisterAck(); + break; + case MSG_NS_LOOKUP: + m = new MNSLookup(); + break; + case MSG_NS_LOOKUPREPLY: + m = new MNSLookupReply(); + break; + case MSG_NS_FAILURE: + m = new MNSFailure(); + break; + + case MSG_MON_ELECTION_ACK: + m = new MMonElectionAck(); + break; + case MSG_MON_ELECTION_COLLECT: + m = new MMonElectionCollect(); + break; + case MSG_MON_ELECTION_REFRESH: + m = new MMonElectionRefresh(); + break; + case MSG_MON_ELECTION_STATUS: + m = new MMonElectionStatus(); + break; + + case MSG_PING: + m = new MPing(); + break; + case MSG_PING_ACK: + m = new MPingAck(); + break; + case MSG_FAILURE: + m = new MFailure(); + break; + case MSG_FAILURE_ACK: + m = new MFailureAck(); + break; + + case MSG_OSD_BOOT: + m = new MOSDBoot(); + break; + case MSG_OSD_IN: + m = new MOSDIn(); + break; + case MSG_OSD_OUT: + m = new MOSDOut(); + break; + case MSG_OSD_FAILURE: + m = new MOSDFailure(); + break; + case MSG_OSD_PING: + m = new MOSDPing(); + break; + case MSG_OSD_OP: + m = new MOSDOp(); + break; + case MSG_OSD_OPREPLY: + m = new MOSDOpReply(); + break; + + case MSG_OSD_MAP: + m = new MOSDMap(); + break; + case MSG_OSD_GETMAP: + m = new MOSDGetMap(); + break; + + case MSG_OSD_PG_NOTIFY: + m = new MOSDPGNotify(); + break; + case MSG_OSD_PG_QUERY: + m = new MOSDPGQuery(); + break; + case MSG_OSD_PG_LOG: + m = new MOSDPGLog(); + break; + case MSG_OSD_PG_REMOVE: + m = new MOSDPGRemove(); + break; + + // clients + case MSG_CLIENT_MOUNT: + m = new MClientMount(); + break; + case MSG_CLIENT_MOUNTACK: + m = new MClientMountAck(); + break; + case MSG_CLIENT_REQUEST: + m = new MClientRequest(); + break; + case MSG_CLIENT_REPLY: + m = new MClientReply(); + break; + case MSG_CLIENT_FILECAPS: + m = new MClientFileCaps(); + break; + + // mds + case MSG_MDS_GETMAP: + m = new MMDSGetMap(); + break; + case MSG_MDS_MAP: + m = new MMDSMap(); + break; + case MSG_MDS_BOOT: + m = new MMDSBoot(); + break; + + case MSG_MDS_DIRUPDATE: + m = new MDirUpdate(); + break; + + case MSG_MDS_DISCOVER: + m = new MDiscover(); + break; + case MSG_MDS_DISCOVERREPLY: + m = new MDiscoverReply(); + break; + + case MSG_MDS_EXPORTDIRDISCOVER: + m = new MExportDirDiscover(); + break; + case MSG_MDS_EXPORTDIRDISCOVERACK: + m = new MExportDirDiscoverAck(); + break; + + case MSG_MDS_EXPORTDIR: + m = new MExportDir(); + break; + + case MSG_MDS_EXPORTDIRFINISH: + m = new MExportDirFinish(); + break; + + case MSG_MDS_EXPORTDIRNOTIFY: + m = new MExportDirNotify(); + break; + + case MSG_MDS_EXPORTDIRNOTIFYACK: + m = new MExportDirNotifyAck(); + break; + + case MSG_MDS_EXPORTDIRPREP: + m = new MExportDirPrep(); + break; + + case MSG_MDS_EXPORTDIRPREPACK: + m = new MExportDirPrepAck(); + break; + + case MSG_MDS_EXPORTDIRWARNING: + m = new MExportDirWarning(); + break; + + + case MSG_MDS_HASHREADDIR: + m = new MHashReaddir(); + break; + case MSG_MDS_HASHREADDIRREPLY: + m = new MHashReaddirReply(); + break; + + case MSG_MDS_HASHDIRDISCOVER: + m = new MHashDirDiscover(); + break; + case MSG_MDS_HASHDIRDISCOVERACK: + m = new MHashDirDiscoverAck(); + break; + case MSG_MDS_HASHDIRPREP: + m = new MHashDirPrep(); + break; + case MSG_MDS_HASHDIRPREPACK: + m = new MHashDirPrepAck(); + break; + case MSG_MDS_HASHDIR: + m = new MHashDir(); + break; + case MSG_MDS_HASHDIRACK: + m = new MHashDirAck(); + break; + case MSG_MDS_HASHDIRNOTIFY: + m = new MHashDirNotify(); + break; + + case MSG_MDS_UNHASHDIRPREP: + m = new MUnhashDirPrep(); + break; + case MSG_MDS_UNHASHDIRPREPACK: + m = new MUnhashDirPrepAck(); + break; + case MSG_MDS_UNHASHDIR: + m = new MUnhashDir(); + break; + case MSG_MDS_UNHASHDIRACK: + m = new MUnhashDirAck(); + break; + case MSG_MDS_UNHASHDIRNOTIFY: + m = new MUnhashDirNotify(); + break; + case MSG_MDS_UNHASHDIRNOTIFYACK: + m = new MUnhashDirNotifyAck(); + break; + + case MSG_MDS_RENAMEWARNING: + m = new MRenameWarning(); + break; + case MSG_MDS_RENAMENOTIFY: + m = new MRenameNotify(); + break; + case MSG_MDS_RENAMENOTIFYACK: + m = new MRenameNotifyAck(); + break; + case MSG_MDS_RENAME: + m = new MRename(); + break; + case MSG_MDS_RENAMEPREP: + m = new MRenamePrep(); + break; + case MSG_MDS_RENAMEREQ: + m = new MRenameReq(); + break; + case MSG_MDS_RENAMEACK: + m = new MRenameAck(); + break; + + case MSG_MDS_DENTRYUNLINK: + m = new MDentryUnlink(); + break; + + case MSG_MDS_HEARTBEAT: + m = new MHeartbeat(); + break; + + case MSG_MDS_CACHEEXPIRE: + m = new MCacheExpire(); + break; + + case MSG_MDS_ANCHORREQUEST: + m = new MAnchorRequest(); + break; + case MSG_MDS_ANCHORREPLY: + m = new MAnchorReply(); + break; + + case MSG_MDS_INODELINK: + m = new MInodeLink(); + break; + case MSG_MDS_INODELINKACK: + m = new MInodeLinkAck(); + break; + + /* case MSG_MDS_INODEUPDATE: + m = new MInodeUpdate(); + break; + */ + + case MSG_MDS_INODEEXPIRE: + m = new MInodeExpire(); + break; + + case MSG_MDS_INODEFILECAPS: + m = new MInodeFileCaps(); + break; + + case MSG_MDS_DIREXPIRE: + m = new MDirExpire(); + break; + + case MSG_MDS_LOCK: + m = new MLock(); + break; + + + // -- simple messages without payload -- + + case MSG_CLOSE: + case MSG_NS_STARTED: + case MSG_NS_UNREGISTER: + case MSG_SHUTDOWN: + case MSG_MDS_SHUTDOWNSTART: + case MSG_MDS_SHUTDOWNFINISH: + case MSG_CLIENT_UNMOUNT: + case MSG_OSD_MKFS_ACK: + m = new MGenericMessage(env.type); + break; + + default: + dout(1) << "can't decode unknown message type " << env.type << endl; + assert(0); + } + + // env + m->set_envelope(env); + + // decode + m->set_payload(payload); + m->decode_payload(); + + // done! + return m; +} + + diff --git a/branches/sage/cephmds2/msg/Message.h b/branches/sage/cephmds2/msg/Message.h new file mode 100644 index 0000000000000..afe1ae6941844 --- /dev/null +++ b/branches/sage/cephmds2/msg/Message.h @@ -0,0 +1,463 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MESSAGE_H +#define __MESSAGE_H + +#define MSG_CLOSE 0 + +#define MSG_NS_CONNECT 1 +#define MSG_NS_CONNECTACK 2 +#define MSG_NS_REGISTER 3 +#define MSG_NS_REGISTERACK 4 +#define MSG_NS_STARTED 5 +#define MSG_NS_UNREGISTER 6 +#define MSG_NS_LOOKUP 7 +#define MSG_NS_LOOKUPREPLY 8 +#define MSG_NS_FAILURE 9 + + +#define MSG_PING 10 +#define MSG_PING_ACK 11 + +#define MSG_FAILURE 12 +#define MSG_FAILURE_ACK 13 + +#define MSG_SHUTDOWN 99999 + + +#define MSG_MON_ELECTION_ACK 15 +#define MSG_MON_ELECTION_COLLECT 16 +#define MSG_MON_ELECTION_REFRESH 17 +#define MSG_MON_ELECTION_STATUS 18 + +#define MSG_MON_OSDMAP_INFO 20 +#define MSG_MON_OSDMAP_LEASE 21 +#define MSG_MON_OSDMAP_LEASE_ACK 22 +#define MSG_MON_OSDMAP_UPDATE_PREPARE 23 +#define MSG_MON_OSDMAP_UPDATE_ACK 24 +#define MSG_MON_OSDMAP_UPDATE_COMMIT 25 + +#define MSG_OSD_OP 40 // delete, etc. +#define MSG_OSD_OPREPLY 41 // delete, etc. +#define MSG_OSD_PING 42 + +#define MSG_OSD_GETMAP 43 +#define MSG_OSD_MAP 44 + +#define MSG_OSD_BOOT 45 +#define MSG_OSD_MKFS_ACK 46 + +#define MSG_OSD_FAILURE 47 + +#define MSG_OSD_IN 48 +#define MSG_OSD_OUT 49 + + + +#define MSG_OSD_PG_NOTIFY 50 +#define MSG_OSD_PG_QUERY 51 +#define MSG_OSD_PG_SUMMARY 52 +#define MSG_OSD_PG_LOG 53 +#define MSG_OSD_PG_REMOVE 54 + +#define MSG_CLIENT_REQUEST 60 +#define MSG_CLIENT_REPLY 61 +//#define MSG_CLIENT_DONE 62 +#define MSG_CLIENT_FILECAPS 63 +#define MSG_CLIENT_INODEAUTHUPDATE 64 + +#define MSG_CLIENT_MOUNT 70 +#define MSG_CLIENT_MOUNTACK 71 +#define MSG_CLIENT_UNMOUNT 72 + + +// *** MDS *** + +#define MSG_MDS_BOOT 100 +#define MSG_MDS_GETMAP 101 +#define MSG_MDS_MAP 102 +#define MSG_MDS_HEARTBEAT 103 + +#define MSG_MDS_DISCOVER 110 +#define MSG_MDS_DISCOVERREPLY 111 + +#define MSG_MDS_INODEGETREPLICA 112 +#define MSG_MDS_INODEGETREPLICAACK 113 + +#define MSG_MDS_INODEFILECAPS 115 + +#define MSG_MDS_INODEUPDATE 120 +#define MSG_MDS_DIRUPDATE 121 +#define MSG_MDS_INODEEXPIRE 122 +#define MSG_MDS_DIREXPIRE 123 + +#define MSG_MDS_DIREXPIREREQ 124 + +#define MSG_MDS_CACHEEXPIRE 125 + +#define MSG_MDS_ANCHORREQUEST 130 +#define MSG_MDS_ANCHORREPLY 131 + +#define MSG_MDS_INODELINK 140 +#define MSG_MDS_INODELINKACK 141 +#define MSG_MDS_INODEUNLINK 142 +#define MSG_MDS_INODEUNLINKACK 143 + +#define MSG_MDS_EXPORTDIRDISCOVER 150 +#define MSG_MDS_EXPORTDIRDISCOVERACK 151 +#define MSG_MDS_EXPORTDIRPREP 152 +#define MSG_MDS_EXPORTDIRPREPACK 153 +#define MSG_MDS_EXPORTDIRWARNING 154 +#define MSG_MDS_EXPORTDIR 155 +#define MSG_MDS_EXPORTDIRNOTIFY 156 +#define MSG_MDS_EXPORTDIRNOTIFYACK 157 +#define MSG_MDS_EXPORTDIRFINISH 158 + + +#define MSG_MDS_HASHDIRDISCOVER 160 +#define MSG_MDS_HASHDIRDISCOVERACK 161 +#define MSG_MDS_HASHDIRPREP 162 +#define MSG_MDS_HASHDIRPREPACK 163 +#define MSG_MDS_HASHDIR 164 +#define MSG_MDS_HASHDIRACK 165 +#define MSG_MDS_HASHDIRNOTIFY 166 + +#define MSG_MDS_HASHREADDIR 168 +#define MSG_MDS_HASHREADDIRREPLY 169 + +#define MSG_MDS_UNHASHDIRPREP 170 +#define MSG_MDS_UNHASHDIRPREPACK 171 +#define MSG_MDS_UNHASHDIR 172 +#define MSG_MDS_UNHASHDIRACK 173 +#define MSG_MDS_UNHASHDIRNOTIFY 174 +#define MSG_MDS_UNHASHDIRNOTIFYACK 175 + +#define MSG_MDS_DENTRYUNLINK 200 + +#define MSG_MDS_RENAMEWARNING 300 // sent from src to bystanders +#define MSG_MDS_RENAMENOTIFY 301 // sent from dest to bystanders +#define MSG_MDS_RENAMENOTIFYACK 302 // sent back to src +#define MSG_MDS_RENAMEACK 303 // sent from src to initiator, to xlock_finish + +#define MSG_MDS_RENAMEPREP 304 // sent from initiator to dest auth (if dir) +#define MSG_MDS_RENAMEREQ 305 // sent from initiator (or dest if dir) to src auth +#define MSG_MDS_RENAME 306 // sent from src to dest, includes inode + +#define MSG_MDS_LOCK 500 + +#define MSG_MDS_SHUTDOWNSTART 900 +#define MSG_MDS_SHUTDOWNFINISH 901 + + +#include +#include + +#include +#include +using std::list; + +#include +#include + +using __gnu_cxx::crope; + +#include "include/buffer.h" + +#include "tcp.h" + + + + +// use fixed offsets and static entity -> logical addr mapping! +#define MSG_ADDR_NAMER_BASE 0 +#define MSG_ADDR_RANK_BASE 0x10000000 // per-rank messenger services +#define MSG_ADDR_MDS_BASE 0x20000000 +#define MSG_ADDR_OSD_BASE 0x30000000 +#define MSG_ADDR_MON_BASE 0x40000000 +#define MSG_ADDR_CLIENT_BASE 0x50000000 + +#define MSG_ADDR_TYPE_MASK 0xf0000000 +#define MSG_ADDR_NUM_MASK 0x0fffffff + +#define MSG_ADDR_NEW 0x0fffffff +#define MSG_ADDR_UNDEF_BASE 0xffffffff + + +/* old int way, which lacked type safety... +typedef int msg_addr_t; + +#define MSG_ADDR_RANK(x) (MSG_ADDR_RANK_BASE + (x)) +#define MSG_ADDR_MDS(x) (MSG_ADDR_MDS_BASE + (x)) +#define MSG_ADDR_OSD(x) (MSG_ADDR_OSD_BASE + (x)) +#define MSG_ADDR_CLIENT(x) (MSG_ADDR_CLIENT_BASE + (x)) + +#define MSG_ADDR_DIRECTORY 0 +#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(MSG_ADDR_NEW) +#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(MSG_ADDR_NEW) +#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(MSG_ADDR_NEW) +#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(MSG_ADDR_NEW) + +#define MSG_ADDR_ISCLIENT(x) ((x) >= MSG_ADDR_CLIENT_BASE) +#define MSG_ADDR_TYPE(x) (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_RANK_BASE ? "rank": \ + (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_CLIENT_BASE ? "client": \ + (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_OSD_BASE ? "osd": \ + (((x) & MSG_ADDR_TYPE_MASK) == MSG_ADDR_MDS_BASE ? "mds": \ + ((x) == MSG_ADDR_DIRECTORY ? "namer":"unknown"))))) +#define MSG_ADDR_NUM(x) ((x) & MSG_ADDR_NUM_MASK) +#define MSG_ADDR_NICE(x) MSG_ADDR_TYPE(x) << MSG_ADDR_NUM(x) +*/ + +// new typed msg_addr_t way! +class msg_addr_t { +public: + int _addr; + + msg_addr_t() : _addr(MSG_ADDR_UNDEF_BASE) {} + msg_addr_t(int t, int n) : _addr(t | n) {} + + int num() const { return _addr & MSG_ADDR_NUM_MASK; } + int type() const { return _addr & MSG_ADDR_TYPE_MASK; } + const char *type_str() const { + switch (type()) { + case MSG_ADDR_RANK_BASE: return "rank"; + case MSG_ADDR_MDS_BASE: return "mds"; + case MSG_ADDR_OSD_BASE: return "osd"; + case MSG_ADDR_MON_BASE: return "mon"; + case MSG_ADDR_CLIENT_BASE: return "client"; + case MSG_ADDR_NAMER_BASE: return "namer"; + } + return "unknown"; + } + + bool is_new() const { return num() == MSG_ADDR_NEW; } + + bool is_client() const { return type() == MSG_ADDR_CLIENT_BASE; } + bool is_mds() const { return type() == MSG_ADDR_MDS_BASE; } + bool is_osd() const { return type() == MSG_ADDR_OSD_BASE; } + bool is_mon() const { return type() == MSG_ADDR_MON_BASE; } + bool is_namer() const { return type() == MSG_ADDR_NAMER_BASE; } +}; + +inline bool operator== (const msg_addr_t& l, const msg_addr_t& r) { return l._addr == r._addr; } +inline bool operator!= (const msg_addr_t& l, const msg_addr_t& r) { return l._addr != r._addr; } +inline bool operator< (const msg_addr_t& l, const msg_addr_t& r) { return l._addr < r._addr; } + +//typedef struct msg_addr msg_addr_t; + +inline std::ostream& operator<<(std::ostream& out, const msg_addr_t& addr) { + //if (addr.is_namer()) return out << "namer"; + return out << addr.type_str() << addr.num(); +} + + +namespace __gnu_cxx { + template<> struct hash< msg_addr_t > + { + size_t operator()( const msg_addr_t m ) const + { + static hash H; + return H(m._addr); + } + }; +} + +#define MSG_ADDR_RANK(x) msg_addr_t(MSG_ADDR_RANK_BASE,x) +#define MSG_ADDR_MDS(x) msg_addr_t(MSG_ADDR_MDS_BASE,x) +#define MSG_ADDR_OSD(x) msg_addr_t(MSG_ADDR_OSD_BASE,x) +#define MSG_ADDR_MON(x) msg_addr_t(MSG_ADDR_MON_BASE,x) +#define MSG_ADDR_CLIENT(x) msg_addr_t(MSG_ADDR_CLIENT_BASE,x) +#define MSG_ADDR_NAMER(x) msg_addr_t(MSG_ADDR_NAMER_BASE,x) + +#define MSG_ADDR_UNDEF msg_addr_t() +#define MSG_ADDR_DIRECTORY MSG_ADDR_NAMER(0) + +#define MSG_ADDR_RANK_NEW MSG_ADDR_RANK(MSG_ADDR_NEW) +#define MSG_ADDR_MDS_NEW MSG_ADDR_MDS(MSG_ADDR_NEW) +#define MSG_ADDR_OSD_NEW MSG_ADDR_OSD(MSG_ADDR_NEW) +#define MSG_ADDR_CLIENT_NEW MSG_ADDR_CLIENT(MSG_ADDR_NEW) +#define MSG_ADDR_NAMER_NEW MSG_ADDR_NAMER(MSG_ADDR_NEW) + +#define MSG_ADDR_ISCLIENT(x) x.is_client() +#define MSG_ADDR_TYPE(x) x.type_str() +#define MSG_ADDR_NUM(x) x.num() +#define MSG_ADDR_NICE(x) x.type_str() << x.num() + + + + +class entity_inst_t { + public: + tcpaddr_t addr; + int rank; + + entity_inst_t() : rank(-1) { + memset(&addr, 0, sizeof(addr)); + } + entity_inst_t(tcpaddr_t& a, int r) : addr(a), rank(r) { + memset(&addr, 0, sizeof(addr)); + } +}; + +inline bool operator==(const entity_inst_t& a, const entity_inst_t& b) { return a.rank == b.rank && a.addr == b.addr; } +inline bool operator!=(const entity_inst_t& a, const entity_inst_t& b) { return !(a == b); } +inline bool operator>(const entity_inst_t& a, const entity_inst_t& b) { return a.rank > b.rank; } +inline bool operator>=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank >= b.rank; } +inline bool operator<(const entity_inst_t& a, const entity_inst_t& b) { return a.rank < b.rank; } +inline bool operator<=(const entity_inst_t& a, const entity_inst_t& b) { return a.rank <= b.rank; } + +inline ostream& operator<<(ostream& out, const entity_inst_t &i) +{ + return out << "rank" << i.rank << "_" << i.addr; +} + + +// abstract Message class + + + +typedef struct { + int type; + msg_addr_t source, dest; + entity_inst_t source_inst; + int source_port, dest_port; + int nchunks; + __uint64_t lamport_send_stamp; + __uint64_t lamport_recv_stamp; +} msg_envelope_t; + +#define MSG_ENVELOPE_LEN sizeof(msg_envelope_t) + + +class Message { + private: + + protected: + msg_envelope_t env; // envelope + bufferlist payload; // payload + + friend class Messenger; +public: + + public: + Message() { + env.source_port = env.dest_port = -1; + env.source = env.dest = MSG_ADDR_UNDEF; + env.nchunks = 0; + env.lamport_send_stamp = 0; + env.lamport_recv_stamp = 0; + }; + Message(int t) { + env.source_port = env.dest_port = -1; + env.source = env.dest = MSG_ADDR_UNDEF; + env.nchunks = 0; + env.type = t; + env.lamport_send_stamp = 0; + env.lamport_recv_stamp = 0; + } + virtual ~Message() { + } + + void set_lamport_send_stamp(__uint64_t t) { env.lamport_send_stamp = t; } + void set_lamport_recv_stamp(__uint64_t t) { env.lamport_recv_stamp = t; } + __uint64_t get_lamport_send_stamp() { return env.lamport_send_stamp; } + __uint64_t get_lamport_recv_stamp() { return env.lamport_recv_stamp; } + + + // for rpc-type procedural messages (pcid = procedure call id) + virtual long get_pcid() { return 0; } + virtual void set_pcid(long t) { assert(0); } // overload me + + void clear_payload() { payload.clear(); } + bool empty_payload() { return payload.length() == 0; } + bufferlist& get_payload() { + return payload; + } + void set_payload(bufferlist& bl) { + payload.claim(bl); + } + msg_envelope_t& get_envelope() { + return env; + } + void set_envelope(msg_envelope_t& env) { + this->env = env; + } + + + // ENVELOPE ---- + + // type + int get_type() { return env.type; } + void set_type(int t) { env.type = t; } + virtual char *get_type_name() = 0; + + // source/dest + msg_addr_t& get_dest() { return env.dest; } + void set_dest(msg_addr_t a, int p) { env.dest = a; env.dest_port = p; } + int get_dest_port() { return env.dest_port; } + + msg_addr_t& get_source() { return env.source; } + void set_source(msg_addr_t a, int p) { env.source = a; env.source_port = p; } + int get_source_port() { return env.source_port; } + + entity_inst_t& get_source_inst() { return env.source_inst; } + void set_source_inst(entity_inst_t &i) { env.source_inst = i; } + + // PAYLOAD ---- + void reset_payload() { + payload.clear(); + } + + // overload either the rope version (easier!) + virtual void encode_payload(crope& s) { assert(0); } + virtual void decode_payload(crope& s, int& off) { assert(0); } + + // of the bufferlist versions (faster!) + virtual void decode_payload() { + // use a crope for convenience, small messages, etc. FIXME someday. + crope ser; + for (list::const_iterator it = payload.buffers().begin(); + it != payload.buffers().end(); + it++) + ser.append((*it).c_str(), (*it).length()); + + int off = 0; + decode_payload(ser, off); + assert((unsigned)off == payload.length()); + } + virtual void encode_payload() { + assert(payload.length() == 0); // caller should reset payload + + // use crope for convenience, small messages. FIXME someday. + crope r; + encode_payload(r); + + // copy payload + payload.push_back( buffer::copy(r.c_str(), r.length()) ); + } + + virtual void print(ostream& out) { + out << "message(type=" << get_type() << ")"; + } + +}; + +extern Message *decode_message(msg_envelope_t &env, bufferlist& bl); +inline ostream& operator<<(ostream& out, Message& m) { + m.print(out); + return out; +} + +#endif diff --git a/branches/sage/cephmds2/msg/Messenger.cc b/branches/sage/cephmds2/msg/Messenger.cc new file mode 100644 index 0000000000000..b033bbfc08638 --- /dev/null +++ b/branches/sage/cephmds2/msg/Messenger.cc @@ -0,0 +1,84 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include "include/types.h" + +#include "Message.h" +#include "Messenger.h" +#include "messages/MGenericMessage.h" + +#include +#include +using namespace std; + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "messenger: " +#define DEBUGLVL 10 // debug level of output + + + +// -------- +// callbacks + +Mutex msgr_callback_lock; +list msgr_callback_queue; +//Context* msgr_callback_kicker = 0; + +void Messenger::queue_callback(Context *c) { + msgr_callback_lock.Lock(); + msgr_callback_queue.push_back(c); + msgr_callback_lock.Unlock(); + + callback_kick(); +} +void Messenger::queue_callbacks(list& ls) { + msgr_callback_lock.Lock(); + msgr_callback_queue.splice(msgr_callback_queue.end(), ls); + msgr_callback_lock.Unlock(); + + callback_kick(); +} + +void Messenger::do_callbacks() { + // take list + msgr_callback_lock.Lock(); + list ls; + ls.splice(ls.begin(), msgr_callback_queue); + msgr_callback_lock.Unlock(); + + // do them + for (list::iterator it = ls.begin(); + it != ls.end(); + it++) { + dout(10) << "--- doing callback " << *it << endl; + (*it)->finish(0); + delete *it; + } +} + +// --------- +// incoming messages + +void Messenger::dispatch(Message *m) +{ + assert(dispatcher); + dispatcher->dispatch(m); +} + + + diff --git a/branches/sage/cephmds2/msg/Messenger.h b/branches/sage/cephmds2/msg/Messenger.h new file mode 100644 index 0000000000000..4ec3349a2a096 --- /dev/null +++ b/branches/sage/cephmds2/msg/Messenger.h @@ -0,0 +1,92 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef __MESSENGER_H +#define __MESSENGER_H + +#include +using namespace std; + +#include "Message.h" +#include "Dispatcher.h" +#include "common/Mutex.h" +#include "common/Cond.h" +#include "include/Context.h" + + +typedef __uint64_t lamport_t; + + +class MDS; +class Timer; + +class Messenger { + private: + Dispatcher *dispatcher; + msg_addr_t _myaddr; + + + public: + Messenger(msg_addr_t w) : dispatcher(0), _myaddr(w) { } + virtual ~Messenger() { } + + void set_myaddr(msg_addr_t m) { _myaddr = m; } + msg_addr_t get_myaddr() { return _myaddr; } + + + virtual int shutdown() = 0; + + // callbacks + static void do_callbacks(); + + void queue_callback(Context *c); + void queue_callbacks(list& ls); + virtual void callback_kick() = 0; + + virtual int get_dispatch_queue_len() { return 0; }; + + // setup + void set_dispatcher(Dispatcher *d) { dispatcher = d; ready(); } + Dispatcher *get_dispatcher() { return dispatcher; } + virtual void ready() { } + bool is_ready() { return dispatcher != 0; } + + // dispatch incoming messages + virtual void dispatch(Message *m); + + // send message + virtual void prepare_dest(const entity_inst_t& inst) {} + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; + virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst, + int port=0, int fromport=0) { + return send_message(m, dest, port, fromport); // overload me! + } + + + // make a procedure call + //virtual Message* sendrecv(Message *m, msg_addr_t dest, int port=0); + + + virtual void mark_down(msg_addr_t a, entity_inst_t& i) {} + virtual void mark_up(msg_addr_t a, entity_inst_t& i) {} + //virtual void reset(msg_addr_t a) { mark_down(a); mark_up(a); } + +}; + + + + + +#endif diff --git a/branches/sage/cephmds2/msg/NewMessenger.cc b/branches/sage/cephmds2/msg/NewMessenger.cc new file mode 100644 index 0000000000000..6cd5d291b60c3 --- /dev/null +++ b/branches/sage/cephmds2/msg/NewMessenger.cc @@ -0,0 +1,1714 @@ + +#include "NewMessenger.h" + +#include +#include +#include +#include + +#include "config.h" + +#include "messages/MGenericMessage.h" +#include "messages/MNSConnect.h" +#include "messages/MNSConnectAck.h" +#include "messages/MNSRegister.h" +#include "messages/MNSRegisterAck.h" +#include "messages/MNSLookup.h" +#include "messages/MNSLookupReply.h" +#include "messages/MNSFailure.h" + +//#include "messages/MFailure.h" + +#include + + +#undef dout +#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " +#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " + + + +#include "tcp.cc" + + +Rank rank; + + +/******************************************** + * Namer + */ + +Rank::Namer::Namer(EntityMessenger *msgr) : + messenger(msgr), + nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) +{ + assert(rank.my_rank == 0); + nrank = g_conf.num_mon; + + // announce myself + /* + cerr << "ceph ns is " << rank.accepter.listen_addr << endl; + cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; + int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); + ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); + ::fchmod(fd, 0755); + ::close(fd); + */ + + // ok + messenger->set_dispatcher(this); +} + +Rank::Namer::~Namer() +{ + //::unlink(".ceph_ns"); +} + + +void Rank::Namer::dispatch(Message *m) +{ + rank.lock.Lock(); + int type = m->get_type(); + switch (type) { + case MSG_NS_CONNECT: + handle_connect((class MNSConnect*)m); + break; + case MSG_NS_REGISTER: + handle_register((class MNSRegister*)m); + break; + case MSG_NS_STARTED: + handle_started(m); + break; + case MSG_NS_UNREGISTER: + handle_unregister(m); + break; + case MSG_NS_LOOKUP: + handle_lookup((class MNSLookup*)m); + break; + case MSG_NS_FAILURE: + handle_failure((class MNSFailure*)m); + break; + + case MSG_FAILURE_ACK: + delete m; + break; + + default: + assert(0); + } + rank.lock.Unlock(); +} + +void Rank::Namer::handle_connect(MNSConnect *m) +{ + int newrank = nrank++; + dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; + + rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); + rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; + rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); + + messenger->send_message(new MNSConnectAck(newrank), + MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); + delete m; +} + +void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) +{ + rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; +} + +void Rank::Namer::handle_register(MNSRegister *m) +{ + dout(10) << "namer.handle_register from rank " << m->get_rank() + << " addr " << m->get_entity() << endl; + + // pick id + msg_addr_t entity = m->get_entity(); + + if (entity.is_new()) { + // make up a new address! + switch (entity.type()) { + case MSG_ADDR_MDS_BASE: + entity = MSG_ADDR_MDS(nmds++); + break; + + case MSG_ADDR_OSD_BASE: + entity = MSG_ADDR_OSD(nosd++); + break; + + case MSG_ADDR_CLIENT_BASE: + entity = MSG_ADDR_CLIENT(nclient++); + break; + + default: + assert(0); + } + } else { + // specific address! + } + + + // register + if (rank.entity_map.count(entity)) { + dout(1) << "namer.handle_register re-registering " << entity + << " inst " << m->get_source_inst() + << " (was " << rank.entity_map[entity] << ")" + << endl; + } else { + dout(1) << "namer.handle_register registering " << entity + << " inst " << m->get_source_inst() + << endl; + } + rank.entity_map[entity] = m->get_source_inst(); + rank.entity_unstarted.insert(entity); + + // reply w/ new id + messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), + m->get_source(), rank.entity_map[entity]); + + delete m; +} + +void Rank::Namer::handle_started(Message *m) +{ + msg_addr_t who = m->get_source(); + dout(10) << "namer.handle_started from entity " << who << endl; + + assert(rank.entity_unstarted.count(who)); + rank.entity_unstarted.erase(who); + + // anybody waiting? + if (waiting.count(who)) { + list ls; + ls.swap(waiting[who]); + waiting.erase(who); + + dout(10) << "doing waiters on " << who << endl; + for (list::iterator it = ls.begin(); + it != ls.end(); + it++) + dispatch(*it); + } + +} + +void Rank::Namer::handle_unregister(Message *m) +{ + msg_addr_t who = m->get_source(); + dout(1) << "namer.handle_unregister entity " << who << endl; + + rank.show_dir(); + + assert(rank.entity_map.count(who)); + rank.entity_map.erase(who); + + rank.show_dir(); + + // shut myself down? kick watcher. + if (rank.entity_map.size() == 2) { + dout(10) << "namer.handle_unregister stopping namer" << endl; + rank.lock.Unlock(); + messenger->shutdown(); + delete messenger; + rank.lock.Lock(); + } + + delete m; +} + + +void Rank::Namer::handle_lookup(MNSLookup *m) +{ + // have it? + if (rank.entity_map.count(m->get_entity()) == 0) { + dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; + waiting[m->get_entity()].push_back(m); + return; + } + + if (rank.entity_unstarted.count(m->get_entity())) { + dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; + waiting[m->get_entity()].push_back(m); + return; + } + + // look it up! + MNSLookupReply *reply = new MNSLookupReply(m); + + reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; + + dout(10) << "namer " << m->get_source() + << " lookup '" << m->get_entity() + << "' -> " << rank.entity_map[m->get_entity()] << endl; + + messenger->send_message(reply, m->get_source(), m->get_source_inst()); + delete m; +} + +void Rank::Namer::handle_failure(MNSFailure *m) +{ + dout(10) << "namer.handle_failure inst " << m->get_inst() + << endl; + + // search for entities on this instance + list rm; + for (hash_map::iterator i = rank.entity_map.begin(); + i != rank.entity_map.end(); + i++) { + if (i->second != m->get_inst()) continue; + rm.push_back(i->first); + } + for (list::iterator i = rm.begin(); + i != rm.end(); + i++) { + dout(10) << "namer.handle_failure inst " << m->get_inst() + << ", removing " << *i << endl; + + rank.entity_map.erase(*i); + rank.entity_unstarted.erase(*i); + + /* + if ((*i).is_osd()) { + // tell the monitor + messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); + } + */ + } + + delete m; +} + + + +/******************************************** + * Accepter + */ + +int Rank::Accepter::start() +{ + // bind to a socket + dout(10) << "accepter.start binding to listen " << endl; + + /* socket creation */ + listen_sd = socket(AF_INET,SOCK_STREAM,0); + assert(listen_sd > 0); + + /* bind to port */ + memset((char*)&listen_addr, 0, sizeof(listen_addr)); + listen_addr.sin_family = AF_INET; + listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); + listen_addr.sin_port = 0; + + int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); + assert(rc >= 0); + + socklen_t llen = sizeof(listen_addr); + getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); + + int myport = listen_addr.sin_port; + + // listen! + rc = ::listen(listen_sd, 1000); + assert(rc >= 0); + + //dout(10) << "accepter.start listening on " << myport << endl; + + // my address is... + char host[100]; + bzero(host, 100); + gethostname(host, 100); + //dout(10) << "accepter.start my hostname is " << host << endl; + + struct hostent *myhostname = gethostbyname( host ); + + struct sockaddr_in my_addr; + memset(&my_addr, 0, sizeof(my_addr)); + + my_addr.sin_family = myhostname->h_addrtype; + memcpy((char *) &my_addr.sin_addr.s_addr, + myhostname->h_addr_list[0], + myhostname->h_length); + my_addr.sin_port = myport; + + listen_addr = my_addr; + + dout(10) << "accepter.start listen addr is " << listen_addr << endl; + + // start thread + create(); + + return 0; +} + +void *Rank::Accepter::entry() +{ + dout(10) << "accepter starting" << endl; + + while (!done) { + // accept + struct sockaddr_in addr; + socklen_t slen = sizeof(addr); + int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); + if (sd > 0) { + dout(10) << "accepted incoming on sd " << sd << endl; + + Receiver *r = new Receiver(sd); + r->create(); + + rank.lock.Lock(); + rank.receivers.insert(r); + rank.lock.Unlock(); + } else { + dout(10) << "no incoming connection?" << endl; + break; + } + } + + return 0; +} + + +/************************************** + * Receiver + */ + +void *Rank::Receiver::entry() +{ + while (!done) { + Message *m = read_message(); + if (!m) { + ::close(sd); + break; + } + + dout(10) << "receiver.entry got message for " << m->get_dest() << endl; + + EntityMessenger *entity = 0; + + rank.lock.Lock(); + { + if (rank.down.count(m->get_dest())) { + dout(0) << "receiver.entry dest " << m->get_dest() << " down, dropping " << *m << endl; + delete m; + + if (rank.looking_up.count(m->get_dest()) == 0) + rank.lookup(m->get_dest()); + } + else if (rank.entity_map.count(m->get_source()) && + rank.entity_map[m->get_source()] > m->get_source_inst()) { + derr(0) << "receiver.entry source " << m->get_source() + << " inst " << m->get_source_inst() + << " < " << rank.entity_map[m->get_source()] + << ", dropping " << *m << endl; + delete m; + } + else { + if (rank.entity_map.count(m->get_source()) && + rank.entity_map[m->get_source()] > m->get_source_inst()) { + derr(0) << "receiver.entry source " << m->get_source() + << " inst " << m->get_source_inst() + << " > " << rank.entity_map[m->get_source()] + << ", WATCH OUT " << *m << endl; + rank.entity_map[m->get_source()] = m->get_source_inst(); + } + + if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { + // ours. + rank.dispatch(m); + } else { + if (g_conf.ms_single_dispatch) { + // submit to single dispatch queue + rank._submit_single_dispatch(m); + } else { + if (rank.local.count(m->get_dest())) { + // find entity + entity = rank.local[m->get_dest()]; + } else { + derr(0) << "got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; + rank.waiting_for_lookup[m->get_dest()].push_back(m); + } + } + } + } + } + rank.lock.Unlock(); + + if (entity) + entity->queue_message(m); // queue + } + + // add to reap queue + rank.lock.Lock(); + rank.receiver_reap_queue.push_back(this); + rank.wait_cond.Signal(); + rank.lock.Unlock(); + + return 0; +} + +Message *Rank::Receiver::read_message() +{ + // envelope + //dout(10) << "receiver.read_message from sd " << sd << endl; + + msg_envelope_t env; + if (!tcp_read( sd, (char*)&env, sizeof(env) )) + return 0; + + if (env.type == 0) { + dout(10) << "receiver got dummy env, bailing" << endl; + return 0; + } + + dout(20) << "receiver got envelope type=" << env.type + << " src " << env.source << " dst " << env.dest + << " nchunks=" << env.nchunks + << endl; + + // payload + bufferlist blist; + for (int i=0; iget_source() << endl; + + return m; +} + + +/************************************** + * Sender + */ + +int Rank::Sender::connect() +{ + dout(10) << "sender(" << inst << ").connect" << endl; + + // create socket? + sd = socket(AF_INET,SOCK_STREAM,0); + assert(sd > 0); + + // bind any port + struct sockaddr_in myAddr; + myAddr.sin_family = AF_INET; + myAddr.sin_addr.s_addr = htonl(INADDR_ANY); + myAddr.sin_port = htons( 0 ); + + int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); + assert(rc>=0); + + // connect! + int r = ::connect(sd, (sockaddr*)&inst.addr, sizeof(myAddr)); + if (r < 0) return r; + + // identify myself + // FIXME + + return 0; +} + + +void Rank::Sender::finish() +{ + dout(10) << "sender(" << inst << ").finish" << endl; + + // make sure i get reaped. + rank.lock.Lock(); + rank.sender_reap_queue.push_back(this); + rank.wait_cond.Signal(); + rank.lock.Unlock(); +} + +void Rank::Sender::fail_and_requeue(list& out) +{ + dout(10) << "sender(" << inst << ").fail" << endl;// and requeue" << endl; + + // tell namer + if (!rank.messenger) { + derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; + assert(0); + } + + // old and unnecessary? + if (0) + rank.messenger->send_message(new MNSFailure(inst), + MSG_ADDR_NAMER(0)); + + + // FIXME: possible race before i reclaim lock here? + + Dispatcher *dis = 0; + msg_addr_t dis_dest; + + list lost; + + // requeue my messages + rank.lock.Lock(); + lock.Lock(); + { + // include out at front of queue + q.splice(q.begin(), out); + dout(10) << "sender(" << inst << ").fail " + << q.size() << " messages" << endl; + + if (0) { + lost.swap(q); + } else { + + while (!q.empty()) { + // don't keep reconnecting.. + if (rank.entity_map.count(q.front()->get_dest()) && + rank.entity_map[q.front()->get_dest()] == inst) + rank.down.insert(q.front()->get_dest()); + //rank.entity_map.erase(q.front()->get_dest()); + + if (!dis && + rank.local.count(q.front()->get_source())) { + dis_dest = q.front()->get_dest(); + dis = rank.local[q.front()->get_source()]->get_dispatcher(); + } + + if (g_conf.ms_requeue_on_sender_fail) + rank.submit_message( q.front() ); + else + lost.push_back( q.front() ); + q.pop_front(); + } + } + + // deactivate myself + if (rank.rank_sender.count(inst.rank) && + rank.rank_sender[inst.rank] == this) + rank.rank_sender.erase(inst.rank); + + // stop sender loop + done = true; + } + lock.Unlock(); + + + // send special failure msg? + if (dis) { + for (list::iterator p = lost.begin(); + p != lost.end(); + p++) + dis->ms_handle_failure(*p, dis_dest, inst); + } + + rank.lock.Unlock(); +} + +void *Rank::Sender::entry() +{ + // connect + if (sd == 0) { + int rc = connect(); + if (rc < 0) { + list out; + derr(0) << "error connecting to " << inst << endl; + fail_and_requeue(out); + finish(); + return 0; + } + } + + lock.Lock(); + while (!q.empty() || !done) { + + if (!q.empty()) { + dout(20) << "sender(" << inst << ") grabbing message(s)" << endl; + + // grab outgoing list + list out; + out.swap(q); + + // drop lock while i send these + lock.Unlock(); + + while (!out.empty()) { + Message *m = out.front(); + out.pop_front(); + + dout(20) << "sender(" << inst << ") sending " << *m << endl; + + // stamp. + m->set_source_inst(rank.my_inst); + + // marshall + if (m->empty_payload()) + m->encode_payload(); + + if (write_message(m) < 0) { + // failed! + derr(0) << "error sending to " << m->get_dest() << " on " << inst << endl; + out.push_front(m); + fail_and_requeue(out); + break; + } + } + + lock.Lock(); + continue; + } + + // wait + dout(20) << "sender(" << inst << ") sleeping" << endl; + cond.Wait(lock); + } + lock.Unlock(); + + finish(); + return 0; +} + + +int Rank::Sender::write_message(Message *m) +{ + // get envelope, buffers + msg_envelope_t *env = &m->get_envelope(); + bufferlist blist; + blist.claim( m->get_payload() ); + +#ifdef TCP_KEEP_CHUNKS + env->nchunks = blist.buffers().size(); +#else + env->nchunks = 1; +#endif + + dout(20)// << g_clock.now() + << " sending " << m << " " << *m + << " to " << m->get_dest() + << endl; + + // send envelope + int r = tcp_write( sd, (char*)env, sizeof(*env) ); + if (r < 0) { + derr(20) << "error sending envelope for " << *m + << " to " << m->get_dest() << endl; + return -1; + } + + // payload +#ifdef TCP_KEEP_CHUNKS + // send chunk-wise + int i = 0; + for (list::iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + dout(10) << "tcp_sending frag " << i << " len " << (*it).length() << endl; + int size = (*it).length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { + derr(20) << "error sending chunk len for " << *m << " to " << m->get_dest() << endl; + return -1; + } + r = tcp_write( sd, (*it).c_str(), size ); + if (r < 0) { + derr(20) << "error sending data chunk for " << *m << " to " << m->get_dest() << endl; + return -1; + } + i++; + } +#else + // one big chunk + int size = blist.length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { + derr(20) << "error sending data len for " << *m << " to " << m->get_dest() << endl; + return -1; + } + for (list::iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + r = tcp_write( sd, (*it).c_str(), (*it).length() ); + if (r < 0) { + derr(20) << "error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; + return -1; + } + } +#endif + + // delete message + delete m; + return 0; +} + + + +/******************************************** + * Rank + */ + +Rank::Rank(int r) : + single_dispatcher(this), + my_rank(r), + namer(0) { +} +Rank::~Rank() +{ + //FIXME + if (namer) delete namer; +} + + +void Rank::_submit_single_dispatch(Message *m) +{ + assert(lock.is_locked()); + + if (local.count(m->get_dest()) && + local[m->get_dest()]->is_ready()) { + rank.single_dispatch_queue.push_back(m); + rank.single_dispatch_cond.Signal(); + } else { + waiting_for_ready[m->get_dest()].push_back(m); + } +} + + +void Rank::single_dispatcher_entry() +{ + lock.Lock(); + while (!single_dispatch_stop || !single_dispatch_queue.empty()) { + if (!single_dispatch_queue.empty()) { + list ls; + ls.swap(single_dispatch_queue); + + lock.Unlock(); + { + while (!ls.empty()) { + Message *m = ls.front(); + ls.pop_front(); + + dout(1) //<< g_clock.now() + << "---- " + << m->get_source() << ':' << m->get_source_port() + << " to " << m->get_dest() << ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << m + << endl; + + if (m->get_dest().type() == MSG_ADDR_RANK_BASE) + rank.dispatch(m); + else { + assert(local.count(m->get_dest())); + local[m->get_dest()]->dispatch(m); + } + } + } + lock.Lock(); + continue; + } + single_dispatch_cond.Wait(lock); + } + lock.Unlock(); +} + + +/* + * note: assumes lock is held + */ +void Rank::reaper() +{ + assert(lock.is_locked()); + + while (!receiver_reap_queue.empty()) { + Receiver *r = receiver_reap_queue.front(); + receiver_reap_queue.pop_front(); + //dout(10) << "reaper reaping receiver sd " << r->sd << endl; + receivers.erase(r); + r->join(); + dout(10) << "reaper reaped receiver sd " << r->sd << endl; + delete r; + } + + while (!sender_reap_queue.empty()) { + Sender *s = sender_reap_queue.front(); + sender_reap_queue.pop_front(); + //dout(10) << "reaper reaping sender rank " << s->dest_rank << " at " << s->tcpaddr << endl; + if (rank_sender.count(s->inst.rank) && + rank_sender[s->inst.rank] == s) + rank_sender.erase(s->inst.rank); + s->join(); + dout(10) << "reaper reaped sender " << s->inst << endl; + delete s; + } +} + + +int Rank::start_rank() +{ + dout(10) << "start_rank" << endl; + + // bind to a socket + if (accepter.start() < 0) + return -1; + + // start single thread dispatcher? + if (g_conf.ms_single_dispatch) { + single_dispatch_stop = false; + single_dispatcher.create(); + } + + lock.Lock(); + + if (my_rank < 0) { + dout(10) << "start_rank connecting to namer0" << endl; + + // connect to namer + assert(entity_map.count(MSG_ADDR_NAMER(0))); + Sender *sender = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); + + // send + Message *m = new MNSConnect(accepter.listen_addr); + m->set_dest(MSG_ADDR_NAMER(0), 0); + sender->send(m); + + // wait + while (my_rank < 0) + waiting_for_rank.Wait(lock); + assert(my_rank >= 0); + + dout(10) << "start_rank got rank " << my_rank << endl; + + // create rank entity + entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; + local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); + messenger->set_dispatcher(this); + } else { + // my_inst + my_inst.addr = accepter.listen_addr; + my_inst.rank = my_rank; + + // create my rank + msg_addr_t raddr = MSG_ADDR_RANK(my_rank); + entity_map[raddr] = my_inst; + entity_unstarted.insert(raddr); + local[raddr] = messenger = new EntityMessenger(raddr); + messenger->set_dispatcher(this); + + dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; + } + + lock.Unlock(); + return 0; +} + +void Rank::start_namer() +{ + // create namer0 + msg_addr_t naddr = MSG_ADDR_NAMER(0); + entity_map[naddr] = my_inst; + local[naddr] = new EntityMessenger(naddr); + namer = new Namer(local[naddr]); +} + +void Rank::set_namer(const tcpaddr_t& ns) +{ + entity_map[MSG_ADDR_NAMER(0)].addr = ns; + entity_map[MSG_ADDR_NAMER(0)].rank = 0; +} + +/* connect_rank + * NOTE: assumes rank.lock held. + */ +Rank::Sender *Rank::connect_rank(const entity_inst_t& inst) +{ + assert(rank.lock.is_locked()); + assert(inst != rank.my_inst); + + dout(10) << "connect_rank to " << inst << endl; + + // create sender + Sender *sender = new Sender(inst); + //int rc = sender->connect(); + //assert(rc >= 0); + + // start thread. + sender->create(); + + // old sender? + assert(rank.rank_sender.count(inst.rank) == 0); + //if (rank.rank_sender.count(r)) + //rank.rank_sender[r]->stop(); + + // ok! + rank.rank_sender[inst.rank] = sender; + return sender; +} + + + + + +void Rank::show_dir() +{ + dout(10) << "show_dir ---" << endl; + + for (hash_map::iterator i = entity_map.begin(); + i != entity_map.end(); + i++) { + if (local.count(i->first)) { + dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; + } else { + dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; + } + } +} + + +/* lookup + * NOTE: assumes directory.lock held + */ +void Rank::lookup(msg_addr_t addr) +{ + dout(10) << "lookup " << addr << endl; + assert(lock.is_locked()); + + assert(looking_up.count(addr) == 0); + looking_up.insert(addr); + + MNSLookup *r = new MNSLookup(addr); + messenger->send_message(r, MSG_ADDR_DIRECTORY); +} + + + +/* register_entity + */ +Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) +{ + dout(10) << "register_entity " << addr << endl; + lock.Lock(); + + // register with namer + static long reg_attempt = 0; + long id = ++reg_attempt; + + Message *reg = new MNSRegister(addr, my_rank, id); + reg->set_source(MSG_ADDR_RANK(my_rank), 0); + reg->set_source_inst(my_inst); + reg->set_dest(MSG_ADDR_DIRECTORY, 0); + + // prepare cond + Cond cond; + waiting_for_register_cond[id] = &cond; + + // send request + lock.Unlock(); + submit_message(reg); + lock.Lock(); + + // wait + while (!waiting_for_register_result.count(id)) + cond.Wait(lock); + + // grab result + addr = waiting_for_register_result[id]; + dout(10) << "register_entity got " << addr << endl; + + // clean up + waiting_for_register_cond.erase(id); + waiting_for_register_result.erase(id); + + // create messenger + EntityMessenger *msgr = new EntityMessenger(addr); + + // add to directory + entity_map[addr] = my_inst; + local[addr] = msgr; + + // was anyone waiting? + if (waiting_for_lookup.count(addr)) { + submit_messages(waiting_for_lookup[addr]); + waiting_for_lookup.erase(addr); + } + + lock.Unlock(); + return msgr; +} + +void Rank::unregister_entity(EntityMessenger *msgr) +{ + lock.Lock(); + dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; + + // remove from local directory. + assert(local.count(msgr->get_myaddr())); + local.erase(msgr->get_myaddr()); + + if (my_rank > 0) { + assert(entity_map.count(msgr->get_myaddr())); + entity_map.erase(msgr->get_myaddr()); + } // else namer will do it. + + // tell namer. + if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && + msgr->get_myaddr() != MSG_ADDR_RANK(0)) + msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), + MSG_ADDR_NAMER(0)); + + // kick wait()? + if (local.size() <= 2) + wait_cond.Signal(); + + lock.Unlock(); +} + + +void Rank::submit_messages(list& ls) +{ + for (list::iterator i = ls.begin(); i != ls.end(); i++) + submit_message(*i); + ls.clear(); +} + + +void Rank::prepare_dest(msg_addr_t dest) +{ + lock.Lock(); + + if (entity_map.count( dest )) { + // remote, known rank addr. + entity_inst_t inst = entity_map[dest]; + + if (inst == my_inst) { + //dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; + //waiting_for_lookup[dest].push_back(m); + } + else if (rank_sender.count( inst.rank ) && + rank_sender[inst.rank]->inst == inst) { + //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; + // connected. + //sender = rank_sender[ inst.rank ]; + } else { + //dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; + // not connected. + connect_rank( inst ); + } + } else { + // unknown dest rank or rank addr. + if (looking_up.count(dest) == 0) { + //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; + lookup(dest); + } else { + //dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; + } + } + + lock.Unlock(); +} + +void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) +{ + const msg_addr_t dest = m->get_dest(); + + // lookup + EntityMessenger *entity = 0; + Sender *sender = 0; + + lock.Lock(); + { + // local? + if (dest_inst.rank == my_inst.rank) { + if (local.count(dest)) { + // local + dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; + if (g_conf.ms_single_dispatch) { + _submit_single_dispatch(m); + } else { + entity = local[dest]; + } + } else { + // mid-register + dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; + assert(0); + waiting_for_lookup[dest].push_back(m); + } + } + else { + // remote. + if (rank_sender.count( dest_inst.rank )) { + //&& + //rank_sender[dest_inst.rank]->inst == dest_inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connected." << endl; + // connected. + sender = rank_sender[ dest_inst.rank ]; + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; + // not connected. + sender = connect_rank( dest_inst ); + } + } + } + lock.Unlock(); + + // do it + if (entity) { + // local! + dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; + entity->queue_message(m); + } + else if (sender) { + // remote! + dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; + sender->send(m); + } +} + + +void Rank::submit_message(Message *m) +{ + const msg_addr_t dest = m->get_dest(); + + // lookup + EntityMessenger *entity = 0; + Sender *sender = 0; + + lock.Lock(); + { + if (down.count(dest)) { + // black hole. + dout(0) << "submit_message " << *m << " dest " << dest << " down, dropping" << endl; + delete m; + + if (looking_up.count(dest) == 0) + lookup(dest); + + } else if (local.count(dest)) { + dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; + + // local + if (g_conf.ms_single_dispatch) { + _submit_single_dispatch(m); + } else { + entity = local[dest]; + } + } else if (entity_map.count( dest )) { + // remote, known rank addr. + entity_inst_t inst = entity_map[dest]; + + if (inst == my_inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; + waiting_for_lookup[dest].push_back(m); + } + else if (rank_sender.count( inst.rank ) && + rank_sender[inst.rank]->inst == inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; + // connected. + sender = rank_sender[ inst.rank ]; + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; + // not connected. + sender = connect_rank( inst ); + } + } else { + // unknown dest rank or rank addr. + if (looking_up.count(dest) == 0) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; + lookup(dest); + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; + } + waiting_for_lookup[dest].push_back(m); + } + } + lock.Unlock(); + + // do it + if (entity) { + // local! + dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; + entity->queue_message(m); + } + else if (sender) { + // remote! + dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; + sender->send(m); + } +} + + + + +void Rank::dispatch(Message *m) +{ + lock.Lock(); + + dout(10) << "dispatching " << *m << endl; + + switch (m->get_type()) { + case MSG_NS_CONNECTACK: + handle_connect_ack((MNSConnectAck*)m); + break; + + case MSG_NS_REGISTERACK: + handle_register_ack((MNSRegisterAck*)m); + break; + + case MSG_NS_LOOKUPREPLY: + handle_lookup_reply((MNSLookupReply*)m); + break; + + default: + assert(0); + } + + lock.Unlock(); +} + +void Rank::handle_connect_ack(MNSConnectAck *m) +{ + dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; + my_rank = m->get_rank(); + + my_inst.addr = accepter.listen_addr; + my_inst.rank = my_rank; + + waiting_for_rank.SignalAll(); + delete m; + + // logger! + /*dout(10) << "logger" << endl; + char names[100]; + sprintf(names, "rank%d", my_rank); + string name = names; + + if (g_conf.tcp_log) { + logger = new Logger(name, (LogType*)&rank_logtype); + rank_logtype.add_set("num"); + rank_logtype.add_inc("in"); + rank_logtype.add_inc("inb"); + rank_logtype.add_inc("dis"); + rank_logtype.add_set("inq"); + rank_logtype.add_set("inqb"); + rank_logtype.add_set("outq"); + rank_logtype.add_set("outqb"); + } + */ +} + + +void Rank::handle_register_ack(MNSRegisterAck *m) +{ + dout(10) << "handle_register_ack " << m->get_entity() << endl; + + long tid = m->get_tid(); + waiting_for_register_result[tid] = m->get_entity(); + waiting_for_register_cond[tid]->Signal(); + delete m; +} + +void Rank::handle_lookup_reply(MNSLookupReply *m) +{ + list waiting; + dout(10) << "got lookup reply" << endl; + + for (map::iterator it = m->entity_map.begin(); + it != m->entity_map.end(); + it++) { + dout(10) << "lookup got " << it->first << " at " << it->second << endl; + msg_addr_t addr = it->first; + entity_inst_t inst = it->second; + + if (down.count(addr)) { + // ignore + dout(10) << "ignoring lookup results for " << addr << ", who is down" << endl; + //assert(entity_map.count(addr) == 0); + continue; + } + + if (entity_map.count(addr) && + entity_map[addr] > inst) { + dout(10) << "ignoring lookup results for " << addr << ", " \ + << entity_map[addr] << " > " << inst << endl; + continue; + } + + // update map. + entity_map[addr] = inst; + + if (inst.rank == my_rank) { + // local + dout(10) << "delivering lookup results locally" << endl; + if (local.count(addr)) { + if (g_conf.ms_single_dispatch) { + single_dispatch_queue.splice(single_dispatch_queue.end(), + waiting_for_lookup[addr]); + } else { + local[addr]->queue_messages(waiting_for_lookup[addr]); + } + waiting_for_lookup.erase(addr); + } else + lookup(addr); // try again! + + } else { + // remote + if (rank_sender.count(inst.rank) == 0) + connect_rank(inst); + else if (rank_sender[inst.rank]->inst != inst) { + dout(0) << "lookup got rank addr change, WATCH OUT" << endl; + // FIXME BUG possible message loss weirdness? + rank_sender[inst.rank]->stop(); + rank_sender.erase(inst.rank); + connect_rank(inst); + } + + // take waiters + Sender *sender = rank_sender[inst.rank]; + assert(sender); + + if (waiting_for_lookup.count(addr)) { + sender->send(waiting_for_lookup[addr]); + waiting_for_lookup.erase(addr); + } + } + } + + delete m; +} + + +void Rank::wait() +{ + lock.Lock(); + while (1) { + // reap dead senders, receivers. + reaper(); + + if (local.size() == 0) { + dout(10) << "wait: everything stopped" << endl; + break; // everything stopped. + } + + if (local.size() == 1 && + !messenger->is_stopped()) { + dout(10) << "wait: stopping rank" << endl; + lock.Unlock(); + messenger->shutdown(); + delete messenger; + lock.Lock(); + continue; + } + + wait_cond.Wait(lock); + } + lock.Unlock(); + + // done! clean up. + + // stop dispatch thread + if (g_conf.ms_single_dispatch) { + dout(10) << "wait: stopping dispatch thread" << endl; + lock.Lock(); + single_dispatch_stop = true; + single_dispatch_cond.Signal(); + lock.Unlock(); + single_dispatcher.join(); + } + + // reap senders and receivers + lock.Lock(); + { + dout(10) << "wait: stopping senders" << endl; + for (hash_map::iterator i = rank_sender.begin(); + i != rank_sender.end(); + i++) + i->second->stop(); + while (!rank_sender.empty()) { + wait_cond.Wait(lock); + reaper(); + } + + if (0) { // stop() no worky on receivers! we leak, but who cares. + dout(10) << "wait: stopping receivers" << endl; + for (set::iterator i = receivers.begin(); + i != receivers.end(); + i++) + (*i)->stop(); + while (!receivers.empty()) { + wait_cond.Wait(lock); + reaper(); + } + } + + } + lock.Unlock(); + + dout(10) << "wait: done." << endl; +} + + + +int Rank::find_ns_addr(tcpaddr_t &nsa) +{ + // file? + int fd = ::open(".ceph_ns",O_RDONLY); + if (fd > 0) { + ::read(fd, (void*)&nsa, sizeof(nsa)); + ::close(fd); + cout << "ceph ns is " << nsa << endl; + return 0; + } + + // env var? + char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); + if (nsaddr) { + while (nsaddr[0] != '=') nsaddr++; + nsaddr++; + + if (tcp_hostlookup(nsaddr, nsa) < 0) { + cout << "can't resolve " << nsaddr << endl; + return -1; + } + + cout << "ceph ns is " << nsa << endl; + return 0; + } + + cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; + return -1; +} + + + +/********************************** + * EntityMessenger + */ + +Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) : + Messenger(myaddr), + stop(false), + dispatch_thread(this) +{ +} +Rank::EntityMessenger::~EntityMessenger() +{ +} + +void Rank::EntityMessenger::dispatch_entry() +{ + lock.Lock(); + while (!stop) { + if (!dispatch_queue.empty()) { + list ls; + ls.swap(dispatch_queue); + + lock.Unlock(); + { + // deliver + while (!ls.empty()) { + Message *m = ls.front(); + ls.pop_front(); + dout(1) //<< g_clock.now() + << "---- " + << m->get_source() << ':' << m->get_source_port() + << " to " << m->get_dest() << ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << m->get_source_inst() + << " ---- " << m + << endl; + dispatch(m); + } + } + lock.Lock(); + continue; + } + cond.Wait(lock); + } + lock.Unlock(); +} + +void Rank::EntityMessenger::ready() +{ + dout(10) << "ready " << get_myaddr() << endl; + + if (g_conf.ms_single_dispatch) { + rank.lock.Lock(); + if (rank.waiting_for_ready.count(get_myaddr())) { + rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), + rank.waiting_for_ready[get_myaddr()]); + rank.waiting_for_ready.erase(get_myaddr()); + rank.single_dispatch_cond.Signal(); + } + rank.lock.Unlock(); + } else { + // start my dispatch thread + dispatch_thread.create(); + } + + // tell namer + if (get_myaddr() != MSG_ADDR_NAMER(0)) + send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0)); +} + + +int Rank::EntityMessenger::shutdown() +{ + dout(10) << "shutdown " << get_myaddr() << endl; + + // deregister + rank.unregister_entity(this); + + // stop my dispatch thread + if (dispatch_thread.am_self()) { + dout(1) << "shutdown i am dispatch, setting stop flag" << endl; + stop = true; + } else { + dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; + lock.Lock(); + stop = true; + cond.Signal(); + lock.Unlock(); + dispatch_thread.join(); + } + + return 0; +} + + +void Rank::EntityMessenger::prepare_send_message(msg_addr_t dest) +{ + rank.prepare_dest(dest); +} + +int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst) +{ + // set envelope + m->set_source(get_myaddr(), 0); + m->set_dest(dest, 0); + + m->set_source_inst(rank.my_inst); + + dout(1) << "--> " + << m->get_source() //<< ':' << m->get_source_port() + << " to " << m->get_dest() //<< ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << rank.my_inst << " --> " << inst + << " ---- " << m + << endl; + + rank.submit_message(m, inst); + + return 0; +} + + +int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +{ + // set envelope + m->set_source(get_myaddr(), fromport); + m->set_dest(dest, port); + + m->set_source_inst(rank.my_inst); + + dout(1) << "--> " + << m->get_source() //<< ':' << m->get_source_port() + << " to " << m->get_dest() //<< ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << rank.my_inst << " --> ?" + << " ---- " << m + << endl; + + rank.submit_message(m); + + return 0; +} + + +void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i) +{ + assert(a != get_myaddr()); + rank.mark_down(a,i); +} + +void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) +{ + if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer + lock.Lock(); + if (down.count(a) == 0) { + if (entity_map.count(a) && + entity_map[a] > inst) { + dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; + derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; + // do nothing! + } else { + down.insert(a); + + if (entity_map.count(a) == 0) { + // don't know it + dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; + derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; + + waiting_for_lookup.erase(a); + looking_up.erase(a); + } else { + // know it + assert(entity_map[a] <= inst); + dout(10) << "mark_down " << a << " inst " << inst << endl; + derr(10) << "mark_down " << a << " inst " << inst << endl; + + entity_map.erase(a); + + if (rank_sender.count(inst.rank)) { + rank_sender[inst.rank]->stop(); + rank_sender.erase(inst.rank); + } + } + } + } + lock.Unlock(); +} + +void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i) +{ + assert(a != get_myaddr()); + rank.mark_up(a, i); +} + +void Rank::mark_up(msg_addr_t a, entity_inst_t& i) +{ + if (my_rank == 0) return; + lock.Lock(); + { + dout(10) << "mark_up " << a << " inst " << i << endl; + derr(10) << "mark_up " << a << " inst " << i << endl; + + down.erase(a); + + assert(i.rank != my_rank); // hrm? + + if (entity_map.count(a) == 0 || + entity_map[a] < i) { + entity_map[a] = i; + connect_rank(i); + } else if (entity_map[a] == i) { + dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; + derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; + } else { + dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; + derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; + } + + //if (waiting_for_lookup.count(a)) + //lookup(a); + } + lock.Unlock(); +} + diff --git a/branches/sage/cephmds2/msg/NewMessenger.h b/branches/sage/cephmds2/msg/NewMessenger.h new file mode 100644 index 0000000000000..a1c7af6e5c83b --- /dev/null +++ b/branches/sage/cephmds2/msg/NewMessenger.h @@ -0,0 +1,305 @@ +#ifndef __NEWMESSENGER_H +#define __NEWMESSENGER_H + + +#include +#include +using namespace std; +#include +#include +using namespace __gnu_cxx; + + +#include "include/types.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "Messenger.h" +#include "Message.h" +#include "tcp.h" + + + + +/* Rank - per-process + */ +class Rank : public Dispatcher { + + class EntityMessenger; + class Sender; + class Receiver; + + // namer + class Namer : public Dispatcher { + public: + EntityMessenger *messenger; // namerN + + int nrank; + int nclient, nmds, nosd, nmon; + + map > waiting; + + Namer(EntityMessenger *msgr); + ~Namer(); + + void handle_connect(class MNSConnect*); + void handle_register(class MNSRegister *m); + void handle_started(Message *m); + void handle_lookup(class MNSLookup *m); + void handle_unregister(Message *m); + void handle_failure(class MNSFailure *m); + + void dispatch(Message *m); + + void manual_insert_inst(const entity_inst_t &inst); + + }; + + // incoming + class Accepter : public Thread { + public: + bool done; + + tcpaddr_t listen_addr; + int listen_sd; + + Accepter() : done(false) {} + + void *entry(); + void stop() { + done = true; + ::close(listen_sd); + join(); + } + int start(); + } accepter; + + + class Receiver : public Thread { + public: + int sd; + bool done; + + Receiver(int _sd) : sd(_sd), done(false) {} + + void *entry(); + void stop() { + done = true; + ::close(sd); + //join(); + } + Message *read_message(); + }; + + + // outgoing + class Sender : public Thread { + public: + entity_inst_t inst; + bool done; + int sd; + + set entities; + list q; + + Mutex lock; + Cond cond; + + Sender(const entity_inst_t& i, int s=0) : inst(i), done(false), sd(s) {} + virtual ~Sender() {} + + void *entry(); + + int connect(); + void fail_and_requeue(list& ls); + void finish(); + + void stop() { + lock.Lock(); + done = true; + cond.Signal(); + lock.Unlock(); + } + + void send(Message *m) { + lock.Lock(); + q.push_back(m); + cond.Signal(); + lock.Unlock(); + } + void send(list& ls) { + lock.Lock(); + q.splice(q.end(), ls); + cond.Signal(); + lock.Unlock(); + } + + int write_message(Message *m); + }; + + + + // messenger interface + class EntityMessenger : public Messenger { + Mutex lock; + Cond cond; + list dispatch_queue; + bool stop; + + class DispatchThread : public Thread { + EntityMessenger *m; + public: + DispatchThread(EntityMessenger *_m) : m(_m) {} + void *entry() { + m->dispatch_entry(); + return 0; + } + } dispatch_thread; + void dispatch_entry(); + + public: + void queue_message(Message *m) { + lock.Lock(); + dispatch_queue.push_back(m); + cond.Signal(); + lock.Unlock(); + } + void queue_messages(list ls) { + lock.Lock(); + dispatch_queue.splice(dispatch_queue.end(), ls); + cond.Signal(); + lock.Unlock(); + } + + public: + EntityMessenger(msg_addr_t myaddr); + ~EntityMessenger(); + + void ready(); + bool is_stopped() { return stop; } + + void wait() { + dispatch_thread.join(); + } + + virtual void callback_kick() {} + virtual int shutdown(); + virtual void prepare_send_message(msg_addr_t dest); + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst); + + virtual void mark_down(msg_addr_t a, entity_inst_t& i); + virtual void mark_up(msg_addr_t a, entity_inst_t& i); + //virtual void reset(msg_addr_t a); + }; + + + class SingleDispatcher : public Thread { + Rank *rank; + public: + SingleDispatcher(Rank *r) : rank(r) {} + void *entry() { + rank->single_dispatcher_entry(); + return 0; + } + } single_dispatcher; + + Cond single_dispatch_cond; + bool single_dispatch_stop; + list single_dispatch_queue; + + map > waiting_for_ready; + + void single_dispatcher_entry(); + void _submit_single_dispatch(Message *m); + + + // Rank stuff + public: + Mutex lock; + Cond wait_cond; // for wait() + + // my rank + int my_rank; + Cond waiting_for_rank; + + // my instance + entity_inst_t my_inst; + + // lookup + hash_map entity_map; + hash_set entity_unstarted; + + map > waiting_for_lookup; + set looking_up; + + hash_set down; + + // register + map waiting_for_register_cond; + map waiting_for_register_result; + + // local + map local; + + // remote + hash_map rank_sender; + + set receivers; + + list sender_reap_queue; + list receiver_reap_queue; + + EntityMessenger *messenger; // rankN + Namer *namer; + + + void show_dir(); + + void lookup(msg_addr_t addr); + + void dispatch(Message *m); + void handle_connect_ack(class MNSConnectAck *m); + void handle_register_ack(class MNSRegisterAck *m); + void handle_lookup_reply(class MNSLookupReply *m); + + Sender *connect_rank(const entity_inst_t& inst); + + void mark_down(msg_addr_t addr, entity_inst_t& i); + void mark_up(msg_addr_t addr, entity_inst_t& i); + + tcpaddr_t get_listen_addr() { return accepter.listen_addr; } + + void reaper(); + + +public: + Rank(int r=-1); + ~Rank(); + + int find_ns_addr(tcpaddr_t &tcpaddr); + + void set_namer(const tcpaddr_t& ns); + void start_namer(); + + int start_rank(); + void wait(); + + EntityMessenger *register_entity(msg_addr_t addr); + void unregister_entity(EntityMessenger *ms); + + void submit_message(Message *m, const entity_inst_t& inst); + void prepare_dest(msg_addr_t dest); + void submit_message(Message *m); + void submit_messages(list& ls); + + // create a new messenger + EntityMessenger *new_entity(msg_addr_t addr); + +} ; + +extern Rank rank; + +#endif diff --git a/branches/sage/cephmds2/msg/NewerMessenger.cc b/branches/sage/cephmds2/msg/NewerMessenger.cc new file mode 100644 index 0000000000000..d1ed3fb00fdb3 --- /dev/null +++ b/branches/sage/cephmds2/msg/NewerMessenger.cc @@ -0,0 +1,1791 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "NewerMessenger.h" + +#include +#include +#include +#include + +#include "config.h" + +#include "messages/MGenericMessage.h" +#include "messages/MNSConnect.h" +#include "messages/MNSConnectAck.h" +#include "messages/MNSRegister.h" +#include "messages/MNSRegisterAck.h" +#include "messages/MNSLookup.h" +#include "messages/MNSLookupReply.h" +#include "messages/MNSFailure.h" + +//#include "messages/MFailure.h" + +#include + + +#undef dout +#define dout(l) if (l<=g_conf.debug_ms) cout << g_clock.now() << " -- rank" << rank.my_rank << " " +#define derr(l) if (l<=g_conf.debug_ms) cerr << g_clock.now() << " -- rank" << rank.my_rank << " " + + + +#include "tcp.cc" + + +Rank rank; + + +/******************************************** + * Namer + */ + +Rank::Namer::Namer(EntityMessenger *msgr) : + messenger(msgr), + nrank(0), nclient(0), nmds(0), nosd(0), nmon(0) +{ + assert(rank.my_rank == 0); + nrank = g_conf.num_mon; + + // announce myself + /* + cerr << "ceph ns is " << rank.accepter.listen_addr << endl; + cout << "export CEPH_NAMESERVER=" << rank.accepter.listen_addr << endl; + int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); + ::write(fd, (void*)&rank.accepter.listen_addr, sizeof(tcpaddr_t)); + ::fchmod(fd, 0755); + ::close(fd); + */ + + // ok + messenger->set_dispatcher(this); +} + +Rank::Namer::~Namer() +{ + //::unlink(".ceph_ns"); +} + + +void Rank::Namer::dispatch(Message *m) +{ + rank.lock.Lock(); + int type = m->get_type(); + switch (type) { + case MSG_NS_CONNECT: + handle_connect((class MNSConnect*)m); + break; + case MSG_NS_REGISTER: + handle_register((class MNSRegister*)m); + break; + case MSG_NS_STARTED: + handle_started(m); + break; + case MSG_NS_UNREGISTER: + handle_unregister(m); + break; + case MSG_NS_LOOKUP: + handle_lookup((class MNSLookup*)m); + break; + case MSG_NS_FAILURE: + handle_failure((class MNSFailure*)m); + break; + + case MSG_FAILURE_ACK: + delete m; + break; + + default: + assert(0); + } + rank.lock.Unlock(); +} + +void Rank::Namer::handle_connect(MNSConnect *m) +{ + int newrank = nrank++; + dout(2) << "namer.handle_connect from new rank" << newrank << " " << m->get_addr() << endl; + + rank.entity_map[MSG_ADDR_RANK(newrank)].addr = m->get_addr(); + rank.entity_map[MSG_ADDR_RANK(newrank)].rank = newrank; + rank.entity_unstarted.insert(MSG_ADDR_RANK(newrank)); + + messenger->send_message(new MNSConnectAck(newrank), + MSG_ADDR_RANK(newrank), rank.entity_map[MSG_ADDR_RANK(newrank)]); + delete m; +} + +void Rank::Namer::manual_insert_inst(const entity_inst_t &inst) +{ + rank.entity_map[MSG_ADDR_RANK(inst.rank)] = inst; +} + +void Rank::Namer::handle_register(MNSRegister *m) +{ + dout(10) << "namer.handle_register from rank " << m->get_rank() + << " addr " << m->get_entity() << endl; + + // pick id + msg_addr_t entity = m->get_entity(); + + if (entity.is_new()) { + // make up a new address! + switch (entity.type()) { + case MSG_ADDR_MDS_BASE: + entity = MSG_ADDR_MDS(nmds++); + break; + + case MSG_ADDR_OSD_BASE: + entity = MSG_ADDR_OSD(nosd++); + break; + + case MSG_ADDR_CLIENT_BASE: + entity = MSG_ADDR_CLIENT(nclient++); + break; + + default: + assert(0); + } + } else { + // specific address! + } + + + // register + if (rank.entity_map.count(entity)) { + dout(1) << "namer.handle_register re-registering " << entity + << " inst " << m->get_source_inst() + << " (was " << rank.entity_map[entity] << ")" + << endl; + } else { + dout(1) << "namer.handle_register registering " << entity + << " inst " << m->get_source_inst() + << endl; + } + rank.entity_map[entity] = m->get_source_inst(); + rank.entity_unstarted.insert(entity); + + // reply w/ new id + messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), + m->get_source(), rank.entity_map[entity]); + + delete m; +} + +void Rank::Namer::handle_started(Message *m) +{ + msg_addr_t who = m->get_source(); + dout(10) << "namer.handle_started from entity " << who << endl; + + assert(rank.entity_unstarted.count(who)); + rank.entity_unstarted.erase(who); + + // anybody waiting? + if (waiting.count(who)) { + list ls; + ls.swap(waiting[who]); + waiting.erase(who); + + dout(10) << "doing waiters on " << who << endl; + for (list::iterator it = ls.begin(); + it != ls.end(); + it++) + dispatch(*it); + } + +} + +void Rank::Namer::handle_unregister(Message *m) +{ + msg_addr_t who = m->get_source(); + dout(1) << "namer.handle_unregister entity " << who << endl; + + rank.show_dir(); + + assert(rank.entity_map.count(who)); + rank.entity_map.erase(who); + + rank.show_dir(); + + // shut myself down? kick watcher. + if (rank.entity_map.size() == 2) { + dout(10) << "namer.handle_unregister stopping namer" << endl; + rank.lock.Unlock(); + messenger->shutdown(); + delete messenger; + rank.lock.Lock(); + } + + delete m; +} + + +void Rank::Namer::handle_lookup(MNSLookup *m) +{ + // have it? + if (rank.entity_map.count(m->get_entity()) == 0) { + dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> dne" << endl; + waiting[m->get_entity()].push_back(m); + return; + } + + if (rank.entity_unstarted.count(m->get_entity())) { + dout(10) << "namer " << m->get_source() << " lookup '" << m->get_entity() << "' -> unstarted" << endl; + waiting[m->get_entity()].push_back(m); + return; + } + + // look it up! + MNSLookupReply *reply = new MNSLookupReply(m); + + reply->entity_map[m->get_entity()] = rank.entity_map[m->get_entity()]; + + dout(10) << "namer " << m->get_source() + << " lookup '" << m->get_entity() + << "' -> " << rank.entity_map[m->get_entity()] << endl; + + messenger->send_message(reply, m->get_source(), m->get_source_inst()); + delete m; +} + +void Rank::Namer::handle_failure(MNSFailure *m) +{ + dout(10) << "namer.handle_failure inst " << m->get_inst() + << endl; + + // search for entities on this instance + list rm; + for (hash_map::iterator i = rank.entity_map.begin(); + i != rank.entity_map.end(); + i++) { + if (i->second != m->get_inst()) continue; + rm.push_back(i->first); + } + for (list::iterator i = rm.begin(); + i != rm.end(); + i++) { + dout(10) << "namer.handle_failure inst " << m->get_inst() + << ", removing " << *i << endl; + + rank.entity_map.erase(*i); + rank.entity_unstarted.erase(*i); + + /* + if ((*i).is_osd()) { + // tell the monitor + messenger->send_message(new MFailure(*i, m->get_inst()), MSG_ADDR_MON(0)); + } + */ + } + + delete m; +} + + + +/******************************************** + * Accepter + */ + +int Rank::Accepter::start() +{ + // bind to a socket + dout(10) << "accepter.start binding to listen " << endl; + + /* socket creation */ + listen_sd = socket(AF_INET,SOCK_STREAM,0); + assert(listen_sd > 0); + + /* bind to port */ + memset((char*)&listen_addr, 0, sizeof(listen_addr)); + listen_addr.sin_family = AF_INET; + listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); + listen_addr.sin_port = 0; + + int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); + assert(rc >= 0); + + socklen_t llen = sizeof(listen_addr); + getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); + + int myport = listen_addr.sin_port; + + // listen! + rc = ::listen(listen_sd, 1000); + assert(rc >= 0); + + //dout(10) << "accepter.start listening on " << myport << endl; + + // my address is... + char host[100]; + bzero(host, 100); + gethostname(host, 100); + //dout(10) << "accepter.start my hostname is " << host << endl; + + struct hostent *myhostname = gethostbyname( host ); + + struct sockaddr_in my_addr; + memset(&my_addr, 0, sizeof(my_addr)); + + my_addr.sin_family = myhostname->h_addrtype; + memcpy((char *) &my_addr.sin_addr.s_addr, + myhostname->h_addr_list[0], + myhostname->h_length); + my_addr.sin_port = myport; + + listen_addr = my_addr; + + dout(10) << "accepter.start listen addr is " << listen_addr << endl; + + // start thread + create(); + + return 0; +} + +void *Rank::Accepter::entry() +{ + dout(10) << "accepter starting" << endl; + + while (!done) { + // accept + struct sockaddr_in addr; + socklen_t slen = sizeof(addr); + int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); + if (sd > 0) { + dout(10) << "accepted incoming on sd " << sd << endl; + + rank.lock.Lock(); + Pipe *p = new Pipe(sd); + rank.pipes.insert(p); + rank.lock.Unlock(); + } else { + dout(10) << "no incoming connection?" << endl; + break; + } + } + + return 0; +} + + + +/************************************** + * Pipe + */ + +int Rank::Pipe::accept() +{ + // my creater gave me sd via accept() + + // announce myself. + int rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); + if (rc < 0) { + ::close(sd); + done = true; + return -1; + } + + // identify peer + rc = tcp_read(sd, (char*)&peer_inst, sizeof(peer_inst)); + if (rc < 0) { + dout(10) << "pipe(? " << this << ").accept couldn't read peer inst" << endl; + ::close(sd); + done = true; + return -1; + } + + // create writer thread. + writer_running = true; + writer_thread.create(); + + // register pipe. + if (peer_inst.rank >= 0) { + rank.lock.Lock(); + { + if (rank.rank_pipe.count(peer_inst.rank) == 0) { + // install a pipe! + dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst << endl; + rank.rank_pipe[peer_inst.rank] = this; + } else { + // low ranks' Pipes "win" + if (peer_inst.rank < rank.my_inst.rank || + rank.my_inst.rank < 0) { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst + << ", already had pipe, but switching to this new one" << endl; + // switch to this new Pipe + rank.rank_pipe[peer_inst.rank]->close(); // close old one + rank.rank_pipe[peer_inst.rank] = this; + } else { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is " << peer_inst + << ", already had pipe, sticking with it" << endl; + } + } + } + rank.lock.Unlock(); + } else { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").accept peer is unranked " << peer_inst << endl; + } + + return 0; // success. +} + +int Rank::Pipe::connect() +{ + dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect" << endl; + + // create socket? + sd = socket(AF_INET,SOCK_STREAM,0); + assert(sd > 0); + + // bind any port + struct sockaddr_in myAddr; + myAddr.sin_family = AF_INET; + myAddr.sin_addr.s_addr = htonl(INADDR_ANY); + myAddr.sin_port = htons( 0 ); + + int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); + assert(rc>=0); + + // connect! + rc = ::connect(sd, (sockaddr*)&peer_inst.addr, sizeof(myAddr)); + if (rc < 0) return rc; + + // identify peer + entity_inst_t inst; + rc = tcp_read(sd, (char*)&inst, sizeof(inst)); + if (inst.rank < 0) + inst = peer_inst; // i know better than they do. + if (peer_inst != inst && inst.rank > 0) { + derr(0) << "pipe(" << peer_inst << ' ' << this << ").connect peer is " << inst << ", wtf" << endl; + assert(0); + return -1; + } + + // identify myself + rc = tcp_write(sd, (char*)&rank.my_inst, sizeof(rank.my_inst)); + if (rc < 0) + return -1; + + // register pipe + rank.lock.Lock(); + { + if (rank.rank_pipe.count(peer_inst.rank) == 0) { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect registering pipe" << endl; + rank.rank_pipe[peer_inst.rank] = this; + } else { + // this is normal. + dout(10) << "pipe(" << peer_inst << ' ' << this << ").connect pipe already registered." << endl; + } + } + rank.lock.Unlock(); + + // start reader + reader_running = true; + reader_thread.create(); + + return 0; +} + + +void Rank::Pipe::close() +{ + if (sent_close) { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").close already closing" << endl; + return; + } + dout(10) << "pipe(" << peer_inst << ' ' << this << ").close" << endl; + + // unreg ourselves + rank.lock.Lock(); + { + if (rank.rank_pipe.count(peer_inst.rank) && + rank.rank_pipe[peer_inst.rank] == this) { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").close unregistering pipe" << endl; + rank.rank_pipe.erase(peer_inst.rank); + } + } + rank.lock.Unlock(); + + // queue close message. + dout(10) << "pipe(" << peer_inst << ' ' << this << ").close queueing MSG_CLOSE" << endl; + lock.Lock(); + q.push_back(new MGenericMessage(MSG_CLOSE)); + cond.Signal(); + sent_close = true; + lock.Unlock(); +} + + +/* read msgs from socket. + * also, server. + * + */ +void Rank::Pipe::reader() +{ + if (server) + accept(); + + // loop. + while (!done) { + Message *m = read_message(); + if (!m || m->get_type() == 0) { + if (m) { + delete m; + dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader read MSG_CLOSE message" << endl; + } else { + derr(10) << "pipe(" << peer_inst << ' ' << this << ").reader read null message" << endl; + } + + if (!sent_close) + close(); + + done = true; + cond.Signal(); // wake up writer too. + break; + } + + dout(10) << "pipe(" << peer_inst << ' ' << this << ").reader got message for " << m->get_dest() << endl; + + EntityMessenger *entity = 0; + + rank.lock.Lock(); + { + if (rank.entity_map.count(m->get_source()) && + rank.entity_map[m->get_source()] > m->get_source_inst()) { + derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader source " << m->get_source() + << " inst " << m->get_source_inst() + << " > " << rank.entity_map[m->get_source()] + << ", WATCH OUT " << *m << endl; + assert(0); + } + + if (m->get_dest().type() == MSG_ADDR_RANK_BASE) { + // ours. + rank.dispatch(m); + } else { + if (g_conf.ms_single_dispatch) { + // submit to single dispatch queue + rank._submit_single_dispatch(m); + } else { + if (rank.local.count(m->get_dest())) { + // find entity + entity = rank.local[m->get_dest()]; + } else { + derr(0) << "pipe(" << peer_inst << ' ' << this << ").reader got message " << *m << " for " << m->get_dest() << ", which isn't local" << endl; + assert(0); // FIXME do this differently + //rank.waiting_for_lookup[m->get_dest()].push_back(m); + } + } + } + } + rank.lock.Unlock(); + + if (entity) + entity->queue_message(m); // queue + } + + + // reap? + bool reap = false; + lock.Lock(); + { + reader_running = false; + if (!writer_running) reap = true; + } + lock.Unlock(); + + if (reap) { + dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader queueing for reap" << endl; + ::close(sd); + rank.lock.Lock(); + { + rank.pipe_reap_queue.push_back(this); + rank.wait_cond.Signal(); + } + rank.lock.Unlock(); + } +} + + +/* write msgs to socket. + * also, client. + */ +void Rank::Pipe::writer() +{ + if (!server) { + int rc = connect(); + if (rc < 0) { + derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error connecting" << endl; + done = true; + list out; + fail(out); + } + } + + // loop. + lock.Lock(); + while (!q.empty() || !done) { + + if (!q.empty()) { + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer grabbing message(s)" << endl; + + // grab outgoing list + list out; + out.swap(q); + + // drop lock while i send these + lock.Unlock(); + + while (!out.empty()) { + Message *m = out.front(); + out.pop_front(); + + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sending " << *m << endl; + + // stamp. + m->set_source_inst(rank.my_inst); + + // marshall + if (m->empty_payload()) + m->encode_payload(); + + if (write_message(m) < 0) { + // failed! + derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending " << *m << " to " << m->get_dest() << endl; + out.push_front(m); + fail(out); + done = true; + break; + } + + // did i just send a close? + if (m->get_type() == MSG_CLOSE) + done = true; + + // clean up + delete m; + } + + lock.Lock(); + continue; + } + + // wait + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer sleeping" << endl; + cond.Wait(lock); + } + lock.Unlock(); + + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer finishing" << endl; + + // reap? + bool reap = false; + lock.Lock(); + { + writer_running = false; + if (!reader_running) reap = true; + } + lock.Unlock(); + + if (reap) { + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer queueing for reap" << endl; + ::close(sd); + rank.lock.Lock(); + { + rank.pipe_reap_queue.push_back(this); + rank.wait_cond.Signal(); + } + rank.lock.Unlock(); + } +} + + +Message *Rank::Pipe::read_message() +{ + // envelope + //dout(10) << "receiver.read_message from sd " << sd << endl; + + msg_envelope_t env; + if (!tcp_read( sd, (char*)&env, sizeof(env) )) + return 0; + + dout(20) << "pipe(" << peer_inst << ' ' << this << ").reader got envelope type=" << env.type + << " src " << env.source << " dst " << env.dest + << " nchunks=" << env.nchunks + << endl; + + // payload + bufferlist blist; + for (int i=0; iget_source() << endl; + + return m; +} + + + +int Rank::Pipe::write_message(Message *m) +{ + // get envelope, buffers + msg_envelope_t *env = &m->get_envelope(); + bufferlist blist; + blist.claim( m->get_payload() ); + +#ifdef TCP_KEEP_CHUNKS + env->nchunks = blist.buffers().size(); +#else + env->nchunks = 1; +#endif + + dout(20)// << g_clock.now() + << "pipe(" << peer_inst << ' ' << this << ").writer sending " << m << " " << *m + << " to " << m->get_dest() + << endl; + + // send envelope + int r = tcp_write( sd, (char*)env, sizeof(*env) ); + if (r < 0) { + derr(1) << "pipe(" << peer_inst << ' ' << this << ").writer error sending envelope for " << *m + << " to " << m->get_dest() << endl; + return -1; + } + + // payload +#ifdef TCP_KEEP_CHUNKS + // send chunk-wise + int i = 0; + for (list::const_iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + dout(10) << "pipe(" << peer_inst << ' ' << this << ").writer tcp_sending frag " << i << " len " << (*it).length() << endl; + int size = (*it).length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { + derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending chunk len for " << *m << " to " << m->get_dest() << endl; + return -1; + } + r = tcp_write( sd, (*it).c_str(), size ); + if (r < 0) { + derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data chunk for " << *m << " to " << m->get_dest() << endl; + return -1; + } + i++; + } +#else + // one big chunk + int size = blist.length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { + derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data len for " << *m << " to " << m->get_dest() << endl; + return -1; + } + dout(20) << "pipe(" << peer_inst << ' ' << this << ").writer data len is " << size << " in " << blist.buffers().size() << " buffers" << endl; + + for (list::const_iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + if ((*it).length() == 0) continue; // blank buffer. + r = tcp_write( sd, (char*)(*it).c_str(), (*it).length() ); + if (r < 0) { + derr(10) << "pipe(" << peer_inst << ' ' << this << ").writer error sending data megachunk for " << *m << " to " << m->get_dest() << " : len " << (*it).length() << endl; + return -1; + } + } +#endif + + return 0; +} + + +void Rank::Pipe::fail(list& out) +{ + derr(10) << "pipe(" << peer_inst << ' ' << this << ").fail" << endl; + + // tell namer + if (!rank.messenger) { + derr(0) << "FATAL error: can't send failure to namer0, not connected yet" << endl; + assert(0); + } + + // FIXME: possible race before i reclaim lock here? + + // deactivate myself + rank.lock.Lock(); + { + if (rank.rank_pipe.count(peer_inst.rank) && + rank.rank_pipe[peer_inst.rank] == this) + rank.rank_pipe.erase(peer_inst.rank); + } + rank.lock.Unlock(); + + // what do i do about reader()? FIXME + + // sort my messages by (source) dispatcher, dest. + map > > by_dis; + lock.Lock(); + { + // include out at front of queue + q.splice(q.begin(), out); + + // sort + while (!q.empty()) { + if (q.front()->get_type() == MSG_CLOSE) { + delete q.front(); + } + else if (rank.local.count(q.front()->get_source())) { + Dispatcher *dis = rank.local[q.front()->get_source()]->get_dispatcher(); + by_dis[dis][q.front()->get_dest()].push_back(q.front()); + } + else { + // oh well. sending entity musta just shut down? + assert(0); + delete q.front(); + } + q.pop_front(); + } + } + lock.Unlock(); + + // report failure(s) to dispatcher(s) + for (map > >::iterator i = by_dis.begin(); + i != by_dis.end(); + ++i) + for (map >::iterator j = i->second.begin(); + j != i->second.end(); + ++j) + for (list::iterator k = j->second.begin(); + k != j->second.end(); + ++k) { + derr(1) << "pipe(" << peer_inst << ' ' << this << ").fail on " << **k << " to " << j->first << " inst " << peer_inst << endl; + i->first->ms_handle_failure(*k, j->first, peer_inst); + } +} + + + + + + +/******************************************** + * Rank + */ + +Rank::Rank(int r) : + single_dispatcher(this), + my_rank(r), + namer(0) { +} +Rank::~Rank() +{ + //FIXME + if (namer) delete namer; +} + + +void Rank::_submit_single_dispatch(Message *m) +{ + assert(lock.is_locked()); + + if (local.count(m->get_dest()) && + local[m->get_dest()]->is_ready()) { + rank.single_dispatch_queue.push_back(m); + rank.single_dispatch_cond.Signal(); + } else { + waiting_for_ready[m->get_dest()].push_back(m); + } +} + + +void Rank::single_dispatcher_entry() +{ + lock.Lock(); + while (!single_dispatch_stop || !single_dispatch_queue.empty()) { + if (!single_dispatch_queue.empty()) { + list ls; + ls.swap(single_dispatch_queue); + + lock.Unlock(); + { + while (!ls.empty()) { + Message *m = ls.front(); + ls.pop_front(); + + dout(1) //<< g_clock.now() + << "---- " + << m->get_source()// << ':' << m->get_source_port() + << " to " << m->get_dest()// << ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << m + << endl; + + if (m->get_dest().type() == MSG_ADDR_RANK_BASE) + rank.dispatch(m); + else { + assert(local.count(m->get_dest())); + local[m->get_dest()]->dispatch(m); + } + } + } + lock.Lock(); + continue; + } + single_dispatch_cond.Wait(lock); + } + lock.Unlock(); +} + + +/* + * note: assumes lock is held + */ +void Rank::reaper() +{ + dout(10) << "reaper" << endl; + assert(lock.is_locked()); + + while (!pipe_reap_queue.empty()) { + Pipe *p = pipe_reap_queue.front(); + dout(10) << "reaper reaping pipe " << p->get_peer_inst() << endl; + pipe_reap_queue.pop_front(); + assert(pipes.count(p)); + pipes.erase(p); + p->join(); + dout(10) << "reaper reaped pipe " << p->get_peer_inst() << endl; + delete p; + } +} + + +int Rank::start_rank() +{ + dout(10) << "start_rank" << endl; + + // bind to a socket + if (accepter.start() < 0) + return -1; + + // start single thread dispatcher? + if (g_conf.ms_single_dispatch) { + single_dispatch_stop = false; + single_dispatcher.create(); + } + + lock.Lock(); + + // my_inst + my_inst.addr = accepter.listen_addr; + my_inst.rank = my_rank; + + if (my_rank < 0) { + dout(10) << "start_rank connecting to namer0" << endl; + + // connect to namer + assert(entity_map.count(MSG_ADDR_NAMER(0))); + Pipe *pipe = connect_rank(entity_map[MSG_ADDR_NAMER(0)]); + + // send + Message *m = new MNSConnect(accepter.listen_addr); + m->set_dest(MSG_ADDR_NAMER(0), 0); + pipe->send(m); + + // wait + while (my_rank < 0) + waiting_for_rank.Wait(lock); + assert(my_rank >= 0); + + dout(10) << "start_rank got rank " << my_rank << endl; + + // create rank entity + entity_map[MSG_ADDR_RANK(my_rank)] = my_inst; + local[MSG_ADDR_RANK(my_rank)] = messenger = new EntityMessenger(MSG_ADDR_RANK(my_rank)); + messenger->set_dispatcher(this); + } else { + // create my rank + msg_addr_t raddr = MSG_ADDR_RANK(my_rank); + entity_map[raddr] = my_inst; + entity_unstarted.insert(raddr); + local[raddr] = messenger = new EntityMessenger(raddr); + messenger->set_dispatcher(this); + + dout(1) << "start_rank " << my_rank << " at " << my_inst << endl; + } + + lock.Unlock(); + return 0; +} + +void Rank::start_namer() +{ + // create namer0 + msg_addr_t naddr = MSG_ADDR_NAMER(0); + entity_map[naddr] = my_inst; + local[naddr] = new EntityMessenger(naddr); + namer = new Namer(local[naddr]); + namer_inst = my_inst; +} + +void Rank::set_namer(const tcpaddr_t& ns) +{ + namer_inst.addr = entity_map[MSG_ADDR_NAMER(0)].addr = ns; + namer_inst.rank = entity_map[MSG_ADDR_NAMER(0)].rank = 0; +} + +/* connect_rank + * NOTE: assumes rank.lock held. + */ +Rank::Pipe *Rank::connect_rank(const entity_inst_t& inst) +{ + assert(rank.lock.is_locked()); + assert(inst != rank.my_inst); + + dout(10) << "connect_rank to " << inst << endl; + + // create pipe + Pipe *pipe = new Pipe(inst); + rank.rank_pipe[inst.rank] = pipe; + pipes.insert(pipe); + + return pipe; +} + + + + + +void Rank::show_dir() +{ + dout(10) << "show_dir ---" << endl; + + for (hash_map::iterator i = entity_map.begin(); + i != entity_map.end(); + i++) { + if (local.count(i->first)) { + dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << " local " << endl; + } else { + dout(10) << "show_dir entity_map " << i->first << " -> " << i->second << endl; + } + } +} + + +/* lookup + * NOTE: assumes directory.lock held + */ +void Rank::lookup(msg_addr_t addr) +{ + dout(10) << "lookup " << addr << endl; + assert(lock.is_locked()); + + assert(looking_up.count(addr) == 0); + looking_up.insert(addr); + + MNSLookup *r = new MNSLookup(addr); + messenger->send_message(r, MSG_ADDR_NAMER(0), namer_inst); +} + + + +/* register_entity + */ +Rank::EntityMessenger *Rank::register_entity(msg_addr_t addr) +{ + dout(10) << "register_entity " << addr << endl; + lock.Lock(); + + // register with namer + static long reg_attempt = 0; + long id = ++reg_attempt; + + Message *reg = new MNSRegister(addr, my_rank, id); + reg->set_source(MSG_ADDR_RANK(my_rank), 0); + reg->set_source_inst(my_inst); + reg->set_dest(MSG_ADDR_DIRECTORY, 0); + + // prepare cond + Cond cond; + waiting_for_register_cond[id] = &cond; + + // send request + lock.Unlock(); + submit_message(reg); + lock.Lock(); + + // wait + while (!waiting_for_register_result.count(id)) + cond.Wait(lock); + + // grab result + addr = waiting_for_register_result[id]; + dout(10) << "register_entity got " << addr << endl; + + // clean up + waiting_for_register_cond.erase(id); + waiting_for_register_result.erase(id); + + // create messenger + EntityMessenger *msgr = new EntityMessenger(addr); + + // add to directory + entity_map[addr] = my_inst; + local[addr] = msgr; + + // was anyone waiting? + if (waiting_for_lookup.count(addr)) { + submit_messages(waiting_for_lookup[addr]); + waiting_for_lookup.erase(addr); + } + + lock.Unlock(); + return msgr; +} + +void Rank::unregister_entity(EntityMessenger *msgr) +{ + lock.Lock(); + dout(10) << "unregister_entity " << msgr->get_myaddr() << endl; + + // remove from local directory. + assert(local.count(msgr->get_myaddr())); + local.erase(msgr->get_myaddr()); + + if (my_rank > 0) { + assert(entity_map.count(msgr->get_myaddr())); + entity_map.erase(msgr->get_myaddr()); + } // else namer will do it. + + // tell namer. + if (msgr->get_myaddr() != MSG_ADDR_NAMER(0) && + msgr->get_myaddr() != MSG_ADDR_RANK(0)) + msgr->send_message(new MGenericMessage(MSG_NS_UNREGISTER), + MSG_ADDR_NAMER(0), namer_inst); + + // kick wait()? + if (local.size() <= 2) + wait_cond.Signal(); + + lock.Unlock(); +} + + +void Rank::submit_messages(list& ls) +{ + for (list::iterator i = ls.begin(); i != ls.end(); i++) + submit_message(*i); + ls.clear(); +} + + + +void Rank::submit_message(Message *m, const entity_inst_t& dest_inst) +{ + const msg_addr_t dest = m->get_dest(); + + // lookup + EntityMessenger *entity = 0; + Pipe *pipe = 0; + + lock.Lock(); + { + // local? + if (dest_inst.rank == my_inst.rank) { + if (local.count(dest)) { + // local + dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; + if (g_conf.ms_single_dispatch) { + _submit_single_dispatch(m); + } else { + entity = local[dest]; + } + } else { + // mid-register + dout(20) << "submit_message " << *m << " dest " << dest << " " << dest_inst << " local but mid-register, waiting." << endl; + assert(0); // hmpf + waiting_for_lookup[dest].push_back(m); + } + } + else { + // remote. + if (rank_pipe.count( dest_inst.rank )) { + //&& + //rank_pipe[dest_inst.rank]->inst == dest_inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", already connected." << endl; + // connected. + pipe = rank_pipe[ dest_inst.rank ]; + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << dest_inst << ", connecting." << endl; + // not connected. + pipe = connect_rank( dest_inst ); + } + } + } + lock.Unlock(); + + // do it + if (entity) { + // local! + dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; + entity->queue_message(m); + } + else if (pipe) { + // remote! + dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; + pipe->send(m); + } +} + + +void Rank::submit_message(Message *m) +{ + const msg_addr_t dest = m->get_dest(); + + // lookup + EntityMessenger *entity = 0; + Pipe *pipe = 0; + + lock.Lock(); + { + if (local.count(dest)) { + dout(20) << "submit_message " << *m << " dest " << dest << " local" << endl; + + // local + if (g_conf.ms_single_dispatch) { + _submit_single_dispatch(m); + } else { + entity = local[dest]; + } + } else if (entity_map.count( dest )) { + // remote, known rank addr. + entity_inst_t inst = entity_map[dest]; + + if (inst == my_inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " local but mid-register, waiting." << endl; + waiting_for_lookup[dest].push_back(m); + } + else if (rank_pipe.count( inst.rank ) && + rank_pipe[inst.rank]->get_peer_inst() == inst) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connected." << endl; + // connected. + pipe = rank_pipe[ inst.rank ]; + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, " << inst << ", connecting." << endl; + // not connected. + pipe = connect_rank( inst ); + } + } else { + // unknown dest rank or rank addr. + if (looking_up.count(dest) == 0) { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, looking up" << endl; + lookup(dest); + } else { + dout(20) << "submit_message " << *m << " dest " << dest << " remote, unknown addr, already looking up" << endl; + } + waiting_for_lookup[dest].push_back(m); + } + } + lock.Unlock(); + + // do it + if (entity) { + // local! + dout(20) << "submit_message " << *m << " dest " << dest << " local, queueing" << endl; + entity->queue_message(m); + } + else if (pipe) { + // remote! + dout(20) << "submit_message " << *m << " dest " << dest << " remote, sending" << endl; + pipe->send(m); + } +} + + + + +void Rank::dispatch(Message *m) +{ + lock.Lock(); + + dout(10) << "dispatching " << *m << endl; + + switch (m->get_type()) { + case MSG_NS_CONNECTACK: + handle_connect_ack((MNSConnectAck*)m); + break; + + case MSG_NS_REGISTERACK: + handle_register_ack((MNSRegisterAck*)m); + break; + + case MSG_NS_LOOKUPREPLY: + handle_lookup_reply((MNSLookupReply*)m); + break; + + default: + assert(0); + } + + lock.Unlock(); +} + +void Rank::handle_connect_ack(MNSConnectAck *m) +{ + dout(10) << "handle_connect_ack, my rank is " << m->get_rank() << endl; + my_rank = m->get_rank(); + + my_inst.addr = accepter.listen_addr; + my_inst.rank = my_rank; + + waiting_for_rank.SignalAll(); + delete m; + + // logger! + /*dout(10) << "logger" << endl; + char names[100]; + sprintf(names, "rank%d", my_rank); + string name = names; + + if (g_conf.tcp_log) { + logger = new Logger(name, (LogType*)&rank_logtype); + rank_logtype.add_set("num"); + rank_logtype.add_inc("in"); + rank_logtype.add_inc("inb"); + rank_logtype.add_inc("dis"); + rank_logtype.add_set("inq"); + rank_logtype.add_set("inqb"); + rank_logtype.add_set("outq"); + rank_logtype.add_set("outqb"); + } + */ +} + + +void Rank::handle_register_ack(MNSRegisterAck *m) +{ + dout(10) << "handle_register_ack " << m->get_entity() << endl; + + long tid = m->get_tid(); + waiting_for_register_result[tid] = m->get_entity(); + waiting_for_register_cond[tid]->Signal(); + delete m; +} + +void Rank::handle_lookup_reply(MNSLookupReply *m) +{ + list waiting; + dout(10) << "got lookup reply" << endl; + + for (map::iterator it = m->entity_map.begin(); + it != m->entity_map.end(); + it++) { + dout(10) << "lookup got " << it->first << " at " << it->second << endl; + msg_addr_t addr = it->first; + entity_inst_t inst = it->second; + + if (entity_map.count(addr) && + entity_map[addr] > inst) { + dout(10) << "ignoring lookup results for " << addr << ", " \ + << entity_map[addr] << " > " << inst << endl; + continue; + } + + // update map. + entity_map[addr] = inst; + + if (inst.rank == my_rank) { + // local + dout(10) << "delivering lookup results locally" << endl; + if (local.count(addr)) { + if (g_conf.ms_single_dispatch) { + single_dispatch_queue.splice(single_dispatch_queue.end(), + waiting_for_lookup[addr]); + } else { + local[addr]->queue_messages(waiting_for_lookup[addr]); + } + waiting_for_lookup.erase(addr); + } else + lookup(addr); // try again! + + } else { + // remote + if (rank_pipe.count(inst.rank) == 0) + connect_rank(inst); + else if (rank_pipe[inst.rank]->get_peer_inst() != inst) { + dout(0) << "lookup got rank addr change, WATCH OUT" << endl; + // FIXME BUG possible message loss weirdness? + rank_pipe[inst.rank]->close(); + rank_pipe.erase(inst.rank); + connect_rank(inst); + } + + // take waiters + Pipe *pipe = rank_pipe[inst.rank]; + assert(pipe); + + if (waiting_for_lookup.count(addr)) { + pipe->send(waiting_for_lookup[addr]); + waiting_for_lookup.erase(addr); + } + } + } + + delete m; +} + + +void Rank::wait() +{ + lock.Lock(); + while (1) { + // reap dead pipes + reaper(); + + if (local.size() == 0) { + dout(10) << "wait: everything stopped" << endl; + break; // everything stopped. + } + + if (local.size() == 1 && + !messenger->is_stopped()) { + dout(10) << "wait: stopping rank" << endl; + lock.Unlock(); + messenger->shutdown(); + delete messenger; + lock.Lock(); + continue; + } + + wait_cond.Wait(lock); + } + lock.Unlock(); + + // done! clean up. + + // stop dispatch thread + if (g_conf.ms_single_dispatch) { + dout(10) << "wait: stopping dispatch thread" << endl; + lock.Lock(); + single_dispatch_stop = true; + single_dispatch_cond.Signal(); + lock.Unlock(); + single_dispatcher.join(); + } + + // reap pipes + lock.Lock(); + { + dout(10) << "wait: closing pipes" << endl; + list toclose; + for (hash_map::iterator i = rank_pipe.begin(); + i != rank_pipe.end(); + i++) + toclose.push_back(i->second); + for (list::iterator i = toclose.begin(); + i != toclose.end(); + i++) + (*i)->close(); + + dout(10) << "wait: waiting for pipes " << pipes << " to close" << endl; + while (!pipes.empty()) { + wait_cond.Wait(lock); + reaper(); + } + } + lock.Unlock(); + + dout(10) << "wait: done." << endl; +} + + + +int Rank::find_ns_addr(tcpaddr_t &nsa) +{ + // file? + int fd = ::open(".ceph_ns",O_RDONLY); + if (fd > 0) { + ::read(fd, (void*)&nsa, sizeof(nsa)); + ::close(fd); + cout << "ceph ns is " << nsa << endl; + return 0; + } + + // env var? + char *nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); + if (nsaddr) { + while (nsaddr[0] != '=') nsaddr++; + nsaddr++; + + if (tcp_hostlookup(nsaddr, nsa) < 0) { + cout << "can't resolve " << nsaddr << endl; + return -1; + } + + cout << "ceph ns is " << nsa << endl; + return 0; + } + + cerr << "i can't find ceph ns addr in .ceph_ns or CEPH_NAMESERVER" << endl; + return -1; +} + + + +/********************************** + * EntityMessenger + */ + +Rank::EntityMessenger::EntityMessenger(msg_addr_t myaddr) : + Messenger(myaddr), + stop(false), + dispatch_thread(this) +{ +} +Rank::EntityMessenger::~EntityMessenger() +{ +} + +void Rank::EntityMessenger::dispatch_entry() +{ + lock.Lock(); + while (!stop) { + if (!dispatch_queue.empty()) { + list ls; + ls.swap(dispatch_queue); + + lock.Unlock(); + { + // deliver + while (!ls.empty()) { + Message *m = ls.front(); + ls.pop_front(); + dout(1) //<< g_clock.now() + << "---- " + << m->get_source()// << ':' << m->get_source_port() + << " to " << m->get_dest()// << ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << m->get_source_inst() + << " ---- " << m + << endl; + dispatch(m); + } + } + lock.Lock(); + continue; + } + cond.Wait(lock); + } + lock.Unlock(); +} + +void Rank::EntityMessenger::ready() +{ + dout(10) << "ready " << get_myaddr() << endl; + + if (g_conf.ms_single_dispatch) { + rank.lock.Lock(); + if (rank.waiting_for_ready.count(get_myaddr())) { + rank.single_dispatch_queue.splice(rank.single_dispatch_queue.end(), + rank.waiting_for_ready[get_myaddr()]); + rank.waiting_for_ready.erase(get_myaddr()); + rank.single_dispatch_cond.Signal(); + } + rank.lock.Unlock(); + } else { + // start my dispatch thread + dispatch_thread.create(); + } + + // tell namer + if (get_myaddr() != MSG_ADDR_NAMER(0) && + get_myaddr() != MSG_ADDR_RANK(0)) + send_message(new MGenericMessage(MSG_NS_STARTED), MSG_ADDR_NAMER(0), rank.namer_inst); +} + + +int Rank::EntityMessenger::shutdown() +{ + dout(10) << "shutdown " << get_myaddr() << endl; + + // deregister + rank.unregister_entity(this); + + // stop my dispatch thread + if (dispatch_thread.am_self()) { + dout(1) << "shutdown i am dispatch, setting stop flag" << endl; + stop = true; + } else { + dout(1) << "shutdown i am not dispatch, setting stop flag and joining thread." << endl; + lock.Lock(); + stop = true; + cond.Signal(); + lock.Unlock(); + dispatch_thread.join(); + } + + return 0; +} + + +void Rank::EntityMessenger::prepare_dest(const entity_inst_t& inst) +{ + rank.lock.Lock(); + { + if (rank.rank_pipe.count(inst.rank) == 0) + rank.connect_rank(inst); + } + rank.lock.Unlock(); +} + +int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst, + int port, int fromport) +{ + // set envelope + m->set_source(get_myaddr(), fromport); + m->set_dest(dest, port); + + m->set_source_inst(rank.my_inst); + + dout(1) << "--> " + << m->get_source() //<< ':' << m->get_source_port() + << " to " << m->get_dest() //<< ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << rank.my_inst << " --> " << inst + << " ---- " << m + << endl; + + rank.submit_message(m, inst); + + return 0; +} + + +int Rank::EntityMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +{ + // set envelope + m->set_source(get_myaddr(), fromport); + m->set_dest(dest, port); + + m->set_source_inst(rank.my_inst); + + dout(1) << "--> " + << m->get_source() //<< ':' << m->get_source_port() + << " to " << m->get_dest() //<< ':' << m->get_dest_port() + << " ---- " << m->get_type_name() + << " ---- " << rank.my_inst << " --> ? (DEPRECATED)" + << " ---- " << m + << endl; + + rank.submit_message(m); + + return 0; +} + + +void Rank::EntityMessenger::mark_down(msg_addr_t a, entity_inst_t& i) +{ + assert(a != get_myaddr()); + rank.mark_down(a,i); +} + +void Rank::mark_down(msg_addr_t a, entity_inst_t& inst) +{ + //if (my_rank == 0) return; // ugh.. rank0 already handles this stuff in the namer + lock.Lock(); + if (entity_map.count(a) && + entity_map[a] > inst) { + dout(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; + derr(10) << "mark_down " << a << " inst " << inst << " < " << entity_map[a] << endl; + // do nothing! + } else { + if (entity_map.count(a) == 0) { + // don't know it + dout(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; + derr(10) << "mark_down " << a << " inst " << inst << " ... unknown by me" << endl; + + waiting_for_lookup.erase(a); + looking_up.erase(a); + } else { + // know it + assert(entity_map[a] <= inst); + dout(10) << "mark_down " << a << " inst " << inst << endl; + derr(10) << "mark_down " << a << " inst " << inst << endl; + + entity_map.erase(a); + + if (rank_pipe.count(inst.rank)) { + rank_pipe[inst.rank]->close(); + rank_pipe.erase(inst.rank); + } + + // kill rank# too? only if i'm the namer. + if (my_rank == 0) { + entity_map.erase(MSG_ADDR_RANK(inst.rank)); + } + } + } + lock.Unlock(); +} + +void Rank::EntityMessenger::mark_up(msg_addr_t a, entity_inst_t& i) +{ + assert(a != get_myaddr()); + rank.mark_up(a, i); +} + +void Rank::mark_up(msg_addr_t a, entity_inst_t& i) +{ + if (my_rank == 0) return; + lock.Lock(); + { + dout(10) << "mark_up " << a << " inst " << i << endl; + derr(10) << "mark_up " << a << " inst " << i << endl; + + assert(i.rank != my_rank); // hrm? + + if (entity_map.count(a) == 0 || + entity_map[a] < i) { + entity_map[a] = i; + connect_rank(i); + } else if (entity_map[a] == i) { + dout(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; + derr(10) << "mark_up " << a << " inst " << i << " ... knew it" << endl; + } else { + dout(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; + derr(-10) << "mark_up " << a << " inst " << i << " < " << entity_map[a] << endl; + } + + //if (waiting_for_lookup.count(a)) + //lookup(a); + } + lock.Unlock(); +} + diff --git a/branches/sage/cephmds2/msg/NewerMessenger.h b/branches/sage/cephmds2/msg/NewerMessenger.h new file mode 100644 index 0000000000000..6a4e003352aa8 --- /dev/null +++ b/branches/sage/cephmds2/msg/NewerMessenger.h @@ -0,0 +1,343 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __NEWMESSENGER_H +#define __NEWMESSENGER_H + + +#include +#include +using namespace std; +#include +#include +using namespace __gnu_cxx; + + +#include "include/types.h" + +#include "common/Mutex.h" +#include "common/Cond.h" +#include "common/Thread.h" + +#include "Messenger.h" +#include "Message.h" +#include "tcp.h" + + + + +/* Rank - per-process + */ +class Rank : public Dispatcher { + + class EntityMessenger; + class Pipe; + + // namer + class Namer : public Dispatcher { + public: + EntityMessenger *messenger; // namerN + + int nrank; + int nclient, nmds, nosd, nmon; + + map > waiting; + + Namer(EntityMessenger *msgr); + ~Namer(); + + void handle_connect(class MNSConnect*); + void handle_register(class MNSRegister *m); + void handle_started(Message *m); + void handle_lookup(class MNSLookup *m); + void handle_unregister(Message *m); + void handle_failure(class MNSFailure *m); + + void dispatch(Message *m); + + void manual_insert_inst(const entity_inst_t &inst); + + }; + + // incoming + class Accepter : public Thread { + public: + bool done; + + tcpaddr_t listen_addr; + int listen_sd; + + Accepter() : done(false) {} + + void *entry(); + void stop() { + done = true; + ::close(listen_sd); + join(); + } + int start(); + } accepter; + + + + class Pipe { + protected: + int sd; + bool done; + entity_inst_t peer_inst; + bool server; + bool sent_close; + + bool reader_running; + bool writer_running; + + list q; + Mutex lock; + Cond cond; + + int accept(); // server handshake + int connect(); // client handshake + void reader(); + void writer(); + + Message *read_message(); + int write_message(Message *m); + void fail(list& ls); + + // threads + class Reader : public Thread { + Pipe *pipe; + public: + Reader(Pipe *p) : pipe(p) {} + void *entry() { pipe->reader(); return 0; } + } reader_thread; + friend class Reader; + + class Writer : public Thread { + Pipe *pipe; + public: + Writer(Pipe *p) : pipe(p) {} + void *entry() { pipe->writer(); return 0; } + } writer_thread; + friend class Writer; + + public: + Pipe(int s) : sd(s), + done(false), server(true), + sent_close(false), + reader_running(false), writer_running(false), + reader_thread(this), writer_thread(this) { + // server + reader_running = true; + reader_thread.create(); + } + Pipe(const entity_inst_t &pi) : sd(0), + done(false), peer_inst(pi), server(false), + sent_close(false), + reader_running(false), writer_running(false), + reader_thread(this), writer_thread(this) { + // client + writer_running = true; + writer_thread.create(); + } + + // public constructors + static const Pipe& Server(int s); + static const Pipe& Client(const entity_inst_t& pi); + + entity_inst_t& get_peer_inst() { return peer_inst; } + + void close(); + void join() { + writer_thread.join(); + reader_thread.join(); + } + + void send(Message *m) { + lock.Lock(); + q.push_back(m); + cond.Signal(); + lock.Unlock(); + } + void send(list& ls) { + lock.Lock(); + q.splice(q.end(), ls); + cond.Signal(); + lock.Unlock(); + } + }; + + + + // messenger interface + class EntityMessenger : public Messenger { + Mutex lock; + Cond cond; + list dispatch_queue; + bool stop; + + class DispatchThread : public Thread { + EntityMessenger *m; + public: + DispatchThread(EntityMessenger *_m) : m(_m) {} + void *entry() { + m->dispatch_entry(); + return 0; + } + } dispatch_thread; + void dispatch_entry(); + + public: + void queue_message(Message *m) { + lock.Lock(); + dispatch_queue.push_back(m); + cond.Signal(); + lock.Unlock(); + } + void queue_messages(list ls) { + lock.Lock(); + dispatch_queue.splice(dispatch_queue.end(), ls); + cond.Signal(); + lock.Unlock(); + } + + public: + EntityMessenger(msg_addr_t myaddr); + ~EntityMessenger(); + + void ready(); + bool is_stopped() { return stop; } + + void wait() { + dispatch_thread.join(); + } + + virtual void callback_kick() {} + virtual int shutdown(); + virtual void prepare_dest(const entity_inst_t& inst); + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); + virtual int send_message(Message *m, msg_addr_t dest, const entity_inst_t& inst, + int port=0, int fromport=0); + + virtual void mark_down(msg_addr_t a, entity_inst_t& i); + virtual void mark_up(msg_addr_t a, entity_inst_t& i); + //virtual void reset(msg_addr_t a); + }; + + + class SingleDispatcher : public Thread { + Rank *rank; + public: + SingleDispatcher(Rank *r) : rank(r) {} + void *entry() { + rank->single_dispatcher_entry(); + return 0; + } + } single_dispatcher; + + Cond single_dispatch_cond; + bool single_dispatch_stop; + list single_dispatch_queue; + + map > waiting_for_ready; + + void single_dispatcher_entry(); + void _submit_single_dispatch(Message *m); + + + // Rank stuff + public: + Mutex lock; + Cond wait_cond; // for wait() + + // my rank + int my_rank; + Cond waiting_for_rank; + + // my instance + entity_inst_t my_inst; + + // lookup + hash_map entity_map; + hash_set entity_unstarted; + + map > waiting_for_lookup; + set looking_up; + + // register + map waiting_for_register_cond; + map waiting_for_register_result; + + // local + map local; + + // remote + hash_map rank_pipe; + + set pipes; + list pipe_reap_queue; + + EntityMessenger *messenger; // rankN + Namer *namer; + + entity_inst_t namer_inst; + + void show_dir(); + + void lookup(msg_addr_t addr); + + void dispatch(Message *m); + void handle_connect_ack(class MNSConnectAck *m); + void handle_register_ack(class MNSRegisterAck *m); + void handle_lookup_reply(class MNSLookupReply *m); + + Pipe *connect_rank(const entity_inst_t& inst); + + void mark_down(msg_addr_t addr, entity_inst_t& i); + void mark_up(msg_addr_t addr, entity_inst_t& i); + + tcpaddr_t get_listen_addr() { return accepter.listen_addr; } + + void reaper(); + + +public: + Rank(int r=-1); + ~Rank(); + + int find_ns_addr(tcpaddr_t &tcpaddr); + + void set_namer(const tcpaddr_t& ns); + void start_namer(); + + int start_rank(); + void wait(); + + EntityMessenger *register_entity(msg_addr_t addr); + void unregister_entity(EntityMessenger *ms); + + void submit_message(Message *m, const entity_inst_t& inst); + void prepare_dest(const entity_inst_t& inst); + void submit_message(Message *m); + void submit_messages(list& ls); + + // create a new messenger + EntityMessenger *new_entity(msg_addr_t addr); + +} ; + + + +extern Rank rank; + +#endif diff --git a/branches/sage/cephmds2/msg/RWLock.h b/branches/sage/cephmds2/msg/RWLock.h new file mode 100644 index 0000000000000..83b84c6faf370 --- /dev/null +++ b/branches/sage/cephmds2/msg/RWLock.h @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#ifndef _RWLock_Posix_ +#define _RWLock_Posix_ + +#include + +class RWLock +{ + mutable pthread_rwlock_t L; + + public: + + RWLock() { + pthread_rwlock_init(&L, NULL); + } + + virtual ~RWLock() { + pthread_rwlock_unlock(&L); + pthread_rwlock_destroy(&L); + } + + void unlock() { + pthread_rwlock_unlock(&L); + } + void get_read() { + pthread_rwlock_rdlock(&L); + } + void put_read() { unlock(); } + void get_write() { + pthread_rwlock_wrlock(&L); + } + void put_write() { unlock(); } +}; + +#endif // !_Mutex_Posix_ diff --git a/branches/sage/cephmds2/msg/SerialMessenger.h b/branches/sage/cephmds2/msg/SerialMessenger.h new file mode 100644 index 0000000000000..d03e7377d2826 --- /dev/null +++ b/branches/sage/cephmds2/msg/SerialMessenger.h @@ -0,0 +1,28 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __SERIAL_MESSENGER_H +#define __SERIAL_MESSENGER_H + +#include "Dispatcher.h" +#include "Message.h" + +class SerialMessenger : public Dispatcher { + public: + virtual void dispatch(Message *m) = 0; // i receive my messages here + virtual void send(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // doesn't block + virtual Message *sendrecv(Message *m, msg_addr_t dest, int port=0, int fromport=0) = 0; // blocks for matching reply +}; + +#endif diff --git a/branches/sage/cephmds2/msg/TCPDirectory.cc b/branches/sage/cephmds2/msg/TCPDirectory.cc new file mode 100644 index 0000000000000..111f6ee69f2f3 --- /dev/null +++ b/branches/sage/cephmds2/msg/TCPDirectory.cc @@ -0,0 +1,178 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "TCPDirectory.h" + +#include "messages/MNSConnect.h" +#include "messages/MNSConnectAck.h" +#include "messages/MNSRegister.h" +#include "messages/MNSRegisterAck.h" +#include "messages/MNSLookup.h" +#include "messages/MNSLookupReply.h" +//#include "messages/MNSUnregister.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_ns) cout << "nameserver: " + +void tcp_open(int rank); + + +void TCPDirectory::handle_connect(MNSConnect *m) +{ + int rank = nrank++; + dout(2) << "connect from new rank " << rank << " " << m->get_addr() << endl; + + dir[MSG_ADDR_RANK(rank)] = rank; + messenger->map_entity_rank(MSG_ADDR_RANK(rank), rank); + + rank_addr[rank] = m->get_addr(); + messenger->map_rank_addr(rank, m->get_addr()); + + messenger->send_message(new MNSConnectAck(rank), + MSG_ADDR_RANK(rank)); + delete m; +} + + + +void TCPDirectory::handle_register(MNSRegister *m) +{ + dout(10) << "register from rank " << m->get_rank() << " addr " << MSG_ADDR_NICE(m->get_entity()) << endl; + + // pick id + int rank = m->get_rank(); + msg_addr_t entity = m->get_entity(); + + if (entity.is_new()) { + // make up a new address! + switch (entity.type()) { + + case MSG_ADDR_RANK_BASE: // stupid client should be able to figure this out + entity = MSG_ADDR_RANK(rank); + break; + + case MSG_ADDR_MDS_BASE: + entity = MSG_ADDR_MDS(nmds++); + break; + + case MSG_ADDR_OSD_BASE: + entity = MSG_ADDR_OSD(nosd++); + break; + + case MSG_ADDR_CLIENT_BASE: + entity = MSG_ADDR_CLIENT(nclient++); + break; + + default: + assert(0); + } + } else { + // specific address! + assert(dir.count(entity) == 0); // make sure it doesn't exist yet. + } + + dout(2) << "registered " << MSG_ADDR_NICE(entity) << endl; + + // register + dir[entity] = rank; + + if (entity == MSG_ADDR_RANK(rank)) // map this locally now so we can reply + messenger->map_entity_rank(entity, rank); // otherwise wait until they send STARTED msg + + hold.insert(entity); + + ++version; + update_log[version] = entity; + + // reply w/ new id + messenger->send_message(new MNSRegisterAck(m->get_tid(), entity), + MSG_ADDR_RANK(rank)); + delete m; +} + +void TCPDirectory::handle_started(Message *m) +{ + msg_addr_t entity = m->get_source(); + + dout(3) << "start signal from " << MSG_ADDR_NICE(entity) << endl; + hold.erase(entity); + messenger->map_entity_rank(entity, dir[entity]); + + // waiters? + if (waiting.count(entity)) { + list ls; + ls.splice(ls.begin(), waiting[entity]); + waiting.erase(entity); + + dout(10) << "doing waiter on " << MSG_ADDR_NICE(entity) << endl; + for (list::iterator it = ls.begin(); + it != ls.end(); + it++) { + dispatch(*it); + } + } +} + +void TCPDirectory::handle_unregister(Message *m) +{ + msg_addr_t who = m->get_source(); + dout(2) << "unregister from entity " << MSG_ADDR_NICE(who) << endl; + + assert(dir.count(who)); + dir.erase(who); + + // shutdown? + if (dir.size() <= 2) { + dout(2) << "dir is empty except for me, shutting down" << endl; + tcpmessenger_stop_nameserver(); + } + else { + if (0) { + dout(10) << "dir size now " << dir.size() << endl; + for (hash_map::iterator it = dir.begin(); + it != dir.end(); + it++) { + dout(10) << " dir: " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; + } + } + } + +} + + +void TCPDirectory::handle_lookup(MNSLookup *m) +{ + // have it? + if (dir.count(m->get_entity()) == 0 || + hold.count(m->get_entity())) { + dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> dne or on hold" << endl; + waiting[m->get_entity()].push_back(m); + return; + } + + // look it up! + MNSLookupReply *reply = new MNSLookupReply(m); + + int rank = dir[m->get_entity()]; + reply->entity_map[m->get_entity()] = rank; + reply->rank_addr[rank] = rank_addr[rank]; + + dout(2) << MSG_ADDR_NICE(m->get_source()) << " lookup '" << MSG_ADDR_NICE(m->get_entity()) << "' -> rank " << rank << endl; + + messenger->send_message(reply, + m->get_source(), m->get_source_port()); + delete m; +} diff --git a/branches/sage/cephmds2/msg/TCPDirectory.h b/branches/sage/cephmds2/msg/TCPDirectory.h new file mode 100644 index 0000000000000..1b54bb010e906 --- /dev/null +++ b/branches/sage/cephmds2/msg/TCPDirectory.h @@ -0,0 +1,110 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __TCPDIRECTORY_H +#define __TCPDIRECTORY_H + +/* + * rank -- a process (listening on some host:port) + * entity -- a logical entity (osd123, mds3, client3245, etc.) + * + * multiple entities can coexist on a single rank. + */ + +#include "Dispatcher.h" +#include "TCPMessenger.h" + +#include +using namespace std; +#include +using namespace __gnu_cxx; + +#include +//#include +#include + +class TCPDirectory : public Dispatcher { + protected: + // how i communicate + TCPMessenger *messenger; + + // directory + hash_map dir; // entity -> rank + hash_map rank_addr; // rank -> ADDR (e.g. host:port) + + __uint64_t version; + map<__uint64_t, msg_addr_t> update_log; + + int nrank; + int nclient, nmds, nosd; + + set hold; + map > waiting; + + // messages + void handle_connect(class MNSConnect*); + void handle_register(class MNSRegister *m); + void handle_started(Message *m); + void handle_lookup(class MNSLookup *m); + void handle_unregister(Message *m); + + public: + TCPDirectory(TCPMessenger *m) : + messenger(m), + version(0), + nrank(0), nclient(0), nmds(0), nosd(0) { + messenger->set_dispatcher(this); + + // i am rank 0! + dir[MSG_ADDR_DIRECTORY] = 0; + rank_addr[0] = m->get_tcpaddr(); + ++nrank; + + // announce nameserver + cout << "export CEPH_NAMESERVER=" << m->get_tcpaddr() << endl; + + int fd = ::open(".ceph_ns", O_WRONLY|O_CREAT); + ::write(fd, (void*)&m->get_tcpaddr(), sizeof(tcpaddr_t)); + ::fchmod(fd, 0755); + ::close(fd); + } + ~TCPDirectory() { + ::unlink(".ceph_ns"); + } + + void dispatch(Message *m) { + switch (m->get_type()) { + case MSG_NS_CONNECT: + handle_connect((class MNSConnect*)m); + break; + case MSG_NS_REGISTER: + handle_register((class MNSRegister*)m); + break; + case MSG_NS_STARTED: + handle_started(m); + break; + case MSG_NS_UNREGISTER: + handle_unregister(m); + break; + case MSG_NS_LOOKUP: + handle_lookup((class MNSLookup*)m); + break; + + default: + assert(0); + } + } +}; + +#endif diff --git a/branches/sage/cephmds2/msg/TCPMessenger.cc b/branches/sage/cephmds2/msg/TCPMessenger.cc new file mode 100644 index 0000000000000..2c594bb528df6 --- /dev/null +++ b/branches/sage/cephmds2/msg/TCPMessenger.cc @@ -0,0 +1,1454 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "config.h" +#include "include/error.h" + +#include "common/Timer.h" +#include "common/Mutex.h" + +#include "TCPMessenger.h" +#include "Message.h" + +#include +#include +using namespace std; +#include +using namespace __gnu_cxx; + +#include +# include +# include +# include +# include +#include +#include +#include +#include + +#include + +#include "messages/MGenericMessage.h" +#include "messages/MNSConnect.h" +#include "messages/MNSConnectAck.h" +#include "messages/MNSRegister.h" +#include "messages/MNSRegisterAck.h" +#include "messages/MNSLookup.h" +#include "messages/MNSLookupReply.h" + +#include "TCPDirectory.h" + +#include "common/Logger.h" + +#define DBL 18 + +//#define TCP_SERIALMARSHALL // do NOT turn this off until you check messages/* encode_payload methods +//#define TCP_SERIALOUT // be paranoid/annoying and send messages in same thread + + +TCPMessenger *rankmessenger = 0; // + +TCPDirectory *nameserver = 0; // only defined on rank 0 +TCPMessenger *nsmessenger = 0; + + +/***************************/ +LogType rank_logtype; +Logger *logger; + +int stat_num = 0; +off_t stat_inq = 0, stat_inqb = 0; +off_t stat_disq = 0, stat_disqb = 0; +off_t stat_outq = 0, stat_outqb = 0; +/***************************/ + + +// local directory +hash_map directory; // local +hash_set directory_ready; +Mutex directory_lock; + +// connecting +struct sockaddr_in listen_addr; // my listen addr +int listen_sd = 0; +int my_rank = -1; +Cond waiting_for_rank; + +// register +long regid = 0; +map waiting_for_register_cond; +map waiting_for_register_result; + +// incoming messages +list incoming; +Mutex incoming_lock; +Cond incoming_cond; + +// outgoing messages +/* +list outgoing; +Mutex outgoing_lock; +Cond outgoing_cond; +*/ + +class OutThread : public Thread { +public: + Mutex lock; + Cond cond; + list q; + bool done; + + OutThread() : done(false) {} + virtual ~OutThread() {} + + void *entry(); + + void stop() { + lock.Lock(); + done = true; + cond.Signal(); + lock.Unlock(); + join(); + } + + void send(Message *m) { + lock.Lock(); + q.push_back(m); + cond.Signal(); + lock.Unlock(); + } +} single_out_thread; + +Mutex lookup_lock; // +hash_map entity_rank; // entity -> rank +hash_map rank_sd; // outgoing sockets, rank -> sd +hash_map rank_out; +hash_map rank_addr; // rank -> tcpaddr +map > waiting_for_lookup; + + +/* this process */ +bool tcp_done = false; // set this flag to stop the event loop + + +// threads +pthread_t dispatch_thread_id = 0; // thread id of the event loop. init value == nobody +pthread_t out_thread_id = 0; // thread id of the event loop. init value == nobody +pthread_t listen_thread_id = 0; +map in_threads; // sd -> threadid + +//bool pending_timer = false; + +// per-rank fun + + +// debug +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "[TCP " << my_rank /*(<< " " << getpid() << "." << pthread_self() */ << "] " + + +#include "tcp.cc" + +// some declarations +void tcp_open(int rank); +int tcp_send(Message *m); +void tcpmessenger_kick_dispatch_loop(); +OutThread *tcp_lookup(Message *m); + +int tcpmessenger_get_rank() +{ + return my_rank; +} + + +int tcpmessenger_findns(tcpaddr_t &nsa) +{ + char *nsaddr = 0; + bool have_nsa = false; + + // env var? + /*int e_len = 0; + for (int i=0; envp[i]; i++) + e_len += strlen(envp[i]) + 1; + */ + nsaddr = getenv("CEPH_NAMESERVER");////envz_entry(*envp, e_len, "CEPH_NAMESERVER"); + if (nsaddr) { + while (nsaddr[0] != '=') nsaddr++; + nsaddr++; + } + + else { + // file? + int fd = ::open(".ceph_ns",O_RDONLY); + if (fd > 0) { + ::read(fd, (void*)&nsa, sizeof(nsa)); + ::close(fd); + have_nsa = true; + nsaddr = "from .ceph_ns"; + } + } + + if (!nsaddr && !have_nsa) { + cerr << "i need ceph ns addr.. either CEPH_NAMESERVER env var or --ns blah" << endl; + return -1; + //exit(-1); + } + + // look up nsaddr? + if (!have_nsa && tcpmessenger_lookup(nsaddr, nsa) < 0) { + return -1; + } + + dout(2) << "ceph ns is " << nsaddr << " or " << nsa << endl; + return 0; +} + + + +/** rankserver + * + * one per rank. handles entity->rank lookup replies. + */ + +class RankServer : public Dispatcher { +public: + void dispatch(Message *m) { + lookup_lock.Lock(); + + dout(DBL) << "rankserver dispatching " << *m << endl; + + switch (m->get_type()) { + case MSG_NS_CONNECTACK: + handle_connect_ack((MNSConnectAck*)m); + break; + + case MSG_NS_REGISTERACK: + handle_register_ack((MNSRegisterAck*)m); + break; + + case MSG_NS_LOOKUPREPLY: + handle_lookup_reply((MNSLookupReply*)m); + break; + + default: + assert(0); + } + + lookup_lock.Unlock(); + } + + void handle_connect_ack(MNSConnectAck *m) { + dout(DBL) << "my rank is " << m->get_rank(); + my_rank = m->get_rank(); + + // now that i know my rank, + entity_rank[MSG_ADDR_RANK(my_rank)] = my_rank; + rank_addr[my_rank] = listen_addr; + + waiting_for_rank.SignalAll(); + + delete m; + + // logger! + dout(DBL) << "logger" << endl; + char names[100]; + sprintf(names, "rank%d", my_rank); + string name = names; + + if (g_conf.tcp_log) { + logger = new Logger(name, (LogType*)&rank_logtype); + rank_logtype.add_set("num"); + rank_logtype.add_inc("in"); + rank_logtype.add_inc("inb"); + rank_logtype.add_inc("dis"); + rank_logtype.add_set("inq"); + rank_logtype.add_set("inqb"); + rank_logtype.add_set("outq"); + rank_logtype.add_set("outqb"); + } + + } + + void handle_register_ack(MNSRegisterAck *m) { + long tid = m->get_tid(); + waiting_for_register_result[tid] = m->get_entity(); + waiting_for_register_cond[tid]->Signal(); + delete m; + } + + void handle_lookup_reply(MNSLookupReply *m) { + list waiting; + dout(DBL) << "got lookup reply" << endl; + + for (map::iterator it = m->entity_rank.begin(); + it != m->entity_rank.end(); + it++) { + dout(DBL) << "lookup got " << MSG_ADDR_NICE(it->first) << " on rank " << it->second << endl; + entity_rank[it->first] = it->second; + + if (it->second == my_rank) { + // deliver locally + dout(-DBL) << "delivering lookup results locally" << endl; + incoming_lock.Lock(); + + for (list::iterator i = waiting_for_lookup[it->first].begin(); + i != waiting_for_lookup[it->first].end(); + i++) { + stat_inq++; + stat_inqb += (*i)->get_payload().length(); + (*i)->decode_payload(); + incoming.push_back(*i); + } + incoming_cond.Signal(); + incoming_lock.Unlock(); + } else { + // take waiters + waiting.splice(waiting.begin(), waiting_for_lookup[it->first]); + } + waiting_for_lookup.erase(it->first); + + } + + for (map::iterator it = m->rank_addr.begin(); + it != m->rank_addr.end(); + it++) { + dout(DBL) << "lookup got rank " << it->first << " addr " << it->second << endl; + rank_addr[it->first] = it->second; + + // open it now + if (rank_sd.count(it->first) == 0) + tcp_open(it->first); + } + + // send waiting messages +#ifdef TCP_SERIALOUT + for (list::iterator it = waiting.begin(); + it != waiting.end(); + it++) { + OutThread *outt = tcp_lookup(*it); + assert(outt); + tcp_send(*it); + } +#else + for (list::iterator it = waiting.begin(); + it != waiting.end(); + it++) { + OutThread *outt = tcp_lookup(*it); + assert(outt); + outt->send(*it); +// dout(0) << "lookup done, splicing in " << *it << endl; + } +#endif + + delete m; + } + +} rankserver; + + +class C_TCPKicker : public Context { + void finish(int r) { + dout(DBL) << "timer kick" << endl; + tcpmessenger_kick_dispatch_loop(); + } +}; + +void TCPMessenger::callback_kick() +{ + tcpmessenger_kick_dispatch_loop(); +} + + +extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta) +{ + char *host = str; + char *port = 0; + + for (int i=0; str[i]; i++) { + if (str[i] == ':') { + port = str+i+1; + str[i] = 0; + break; + } + } + if (!port) { + cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; + return -1; + } + //cout << "host '" << host << "' port '" << port << "'" << endl; + + int iport = atoi(port); + + struct hostent *myhostname = gethostbyname( host ); + if (!myhostname) { + cerr << "host " << host << " not found" << endl; + return -1; + } + + memset(&ta, 0, sizeof(ta)); + + //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; + + ta.sin_family = myhostname->h_addrtype; + memcpy((char *)&ta.sin_addr, + myhostname->h_addr, + myhostname->h_length); + ta.sin_port = iport; + + cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; + + return 0; +} + + + +/***** + * global methods for process-wide startup, shutdown. + */ + +int tcpmessenger_init() +{ + // LISTEN + dout(DBL) << "binding to listen " << endl; + + /* socket creation */ + listen_sd = socket(AF_INET,SOCK_STREAM,0); + assert(listen_sd > 0); + + /* bind to port */ + memset((char*)&listen_addr, 0, sizeof(listen_addr)); + listen_addr.sin_family = AF_INET; + listen_addr.sin_addr.s_addr = htonl(INADDR_ANY); + listen_addr.sin_port = 0; + + int rc = bind(listen_sd, (struct sockaddr *) &listen_addr, sizeof(listen_addr)); + assert(rc >= 0); + + socklen_t llen = sizeof(listen_addr); + getsockname(listen_sd, (sockaddr*)&listen_addr, &llen); + + int myport = listen_addr.sin_port; + + // listen! + rc = ::listen(listen_sd, 1000); + assert(rc >= 0); + + dout(DBL) << "listening on " << myport << endl; + + // my address is... + char host[100]; + gethostname(host, 100); + dout(DBL) << "my hostname is " << host << endl; + + struct hostent *myhostname = gethostbyname( host ); + + struct sockaddr_in my_addr; + memset(&my_addr, 0, sizeof(my_addr)); + + my_addr.sin_family = myhostname->h_addrtype; + memcpy((char *) &my_addr.sin_addr.s_addr, + myhostname->h_addr_list[0], + myhostname->h_length); + my_addr.sin_port = myport; + + listen_addr = my_addr; + + dout(DBL) << "listen addr is " << listen_addr << endl; + + // register to execute timer events + //g_timer.set_messenger_kicker(new C_TCPKicker()); + + + dout(DBL) << "init done" << endl; + return 0; +} + + +// on first rank only +void tcpmessenger_start_nameserver(tcpaddr_t& diraddr) +{ + dout(DBL) << "starting nameserver on " << MSG_ADDR_NICE(MSG_ADDR_DIRECTORY) << endl; + + // i am rank 0. + nsmessenger = new TCPMessenger(MSG_ADDR_DIRECTORY); + + // start name server + nameserver = new TCPDirectory(nsmessenger); + + // diraddr is my addr! + diraddr = rank_addr[0] = listen_addr; + my_rank = 0; + entity_rank[MSG_ADDR_DIRECTORY] = 0; +} +void tcpmessenger_stop_nameserver() +{ + if (nsmessenger) { + dout(DBL) << "shutting down nsmessenger" << endl; + TCPMessenger *m = nsmessenger; + nsmessenger = 0; + m->shutdown(); + delete m; + } +} + +// on all ranks +void tcpmessenger_start_rankserver(tcpaddr_t& ns) +{ + // connect to nameserver + entity_rank[MSG_ADDR_DIRECTORY] = 0; + rank_addr[0] = ns; + tcp_open(0); + + if (my_rank >= 0) { + // i know my rank + rankmessenger = new TCPMessenger(MSG_ADDR_RANK(my_rank)); + } else { + // start rank messenger, and discover my rank. + rankmessenger = new TCPMessenger(MSG_ADDR_RANK_NEW); + } +} +void tcpmessenger_stop_rankserver() +{ + if (rankmessenger) { + dout(DBL) << "shutting down rankmessenger" << endl; + rankmessenger->shutdown(); + delete rankmessenger; + rankmessenger = 0; + } +} + + + + + + +int tcpmessenger_shutdown() +{ + dout(DBL) << "tcpmessenger_shutdown barrier" << endl; + + + dout(2) << "tcpmessenger_shutdown closing all sockets etc" << endl; + + // bleh + for (hash_map::iterator it = rank_sd.begin(); + it != rank_sd.end(); + it++) { + ::close(it->second); + } + + return 0; +} + + + + +/*** + * internal send/recv + */ + + + + +/* + * recv a Message* + */ + + + +Message *tcp_recv(int sd) +{ + // envelope + dout(DBL) << "tcp_recv receiving message from sd " << sd << endl; + + msg_envelope_t env; + if (!tcp_read( sd, (char*)&env, sizeof(env) )) + return 0; + + if (env.type == 0) { + dout(DBL) << "got dummy env, bailing" << endl; + return 0; + } + + dout(DBL) << "tcp_recv got envelope type=" << env.type << " src " << MSG_ADDR_NICE(env.source) << " dst " << MSG_ADDR_NICE(env.dest) << " nchunks=" << env.nchunks << endl; + + // payload + bufferlist blist; + for (int i=0; iinc("in"); + logger->inc("inb", s+sizeof(env)); + } + + dout(DBL) << "tcp_recv got " << s << " byte message from " << MSG_ADDR_NICE(m->get_source()) << endl; + + return m; +} + + + + +void tcp_open(int rank) +{ + dout(DBL) << "tcp_open to rank " << rank << " at " << rank_addr[rank] << endl; + + // create socket? + int sd = socket(AF_INET,SOCK_STREAM,0); + assert(sd > 0); + + // bind any port + struct sockaddr_in myAddr; + myAddr.sin_family = AF_INET; + myAddr.sin_addr.s_addr = htonl(INADDR_ANY); + myAddr.sin_port = htons( 0 ); + + int rc = bind(sd, (struct sockaddr *) &myAddr, sizeof(myAddr)); + assert(rc>=0); + + // connect! + int r = connect(sd, (sockaddr*)&rank_addr[rank], sizeof(myAddr)); + assert(r >= 0); + + //dout(DBL) << "tcp_open connected to " << who << endl; + assert(rank_sd.count(rank) == 0); + rank_sd[rank] = sd; + + if (g_conf.tcp_multi_out) { + rank_out[rank] = new OutThread(); + rank_out[rank]->create(); + } else { + rank_out[rank] = &single_out_thread; + if (!single_out_thread.is_started()) + single_out_thread.create(); + } +} + + +void tcp_marshall(Message *m) +{ + // marshall + if (m->empty_payload()) + m->encode_payload(); +} + +OutThread *tcp_lookup(Message *m) +{ + msg_addr_t addr = m->get_dest(); + + if (!entity_rank.count(m->get_dest())) { + // lookup and wait. + if (waiting_for_lookup.count(addr)) { + dout(DBL) << "already looking up " << MSG_ADDR_NICE(addr) << endl; + } else { + dout(DBL) << "lookup on " << MSG_ADDR_NICE(addr) << " for " << m << endl; + MNSLookup *r = new MNSLookup(addr); + rankmessenger->send_message(r, MSG_ADDR_DIRECTORY); + } + + // add waiter + waiting_for_lookup[addr].push_back(m); + return 0; + } + + int rank = entity_rank[m->get_dest()]; + + if (rank_sd.count(rank) == 0) { // should only happen on rank0? + tcp_open(rank); + } + assert(rank_sd.count(rank)); + m->set_tcp_sd( rank_sd[rank] ); + return rank_out[rank]; +} + + +/* + * send a Message* over the wire. ** do not block **. + */ +int tcp_send(Message *m) +{ + /*int rank = entity_rank[m->get_dest()]; + //if (rank_sd.count(rank) == 0) tcp_open(rank); + assert(rank_sd.count(rank)); + + int sd = rank_sd[rank]; + assert(sd); + */ + int sd = m->get_tcp_sd(); + assert(sd); + + // get envelope, buffers + msg_envelope_t *env = &m->get_envelope(); + bufferlist blist; + blist.claim( m->get_payload() ); + +#ifdef TCP_KEEP_CHUNKS + env->nchunks = blist.buffers().size(); +#else + env->nchunks = 1; +#endif + + // HACK osd -> client only + //if (m->get_source() >= MSG_ADDR_OSD(0) && m->get_source() < MSG_ADDR_CLIENT(0) && + // m->get_dest() >= MSG_ADDR_CLIENT(0)) + dout(DBL) << g_clock.now() << " sending " << m << " " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) + //<< " rank " << rank + << " sd " << sd << endl; + + // send envelope + int r = tcp_write( sd, (char*)env, sizeof(*env) ); + if (r < 0) { cerr << "error sending envelope for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } + + // payload +#ifdef TCP_KEEP_CHUNKS + // send chunk-wise + int i = 0; + for (list::iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + dout(DBL) << "tcp_sending frag " << i << " len " << (*it).length() << endl; + int size = (*it).length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { cerr << "error sending chunk len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } + r = tcp_write( sd, (*it).c_str(), size ); + if (r < 0) { cerr << "error sending data chunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } + i++; + } +#else + // one big chunk + int size = blist.length(); + r = tcp_write( sd, (char*)&size, sizeof(size) ); + if (r < 0) { cerr << "error sending data len for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << endl; assert(0); } + for (list::iterator it = blist.buffers().begin(); + it != blist.buffers().end(); + it++) { + r = tcp_write( sd, (*it).c_str(), (*it).length() ); + if (r < 0) { cerr << "error sending data megachunk for " << *m << " to " << MSG_ADDR_NICE(m->get_dest()) << " : len " << (*it).length() << endl; assert(0); } + } +#endif + + // hose message + delete m; + return 0; +} + + + + + +/** tcp_outthread + * this thread watching the outgoing queue, and encodes+sends any queued messages + */ + +void* OutThread::entry() +{ + lock.Lock(); + while (!q.empty() || !done) { + + if (!q.empty()) { + dout(DBL) << "outthread grabbing message(s)" << endl; + + // grab outgoing list + list out; + out.splice(out.begin(), q); + + // drop lock while i send these + lock.Unlock(); + + while (!out.empty()) { + Message *m = out.front(); + out.pop_front(); + + dout(DBL) << "outthread sending " << m << endl; + + if (!g_conf.tcp_serial_marshall) + tcp_marshall(m); + + tcp_send(m); + } + + lock.Lock(); + continue; + } + + // wait + dout(DBL) << "outthread sleeping" << endl; + cond.Wait(lock); + } + dout(DBL) << "outthread done" << endl; + + lock.Unlock(); + return 0; +} + + + +/** tcp_inthread + * read incoming messages from a given peer. + * give received and decoded messages to dispatch loop. + */ +void *tcp_inthread(void *r) +{ + int sd = (int)r; + + dout(DBL) << "tcp_inthread reading on sd " << sd << endl; + + while (!tcp_done) { + Message *m = tcp_recv(sd); + if (!m) break; + msg_addr_t who = m->get_source(); + + dout(20) << g_clock.now() << " inthread got " << m << " from sd " << sd << " who is " << who << endl; + + // give to dispatch loop + size_t sz = m->get_payload().length(); + + if (g_conf.tcp_multi_dispatch) { + const msg_addr_t dest = m->get_dest(); + directory_lock.Lock(); + TCPMessenger *messenger = directory[ dest ]; + directory_lock.Unlock(); + + if (messenger) + messenger->dispatch_queue(m); + else + dout(0) << "dest " << dest << " dne" << endl; + + } else { + // single dispatch queue + incoming_lock.Lock(); + { + //dout(-20) << "in1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; + //assert(stat_inq == incoming.size()); + incoming.push_back(m); + incoming_cond.Signal(); + + stat_inq++; + //assert(stat_inq == incoming.size()); + //dout(-20) << "in2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; + stat_inqb += sz; + } + incoming_lock.Unlock(); + } + + if (logger) { + //logger->inc("in"); + //logger->inc("inb", sz); + } + } + + dout(DBL) << "tcp_inthread closing " << sd << endl; + + //::close(sd); + return 0; +} + +/** tcp_accepthread + * accept incoming connections from peers. + * start a tcp_inthread for each. + */ +void *tcp_acceptthread(void *) +{ + dout(DBL) << "tcp_acceptthread starting" << endl; + + while (!tcp_done) { + //dout(DBL) << "accepting, left = " << left << endl; + + struct sockaddr_in addr; + socklen_t slen = sizeof(addr); + int sd = ::accept(listen_sd, (sockaddr*)&addr, &slen); + if (sd > 0) { + dout(DBL) << "accepted incoming on sd " << sd << endl; + + pthread_t th; + pthread_create(&th, + NULL, + tcp_inthread, + (void*)sd); + in_threads[sd] = th; + } else { + dout(DBL) << "no incoming connection?" << endl; + break; + } + } + return 0; +} + + + + +/** tcp_dispatchthread + * wait for pending timers, incoming messages. dispatch them. + */ +void TCPMessenger::dispatch_entry() +{ + incoming_lock.Lock(); + while (!incoming.empty() || !incoming_stop) { + if (!incoming.empty()) { + // grab incoming messages + list in; + in.splice(in.begin(), incoming); + + assert(stat_disq == 0); + stat_disq = stat_inq; + stat_disqb = stat_inqb; + stat_inq = 0; + stat_inqb = 0; + + // drop lock while we deliver + //assert(stat_inq == incoming.size()); + incoming_lock.Unlock(); + + // dispatch! + while (!in.empty()) { + Message *m = in.front(); + in.pop_front(); + + stat_disq--; + stat_disqb -= m->get_payload().length(); + if (logger) { + logger->set("inq", stat_inq+stat_disq); + logger->set("inqb", stat_inqb+stat_disq); + logger->inc("dis"); + } + + dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << + "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << + " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " + << m + << endl; + + dispatch(m); + } + + continue; + } + + // sleep + dout(DBL) << "dispatch: waiting for incoming messages" << endl; + incoming_cond.Wait(incoming_lock); + dout(DBL) << "dispatch: woke up" << endl; + } + incoming_lock.Unlock(); +} + + +void* tcp_dispatchthread(void*) +{ + dout(5) << "tcp_dispatchthread start pid " << getpid() << endl; + + while (1) { + // inq? + incoming_lock.Lock(); + + // done? + if (tcp_done && incoming.empty()) { + incoming_lock.Unlock(); + break; + } + + // wait? + if (incoming.empty()) { + // wait + dout(DBL) << "dispatch: incoming empty, waiting for incoming messages" << endl; + incoming_cond.Wait(incoming_lock); + dout(DBL) << "dispatch: woke up" << endl; + } + + // grab incoming messages + //dout(-20) << "dis1 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; + //assert(stat_inq == incoming.size()); + + list in; + in.splice(in.begin(), incoming); + + assert(stat_disq == 0); + stat_disq = stat_inq; + stat_disqb = stat_inqb; + stat_inq = 0; + stat_inqb = 0; + //assert(stat_inq == incoming.size()); + //dout(-20) << "dis2 stat_inq " << stat_inq << ", incoming " << incoming.size() << endl; + + // drop lock while we deliver + incoming_lock.Unlock(); + + // dispatch! + while (!in.empty()) { + Message *m = in.front(); + in.pop_front(); + + stat_disq--; + stat_disqb -= m->get_payload().length(); + if (logger) { + logger->set("inq", stat_inq+stat_disq); + logger->set("inqb", stat_inqb+stat_disq); + logger->inc("dis"); + } + + dout(DBL) << "dispatch doing " << *m << endl; + + // for rankserver? + if (m->get_type() == MSG_NS_CONNECTACK || // i just connected + m->get_dest() == MSG_ADDR_RANK(my_rank)) { + dout(DBL) << " giving to rankserver" << endl; + rankserver.dispatch(m); + continue; + } + + // ok + msg_addr_t dest = m->get_dest(); + directory_lock.Lock(); + if (directory.count(dest)) { + Messenger *who = directory[ dest ]; + directory_lock.Unlock(); + + dout(4) << g_clock.now() << " ---- '" << m->get_type_name() << + "' from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() << + " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() << " ---- " + << *m + << endl; + + who->dispatch(m); + } else { + directory_lock.Unlock(); + dout (1) << "---- i don't know who " << MSG_ADDR_NICE(dest) << " " << dest << " is." << endl; + assert(0); + } + } + assert(stat_disq == 0); + + } + + + g_timer.shutdown(); + + dout(5) << "tcp_dispatchthread exiting loop" << endl; + return 0; +} + + +// start/stop mpi receiver thread (for unsolicited messages) +int tcpmessenger_start() +{ + dout(5) << "starting accept thread" << endl; + pthread_create(&listen_thread_id, + NULL, + tcp_acceptthread, + 0); + + dout(5) << "starting dispatch thread" << endl; + + // start a thread + pthread_create(&dispatch_thread_id, + NULL, + tcp_dispatchthread, + 0); + + + /* + dout(5) << "starting outgoing thread" << endl; + pthread_create(&out_thread_id, + NULL, + tcp_outthread, + 0); + */ + if (!g_conf.tcp_multi_out) + single_out_thread.create(); + return 0; +} + + +/* + * kick and wake up _loop (to pick up new outgoing message, or quit) + */ + +void tcpmessenger_kick_dispatch_loop() +{ + if (g_conf.tcp_multi_dispatch) { + assert(0); + // all of them + /*for (hash_map::iterator i = directory.begin(); + i != directory.end(); + i++) + i->second->dispatch_kick(); + */ + } else { + // just one + dout(DBL) << "kicking" << endl; + incoming_lock.Lock(); + dout(DBL) << "prekick" << endl; + incoming_cond.Signal(); + incoming_lock.Unlock(); + dout(DBL) << "kicked" << endl; + } +} + +/* +void tcpmessenger_kick_outgoing_loop() +{ + outgoing_lock.Lock(); + outgoing_cond.Signal(); + outgoing_lock.Unlock(); +} +*/ + + +// wait for thread to finish + +void tcpmessenger_wait() +{ + if (g_conf.tcp_multi_dispatch) { + // new way + incoming_lock.Lock(); + while (!tcp_done) + incoming_cond.Wait(incoming_lock); + incoming_lock.Unlock(); + } else { + // old way + dout(10) << "tcpmessenger_wait waking up dispatch loop" << endl; + tcpmessenger_kick_dispatch_loop(); + + void *returnval; + dout(10) << "tcpmessenger_wait waiting for thread to finished." << endl; + pthread_join(dispatch_thread_id, &returnval); + dout(10) << "tcpmessenger_wait thread finished." << endl; + } +} + + + + +msg_addr_t register_entity(msg_addr_t addr) +{ + lookup_lock.Lock(); + + // prepare to wait + long id = ++regid; + Cond cond; + waiting_for_register_cond[id] = &cond; + + if (my_rank < 0) { + dout(DBL) << "register_entity don't know my rank, connecting" << endl; + + // connect to nameserver; discover my rank. + Message *m = new MNSConnect(listen_addr); + m->set_dest(MSG_ADDR_DIRECTORY, 0); + tcp_marshall(m); + OutThread *outt = tcp_lookup(m); + assert(outt); + tcp_send(m); + + // wait for reply + while (my_rank < 0) + waiting_for_rank.Wait(lookup_lock); + assert(my_rank > 0); + } + + // send req + dout(DBL) << "register_entity " << MSG_ADDR_NICE(addr) << endl; + Message *m = new MNSRegister(addr, my_rank, id); + m->set_dest(MSG_ADDR_DIRECTORY, 0); + tcp_marshall(m); + OutThread *outt = tcp_lookup(m); + assert(outt); + tcp_send(m); + + // wait? + while (!waiting_for_register_result.count(id)) + cond.Wait(lookup_lock); + + // get result, clean up + msg_addr_t entity = waiting_for_register_result[id]; + waiting_for_register_result.erase(id); + waiting_for_register_cond.erase(id); + + dout(DBL) << "register_entity got " << MSG_ADDR_NICE(entity) << endl; + + lookup_lock.Unlock(); + + // ok! + return entity; +} + + + +/*********** + * Tcpmessenger class implementation + */ + + +TCPMessenger::TCPMessenger(msg_addr_t myaddr) : + Messenger(myaddr), + dispatch_thread(this) +{ + if (myaddr != MSG_ADDR_DIRECTORY) { + // register! + myaddr = register_entity(myaddr); + } + + + // my address + set_myaddr( myaddr ); + + // register myself in the messenger directory + directory_lock.Lock(); + { + directory[myaddr] = this; + + stat_num++; + if (logger) logger->set("num", stat_num); + } + directory_lock.Unlock(); + + // register to execute timer events + //g_timer.set_messenger_kicker(new C_TCPKicker()); + // g_timer.set_messenger(this); +} + + +void TCPMessenger::ready() +{ + directory_lock.Lock(); + directory_ready.insert(get_myaddr()); + directory_lock.Unlock(); + + if (get_myaddr() != MSG_ADDR_DIRECTORY) { + // started! tell namer we are up and running. + lookup_lock.Lock(); + { + Message *m = new MGenericMessage(MSG_NS_STARTED); + m->set_source(get_myaddr(), 0); + m->set_dest(MSG_ADDR_DIRECTORY, 0); + tcp_marshall(m); + OutThread *outt = tcp_lookup(m); + assert(outt); + tcp_send(m); + } + lookup_lock.Unlock(); + } +} + + +TCPMessenger::~TCPMessenger() +{ + //delete logger; +} + +tcpaddr_t& TCPMessenger::get_tcpaddr() +{ + return listen_addr; +} + +void TCPMessenger::map_entity_rank(msg_addr_t e, int r) +{ + lookup_lock.Lock(); + entity_rank[e] = r; + lookup_lock.Unlock(); +} + +void TCPMessenger::map_rank_addr(int r, tcpaddr_t a) +{ + lookup_lock.Lock(); + rank_addr[r] = a; + lookup_lock.Unlock(); +} + + +int TCPMessenger::get_dispatch_queue_len() +{ + return stat_inq+stat_disq; +} + + +int TCPMessenger::shutdown() +{ + dout(DBL) << "shutdown by " << MSG_ADDR_NICE(get_myaddr()) << endl; + + // dont' send unregistery from nsmessenger shutdown! + if (this != nsmessenger && + (my_rank > 0 || nsmessenger)) { + dout(DBL) << "sending unregister from " << MSG_ADDR_NICE(get_myaddr()) << " to ns" << endl; + send_message(new MGenericMessage(MSG_NS_UNREGISTER), + MSG_ADDR_DIRECTORY); + } + + // remove me from the directory + directory_lock.Lock(); + directory.erase(get_myaddr()); + + // last one? + bool lastone = directory.empty(); + //dout(1) << "lastone = " << lastone << " .. " << directory.size() << endl; + + + // or almost last one? + if (rankmessenger && directory.size() == 1) { + directory_lock.Unlock(); + tcpmessenger_stop_rankserver(); + directory_lock.Lock(); + } + + stat_num--; + if (logger) logger->set("num", stat_num); + + directory_lock.Unlock(); + + // last one? + if (lastone) { + dout(2) << "shutdown last tcpmessenger on rank " << my_rank << " shut down" << endl; + //pthread_t whoami = pthread_self(); + + // no more timer events + //g_timer.unset_messenger(); + + // close incoming sockets + //void *r; + for (map::iterator it = in_threads.begin(); + it != in_threads.end(); + it++) { + dout(DBL) << "closing reader on sd " << it->first << endl; + ::close(it->first); + //pthread_join(it->second, &r); + } + + if (g_conf.tcp_multi_dispatch) { + // kill off dispatch threads + dout(DBL) << "killing dispatch threads" << endl; + for (hash_map::iterator it = directory.begin(); + it != directory.end(); + it++) + it->second->dispatch_stop(); + } + + dout(DBL) << "setting tcp_done" << endl; + + // kick/kill incoming thread + incoming_lock.Lock(); + tcp_done = true; + incoming_cond.Signal(); + incoming_lock.Unlock(); + + // finish off outgoing thread + dout(10) << "waiting for outgoing to finish" << endl; + if (g_conf.tcp_multi_out) { + for (hash_map::iterator it = rank_out.begin(); + it != rank_out.end(); + it++) { + it->second->stop(); + delete it->second; + } + } else { + single_out_thread.stop(); + } + + + /* + + dout(15) << "whoami = " << whoami << ", thread = " << dispatch_thread_id << endl; + if (whoami == thread_id) { + // i am the event loop thread, just set flag! + dout(15) << " set tcp_done=true" << endl; + tcp_done = true; + } + */ + } + return 0; +} + + + + +/*** + * public messaging interface + */ + + +/* note: send_message _MUST_ be non-blocking */ +int TCPMessenger::send_message(Message *m, msg_addr_t dest, int port, int fromport) +{ + // set envelope + m->set_source(get_myaddr(), fromport); + m->set_dest(dest, port); + m->set_lamport_send_stamp( get_lamport() ); + + dout(4) << "--> " << m->get_type_name() + << " from " << MSG_ADDR_NICE(m->get_source()) << ':' << m->get_source_port() + << " to " << MSG_ADDR_NICE(m->get_dest()) << ':' << m->get_dest_port() + << " ---- " << m + << endl; + + // local? + TCPMessenger *entity = 0; + directory_lock.Lock(); + if (directory.count(dest) && + directory_ready.count(dest)) entity = directory[dest]; + directory_lock.Unlock(); + + if (entity) { + // local! + ::incoming_lock.Lock(); + { + dout(20) << " queueing locally for " << dest << " " << m << endl; //", stat_inq " << stat_inq << ", incomign " << ::incoming.size() << endl; + //assert(stat_inq == ::incoming.size()); + ::incoming.push_back(m); + ::incoming_cond.Signal(); + stat_inq++; + //assert(stat_inq == ::incoming.size()); + //dout(-20) << " stat_inq " << stat_inq << ", incoming " << ::incoming.size() << endl; + stat_inqb += m->get_payload().length(); + } + ::incoming_lock.Unlock(); + } else { + // remote! + + if (g_conf.tcp_serial_marshall) + tcp_marshall(m); + + if (g_conf.tcp_serial_out) { + lookup_lock.Lock(); + // send in this thread + if (tcp_lookup(m)) + tcp_send(m); + lookup_lock.Unlock(); + } else { + lookup_lock.Lock(); + OutThread *outt = tcp_lookup(m); + lookup_lock.Unlock(); + + if (outt) outt->send(m); + } + } + + return 0; +} + + + + diff --git a/branches/sage/cephmds2/msg/TCPMessenger.h b/branches/sage/cephmds2/msg/TCPMessenger.h new file mode 100644 index 0000000000000..5cafbe470214b --- /dev/null +++ b/branches/sage/cephmds2/msg/TCPMessenger.h @@ -0,0 +1,115 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __TCPMESSENGER_H +#define __TCPMESSENGER_H + +#include "Messenger.h" +#include "Dispatcher.h" +#include "common/Thread.h" + +#include "tcp.h" + +class Timer; + + +class TCPMessenger : public Messenger { + protected: + + //class Logger *logger; // for logging + + bool incoming_stop; + Mutex incoming_lock; + list incoming; + Cond incoming_cond; + + class DispatchThread : public Thread { + TCPMessenger *m; + public: + DispatchThread(TCPMessenger *_m) : m(_m) {} + void *entry() { + m->dispatch_entry(); + return 0; + } + } dispatch_thread; + + void dispatch_entry(); + +public: + void dispatch_start() { + incoming_stop = false; + dispatch_thread.create(); + } + /* void dispatch_kick() { + incoming_lock.Lock(); + incoming_cond.Signal(); + incoming_lock.Unlock(); + }*/ + void dispatch_stop() { + incoming_lock.Lock(); + incoming_stop = true; + incoming_cond.Signal(); + incoming_lock.Unlock(); + dispatch_thread.join(); + } + void dispatch_queue(Message *m) { + incoming_lock.Lock(); + incoming.push_back(m); + incoming_cond.Signal(); + incoming_lock.Unlock(); + } + + public: + TCPMessenger(msg_addr_t myaddr); + ~TCPMessenger(); + + void ready(); + + tcpaddr_t& get_tcpaddr(); + void map_entity_rank(msg_addr_t e, int r); + void map_rank_addr(int r, tcpaddr_t a); + + int get_dispatch_queue_len(); + + void callback_kick(); + + // init, shutdown MPI and associated event loop thread. + virtual int shutdown(); + + // message interface + virtual int send_message(Message *m, msg_addr_t dest, int port=0, int fromport=0); +}; + +/** + * these are all ONE per process. + */ + +extern int tcpmessenger_lookup(char *str, tcpaddr_t& ta); + +extern int tcpmessenger_findns(tcpaddr_t &nsa); + +extern int tcpmessenger_init(); +extern int tcpmessenger_start(); // start thread +extern void tcpmessenger_wait(); // wait for thread to finish. +extern int tcpmessenger_shutdown(); // finalize MPI + +extern void tcpmessenger_start_nameserver(tcpaddr_t& ta); // on rank 0 +extern void tcpmessenger_stop_nameserver(); // on rank 0 +extern void tcpmessenger_start_rankserver(tcpaddr_t& ta); // on all ranks +extern void tcpmessenger_stop_rankserver(); // on all ranks + +extern int tcpmessenger_get_rank(); + + +#endif diff --git a/branches/sage/cephmds2/msg/error.c b/branches/sage/cephmds2/msg/error.c new file mode 100644 index 0000000000000..15cd16a2ca9da --- /dev/null +++ b/branches/sage/cephmds2/msg/error.c @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +#include "include/error.h" + +#define EXIT_USAGE_ERROR -1 /* error codes for program exit */ +#define EXIT_SYSTEM_ERROR -2 +#define EXIT_GENERIC_ERROR -3 +#define MSGSIZ 1024 /* maximum error message length */ + +/* print usage error message and exit */ +void userror(const char *use, const char *fmt, ...) +{ + char msg[MSGSIZ]; + int len; + + va_list ap; + va_start(ap, fmt); + + len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); + len += snprintf(msg+len, MSGSIZ-len, "\n"); + len += snprintf(msg+len, MSGSIZ-len, use); + fprintf(stderr, "%s\n", msg); + exit(EXIT_USAGE_ERROR); + + va_end(ap); +} + +/* print system error message and exit */ +void syserror(const char *fmt, ...) +{ + char msg[MSGSIZ]; + int len; + + va_list ap; + va_start(ap, fmt); + + len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); + len += snprintf(msg+len, MSGSIZ-len, ": %s\n", strerror(errno)); + fprintf(stderr, "%s", msg); + exit(EXIT_SYSTEM_ERROR); + + va_end(ap); +} + +/* print error message and exit */ +void exiterror(const char *fmt, ...) +{ + char msg[MSGSIZ]; + int len; + + va_list ap; + va_start(ap, fmt); + + len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); + fprintf(stderr, "%s\n", msg); + exit(EXIT_GENERIC_ERROR); + + va_end(ap); +} + +/* print error message */ +void error(const char *fmt, ...) +{ + char msg[MSGSIZ]; + int len; + + va_list ap; + va_start(ap, fmt); + + len = vsnprintf(msg+len, MSGSIZ-len, fmt, ap); + fprintf(stderr, "%s\n", msg); + + va_end(ap); +} diff --git a/branches/sage/cephmds2/msg/mpistarter.cc b/branches/sage/cephmds2/msg/mpistarter.cc new file mode 100644 index 0000000000000..79391f78210d2 --- /dev/null +++ b/branches/sage/cephmds2/msg/mpistarter.cc @@ -0,0 +1,62 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include + +#include "TCPMessenger.h" + +/* + * start up TCPMessenger via MPI. + */ + +pair mpi_bootstrap_tcp(int& argc, char**& argv) +{ + tcpmessenger_init(); + tcpmessenger_start(); + + // exchnage addresses with other nodes + MPI_Init(&argc, &argv); + + int mpi_world; + int mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + //dout(1) << "i am " << mpi_rank << " of " << mpi_world << endl; + + // start up directory? + tcpaddr_t ta; + if (mpi_rank == 0) { + dout(30) << "i am rank 0, starting ns directory" << endl; + tcpmessenger_start_nameserver(ta); + } else { + memset(&ta, 0, sizeof(ta)); + } + + // distribute tcpaddr + int r = MPI_Bcast(&ta, sizeof(ta), MPI_CHAR, + 0, MPI_COMM_WORLD); + + dout(30) << "r = " << r << " ns tcpaddr is " << ta << endl; + tcpmessenger_start_rankserver(ta); + + MPI_Barrier(MPI_COMM_WORLD); + //g_clock.tare(); + MPI_Finalize(); + + return pair(mpi_rank, mpi_world); +} + + diff --git a/branches/sage/cephmds2/msg/new_mpistarter.cc b/branches/sage/cephmds2/msg/new_mpistarter.cc new file mode 100644 index 0000000000000..fc9da720f19ee --- /dev/null +++ b/branches/sage/cephmds2/msg/new_mpistarter.cc @@ -0,0 +1,43 @@ +#include +#include "NewMessenger.h" + +/* + * start up NewMessenger via MPI. + */ + +pair mpi_bootstrap_new(int& argc, char**& argv) +{ + MPI_Init(&argc, &argv); + + int mpi_world; + int mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + tcpaddr_t nsaddr; + memset(&nsaddr, 0, sizeof(nsaddr)); + + if (mpi_rank == 0) { + // i am root. + rank.my_rank = 0; + rank.start_rank(nsaddr); + nsaddr = rank.get_listen_addr(); + } + + int r = MPI_Bcast(&nsaddr, sizeof(nsaddr), MPI_CHAR, + 0, MPI_COMM_WORLD); + + dout(30) << "r = " << r << " ns tcpaddr is " << nsaddr << endl; + + if (mpi_rank != 0) { + rank.start_rank(nsaddr); + } + + MPI_Barrier(MPI_COMM_WORLD); + + //g_clock.tare(); + + MPI_Finalize(); + + return pair(mpi_rank, mpi_world); +} diff --git a/branches/sage/cephmds2/msg/tcp.cc b/branches/sage/cephmds2/msg/tcp.cc new file mode 100644 index 0000000000000..1a448a91cb2c6 --- /dev/null +++ b/branches/sage/cephmds2/msg/tcp.cc @@ -0,0 +1,87 @@ + +#include "tcp.h" + +/****************** + * tcp crap + */ + +bool tcp_read(int sd, char *buf, int len) +{ + while (len > 0) { + int got = ::recv( sd, buf, len, 0 ); + if (got == 0) { + dout(18) << "tcp_read socket " << sd << " closed" << endl; + return false; + } + if (got < 0) { + dout(18) << "tcp_read bailing with " << got << endl; + return false; + } + assert(got >= 0); + len -= got; + buf += got; + //dout(DBL) << "tcp_read got " << got << ", " << len << " left" << endl; + } + return true; +} + +int tcp_write(int sd, char *buf, int len) +{ + //dout(DBL) << "tcp_write writing " << len << endl; + assert(len > 0); + while (len > 0) { + int did = ::send( sd, buf, len, 0 ); + if (did < 0) { + dout(1) << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; + //cerr << "tcp_write error did = " << did << " errno " << errno << " " << strerror(errno) << endl; + } + //assert(did >= 0); + if (did < 0) return did; + len -= did; + buf += did; + //dout(DBL) << "tcp_write did " << did << ", " << len << " left" << endl; + } + return 0; +} + + +int tcp_hostlookup(char *str, tcpaddr_t& ta) +{ + char *host = str; + char *port = 0; + + for (int i=0; str[i]; i++) { + if (str[i] == ':') { + port = str+i+1; + str[i] = 0; + break; + } + } + if (!port) { + cerr << "addr '" << str << "' doesn't look like 'host:port'" << endl; + return -1; + } + //cout << "host '" << host << "' port '" << port << "'" << endl; + + int iport = atoi(port); + + struct hostent *myhostname = gethostbyname( host ); + if (!myhostname) { + cerr << "host " << host << " not found" << endl; + return -1; + } + + memset(&ta, 0, sizeof(ta)); + + //cout << "addrtype " << myhostname->h_addrtype << " len " << myhostname->h_length << endl; + + ta.sin_family = myhostname->h_addrtype; + memcpy((char *)&ta.sin_addr, + myhostname->h_addr, + myhostname->h_length); + ta.sin_port = iport; + + cout << "lookup '" << host << ":" << port << "' -> " << ta << endl; + + return 0; +} diff --git a/branches/sage/cephmds2/msg/tcp.h b/branches/sage/cephmds2/msg/tcp.h new file mode 100644 index 0000000000000..f38388d456a8c --- /dev/null +++ b/branches/sage/cephmds2/msg/tcp.h @@ -0,0 +1,37 @@ +#ifndef __TCP_H +#define __TCP_H + +#include +#include +#include +#include + +typedef struct sockaddr_in tcpaddr_t; + +using std::ostream; + +inline ostream& operator<<(ostream& out, const tcpaddr_t &a) +{ + unsigned char addr[4]; + memcpy((char*)addr, (char*)&a.sin_addr.s_addr, 4); + out << (unsigned)addr[0] << "." + << (unsigned)addr[1] << "." + << (unsigned)addr[2] << "." + << (unsigned)addr[3] << ":" + << (int)a.sin_port; + return out; +} + +extern bool tcp_read(int sd, char *buf, int len); +extern int tcp_write(int sd, char *buf, int len); +extern int tcp_hostlookup(char *str, tcpaddr_t& ta); + +inline bool operator==(const tcpaddr_t& a, const tcpaddr_t& b) { + return strncmp((const char*)&a, (const char*)&b, sizeof(a)) == 0; +} +inline bool operator!=(const tcpaddr_t& a, const tcpaddr_t& b) { + return strncmp((const char*)&a, (const char*)&b, sizeof(a)) != 0; +} + + +#endif diff --git a/branches/sage/cephmds2/newsyn.cc b/branches/sage/cephmds2/newsyn.cc new file mode 100644 index 0000000000000..43fd1b2373391 --- /dev/null +++ b/branches/sage/cephmds2/newsyn.cc @@ -0,0 +1,420 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include +#include +using namespace std; + +#include + +#include "config.h" + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "mon/Monitor.h" +#include "client/Client.h" +#include "client/SyntheticClient.h" + +#include "msg/NewerMessenger.h" + +#include "common/Timer.h" + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + +class C_Test : public Context { +public: + void finish(int r) { + cout << "C_Test->finish(" << r << ")" << endl; + } +}; + + +/* + * start up NewMessenger via MPI. + */ +#include + +pair mpi_bootstrap_new(int& argc, char**& argv, MonMap *monmap) +{ + MPI_Init(&argc, &argv); + + int mpi_world; + int mpi_rank; + MPI_Comm_size(MPI_COMM_WORLD, &mpi_world); + MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank); + + // first, synchronize clocks. + MPI_Barrier(MPI_COMM_WORLD); + //dout(-10) << "tare" << endl; + g_clock.tare(); + + // start up all monitors at known addresses. + entity_inst_t moninst[mpi_world]; // only care about first g_conf.num_mon of these. + + if (mpi_rank < g_conf.num_mon) { + rank.my_rank = mpi_rank; + rank.start_rank(); // bind and listen + + moninst[mpi_rank].rank = mpi_rank; + moninst[mpi_rank].addr = rank.get_listen_addr(); + + //cerr << mpi_rank << " at " << rank.get_listen_addr() << endl; + } + + MPI_Gather( &moninst[mpi_rank], sizeof(entity_inst_t), MPI_CHAR, + moninst, sizeof(entity_inst_t), MPI_CHAR, + 0, MPI_COMM_WORLD); + + if (mpi_rank == 0) { + rank.start_namer(); + + for (int i=0; imon_inst[i] = moninst[i]; + if (i) rank.namer->manual_insert_inst(monmap->get_inst(i)); + } + } + + + // distribute monmap + bufferlist bl; + if (mpi_rank == 0) { + monmap->encode(bl); + + int fd = ::open(".ceph_monmap", O_WRONLY|O_CREAT); + ::write(fd, (void*)bl.c_str(), bl.length()); + ::fchmod(fd, 0755); + ::close(fd); + + } else { + int l = g_conf.num_mon * 1000; // nice'n big. + bufferptr bp(l); + bl.append(bp); + } + + MPI_Bcast(bl.c_str(), bl.length(), MPI_CHAR, + 0, MPI_COMM_WORLD); + + if (mpi_rank > 0) { + monmap->decode(bl); + rank.set_namer(monmap->get_inst(0).addr); + } + + if (mpi_rank >= g_conf.num_mon) { + rank.start_rank(); + } + + // wait for everyone! + MPI_Barrier(MPI_COMM_WORLD); + + return pair(mpi_rank, mpi_world); +} + +utime_t tick_start; +int tick_count = 0; + +class C_Tick : public Context { +public: + void finish(int) { + utime_t now = g_clock.now() - tick_start; + dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; + tick_count += g_conf.tick; + utime_t next = tick_start; + next.sec_ref() += tick_count; + g_timer.add_event_at(next, new C_Tick); + } +}; + +class C_Die : public Context { +public: + void finish(int) { + cerr << "die" << endl; + exit(1); + } +}; + +class C_Debug : public Context { + public: + void finish(int) { + int size = &g_conf.debug_after - &g_conf.debug; + memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); + dout(0) << "debug_after flipping debug settings" << endl; + } +}; + + +int main(int argc, char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + + map kill_osd_after; + if (1) { + vector nargs; + for (unsigned i=0; i nargs; + for (unsigned i=0; i mpiwho = mpi_bootstrap_new(argc, argv, monmap); + int myrank = mpiwho.first; + int world = mpiwho.second; + + int need = 0; + if (g_conf.ms_skip_rank0) need++; + need += NUMMDS; + if (g_conf.ms_stripe_osds) + need++; + else + need += NUMOSD; + if (NUMCLIENT) { + if (!g_conf.ms_overlay_clients) + need += 1; + } + assert(need <= world); + + if (myrank == 0) + cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; + + + char hostname[100]; + gethostname(hostname,100); + int pid = getpid(); + + int started = 0; + + //if (myrank == 0) g_conf.debug = 20; + + // create mon + if (myrank < g_conf.num_mon) { + Monitor *mon = new Monitor(myrank, rank.register_entity(MSG_ADDR_MON(myrank)), monmap); + mon->init(); + } + + + // wait for monitors to start. + MPI_Barrier(MPI_COMM_WORLD); + + // okay, home free! + MPI_Finalize(); + + + // create mds + map mds; + map mdsosd; + for (int i=0; iinit(); + started++; + + if (g_conf.mds_local_osd) { + mdsosd[i] = new OSD(i+10000, rank.register_entity(MSG_ADDR_OSD(i+10000)), monmap); + mdsosd[i]->init(); + } + } + + // create osd + map osd; + int max_osd_nodes = world - NUMMDS - g_conf.ms_skip_rank0; // assumes 0 clients, if we stripe. + int osds_per_node = (NUMOSD-1)/max_osd_nodes + 1; + for (int i=0; iinit(); + started++; + } + + if (g_conf.ms_overlay_clients) sleep(5); + + // create client + int skip_osd = NUMOSD; + if (g_conf.ms_overlay_clients) + skip_osd = 0; // put clients with osds too! + int client_nodes = world - NUMMDS - skip_osd - g_conf.ms_skip_rank0; + int clients_per_node = 1; + if (NUMCLIENT && client_nodes > 0) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; + set clientlist; + map client;//[NUMCLIENT]; + map syn;//[NUMCLIENT]; + for (int i=0; iinit(); + started++; + + syn[i] = new SyntheticClient(client[i]); + } + + if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; + + int nclients = 0; + for (set::iterator it = clientlist.begin(); + it != clientlist.end(); + it++) { + int i = *it; + + //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; + client[i]->mount(); + syn[i]->start_thread(); + + nclients++; + } + if (nclients) { + cerr << nclients << " clients on tcprank " << rank.my_rank << " " << hostname << "." << pid << endl; + } + + for (set::iterator it = clientlist.begin(); + it != clientlist.end(); + it++) { + int i = *it; + + // cout << "waiting for synthetic client" << i << " to finish" << endl; + syn[i]->join_thread(); + delete syn[i]; + + client[i]->unmount(); + //cout << "client" << i << " unmounted" << endl; + client[i]->shutdown(); + + delete client[i]; + } + + + if (myrank && !started) { + //dout(1) << "IDLE" << endl; + cerr << "idle on tcprank " << rank.my_rank << " " << hostname << "." << pid << endl; + //rank.stop_rank(); + } + + // wait for everything to finish + rank.wait(); + + if (started) cerr << "newsyn finishing" << endl; + + return 0; // whatever, cleanup hangs sometimes (stopping ebofs threads?). + + + // cleanup + for (map::iterator i = mds.begin(); i != mds.end(); i++) + delete i->second; + for (map::iterator i = mdsosd.begin(); i != mdsosd.end(); i++) + delete i->second; + for (map::iterator i = osd.begin(); i != osd.end(); i++) + delete i->second; + /* + for (map::iterator i = client.begin(); i != client.end(); i++) + delete i->second; + for (map::iterator i = syn.begin(); i != syn.end(); i++) + delete i->second; + */ + /* + for (int i=0; i +#include +#include + + +int myrand() +{ + if (0) + return rand(); + else { + static int n = 0; + srand(n++); + return rand(); + } +} + + +object_t Ager::age_get_oid() { + if (!age_free_oids.empty()) { + object_t o = age_free_oids.front(); + age_free_oids.pop_front(); + return o; + } + object_t last = age_cur_oid; + ++age_cur_oid.bno; + return last; +} + +ssize_t Ager::age_pick_size() { + ssize_t max = file_size_distn.sample() * 1024; + return max/2 + (myrand() % 100) * max/200 + 1; +} + +bool start_debug = false; + +__uint64_t Ager::age_fill(float pc, utime_t until) { + int max = 1024*1024; + bufferptr bp(max); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + __uint64_t wrote = 0; + while (1) { + if (g_clock.now() > until) break; + + struct statfs st; + store->statfs(&st); + float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); + float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to + //float a = (float)(st.f_bfree) / (float)st.f_blocks; + //dout(10) << "age_fill at " << a << " / " << pc << " .. " << st.f_blocks << " " << st.f_bavail << endl; + if (free >= pc) { + dout(2) << "age_fill at " << free << " / " << avail << " / " << " / " << pc << " stopping" << endl; + break; + } + + // make sure we can write to it.. + if (avail > .98 || + avail - free > .02) + store->sync(); + + object_t oid = age_get_oid(); + + int b = myrand() % 10; + age_objects[b].push_back(oid); + + ssize_t s = age_pick_size(); + wrote += (s + 4095) / 4096; + + + + + dout(2) << "age_fill at " << free << " / " << avail << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; + + + if (false && !g_conf.ebofs_verify && start_debug && wrote > 1000000ULL) { + /* + + + 1005700 +? +1005000 +1005700 + 1005710 + 1005725ULL + 1005750ULL + 1005800 + 1006000 + +// 99 1000500 ? 1000750 1006000 +*/ + g_conf.debug_ebofs = 30; + g_conf.ebofs_verify = true; + } + + off_t off = 0; + while (s) { + ssize_t t = MIN(s, max); + bufferlist sbl; + sbl.substr_of(bl, 0, t); + store->write(oid, off, t, sbl, false); + off += t; + s -= t; + } + oid.bno++; + } + + return wrote*4; // KB +} + +void Ager::age_empty(float pc) { + int nper = 20; + int n = nper; + + //g_conf.ebofs_verify = true; + + while (1) { + struct statfs st; + store->statfs(&st); + float free = 1.0 - ((float)(st.f_bfree) / (float)st.f_blocks); + float avail = 1.0 - ((float)(st.f_bavail) / (float)st.f_blocks); // to write to + dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << endl;//" stopping" << endl; + if (free <= pc) { + dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " stopping" << endl; + break; + } + + int b = myrand() % 10; + n--; + if (n == 0 || age_objects[b].empty()) { + dout(2) << "age_empty sync" << endl; + //sync(); + //sync(); + n = nper; + continue; + } + object_t oid = age_objects[b].front(); + age_objects[b].pop_front(); + + dout(2) << "age_empty at " << free << " / " << avail << " / " << pc << " removing " << hex << oid << dec << endl; + + store->remove(oid); + age_free_oids.push_back(oid); + } + + g_conf.ebofs_verify = false; +} + +void pfrag(__uint64_t written, ObjectStore::FragmentationStat &st) +{ + cout << "#gb wr\ttotal\tn x\tavg x\tavg per\tavg j\tfree\tn fr\tavg fr\tnum<2\tsum<2\tnum<4\tsum<4\t..." + << endl; + cout << written + << "\t" << st.total + << "\t" << st.num_extent + << "\t" << st.avg_extent + << "\t" << st.avg_extent_per_object + << "\t" << st.avg_extent_jump + << "\t" << st.total_free + << "\t" << st.num_free_extent + << "\t" << st.avg_free_extent; + + int n = st.num_extent; + for (__uint64_t i=1; i <= 30; i += 1) { + cout << "\t" << st.extent_dist[i]; + cout << "\t" << st.extent_dist_sum[i]; + //cout << "\ta " << (st.extent_dist[i] ? (st.extent_dist_sum[i] / st.extent_dist[i]):0); + n -= st.extent_dist[i]; + if (n == 0) break; + } + cout << endl; +} + + +void Ager::age(int time, + float high_water, // fill to this % + float low_water, // then empty to this % + int count, // this many times + float final_water, // and end here ( <= low_water) + int fake_size_mb) { + + store->_fake_writes(true); + srand(0); + + utime_t start = g_clock.now(); + utime_t until = start; + until.sec_ref() += time; + + int elapsed = 0; + int freelist_inc = 60; + utime_t nextfl = start; + nextfl.sec_ref() += freelist_inc; + + while (age_objects.size() < 10) age_objects.push_back( list() ); + + if (fake_size_mb) { + int fake_bl = fake_size_mb * 256; + struct statfs st; + store->statfs(&st); + float f = (float)fake_bl / (float)st.f_blocks; + high_water = (float)high_water * f; + low_water = (float)low_water * f; + final_water = (float)final_water * f; + dout(2) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; + } + + // init size distn (once) + if (!did_distn) { + did_distn = true; + age_cur_oid = object_t(0,1); + file_size_distn.add(1, 19.0758125+0.65434375); + file_size_distn.add(512, 35.6566); + file_size_distn.add(1024, 27.7271875); + file_size_distn.add(2*1024, 16.63503125); + //file_size_distn.add(4*1024, 106.82384375); + //file_size_distn.add(8*1024, 81.493375); + //file_size_distn.add(16*1024, 14.13553125); + //file_size_distn.add(32*1024, 2.176); + //file_size_distn.add(256*1024, 0.655938); + //file_size_distn.add(512*1024, 0.1480625); + //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit + file_size_distn.normalize(); + } + + // clear + for (int i=0; i<10; i++) + age_objects[i].clear(); + + ObjectStore::FragmentationStat st; + + __uint64_t wrote = 0; + + for (int c=1; c<=count; c++) { + if (g_clock.now() > until) break; + + //if (c == 7) start_debug = true; + + dout(1) << "#age " << c << "/" << count << " filling to " << high_water << endl; + __uint64_t w = age_fill(high_water, until); + //dout(1) << "age wrote " << w << endl; + wrote += w; + //store->sync(); + //store->_get_frag_stat(st); + //pfrag(st); + + + if (c == count) { + dout(1) << "#age final empty to " << final_water << endl; + age_empty(final_water); + } else { + dout(1) << "#age " << c << "/" << count << " emptying to " << low_water << endl; + age_empty(low_water); + } + //store->sync(); + //store->sync(); + + // show frag state + store->_get_frag_stat(st); + pfrag(wrote / (1024ULL*1024ULL) , // GB + st); + + // dump freelist? + if (g_clock.now() > nextfl) { + elapsed += freelist_inc; + save_freelist(elapsed); + nextfl.sec_ref() += freelist_inc; + } + } + + // dump the freelist + save_freelist(0); + exit(0); // hack + + // ok! + store->_fake_writes(false); + store->sync(); + store->sync(); + dout(1) << "age finished" << endl; +} + + +void Ager::load_freelist() +{ + dout(1) << "load_freelist" << endl; + + struct stat st; + + int r = ::stat("ebofs.freelist", &st); + assert(r == 0); + + bufferptr bp(st.st_size); + bufferlist bl; + bl.push_back(bp); + int fd = ::open("ebofs.freelist", O_RDONLY); + ::read(fd, bl.c_str(), st.st_size); + ::close(fd); + + ((Ebofs*)store)->_import_freelist(bl); + store->sync(); + store->sync(); +} + +void Ager::save_freelist(int el) +{ + dout(1) << "save_freelist " << el << endl; + char s[100]; + sprintf(s, "ebofs.freelist.%d", el); + bufferlist bl; + ((Ebofs*)store)->_export_freelist(bl); + ::unlink(s); + int fd = ::open(s, O_CREAT|O_WRONLY); + ::fchmod(fd, 0644); + ::write(fd, bl.c_str(), bl.length()); + ::close(fd); +} diff --git a/branches/sage/cephmds2/osd/Ager.h b/branches/sage/cephmds2/osd/Ager.h new file mode 100644 index 0000000000000..864c23fce8e14 --- /dev/null +++ b/branches/sage/cephmds2/osd/Ager.h @@ -0,0 +1,42 @@ +#ifndef __AGER_H +#define __AGER_H + +#include "include/types.h" +#include "include/Distribution.h" +#include "ObjectStore.h" +#include "common/Clock.h" + +#include +#include +using namespace std; + +class Ager { + ObjectStore *store; + + private: + list age_free_oids; + object_t age_cur_oid; + vector< list > age_objects; + Distribution file_size_distn; //kb + bool did_distn; + + void age_empty(float pc); + __uint64_t age_fill(float pc, utime_t until); + ssize_t age_pick_size(); + object_t age_get_oid(); + + public: + Ager(ObjectStore *s) : store(s), did_distn(false) {} + + void age(int time, + float high_water, // fill to this % + float low_water, // then empty to this % + int count, // this many times + float final_water, // and end here ( <= low_water) + int fake_size_mb=0); + + void save_freelist(int); + void load_freelist(); +}; + +#endif diff --git a/branches/sage/cephmds2/osd/BDBMap.h b/branches/sage/cephmds2/osd/BDBMap.h new file mode 100644 index 0000000000000..203a4ca9dce8f --- /dev/null +++ b/branches/sage/cephmds2/osd/BDBMap.h @@ -0,0 +1,136 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __BERKELEYDB_H +#define __BERKELEYDB_H + +#include +#include + +#include +using namespace std; + + +template +class BDBMap { + private: + DB *dbp; + + public: + BDBMap() : dbp(0) {} + ~BDBMap() { + close(); + } + + bool is_open() { return dbp ? true:false; } + + // open/close + int open(const char *fn) { + //cout << "open " << fn << endl; + + int r; + if ((r = db_create(&dbp, NULL, 0)) != 0) { + cerr << "db_create: " << db_strerror(r) << endl; + assert(0); + } + + dbp->set_errfile(dbp, stderr); + dbp->set_errpfx(dbp, "bdbmap"); + + r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644); + if (r != 0) { + dbp->err(dbp, r, "%s", fn); + } + assert(r == 0); + return 0; + } + void close() { + if (dbp) { + dbp->close(dbp,0); + dbp = 0; + } + } + void remove(const char *fn) { + if (!dbp) open(fn); + if (dbp) { + dbp->remove(dbp, fn, 0, 0); + dbp = 0; + } else { + ::unlink(fn); + } + } + + // accessors + int put(K key, + D data) { + DBT k; + memset(&k, 0, sizeof(k)); + k.data = &key; + k.size = sizeof(K); + DBT d; + memset(&d, 0, sizeof(d)); + d.data = &data; + d.size = sizeof(data); + return dbp->put(dbp, NULL, &k, &d, 0); + } + + int get(K key, + D& data) { + DBT k; + memset(&k, 0, sizeof(k)); + k.data = &key; + k.size = sizeof(key); + DBT d; + memset(&d, 0, sizeof(d)); + d.data = &data; + d.size = sizeof(data); + int r = dbp->get(dbp, NULL, &k, &d, 0); + return r; + } + + int del(K key) { + DBT k; + memset(&k, 0, sizeof(k)); + k.data = &key; + k.size = sizeof(key); + return dbp->del(dbp, NULL, &k, 0); + } + + int list_keys(list& ls) { + DBC *cursor = 0; + int r = dbp->cursor(dbp, NULL, &cursor, 0); + assert(r == 0); + + DBT k,d; + memset(&k, 0, sizeof(k)); + memset(&d, 0, sizeof(d)); + + while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) { + K key; + assert(k.size == sizeof(key)); + memcpy(&key, k.data, k.size); + ls.push_back(key); + } + if (r != DB_NOTFOUND) { + dbp->err(dbp, r, "DBcursor->get"); + assert(r == DB_NOTFOUND); + } + + cursor->c_close(cursor); + return 0; + } + +}; + +#endif diff --git a/branches/sage/cephmds2/osd/Fake.h b/branches/sage/cephmds2/osd/Fake.h new file mode 100644 index 0000000000000..01fa4afcf3cb8 --- /dev/null +++ b/branches/sage/cephmds2/osd/Fake.h @@ -0,0 +1,249 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FAKE_H +#define __FAKE_H + +#include "include/types.h" + +#include +#include +#include +using namespace std; +using namespace __gnu_cxx; + +class FakeStoreCollections { + private: + Mutex faker_lock; + ObjectStore *store; + hash_map > fakecollections; + + public: + FakeStoreCollections(ObjectStore *s) : store(s) {} + + // faked collections + int list_collections(list& ls) { + faker_lock.Lock(); + int r = 0; + for (hash_map< coll_t, set >::iterator p = fakecollections.begin(); + p != fakecollections.end(); + p++) { + r++; + ls.push_back(p->first); + } + faker_lock.Unlock(); + return r; + } + + int create_collection(coll_t c, + Context *onsafe=0) { + faker_lock.Lock(); + fakecollections[c].size(); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return 0; + } + + int destroy_collection(coll_t c, + Context *onsafe=0) { + int r = 0; + faker_lock.Lock(); + if (fakecollections.count(c)) { + fakecollections.erase(c); + //fakecattr.erase(c); + if (onsafe) store->sync(onsafe); + } else + r = -1; + faker_lock.Unlock(); + return r; + } + + int collection_stat(coll_t c, struct stat *st) { + return collection_exists(c) ? 0:-1; + } + + bool collection_exists(coll_t c) { + faker_lock.Lock(); + int r = fakecollections.count(c); + faker_lock.Unlock(); + return r; + } + + int collection_add(coll_t c, object_t o, + Context *onsafe=0) { + faker_lock.Lock(); + fakecollections[c].insert(o); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return 0; + } + + int collection_remove(coll_t c, object_t o, + Context *onsafe=0) { + faker_lock.Lock(); + fakecollections[c].erase(o); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return 0; + } + + int collection_list(coll_t c, list& o) { + faker_lock.Lock(); + int r = 0; + for (set::iterator p = fakecollections[c].begin(); + p != fakecollections[c].end(); + p++) { + o.push_back(*p); + r++; + } + faker_lock.Unlock(); + return r; + } + +}; + +class FakeStoreAttrs { + private: + + class FakeAttrSet { + public: + map attrs; + + int getattr(const char *name, void *value, size_t size) { + string n = name; + if (attrs.count(n)) { + size_t l = MIN( attrs[n].length(), size ); + bufferlist bl; + bl.append(attrs[n]); + bl.copy(0, l, (char*)value); + return l; + } + return -1; + } + int getattrs(map& aset) { + aset = attrs; + return 0; + } + int setattrs(map& aset) { + attrs = aset; + return 0; + } + + int setattr(const char *name, const void *value, size_t size) { + string n = name; + bufferptr bp = buffer::copy((char*)value, size); + attrs[n] = bp; + return 0; + } + + int listattr(char *attrs, size_t size) { + assert(0); + return 0; + } + + int rmattr(const char *name) { + string n = name; + attrs.erase(n); + return 0; + } + + bool empty() { return attrs.empty(); } + }; + + Mutex faker_lock; + ObjectStore *store; + hash_map fakeoattrs; + hash_map fakecattrs; + + public: + FakeStoreAttrs(ObjectStore *s) : store(s) {} + + int setattr(object_t oid, const char *name, + const void *value, size_t size, + Context *onsafe=0) { + faker_lock.Lock(); + int r = fakeoattrs[oid].setattr(name, value, size); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return r; + } + int setattrs(object_t oid, map& aset) { + faker_lock.Lock(); + int r = fakeoattrs[oid].setattrs(aset); + faker_lock.Unlock(); + return r; + } + int getattr(object_t oid, const char *name, + void *value, size_t size) { + faker_lock.Lock(); + int r = fakeoattrs[oid].getattr(name, value, size); + faker_lock.Unlock(); + return r; + } + int getattrs(object_t oid, map& aset) { + faker_lock.Lock(); + int r = fakeoattrs[oid].getattrs(aset); + faker_lock.Unlock(); + return r; + } + int rmattr(object_t oid, const char *name, + Context *onsafe=0) { + faker_lock.Lock(); + int r = fakeoattrs[oid].rmattr(name); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return r; + } + + int listattr(object_t oid, char *attrs, size_t size) { + faker_lock.Lock(); + int r = fakeoattrs[oid].listattr(attrs,size); + faker_lock.Unlock(); + return r; + } + + int collection_setattr(coll_t c, const char *name, + void *value, size_t size, + Context *onsafe=0) { + faker_lock.Lock(); + int r = fakecattrs[c].setattr(name, value, size); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return r; + } + int collection_rmattr(coll_t c, const char *name, + Context *onsafe=0) { + faker_lock.Lock(); + int r = fakecattrs[c].rmattr(name); + if (onsafe) store->sync(onsafe); + faker_lock.Unlock(); + return r; + } + int collection_getattr(coll_t c, const char *name, + void *value, size_t size) { + faker_lock.Lock(); + int r = fakecattrs[c].getattr(name, value, size); + faker_lock.Unlock(); + return r; + } + int collection_listattr(coll_t c, char *attrs, size_t size) { + faker_lock.Lock(); + int r = fakecattrs[c].listattr(attrs,size); + faker_lock.Unlock(); + return r; + } + +}; + +#endif diff --git a/branches/sage/cephmds2/osd/FakeStore.cc b/branches/sage/cephmds2/osd/FakeStore.cc new file mode 100644 index 0000000000000..c2f573a81038f --- /dev/null +++ b/branches/sage/cephmds2/osd/FakeStore.cc @@ -0,0 +1,364 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "FakeStore.h" +#include "include/types.h" + +#include "common/Timer.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +//#include + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".fakestore " + +#include "include/buffer.h" + +#include +#include +using namespace __gnu_cxx; + +// crap-a-crap hash +#define HASH_DIRS 0x80 +#define HASH_MASK 0x7f +// end crap hash + + + + + + + +int FakeStore::mount() +{ + if (g_conf.fakestore_dev) { + dout(0) << "mounting" << endl; + char cmd[100]; + sprintf(cmd,"mount %s", g_conf.fakestore_dev); + system(cmd); + } + + string mydir; + get_dir(mydir); + + dout(5) << "init with basedir " << mydir << endl; + + // make sure global base dir exists + struct stat st; + int r = ::stat(basedir.c_str(), &st); + if (r != 0) { + dout(1) << "unable to stat basedir " << basedir << ", r = " << r << endl; + return r; + } + + // all okay. + return 0; +} + +int FakeStore::umount() +{ + dout(5) << "finalize" << endl; + + if (g_conf.fakestore_dev) { + char cmd[100]; + dout(0) << "umounting" << endl; + sprintf(cmd,"umount %s", g_conf.fakestore_dev); + system(cmd); + } + + // nothing + return 0; +} + + +int FakeStore::statfs(struct statfs *buf) +{ + string mydir; + get_dir(mydir); + return ::statfs(mydir.c_str(), buf); +} + + + + +void FakeStore::get_dir(string& dir) { + char s[30]; + sprintf(s, "%d", whoami); + dir = basedir + "/" + s; +} +void FakeStore::get_oname(object_t oid, string& fn) { + char s[100]; + static hash H; + sprintf(s, "%d/%02x/%016llx.%08x.%d", whoami, H(oid) & HASH_MASK, oid.ino, oid.bno, oid.rev); + fn = basedir + "/" + s; + // dout(1) << "oname is " << fn << endl; +} + + + +void FakeStore::wipe_dir(string mydir) +{ + DIR *dir = ::opendir(mydir.c_str()); + if (dir) { + dout(10) << "wiping " << mydir << endl; + struct dirent *ent = 0; + + while ((ent = ::readdir(dir)) != 0) { + if (ent->d_name[0] == '.') continue; + dout(25) << "mkfs unlinking " << ent->d_name << endl; + string fn = mydir + "/" + ent->d_name; + ::unlink(fn.c_str()); + } + + ::closedir(dir); + } else { + dout(1) << "mkfs couldn't read dir " << mydir << endl; + } +} + +int FakeStore::mkfs() +{ + if (g_conf.fakestore_dev) { + dout(0) << "mounting" << endl; + char cmd[100]; + sprintf(cmd,"mount %s", g_conf.fakestore_dev); + system(cmd); + } + + + int r = 0; + struct stat st; + string mydir; + get_dir(mydir); + + dout(1) << "mkfs in " << mydir << endl; + + + // make sure my dir exists + r = ::stat(mydir.c_str(), &st); + if (r != 0) { + dout(10) << "creating " << mydir << endl; + mkdir(mydir.c_str(), 0755); + r = ::stat(mydir.c_str(), &st); + if (r != 0) { + dout(1) << "couldnt create dir, r = " << r << endl; + return r; + } + } + else wipe_dir(mydir); + + // hashed bits too + for (int i=0; i 0) bl.push_back( bptr ); // put it in the target bufferlist + } + ::flock(fd, LOCK_UN); + ::close(fd); + return got; +} + + +int FakeStore::write(object_t oid, + off_t offset, size_t len, + bufferlist& bl, + Context *onsafe) +{ + dout(20) << "write " << oid << " len " << len << " off " << offset << endl; + + string fn; + get_oname(oid,fn); + + ::mknod(fn.c_str(), 0644, 0); // in case it doesn't exist yet. + + int flags = O_WRONLY;//|O_CREAT; + int fd = ::open(fn.c_str(), flags); + if (fd < 0) { + dout(1) << "write couldn't open " << fn.c_str() << " flags " << flags << " errno " << errno << " " << strerror(errno) << endl; + return fd; + } + ::flock(fd, LOCK_EX); // lock for safety + //::fchmod(fd, 0664); + + // seek + off_t actual = lseek(fd, offset, SEEK_SET); + int did = 0; + assert(actual == offset); + + // write buffers + for (list::const_iterator it = bl.buffers().begin(); + it != bl.buffers().end(); + it++) { + int r = ::write(fd, (char*)(*it).c_str(), (*it).length()); + if (r > 0) + did += r; + else { + dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; + } + } + + if (did < 0) { + dout(1) << "couldn't write to " << fn.c_str() << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << endl; + } + + ::flock(fd, LOCK_UN); + + // schedule sync + if (onsafe) sync(onsafe); + + ::close(fd); + + return did; +} + + +class C_FakeSync : public Context { +public: + Context *c; + int *n; + C_FakeSync(Context *c_, int *n_) : c(c_), n(n_) { + ++*n; + } + void finish(int r) { + c->finish(r); + --(*n); + //cout << "sync, " << *n << " still unsync" << endl; + } +}; + +void FakeStore::sync(Context *onsafe) +{ + if (g_conf.fakestore_fake_sync) { + g_timer.add_event_after((float)g_conf.fakestore_fake_sync, + new C_FakeSync(onsafe, &unsync)); + + } else { + assert(0); // der..no implemented anymore + } +} + + + diff --git a/branches/sage/cephmds2/osd/FakeStore.h b/branches/sage/cephmds2/osd/FakeStore.h new file mode 100644 index 0000000000000..eaa4126e84e46 --- /dev/null +++ b/branches/sage/cephmds2/osd/FakeStore.h @@ -0,0 +1,87 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FAKESTORE_H +#define __FAKESTORE_H + +#include "ObjectStore.h" +#include "common/ThreadPool.h" +#include "common/Mutex.h" + +#include "Fake.h" +//#include "FakeStoreBDBCollections.h" + + +#include +using namespace std; + +#include +using namespace __gnu_cxx; + + +// fake attributes in memory, if we need to. + + +class FakeStore : public ObjectStore, + public FakeStoreAttrs, + public FakeStoreCollections { + string basedir; + int whoami; + + int unsync; + + Mutex lock; + + // fns + void get_dir(string& dir); + void get_oname(object_t oid, string& fn); + void wipe_dir(string mydir); + + + public: + FakeStore(char *base, int whoami) : FakeStoreAttrs(this), FakeStoreCollections(this) + { + this->basedir = base; + this->whoami = whoami; + unsync = 0; + } + + + int mount(); + int umount(); + int mkfs(); + + int statfs(struct statfs *buf); + + // ------------------ + // objects + int pick_object_revision_lt(object_t& oid) { + return 0; + } + bool exists(object_t oid); + int stat(object_t oid, struct stat *st); + int remove(object_t oid, Context *onsafe); + int truncate(object_t oid, off_t size, Context *onsafe); + int read(object_t oid, + off_t offset, size_t len, + bufferlist& bl); + int write(object_t oid, + off_t offset, size_t len, + bufferlist& bl, + Context *onsafe); + + void sync(Context *onsafe); +}; + +#endif diff --git a/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h b/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h new file mode 100644 index 0000000000000..97316d2642674 --- /dev/null +++ b/branches/sage/cephmds2/osd/FakeStoreBDBCollections.h @@ -0,0 +1,168 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FAKESTOREBDBCOLLECTIONS_H +#define __FAKESTOREBDBCOLLECTIONS_H + +#include "BDBMap.h" +#include "ObjectStore.h" +#include "common/Mutex.h" + +#define BDBHASH_DIRS 128LL +#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS) + +class FakeStoreBDBCollections { + private: + int whoami; + string basedir; + + Mutex bdblock; + + // collection dbs + BDBMap collections; + map*> collection_map; + + // dirs + void get_dir(string& dir) { + char s[30]; + sprintf(s, "%d", whoami); + dir = basedir + "/" + s; + } + void get_collfn(coll_t c, string &fn) { + char s[100]; + sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c); + fn = basedir + "/" + s; + } + + void open_collections() { + string cfn; + get_dir(cfn); + cfn += "/collections"; + collections.open(cfn.c_str()); + list ls; + collections.list_keys(ls); + } + void close_collections() { + if (collections.is_open()) + collections.close(); + + for (map*>::iterator it = collection_map.begin(); + it != collection_map.end(); + it++) { + it->second->close(); + } + collection_map.clear(); + } + + int open_collection(coll_t c) { + if (collection_map.count(c)) + return 0; // already open. + + string fn; + get_collfn(c,fn); + collection_map[c] = new BDBMap; + int r = collection_map[c]->open(fn.c_str()); + if (r != 0) + collection_map.erase(c); // failed + return r; + } + + public: + FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {} + ~FakeStoreBDBCollections() { + close_collections(); + } + + int list_collections(list& ls) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + ls.clear(); + collections.list_keys(ls); + bdblock.Unlock(); + return 0; + } + int create_collection(coll_t c) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + collections.put(c, 1); + open_collection(c); + bdblock.Unlock(); + return 0; + } + int destroy_collection(coll_t c) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + collections.del(c); + + open_collection(c); + collection_map[c]->close(); + + string fn; + get_collfn(c,fn); + collection_map[c]->remove(fn.c_str()); + delete collection_map[c]; + collection_map.erase(c); + bdblock.Unlock(); + return 0; + } + int collection_stat(coll_t c, struct stat *st) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + string fn; + get_collfn(c,fn); + int r = ::stat(fn.c_str(), st); + bdblock.Unlock(); + return r; + } + bool collection_exists(coll_t c) { + bdblock.Lock(); + struct stat st; + int r = collection_stat(c, &st) == 0; + bdblock.Unlock(); + return r; + } + int collection_add(coll_t c, object_t o) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + open_collection(c); + collection_map[c]->put(o,1); + bdblock.Unlock(); + return 0; + } + int collection_remove(coll_t c, object_t o) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + open_collection(c); + collection_map[c]->del(o); + bdblock.Unlock(); + return 0; + } + int collection_list(coll_t c, list& o) { + bdblock.Lock(); + if (!collections.is_open()) open_collections(); + + open_collection(c); + collection_map[c]->list_keys(o); + bdblock.Unlock(); + return 0; + } +}; + +#endif diff --git a/branches/sage/cephmds2/osd/OBFSStore.cc b/branches/sage/cephmds2/osd/OBFSStore.cc new file mode 100644 index 0000000000000..e82c6f804721d --- /dev/null +++ b/branches/sage/cephmds2/osd/OBFSStore.cc @@ -0,0 +1,244 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "OBFSStore.h" + +extern "C" { +#include "../../uofs/uofs.h" +} + +#include "common/Timer.h" + +#include "include/types.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug) cout << "osd" << whoami << ".obfs " + +OBFSStore::OBFSStore(int whoami, char *param, char *dev) +{ + this->whoami = whoami; + this->mounted = -1; + this->bdev_id = -1; + this->param[0] = 0; + this->dev[0] = 0; + if (dev) + strcpy(this->dev, dev); + if (param) + strcpy(this->param, param); +} + +int OBFSStore::mount(void) +{ + dout(0) << "OBFS init!" << endl; + if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { + dout(0) << "device open FAILED on " << this->dev << ", errno " << errno << endl; + return -1; + } + + this->mkfs(); + this->mounted = uofs_mount(this->bdev_id, + g_conf.uofs_cache_size, + g_conf.uofs_min_flush_pages, + this->whoami); + switch (this->mounted) { + case -1: + this->mkfs(); + //retry to mount + dout(0) << "remount the OBFS" << endl; + this->mounted = uofs_mount(this->bdev_id, + g_conf.uofs_cache_size, + g_conf.uofs_min_flush_pages, + this->whoami); + assert(this->mounted >= 0); + break; + case -2: + //fsck + dout(0) << "Need fsck! Simply formatted for now!" << endl; + this->mkfs(); + this->mounted = uofs_mount(this->bdev_id, + g_conf.uofs_cache_size, + g_conf.uofs_min_flush_pages, + this->whoami); + assert(this->mounted >= 0); + break; + case 0: + //success + break; + default: + break; + } + + if (this->mounted >= 0) + dout(0) << "successfully mounted!" << endl; + else + dout(0) << "error in mounting obfsstore!" << endl; + + return 0; +} + +int OBFSStore::mkfs(void) +{ + /*int donode_size_byte = 1024, + bd_ratio = 10, + reg_size_mb = 256, + sb_size_kb = 4, + lb_size_kb = 1024, + nr_hash_table_buckets = 1023, + delay_allocation = 1, + flush_interval = 5; + FILE *param; + */ + + + if (this->mounted >= 0) + return 0; + + dout(0) << "OBFS.mkfs!" << endl; + /* + if (strlen(this->param) > 0) { + param = fopen(this->param, "r"); + if (param) { + //fscanf(param, "Block Device: %s\n", this->dev); + fscanf(param, "Donode Size: %d\n", &donode_size_byte); + fscanf(param, "Block vs Donode Ratio: %d\n", &bd_ratio); + fscanf(param, "Region Size: %d MB\n", ®_size_mb); + fscanf(param, "Small Block Size: %d KB\n", &sb_size_kb); + fscanf(param, "Large Block Size: %d KB\n", &lb_size_kb); + fscanf(param, "Hash Table Buckets: %d\n", &nr_hash_table_buckets); + fscanf(param, "Delayed Allocation: %d\n", &delay_allocation); + } else { + dout(0) << "read open FAILED on "<< this->param <<", errno " << errno << endl; + dout(0) << "use default parameters" << endl; + } + } else + dout(0) << "use default parameters" << endl; + */ + + if (this->bdev_id <= 0) + if ((this->bdev_id = device_open(this->dev, O_RDWR)) < 0) { + dout(0) << "device open FAILED on "<< this->dev <<", errno " << errno << endl; + return -1; + } + + dout(0) << "start formating!" << endl; + + uofs_format(this->bdev_id, + g_conf.uofs_onode_size, + g_conf.uofs_block_meta_ratio, + g_conf.uofs_segment_size, + g_conf.uofs_small_block_size, + g_conf.uofs_large_block_size, + g_conf.uofs_nr_hash_buckets, + g_conf.uofs_delay_allocation, + 0,//g_conf.uofs_dev_force_size, + g_conf.uofs_flush_interval, + 0); + + dout(0) << "formatting complete!" << endl; + return 0; +} + +int OBFSStore::umount(void) +{ + uofs_shutdown(); + close(this->bdev_id); + + return 0; +} + +int OBFSStore::statfs(struct statfs *sfs) +{ + return 0; +} + +bool OBFSStore::exists(object_t oid) +{ + //dout(0) << "calling function exists!" << endl; + return uofs_exist(oid); +} + +int OBFSStore::stat(object_t oid, struct stat *st) +{ + dout(0) << "calling function stat!" << endl; + if (uofs_exist(oid)) return 0; + return -1; +} + +int OBFSStore::remove(object_t oid) +{ + dout(0) << "calling remove function!" << endl; + return uofs_del(oid); +} + +int OBFSStore::truncate(object_t oid, off_t size) +{ + dout(0) << "calling truncate function!" << endl; + //return uofs_truncate(oid, size); + return -1; +} + +int OBFSStore::read(object_t oid, size_t len, + off_t offset, bufferlist &bl) +{ + //dout(0) << "calling read function!" << endl; + //dout(0) << oid << " 0 " << len << " " << offset << " 100" << endl; + + // FIXME: page-align this and we can avoid a memcpy... + bl.push_back(new buffer(len)); + return uofs_read(oid, bl.c_str(), offset, len); +} + +int OBFSStore::write(object_t oid, size_t len, + off_t offset, bufferlist& bl, bool fsync) +{ + int ret = 0; + + //dout(0) << "calling write function!" << endl; + //if (whoami == 0) + // dout(0) << oid << " 0 " << len << " " << offset << " 101" << endl; + + for (list::iterator p = bl.buffers().begin(); + p != bl.buffers().end(); + p++) { + ret += uofs_write(oid, (*p).c_str(), offset, len, 0); + } + + if (fsync) + ret += uofs_sync(oid); + + return ret; +} + + +int OBFSStore::write(object_t oid, size_t len, + off_t offset, bufferlist& bl, Context *onflush) +{ + int r = write(oid, len, offset, bl, false); + g_timer.add_event_after((float)g_conf.uofs_fake_sync, onflush); + return r; +} diff --git a/branches/sage/cephmds2/osd/OBFSStore.h b/branches/sage/cephmds2/osd/OBFSStore.h new file mode 100644 index 0000000000000..cb4a6afc815d7 --- /dev/null +++ b/branches/sage/cephmds2/osd/OBFSStore.h @@ -0,0 +1,56 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef _OBFSSTORE_H_ +#define _OBFSSTORE_H_ + +#include "ObjectStore.h" +#include "Fake.h" + +class OBFSStore : public ObjectStore, + public FakeStoreAttrs, + public FakeStoreCollections { + int whoami; + int bdev_id; + int mounted; + char dev[128]; + char param[128]; + + public: + OBFSStore(int whoami, char *param, char *dev); + + int mount(void); + int umount(void); + int mkfs(void); + + int statfs(struct statfs *); + + bool exists(object_t oid); + int stat(object_t oid, struct stat *st); + + int remove(object_t oid); + int truncate(object_t oid, off_t size); + + int read(object_t oid, size_t len, + off_t offset, bufferlist& bl); + int write(object_t oid, size_t len, + off_t offset, bufferlist& bl, + bool fsync); + int write(object_t oid, size_t len, + off_t offset, bufferlist& bl, + Context *onflush); + +}; + +#endif diff --git a/branches/sage/cephmds2/osd/OSD.cc b/branches/sage/cephmds2/osd/OSD.cc new file mode 100644 index 0000000000000..67e84746229b0 --- /dev/null +++ b/branches/sage/cephmds2/osd/OSD.cc @@ -0,0 +1,3498 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "include/types.h" + +#include "OSD.h" +#include "OSDMap.h" + +#ifdef USE_OBFS +# include "OBFSStore.h" +#else +# include "FakeStore.h" +#endif + +#include "ebofs/Ebofs.h" + +#include "Ager.h" + + +#include "msg/Messenger.h" +#include "msg/Message.h" + +#include "messages/MGenericMessage.h" +#include "messages/MPing.h" +#include "messages/MPingAck.h" +#include "messages/MOSDPing.h" +#include "messages/MOSDFailure.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDBoot.h" +#include "messages/MOSDIn.h" +#include "messages/MOSDOut.h" + +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGQuery.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" + +#include "common/Logger.h" +#include "common/LogType.h" +#include "common/Timer.h" +#include "common/ThreadPool.h" + +#include +#include +#include +#include + + +#include "config.h" +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " +#define derr(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cerr << g_clock.now() << " osd" << whoami << " " << (osdmap ? osdmap->get_epoch():0) << " " + +char *osd_base_path = "./osddata"; +char *ebofs_base_path = "./dev"; + + +object_t SUPERBLOCK_OBJECT(0,0); + + +// force remount hack for performance testing FakeStore +class C_Remount : public Context { + OSD *osd; +public: + C_Remount(OSD *o) : osd(o) {} + void finish(int) { + osd->force_remount(); + } +}; + +void OSD::force_remount() +{ + dout(0) << "forcing remount" << endl; + osd_lock.Lock(); + { + store->umount(); + store->mount(); + } + osd_lock.Unlock(); + dout(0) << "finished remount" << endl; +} +// + + +// cons/des + +LogType osd_logtype; + +OSD::OSD(int id, Messenger *m, MonMap *mm, char *dev) +{ + whoami = id; + messenger = m; + monmap = mm; + + osdmap = 0; + boot_epoch = 0; + + last_tid = 0; + num_pulling = 0; + + state = STATE_BOOTING; + + hb_stat_ops = 0; + hb_stat_qlen = 0; + + pending_ops = 0; + waiting_for_no_ops = false; + + if (g_conf.osd_remount_at) + g_timer.add_event_after(g_conf.osd_remount_at, new C_Remount(this)); + + + + // init object store + // try in this order: + // dev/osd$num + // dev/osd.$hostname + // dev/osd.all + + if (dev) { + strcpy(dev_path,dev); + } else { + char hostname[100]; + hostname[0] = 0; + gethostname(hostname,100); + + sprintf(dev_path, "%s/osd%d", ebofs_base_path, whoami); + + struct stat sta; + if (::lstat(dev_path, &sta) != 0) + sprintf(dev_path, "%s/osd.%s", ebofs_base_path, hostname); + + if (::lstat(dev_path, &sta) != 0) + sprintf(dev_path, "%s/osd.all", ebofs_base_path); + } + + if (g_conf.ebofs) { + store = new Ebofs(dev_path); + //store->_fake_writes(true); + } +#ifdef USE_OBFS + else if (g_conf.uofs) { + store = new OBFSStore(whoami, NULL, dev_path); + } +#endif + else { + store = new FakeStore(osd_base_path, whoami); + } + +} + +OSD::~OSD() +{ + if (threadpool) { delete threadpool; threadpool = 0; } + if (osdmap) { delete osdmap; osdmap = 0; } + //if (monitor) { delete monitor; monitor = 0; } + if (messenger) { delete messenger; messenger = 0; } + if (logger) { delete logger; logger = 0; } + if (store) { delete store; store = 0; } +} + +int OSD::init() +{ + osd_lock.Lock(); + { + // mkfs? + if (g_conf.osd_mkfs) { + dout(2) << "mkfs" << endl; + store->mkfs(); + + // make up a superblock + //superblock.fsid = ???; + superblock.whoami = whoami; + } + + // mount. + dout(2) << "mounting " << dev_path << endl; + int r = store->mount(); + assert(r>=0); + + if (g_conf.osd_mkfs) { + // age? + if (g_conf.osd_age_time != 0) { + dout(2) << "age" << endl; + Ager ager(store); + if (g_conf.osd_age_time < 0) + ager.load_freelist(); + else + ager.age(g_conf.osd_age_time, + g_conf.osd_age, + g_conf.osd_age - .05, + 50000, + g_conf.osd_age - .05); + } + } + else { + dout(2) << "boot" << endl; + + // read superblock + read_superblock(); + + // load up pgs (as they previously existed) + load_pgs(); + + dout(2) << "superblock: i am osd" << superblock.whoami << endl; + assert(whoami == superblock.whoami); + } + + + // log + char name[80]; + sprintf(name, "osd%02d", whoami); + logger = new Logger(name, (LogType*)&osd_logtype); + osd_logtype.add_set("opq"); + osd_logtype.add_inc("op"); + osd_logtype.add_inc("c_rd"); + osd_logtype.add_inc("c_rdb"); + osd_logtype.add_inc("c_wr"); + osd_logtype.add_inc("c_wrb"); + + osd_logtype.add_inc("r_push"); + osd_logtype.add_inc("r_pushb"); + osd_logtype.add_inc("r_wr"); + osd_logtype.add_inc("r_wrb"); + + osd_logtype.add_inc("rlnum"); + + osd_logtype.add_set("numpg"); + osd_logtype.add_set("pingset"); + + osd_logtype.add_set("buf"); + + osd_logtype.add_inc("map"); + osd_logtype.add_inc("mapi"); + osd_logtype.add_inc("mapidup"); + osd_logtype.add_inc("mapf"); + osd_logtype.add_inc("mapfdup"); + + // request thread pool + { + char name[80]; + sprintf(name,"osd%d.threadpool", whoami); + threadpool = new ThreadPool(name, g_conf.osd_maxthreads, + static_dequeueop, + this); + } + + // i'm ready! + messenger->set_dispatcher(this); + + // announce to monitor i exist and have booted. + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDBoot(superblock), MSG_ADDR_MON(mon), monmap->get_inst(mon)); + + // start the heart + next_heartbeat = new C_Heartbeat(this); + g_timer.add_event_after(g_conf.osd_heartbeat_interval, next_heartbeat); + } + osd_lock.Unlock(); + + //dout(0) << "osd_rep " << g_conf.osd_rep << endl; + + return 0; +} + +int OSD::shutdown() +{ + dout(1) << "shutdown, timer has " << g_timer.num_event << endl; + + if (next_heartbeat) g_timer.cancel_event(next_heartbeat); + + state = STATE_STOPPING; + + // finish ops + wait_for_no_ops(); + + // stop threads + delete threadpool; + threadpool = 0; + + // close pgs + for (hash_map::iterator p = pg_map.begin(); + p != pg_map.end(); + p++) { + delete p->second; + } + pg_map.clear(); + + // shut everything else down + //monitor->shutdown(); + messenger->shutdown(); + + osd_lock.Unlock(); + int r = store->umount(); + osd_lock.Lock(); + return r; +} + + + +void OSD::write_superblock(ObjectStore::Transaction& t) +{ + dout(10) << "write_superblock " << superblock << endl; + + bufferlist bl; + bl.append((char*)&superblock, sizeof(superblock)); + t.write(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); +} + +int OSD::read_superblock() +{ + bufferlist bl; + int r = store->read(SUPERBLOCK_OBJECT, 0, sizeof(superblock), bl); + if (bl.length() != sizeof(superblock)) { + dout(10) << "read_superblock failed, r = " << r << ", i got " << bl.length() << " bytes, not " << sizeof(superblock) << endl; + return -1; + } + + bl.copy(0, sizeof(superblock), (char*)&superblock); + + dout(10) << "read_superblock " << superblock << endl; + + // load up "current" osdmap + assert(!osdmap); + osdmap = new OSDMap; + bl.clear(); + get_map_bl(superblock.current_epoch, bl); + osdmap->decode(bl); + + assert(whoami == superblock.whoami); // fixme! + return 0; +} + + +// object locks + +PG *OSD::lock_pg(pg_t pgid) +{ + osd_lock.Lock(); + PG *pg = _lock_pg(pgid); + osd_lock.Unlock(); + return pg; +} + +PG *OSD::_lock_pg(pg_t pgid) +{ + assert(pg_map.count(pgid)); + + if (pg_lock.count(pgid)) { + Cond c; + dout(15) << "lock_pg " << pgid << " waiting as " << &c << endl; + //cerr << "lock_pg " << pgid << " waiting as " << &c << endl; + + list& ls = pg_lock_waiters[pgid]; // this is commit, right? + ls.push_back(&c); + + while (pg_lock.count(pgid) || + ls.front() != &c) + c.Wait(osd_lock); + + assert(ls.front() == &c); + ls.pop_front(); + if (ls.empty()) + pg_lock_waiters.erase(pgid); + } + + dout(15) << "lock_pg " << pgid << endl; + pg_lock.insert(pgid); + + return pg_map[pgid]; +} + +void OSD::unlock_pg(pg_t pgid) +{ + osd_lock.Lock(); + _unlock_pg(pgid); + osd_lock.Unlock(); +} + +void OSD::_unlock_pg(pg_t pgid) +{ + // unlock + assert(pg_lock.count(pgid)); + pg_lock.erase(pgid); + + if (pg_lock_waiters.count(pgid)) { + // someone is in line + Cond *c = pg_lock_waiters[pgid].front(); + assert(c); + dout(15) << "unlock_pg " << pgid << " waking up next guy " << c << endl; + c->Signal(); + } else { + // nobody waiting + dout(15) << "unlock_pg " << pgid << endl; + } +} + +void OSD::_remove_pg(pg_t pgid) +{ + dout(10) << "_remove_pg " << pgid << endl; + + // remove from store + list olist; + store->collection_list(pgid, olist); + + ObjectStore::Transaction t; + { + for (list::iterator p = olist.begin(); + p != olist.end(); + p++) + t.remove(*p); + t.remove_collection(pgid); + t.remove(object_t(1,pgid)); // log too + } + store->apply_transaction(t); + + // hose from memory + delete pg_map[pgid]; + pg_map.erase(pgid); +} + + +void OSD::activate_pg(pg_t pgid, epoch_t epoch) +{ + osd_lock.Lock(); + { + if (pg_map.count(pgid)) { + PG *pg = _lock_pg(pgid); + if (pg->is_crashed() && + pg->is_replay() && + pg->get_role() == 0 && + pg->info.history.same_primary_since <= epoch) { + ObjectStore::Transaction t; + pg->activate(t); + store->apply_transaction(t); + } + _unlock_pg(pgid); + } + } + + // finishers? + if (finished.empty()) { + osd_lock.Unlock(); + } else { + list waiting; + waiting.splice(waiting.begin(), finished); + + osd_lock.Unlock(); + + for (list::iterator it = waiting.begin(); + it != waiting.end(); + it++) { + dispatch(*it); + } + } +} + + +// ------------------------------------- + +void OSD::heartbeat() +{ + osd_lock.Lock(); + + utime_t now = g_clock.now(); + utime_t since = now; + since.sec_ref() -= g_conf.osd_heartbeat_interval; + + // calc my stats + float avg_qlen = 0; + if (hb_stat_ops) avg_qlen = (float)hb_stat_qlen / (float)hb_stat_ops; + + dout(5) << "heartbeat " << now + << ": ops " << hb_stat_ops + << ", avg qlen " << avg_qlen + << endl; + + // reset until next time around + hb_stat_ops = 0; + hb_stat_qlen = 0; + + // send pings + set pingset; + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + PG *pg = i->second; + + // we want to ping the primary. + if (pg->get_role() <= 0) continue; + if (pg->acting.size() < 1) continue; + + if (pg->last_heartbeat < since) { + pg->last_heartbeat = now; + pingset.insert(pg->acting[0]); + } + } + for (set::iterator i = pingset.begin(); + i != pingset.end(); + i++) { + _share_map_outgoing( MSG_ADDR_OSD(*i), osdmap->get_inst(*i) ); + messenger->send_message(new MOSDPing(osdmap->get_epoch(), avg_qlen), + MSG_ADDR_OSD(*i), osdmap->get_inst(*i)); + } + + if (logger) logger->set("pingset", pingset.size()); + + // hack: fake reorg? + if (osdmap && g_conf.fake_osdmap_updates) { + int mon = monmap->pick_mon(); + if ((rand() % g_conf.fake_osdmap_updates) == 0) { + //if ((rand() % (g_conf.num_osd / g_conf.fake_osdmap_updates)) == whoami / g_conf.fake_osdmap_updates) { + messenger->send_message(new MOSDIn(osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + /* + if (osdmap->is_out(whoami)) { + messenger->send_message(new MOSDIn(osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + else if ((rand() % g_conf.fake_osdmap_updates) == 0) { + //messenger->send_message(new MOSDOut(osdmap->get_epoch()), + //MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + } + */ + } + + // schedule next! randomly. + next_heartbeat = new C_Heartbeat(this); + float wait = .5 + ((float)(rand() % 10)/10.0) * (float)g_conf.osd_heartbeat_interval; + g_timer.add_event_after(wait, next_heartbeat); + + osd_lock.Unlock(); +} + + + +// -------------------------------------- +// dispatch + +bool OSD::_share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch) +{ + bool shared = false; + + // does client have old map? + if (who.is_client()) { + if (epoch < osdmap->get_epoch()) { + dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; + send_incremental_map(epoch, who, inst, true); + shared = true; + } + } + + // does peer have old map? + if (who.is_osd()) { + // remember + if (peer_map_epoch[who] < epoch) + peer_map_epoch[who] = epoch; + + // older? + if (peer_map_epoch[who] < osdmap->get_epoch()) { + dout(10) << who << " has old map " << epoch << " < " << osdmap->get_epoch() << endl; + send_incremental_map(epoch, who, inst, true); + peer_map_epoch[who] = osdmap->get_epoch(); // so we don't send it again. + shared = true; + } + } + + return shared; +} + + +void OSD::_share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst) +{ + assert(dest.is_osd()); + + if (dest.is_osd()) { + // send map? + if (peer_map_epoch.count(dest)) { + epoch_t pe = peer_map_epoch[dest]; + if (pe < osdmap->get_epoch()) { + send_incremental_map(pe, dest, inst, true); + peer_map_epoch[dest] = osdmap->get_epoch(); + } + } else { + // no idea about peer's epoch. + // ??? send recent ??? + // do nothing. + } + } +} + + + +void OSD::dispatch(Message *m) +{ + // lock! + osd_lock.Lock(); + + switch (m->get_type()) { + + // -- don't need lock -- + case MSG_PING: + dout(10) << "ping from " << m->get_source() << endl; + delete m; + break; + + // -- don't need OSDMap -- + + /* + // host monitor + case MSG_PING_ACK: + case MSG_FAILURE_ACK: + monitor->proc_message(m); + break; + */ + + // map and replication + case MSG_OSD_MAP: + handle_osd_map((MOSDMap*)m); + break; + + // osd + case MSG_SHUTDOWN: + shutdown(); + delete m; + break; + + + + // -- need OSDMap -- + + default: + { + // no map? starting up? + if (!osdmap) { + dout(7) << "no OSDMap, not booted" << endl; + waiting_for_osdmap.push_back(m); + break; + } + + // down? + if (osdmap->is_down(whoami)) { + dout(7) << "i am marked down, dropping " << *m << endl; + delete m; + break; + } + + + + + // need OSDMap + switch (m->get_type()) { + + case MSG_OSD_PING: + // take note. + handle_osd_ping((MOSDPing*)m); + break; + + case MSG_OSD_PG_NOTIFY: + handle_pg_notify((MOSDPGNotify*)m); + break; + case MSG_OSD_PG_QUERY: + handle_pg_query((MOSDPGQuery*)m); + break; + case MSG_OSD_PG_LOG: + handle_pg_log((MOSDPGLog*)m); + break; + case MSG_OSD_PG_REMOVE: + handle_pg_remove((MOSDPGRemove*)m); + break; + + case MSG_OSD_OP: + handle_op((MOSDOp*)m); + break; + + // for replication etc. + case MSG_OSD_OPREPLY: + handle_op_reply((MOSDOpReply*)m); + break; + + + default: + dout(1) << " got unknown message " << m->get_type() << endl; + assert(0); + } + } + } + + // finishers? + if (!finished.empty()) { + list waiting; + waiting.splice(waiting.begin(), finished); + + osd_lock.Unlock(); + + for (list::iterator it = waiting.begin(); + it != waiting.end(); + it++) { + dispatch(*it); + } + return; + } + + osd_lock.Unlock(); +} + + +void OSD::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +{ + if (g_conf.ms_die_on_failure) { + exit(0); + } + + if (dest.is_osd()) { + // failed osd. drop message, report to mon. + int mon = monmap->pick_mon(); + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", dropping and reporting to mon" << mon + << endl; + messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + delete m; + } else if (dest.is_mon()) { + // resend to a different monitor. + int mon = monmap->pick_mon(true); + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", resending to mon" << mon + << endl; + messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + else { + // client? + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", dropping" << endl; + delete m; + } +} + +bool OSD::ms_lookup(msg_addr_t dest, entity_inst_t& inst) +{ + if (dest.is_osd()) { + assert(osdmap); + return osdmap->get_inst(dest.num(), inst); + } + + assert(0); + return false; +} + + + + +void OSD::handle_osd_ping(MOSDPing *m) +{ + dout(20) << "osdping from " << m->get_source() << endl; + _share_map_incoming(m->get_source(), m->get_source_inst(), ((MOSDPing*)m)->map_epoch); + + int from = m->get_source().num(); + peer_qlen[from] = m->avg_qlen; + + //if (!m->ack) + //messenger->send_message(new MOSDPing(osdmap->get_epoch(), true), + //m->get_source()); + + delete m; +} + + + + +// ===================================================== +// MAP + +void OSD::wait_for_new_map(Message *m) +{ + // ask + if (waiting_for_osdmap.empty()) { + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + + waiting_for_osdmap.push_back(m); +} + + +/** update_map + * assimilate new OSDMap(s). scan pgs, etc. + */ +void OSD::handle_osd_map(MOSDMap *m) +{ + wait_for_no_ops(); + + assert(osd_lock.is_locked()); + + ObjectStore::Transaction t; + + if (osdmap) { + dout(3) << "handle_osd_map epochs [" + << m->get_first() << "," << m->get_last() + << "], i have " << osdmap->get_epoch() + << endl; + } else { + dout(3) << "handle_osd_map epochs [" + << m->get_first() << "," << m->get_last() + << "], i have none" + << endl; + osdmap = new OSDMap; + boot_epoch = m->get_last(); // hrm...? + } + + logger->inc("mapmsg"); + + // store them? + for (map::iterator p = m->maps.begin(); + p != m->maps.end(); + p++) { + object_t oid = get_osdmap_object_name(p->first); + if (store->exists(oid)) { + dout(10) << "handle_osd_map already had full map epoch " << p->first << endl; + logger->inc("mapfdup"); + bufferlist bl; + get_map_bl(p->first, bl); + dout(10) << " .. it is " << bl.length() << " bytes" << endl; + continue; + } + + dout(10) << "handle_osd_map got full map epoch " << p->first << endl; + //t.write(oid, 0, p->second.length(), p->second); + store->write(oid, 0, p->second.length(), p->second, 0); + + if (p->first > superblock.newest_map) + superblock.newest_map = p->first; + if (p->first < superblock.oldest_map || + superblock.oldest_map == 0) + superblock.oldest_map = p->first; + + logger->inc("mapf"); + } + for (map::iterator p = m->incremental_maps.begin(); + p != m->incremental_maps.end(); + p++) { + object_t oid = get_inc_osdmap_object_name(p->first); + if (store->exists(oid)) { + dout(10) << "handle_osd_map already had incremental map epoch " << p->first << endl; + logger->inc("mapidup"); + bufferlist bl; + get_inc_map_bl(p->first, bl); + dout(10) << " .. it is " << bl.length() << " bytes" << endl; + continue; + } + + dout(10) << "handle_osd_map got incremental map epoch " << p->first << endl; + //t.write(oid, 0, p->second.length(), p->second); + store->write(oid, 0, p->second.length(), p->second, 0); + + if (p->first > superblock.newest_map) + superblock.newest_map = p->first; + if (p->first < superblock.oldest_map || + superblock.oldest_map == 0) + superblock.oldest_map = p->first; + + logger->inc("mapi"); + } + + // advance if we can + bool advanced = false; + + if (m->get_source().is_mon() && is_booting()) + advanced = true; + + epoch_t cur = superblock.current_epoch; + while (cur < superblock.newest_map) { + bufferlist bl; + if (m->incremental_maps.count(cur+1) || + store->exists(get_inc_osdmap_object_name(cur+1))) { + dout(10) << "handle_osd_map decoding inc map epoch " << cur+1 << endl; + + bufferlist bl; + if (m->incremental_maps.count(cur+1)) + bl = m->incremental_maps[cur+1]; + else + get_inc_map_bl(cur+1, bl); + + OSDMap::Incremental inc; + int off = 0; + inc.decode(bl, off); + + osdmap->apply_incremental(inc); + + // archive the full map + bl.clear(); + osdmap->encode(bl); + t.write( get_osdmap_object_name(cur+1), 0, bl.length(), bl); + + // notify messenger + for (map::iterator i = inc.new_down.begin(); + i != inc.new_down.end(); + i++) { + int osd = i->first; + if (osd == whoami) continue; + messenger->mark_down(MSG_ADDR_OSD(osd), i->second); + peer_map_epoch.erase(MSG_ADDR_OSD(osd)); + + // kick any replica ops + for (hash_map::iterator it = pg_map.begin(); + it != pg_map.end(); + it++) { + PG *pg = it->second; + + _lock_pg(pg->info.pgid); + { + list ls; // do async; repop_ack() may modify pg->repop_gather + for (map::iterator p = pg->repop_gather.begin(); + p != pg->repop_gather.end(); + p++) { + //dout(-1) << "checking repop tid " << p->first << endl; + if (p->second->waitfor_ack.count(osd) || + p->second->waitfor_commit.count(osd)) + ls.push_back(p->second); + } + for (list::iterator p = ls.begin(); + p != ls.end(); + p++) + repop_ack(pg, *p, -1, true, osd); + } + _unlock_pg(pg->info.pgid); + } + } + for (map::iterator i = inc.new_up.begin(); + i != inc.new_up.end(); + i++) { + if (i->first == whoami) continue; + messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); + peer_map_epoch.erase(MSG_ADDR_OSD(i->first)); + } + } + else if (m->maps.count(cur+1) || + store->exists(get_osdmap_object_name(cur+1))) { + dout(10) << "handle_osd_map decoding full map epoch " << cur+1 << endl; + bufferlist bl; + if (m->maps.count(cur+1)) + bl = m->maps[cur+1]; + else + get_map_bl(cur+1, bl); + osdmap->decode(bl); + + // FIXME BUG: need to notify messenger of ups/downs!! + } + else { + dout(10) << "handle_osd_map missing epoch " << cur+1 << endl; + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDGetMap(cur), MSG_ADDR_MON(mon), monmap->get_inst(mon)); + break; + } + + cur++; + superblock.current_epoch = cur; + advance_map(t); + advanced = true; + } + + // all the way? + if (advanced && cur == superblock.newest_map) { + // yay! + activate_map(t); + + // process waiters + take_waiters(waiting_for_osdmap); + } + + // write updated pg state to store + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + pg_t pgid = i->first; + PG *pg = i->second; + t.collection_setattr( pgid, "info", &pg->info, sizeof(pg->info)); + } + + // superblock and commit + write_superblock(t); + store->apply_transaction(t); + + //if (osdmap->get_epoch() == 1) store->sync(); // in case of early death, blah + + delete m; +} + + +/** + * scan placement groups, initiate any replication + * activities. + */ +void OSD::advance_map(ObjectStore::Transaction& t) +{ + dout(7) << "advance_map epoch " << osdmap->get_epoch() + << " " << pg_map.size() << " pgs" + << endl; + + if (osdmap->is_mkfs()) { + ps_t maxps = 1ULL << osdmap->get_pg_bits(); + ps_t maxlps = 1ULL << osdmap->get_localized_pg_bits(); + dout(1) << "mkfs on " << osdmap->get_pg_bits() << " bits, " << maxps << " pgs" << endl; + assert(osdmap->get_epoch() == 1); + + //cerr << "osdmap " << osdmap->get_ctime() << " logger start " << logger->get_start() << endl; + logger->set_start( osdmap->get_ctime() ); + + // create PGs + for (int nrep = 1; + nrep <= MIN(g_conf.num_osd, g_conf.osd_max_rep); // for low osd counts.. hackish bleh + nrep++) { + for (ps_t ps = 0; ps < maxps; ++ps) { + vector acting; + pg_t pgid = osdmap->ps_nrep_to_pg(ps, nrep); + int nrep = osdmap->pg_to_acting_osds(pgid, acting); + int role = osdmap->calc_pg_role(whoami, acting, nrep); + if (role < 0) continue; + + PG *pg = create_pg(pgid, t); + pg->set_role(role); + pg->acting.swap(acting); + pg->last_epoch_started_any = + pg->info.last_epoch_started = + pg->info.history.same_since = + pg->info.history.same_primary_since = + pg->info.history.same_acker_since = osdmap->get_epoch(); + pg->activate(t); + + dout(7) << "created " << *pg << endl; + } + + for (ps_t ps = 0; ps < maxlps; ++ps) { + // local PG too + vector acting; + pg_t pgid = osdmap->ps_osd_nrep_to_pg(ps, whoami, nrep); + int nrep = osdmap->pg_to_acting_osds(pgid, acting); + int role = osdmap->calc_pg_role(whoami, acting, nrep); + + PG *pg = create_pg(pgid, t); + pg->acting.swap(acting); + pg->set_role(role); + pg->last_epoch_started_any = + pg->info.last_epoch_started = + pg->info.history.same_primary_since = + pg->info.history.same_acker_since = + pg->info.history.same_since = osdmap->get_epoch(); + pg->activate(t); + + dout(7) << "created " << *pg << endl; + } + } + + dout(1) << "mkfs done, created " << pg_map.size() << " pgs" << endl; + + } else { + // scan existing pg's + for (hash_map::iterator it = pg_map.begin(); + it != pg_map.end(); + it++) { + pg_t pgid = it->first; + PG *pg = it->second; + + // did i finish this epoch? + if (pg->is_active()) { + pg->info.last_epoch_finished = osdmap->get_epoch()-1; + } + + // get new acting set + vector tacting; + int nrep = osdmap->pg_to_acting_osds(pgid, tacting); + int role = osdmap->calc_pg_role(whoami, tacting, nrep); + + // no change? + if (tacting == pg->acting) + continue; + + // -- there was a change! -- + _lock_pg(pgid); + + int oldrole = pg->get_role(); + int oldprimary = pg->get_primary(); + int oldacker = pg->get_acker(); + vector oldacting = pg->acting; + + // update PG + pg->acting.swap(tacting); + pg->set_role(role); + + // did primary|acker change? + pg->info.history.same_since = osdmap->get_epoch(); + if (oldprimary != pg->get_primary()) { + pg->info.history.same_primary_since = osdmap->get_epoch(); + pg->cancel_recovery(); + } + if (oldacker != pg->get_acker()) { + pg->info.history.same_acker_since = osdmap->get_epoch(); + } + + // deactivate. + pg->state_clear(PG::STATE_ACTIVE); + + // reset primary state? + if (oldrole == 0 || pg->get_role() == 0) + pg->clear_primary_state(); + + // apply any repops in progress. + if (oldacker == whoami) { + // apply repops + for (map::iterator p = pg->repop_gather.begin(); + p != pg->repop_gather.end(); + p++) { + if (!p->second->applied) + apply_repop(pg, p->second); + delete p->second->op; + delete p->second; + } + pg->repop_gather.clear(); + + // and repop waiters + for (map >::iterator p = pg->waiting_for_repop.begin(); + p != pg->waiting_for_repop.end(); + p++) + for (list::iterator pm = p->second.begin(); + pm != p->second.end(); + pm++) + delete *pm; + pg->waiting_for_repop.clear(); + } + + if (role != oldrole) { + // old primary? + if (oldrole == 0) { + pg->state_clear(PG::STATE_CLEAN); + + // take replay queue waiters + list ls; + for (map::iterator it = pg->replay_queue.begin(); + it != pg->replay_queue.end(); + it++) + ls.push_back(it->second); + pg->replay_queue.clear(); + take_waiters(ls); + + // take active waiters + take_waiters(pg->waiting_for_active); + + // take object waiters + for (hash_map >::iterator it = pg->waiting_for_missing_object.begin(); + it != pg->waiting_for_missing_object.end(); + it++) + take_waiters(it->second); + pg->waiting_for_missing_object.clear(); + } + + // new primary? + if (role == 0) { + // i am new primary + pg->state_clear(PG::STATE_STRAY); + } else { + // i am now replica|stray. we need to send a notify. + pg->state_set(PG::STATE_STRAY); + + if (nrep == 0) { + pg->state_set(PG::STATE_CRASHED); + dout(1) << *pg << " is crashed" << endl; + } + } + + // my role changed. + dout(10) << *pg << " " << oldacting << " -> " << pg->acting + << ", role " << oldrole << " -> " << role << endl; + + } else { + // no role change. + // did primary change? + if (pg->get_primary() != oldprimary) { + // we need to announce + pg->state_set(PG::STATE_STRAY); + + dout(10) << *pg << " " << oldacting << " -> " << pg->acting + << ", acting primary " + << oldprimary << " -> " << pg->get_primary() + << endl; + } else { + // primary is the same. + if (role == 0) { + // i am (still) primary. but my replica set changed. + pg->state_clear(PG::STATE_CLEAN); + pg->state_clear(PG::STATE_REPLAY); + + dout(10) << *pg << " " << oldacting << " -> " << pg->acting + << ", replicas changed" << endl; + } + } + } + + + _unlock_pg(pgid); + } + } +} + +void OSD::activate_map(ObjectStore::Transaction& t) +{ + dout(7) << "activate_map version " << osdmap->get_epoch() << endl; + + map< int, list > notify_list; // primary -> list + map< int, map > query_map; // peer -> PG -> get_summary_since + + // scan pg's + for (hash_map::iterator it = pg_map.begin(); + it != pg_map.end(); + it++) { + //pg_t pgid = it->first; + PG *pg = it->second; + + if (pg->is_active()) { + // update started counter + pg->info.last_epoch_started = osdmap->get_epoch(); + } + else if (pg->get_role() == 0 && !pg->is_active()) { + // i am (inactive) primary + pg->build_prior(); + pg->peer(t, query_map); + } + else if (pg->is_stray() && + pg->get_primary() >= 0) { + // i am residual|replica + notify_list[pg->get_primary()].push_back(pg->info); + } + + } + + if (osdmap->is_mkfs()) // hack: skip the queries/summaries if it's a mkfs + return; + + // notify? (residual|replica) + do_notifies(notify_list); + + // do queries. + do_queries(query_map); + + logger->set("numpg", pg_map.size()); +} + + +void OSD::send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full) +{ + dout(10) << "send_incremental_map " << since << " -> " << osdmap->get_epoch() + << " to " << dest << endl; + + MOSDMap *m = new MOSDMap; + + for (epoch_t e = osdmap->get_epoch(); + e > since; + e--) { + bufferlist bl; + if (get_inc_map_bl(e,bl)) { + m->incremental_maps[e].claim(bl); + } else if (get_map_bl(e,bl)) { + m->maps[e].claim(bl); + if (!full) break; + } + else { + assert(0); // we should have all maps. + } + } + + messenger->send_message(m, dest, inst); +} + +bool OSD::get_map_bl(epoch_t e, bufferlist& bl) +{ + return store->read(get_osdmap_object_name(e), 0, 0, bl) >= 0; +} + +bool OSD::get_inc_map_bl(epoch_t e, bufferlist& bl) +{ + return store->read(get_inc_osdmap_object_name(e), 0, 0, bl) >= 0; +} + +void OSD::get_map(epoch_t epoch, OSDMap &m) +{ + // find a complete map + list incs; + epoch_t e; + for (e = epoch; e > 0; e--) { + bufferlist bl; + if (get_map_bl(e, bl)) { + //dout(10) << "get_map " << epoch << " full " << e << endl; + m.decode(bl); + break; + } else { + OSDMap::Incremental inc; + bool got = get_inc_map(e, inc); + assert(got); + incs.push_front(inc); + } + } + assert(e > 0); + + // apply incrementals + for (e++; e <= epoch; e++) { + //dout(10) << "get_map " << epoch << " inc " << e << endl; + m.apply_incremental( incs.front() ); + incs.pop_front(); + } +} + + +bool OSD::get_inc_map(epoch_t e, OSDMap::Incremental &inc) +{ + bufferlist bl; + if (!get_inc_map_bl(e, bl)) + return false; + int off = 0; + inc.decode(bl, off); + return true; +} + + + + + +bool OSD::require_current_map(Message *m, epoch_t ep) +{ + // older map? + if (ep < osdmap->get_epoch()) { + dout(7) << "require_current_map epoch " << ep << " < " << osdmap->get_epoch() << endl; + delete m; // discard and ignore. + return false; + } + + // newer map? + if (ep > osdmap->get_epoch()) { + dout(7) << "require_current_map epoch " << ep << " > " << osdmap->get_epoch() << endl; + wait_for_new_map(m); + return false; + } + + assert(ep == osdmap->get_epoch()); + return true; +} + + +/* + * require that we have same (or newer) map, and that + * the source is the pg primary. + */ +bool OSD::require_same_or_newer_map(Message *m, epoch_t epoch) +{ + dout(10) << "require_same_or_newer_map " << epoch << " (i am " << osdmap->get_epoch() << ")" << endl; + + // newer map? + if (epoch > osdmap->get_epoch()) { + dout(7) << " from newer map epoch " << epoch << " > " << osdmap->get_epoch() << endl; + wait_for_new_map(m); + return false; + } + + if (epoch < boot_epoch) { + dout(7) << " from pre-boot epoch " << epoch << " < " << boot_epoch << endl; + delete m; + return false; + } + + return true; +} + + + + +// ====================================================== +// REPLICATION + +// PG + +bool OSD::pg_exists(pg_t pgid) +{ + return store->collection_exists(pgid); +} + +PG *OSD::create_pg(pg_t pgid, ObjectStore::Transaction& t) +{ + if (pg_map.count(pgid)) { + dout(0) << "create_pg on " << pgid << ", already have " << *pg_map[pgid] << endl; + } + assert(pg_map.count(pgid) == 0); + assert(!pg_exists(pgid)); + + PG *pg = new PG(this, pgid); + pg_map[pgid] = pg; + + t.create_collection(pgid); + + return pg; +} + + + + +PG *OSD::get_pg(pg_t pgid) +{ + if (pg_map.count(pgid)) + return pg_map[pgid]; + return 0; +} + +void OSD::load_pgs() +{ + dout(10) << "load_pgs" << endl; + assert(pg_map.empty()); + + list ls; + store->list_collections(ls); + + for (list::iterator it = ls.begin(); + it != ls.end(); + it++) { + pg_t pgid = *it; + + PG *pg = new PG(this, pgid); + pg_map[pgid] = pg; + + // read pg info + store->collection_getattr(pgid, "info", &pg->info, sizeof(pg->info)); + + // read pg log + pg->read_log(store); + + // generate state for current mapping + int nrep = osdmap->pg_to_acting_osds(pgid, pg->acting); + int role = osdmap->calc_pg_role(whoami, pg->acting, nrep); + pg->set_role(role); + + dout(10) << "load_pgs loaded " << *pg << " " << pg->log << endl; + } +} + +/** + * check epochs starting from start to verify the pg acting set hasn't changed + * up until now + */ +void OSD::project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from) +{ + dout(15) << "project_pg_history " << pgid + << " from " << from << " to " << osdmap->get_epoch() + << ", start " << h + << endl; + + vector last; + osdmap->pg_to_acting_osds(pgid, last); + + for (epoch_t e = osdmap->get_epoch()-1; + e >= from; + e--) { + // verify during intermediate epoch + OSDMap oldmap; + get_map(e, oldmap); + + vector acting; + oldmap.pg_to_acting_osds(pgid, acting); + + // acting set change? + if (acting != last && + e <= h.same_since) { + dout(15) << "project_pg_history " << pgid << " changed in " << e+1 + << " from " << acting << " -> " << last << endl; + h.same_since = e+1; + } + + // primary change? + if (!(!acting.empty() && !last.empty() && acting[0] == last[0]) && + e <= h.same_primary_since) { + dout(15) << "project_pg_history " << pgid << " primary changed in " << e+1 << endl; + h.same_primary_since = e+1; + + if (g_conf.osd_rep == OSD_REP_PRIMARY) + h.same_acker_since = h.same_primary_since; + } + + // acker change? + if (g_conf.osd_rep != OSD_REP_PRIMARY) { + if (!(!acting.empty() && !last.empty() && acting[acting.size()-1] == last[last.size()-1]) && + e <= h.same_acker_since) { + dout(15) << "project_pg_history " << pgid << " acker changed in " << e+1 << endl; + h.same_acker_since = e+1; + } + } + + if (h.same_since > e && + h.same_primary_since > e && + h.same_acker_since > e) break; + } + + dout(15) << "project_pg_history end " << h << endl; +} + + +/** do_notifies + * Send an MOSDPGNotify to a primary, with a list of PGs that I have + * content for, and they are primary for. + */ + +void OSD::do_notifies(map< int, list >& notify_list) +{ + for (map< int, list >::iterator it = notify_list.begin(); + it != notify_list.end(); + it++) { + if (it->first == whoami) { + dout(7) << "do_notify osd" << it->first << " is self, skipping" << endl; + continue; + } + dout(7) << "do_notify osd" << it->first << " on " << it->second.size() << " PGs" << endl; + MOSDPGNotify *m = new MOSDPGNotify(osdmap->get_epoch(), it->second); + _share_map_outgoing(MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first)); + messenger->send_message(m, MSG_ADDR_OSD(it->first), osdmap->get_inst(it->first)); + } +} + + +/** do_queries + * send out pending queries for info | summaries + */ +void OSD::do_queries(map< int, map >& query_map) +{ + for (map< int, map >::iterator pit = query_map.begin(); + pit != query_map.end(); + pit++) { + int who = pit->first; + dout(7) << "do_queries querying osd" << who + << " on " << pit->second.size() << " PGs" << endl; + + MOSDPGQuery *m = new MOSDPGQuery(osdmap->get_epoch(), + pit->second); + _share_map_outgoing(MSG_ADDR_OSD(who), osdmap->get_inst(who)); + messenger->send_message(m, MSG_ADDR_OSD(who), osdmap->get_inst(who)); + } +} + + + + +/** PGNotify + * from non-primary to primary + * includes PG::Info. + * NOTE: called with opqueue active. + */ +void OSD::handle_pg_notify(MOSDPGNotify *m) +{ + dout(7) << "handle_pg_notify from " << m->get_source() << endl; + int from = MSG_ADDR_NUM(m->get_source()); + + if (!require_same_or_newer_map(m, m->get_epoch())) return; + + ObjectStore::Transaction t; + + // look for unknown PGs i'm primary for + map< int, map > query_map; + + for (list::iterator it = m->get_pg_list().begin(); + it != m->get_pg_list().end(); + it++) { + pg_t pgid = it->pgid; + PG *pg; + + if (pg_map.count(pgid) == 0) { + // same primary? + PG::Info::History history = it->history; + project_pg_history(pgid, history, m->get_epoch()); + + if (m->get_epoch() < history.same_primary_since) { + dout(10) << "handle_pg_notify pg " << pgid << " dne, and primary changed in " + << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; + continue; + } + + // ok, create PG! + pg = create_pg(pgid, t); + osdmap->pg_to_acting_osds(pgid, pg->acting); + pg->set_role(0); + pg->info.history = history; + + pg->last_epoch_started_any = it->last_epoch_started; + pg->build_prior(); + + t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); + + dout(10) << *pg << " is new" << endl; + + // kick any waiters + if (waiting_for_pg.count(pgid)) { + take_waiters(waiting_for_pg[pgid]); + waiting_for_pg.erase(pgid); + } + + _lock_pg(pgid); + } else { + // already had it. am i (still) the primary? + pg = _lock_pg(pgid); + if (m->get_epoch() < pg->info.history.same_primary_since) { + dout(10) << *pg << " handle_pg_notify primary changed in " + << pg->info.history.same_primary_since + << " (msg from " << m->get_epoch() << ")" << endl; + _unlock_pg(pgid); + continue; + } + } + + // ok! + + // stray? + bool acting = pg->is_acting(from); + if (!acting && (*it).last_epoch_started > 0) { + dout(10) << *pg << " osd" << from << " has stray content: " << *it << endl; + pg->stray_set.insert(from); + pg->state_clear(PG::STATE_CLEAN); + } + + // save info. + bool had = pg->peer_info.count(from); + pg->peer_info[from] = *it; + + if (had) { + if (pg->is_active() && + (*it).is_clean() && acting) { + pg->clean_set.insert(from); + dout(10) << *pg << " osd" << from << " now clean (" << pg->clean_set + << "): " << *it << endl; + if (pg->is_all_clean()) { + dout(-10) << *pg << " now clean on all replicas" << endl; + pg->state_set(PG::STATE_CLEAN); + pg->clean_replicas(); + } + } else { + // hmm, maybe keep an eye out for cases where we see this, but peer should happen. + dout(10) << *pg << " already had notify info from osd" << from << ": " << *it << endl; + } + } else { + // adjust prior? + if (it->last_epoch_started > pg->last_epoch_started_any) + pg->adjust_prior(); + + // peer + pg->peer(t, query_map); + } + + _unlock_pg(pgid); + } + + unsigned tr = store->apply_transaction(t); + assert(tr == 0); + + do_queries(query_map); + + delete m; +} + + + +/** PGLog + * from non-primary to primary + * includes log and info + * from primary to non-primary + * includes log for use in recovery + * NOTE: called with opqueue active. + */ + +void OSD::handle_pg_log(MOSDPGLog *m) +{ + int from = MSG_ADDR_NUM(m->get_source()); + const pg_t pgid = m->get_pgid(); + + if (!require_same_or_newer_map(m, m->get_epoch())) return; + if (pg_map.count(pgid) == 0) { + dout(10) << "handle_pg_log don't have pg " << pgid << ", dropping" << endl; + assert(m->get_epoch() < osdmap->get_epoch()); + delete m; + return; + } + + PG *pg = _lock_pg(pgid); + assert(pg); + + if (m->get_epoch() < pg->info.history.same_since) { + dout(10) << "handle_pg_log " << *pg + << " from " << m->get_source() + << " is old, discarding" + << endl; + delete m; + return; + } + + dout(7) << "handle_pg_log " << *pg + << " got " << m->log << " " << m->missing + << " from " << m->get_source() << endl; + + //m->log.print(cout); + + ObjectStore::Transaction t; + + if (pg->is_primary()) { + // i am PRIMARY + assert(pg->peer_log_requested.count(from) || + pg->peer_summary_requested.count(from)); + + pg->proc_replica_log(m->log, m->missing, from); + + // peer + map< int, map > query_map; + pg->peer(t, query_map); + do_queries(query_map); + + } else { + // i am REPLICA + dout(10) << *pg << " got " << m->log << " " << m->missing << endl; + + // merge log + pg->merge_log(m->log, m->missing, from); + pg->proc_missing(m->log, m->missing, from); + assert(pg->missing.num_lost() == 0); + + // ok activate! + pg->activate(t); + } + + unsigned tr = store->apply_transaction(t); + assert(tr == 0); + + _unlock_pg(pgid); + + delete m; +} + + +/** PGQuery + * from primary to replica | stray + * NOTE: called with opqueue active. + */ +void OSD::handle_pg_query(MOSDPGQuery *m) +{ + dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << endl; + int from = MSG_ADDR_NUM(m->get_source()); + + if (!require_same_or_newer_map(m, m->get_epoch())) return; + + map< int, list > notify_list; + + for (map::iterator it = m->pg_list.begin(); + it != m->pg_list.end(); + it++) { + pg_t pgid = it->first; + PG *pg = 0; + + if (pg_map.count(pgid) == 0) { + // same primary? + PG::Info::History history = it->second.history; + project_pg_history(pgid, history, m->get_epoch()); + + if (m->get_epoch() < history.same_since) { + dout(10) << " pg " << pgid << " dne, and pg has changed in " + << history.same_primary_since << " (msg from " << m->get_epoch() << ")" << endl; + continue; + } + + // get active rush mapping + vector acting; + int nrep = osdmap->pg_to_acting_osds(pgid, acting); + int role = osdmap->calc_pg_role(whoami, acting, nrep); + + if (role < 0) { + dout(10) << " pg " << pgid << " dne, and i am not an active replica" << endl; + PG::Info empty(pgid); + notify_list[from].push_back(empty); + continue; + } + assert(role > 0); + + ObjectStore::Transaction t; + pg = create_pg(pgid, t); + pg->acting.swap( acting ); + pg->set_role(role); + pg->info.history = history; + + t.collection_setattr(pgid, "info", (char*)&pg->info, sizeof(pg->info)); + store->apply_transaction(t); + + dout(10) << *pg << " dne (before), but i am role " << role << endl; + _lock_pg(pgid); + } else { + pg = _lock_pg(pgid); + + // same primary? + if (m->get_epoch() < pg->info.history.same_since) { + dout(10) << *pg << " handle_pg_query primary changed in " + << pg->info.history.same_since + << " (msg from " << m->get_epoch() << ")" << endl; + _unlock_pg(pgid); + continue; + } + } + + // ok, process query! + assert(!pg->acting.empty()); + assert(from == pg->acting[0]); + + if (it->second.type == PG::Query::INFO) { + // info + dout(10) << *pg << " sending info" << endl; + notify_list[from].push_back(pg->info); + } else { + MOSDPGLog *m = new MOSDPGLog(osdmap->get_epoch(), pg->get_pgid()); + m->info = pg->info; + m->missing = pg->missing; + + if (it->second.type == PG::Query::LOG) { + dout(10) << *pg << " sending info+missing+log since split " << it->second.split + << " from floor " << it->second.floor + << endl; + if (!m->log.copy_after_unless_divergent(pg->log, it->second.split, it->second.floor)) { + dout(10) << *pg << " divergent, sending backlog" << endl; + it->second.type = PG::Query::BACKLOG; + } + } + + if (it->second.type == PG::Query::BACKLOG) { + dout(10) << *pg << " sending info+missing+backlog" << endl; + if (pg->log.backlog) { + m->log = pg->log; + } else { + pg->generate_backlog(); + m->log = pg->log; + pg->drop_backlog(); + } + } + else if (it->second.type == PG::Query::FULLLOG) { + dout(10) << *pg << " sending info+missing+full log" << endl; + m->log.copy_non_backlog(pg->log); + } + + dout(10) << *pg << " sending " << m->log << " " << m->missing << endl; + //m->log.print(cout); + + _share_map_outgoing(MSG_ADDR_OSD(from), osdmap->get_inst(from)); + messenger->send_message(m, MSG_ADDR_OSD(from), osdmap->get_inst(from)); + } + + _unlock_pg(pgid); + } + + do_notifies(notify_list); + + delete m; +} + + +void OSD::handle_pg_remove(MOSDPGRemove *m) +{ + dout(7) << "handle_pg_remove from " << m->get_source() << endl; + + if (!require_same_or_newer_map(m, m->get_epoch())) return; + + for (set::iterator it = m->pg_list.begin(); + it != m->pg_list.end(); + it++) { + pg_t pgid = *it; + PG *pg; + + if (pg_map.count(pgid) == 0) { + dout(10) << " don't have pg " << pgid << endl; + continue; + } + + pg = _lock_pg(pgid); + + dout(10) << *pg << " removing." << endl; + assert(pg->get_role() == -1); + + _remove_pg(pgid); + + // unlock. there shouldn't be any waiters, since we're a stray, and pg is presumably clean0. + assert(pg_lock_waiters.count(pgid) == 0); + _unlock_pg(pgid); + } + + delete m; +} + + + + + + +/*** RECOVERY ***/ + +/** pull - request object from a peer + */ +void OSD::pull(PG *pg, object_t oid) +{ + assert(pg->missing.loc.count(oid)); + eversion_t v = pg->missing.missing[oid]; + int osd = pg->missing.loc[oid]; + + dout(7) << *pg << " pull " << oid + << " v " << v + << " from osd" << osd + << endl; + + // send op + tid_t tid = ++last_tid; + MOSDOp *op = new MOSDOp(tid, messenger->get_myaddr(), + oid, pg->get_pgid(), + osdmap->get_epoch(), + OSD_OP_PULL); + op->set_version(v); + messenger->send_message(op, MSG_ADDR_OSD(osd), osdmap->get_inst(osd)); + + // take note + assert(pg->objects_pulling.count(oid) == 0); + num_pulling++; + pg->objects_pulling[oid] = v; +} + + +/** push - send object to a peer + */ +void OSD::push(PG *pg, object_t oid, int dest) +{ + // read data+attrs + bufferlist bl; + eversion_t v; + int vlen = sizeof(v); + map attrset; + + ObjectStore::Transaction t; + t.read(oid, 0, 0, &bl); + t.getattr(oid, "version", &v, &vlen); + t.getattrs(oid, attrset); + unsigned tr = store->apply_transaction(t); + + assert(tr == 0); // !!! + + // ok + dout(7) << *pg << " push " << oid << " v " << v + << " size " << bl.length() + << " to osd" << dest + << endl; + + logger->inc("r_push"); + logger->inc("r_pushb", bl.length()); + + // send + MOSDOp *op = new MOSDOp(++last_tid, MSG_ADDR_OSD(whoami), + oid, pg->info.pgid, osdmap->get_epoch(), + OSD_OP_PUSH); + op->set_offset(0); + op->set_length(bl.length()); + op->set_data(bl); // note: claims bl, set length above here! + op->set_version(v); + op->set_attrset(attrset); + + messenger->send_message(op, MSG_ADDR_OSD(dest), osdmap->get_inst(dest)); +} + + +/** op_pull + * process request to pull an entire object. + * NOTE: called from opqueue. + */ +void OSD::op_pull(MOSDOp *op, PG *pg) +{ + const object_t oid = op->get_oid(); + const eversion_t v = op->get_version(); + int from = op->get_source().num(); + + dout(7) << *pg << " op_pull " << oid << " v " << op->get_version() + << " from " << op->get_source() + << endl; + + // is a replica asking? are they missing it? + if (pg->is_primary()) { + // primary + assert(pg->peer_missing.count(from)); // we had better know this, from the peering process. + + if (!pg->peer_missing[from].is_missing(oid)) { + dout(7) << *pg << " op_pull replica isn't actually missing it, we must have already pushed to them" << endl; + delete op; + return; + } + + // do we have it yet? + if (waitfor_missing_object(op, pg)) + return; + } else { + // non-primary + if (pg->missing.is_missing(oid)) { + dout(7) << *pg << " op_pull not primary, and missing " << oid << ", ignoring" << endl; + delete op; + return; + } + } + + // push it back! + push(pg, oid, op->get_source().num()); +} + + +/** op_push + * NOTE: called from opqueue. + */ +void OSD::op_push(MOSDOp *op, PG *pg) +{ + object_t oid = op->get_oid(); + eversion_t v = op->get_version(); + + if (!pg->missing.is_missing(oid)) { + dout(7) << *pg << " op_push not missing " << oid << endl; + return; + } + + dout(7) << *pg << " op_push " + << oid + << " v " << v + << " size " << op->get_length() << " " << op->get_data().length() + << endl; + + assert(op->get_data().length() == op->get_length()); + + // write object and add it to the PG + ObjectStore::Transaction t; + t.remove(oid); // in case old version exists + t.write(oid, 0, op->get_length(), op->get_data()); + t.setattrs(oid, op->get_attrset()); + t.collection_add(pg->info.pgid, oid); + + // close out pull op? + num_pulling--; + if (pg->objects_pulling.count(oid)) + pg->objects_pulling.erase(oid); + pg->missing.got(oid, v); + + + // raise last_complete? + assert(pg->log.complete_to != pg->log.log.end()); + while (pg->log.complete_to != pg->log.log.end()) { + if (pg->missing.missing.count(pg->log.complete_to->oid)) break; + if (pg->info.last_complete < pg->log.complete_to->version) + pg->info.last_complete = pg->log.complete_to->version; + pg->log.complete_to++; + } + dout(10) << *pg << " last_complete now " << pg->info.last_complete << endl; + + + // apply to disk! + t.collection_setattr(pg->info.pgid, "info", &pg->info, sizeof(pg->info)); + unsigned r = store->apply_transaction(t); + assert(r == 0); + + + + // am i primary? are others missing this too? + if (pg->is_primary()) { + for (unsigned i=1; iacting.size(); i++) { + int peer = pg->acting[i]; + assert(pg->peer_missing.count(peer)); + if (pg->peer_missing[peer].is_missing(oid)) { + // ok, push it, and they (will) have it now. + pg->peer_missing[peer].got(oid, v); + push(pg, oid, peer); + } + } + } + + // continue recovery + pg->do_recovery(); + + // kick waiters + if (pg->waiting_for_missing_object.count(oid)) + take_waiters(pg->waiting_for_missing_object[oid]); + + delete op; +} + + + + +// op_rep_modify + +// commit (to disk) callback +class C_OSD_RepModifyCommit : public Context { +public: + OSD *osd; + MOSDOp *op; + int destosd; + + eversion_t pg_last_complete; + + Mutex lock; + Cond cond; + bool acked; + bool waiting; + + C_OSD_RepModifyCommit(OSD *o, MOSDOp *oo, int dosd, eversion_t lc) : + osd(o), op(oo), destosd(dosd), pg_last_complete(lc), + acked(false), waiting(false) { } + void finish(int r) { + lock.Lock(); + assert(!waiting); + while (!acked) { + waiting = true; + cond.Wait(lock); + } + assert(acked); + lock.Unlock(); + osd->op_rep_modify_commit(op, destosd, pg_last_complete); + } + void ack() { + lock.Lock(); + assert(!acked); + acked = true; + if (waiting) cond.Signal(); + + // discard my reference to buffer + op->get_data().clear(); + + lock.Unlock(); + } +}; + +void OSD::op_rep_modify_commit(MOSDOp *op, int ackerosd, eversion_t last_complete) +{ + // send commit. + dout(10) << "rep_modify_commit on op " << *op + << ", sending commit to osd" << ackerosd + << endl; + MOSDOpReply *commit = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); + commit->set_pg_complete_thru(last_complete); + messenger->send_message(commit, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd)); + delete op; +} + +// process a modification operation + +class C_OSD_WriteCommit : public Context { +public: + OSD *osd; + pg_t pgid; + tid_t rep_tid; + eversion_t pg_last_complete; + C_OSD_WriteCommit(OSD *o, pg_t p, tid_t rt, eversion_t lc) : osd(o), pgid(p), rep_tid(rt), pg_last_complete(lc) {} + void finish(int r) { + osd->op_modify_commit(pgid, rep_tid, pg_last_complete); + } +}; + + +/** op_rep_modify + * process a replicated modify. + * NOTE: called from opqueue. + */ +void OSD::op_rep_modify(MOSDOp *op, PG *pg) +{ + object_t oid = op->get_oid(); + eversion_t nv = op->get_version(); + + const char *opname = MOSDOp::get_opname(op->get_op()); + + // check crev + objectrev_t crev = 0; + store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); + + dout(10) << "op_rep_modify " << opname + << " " << oid + << " v " << nv + << " " << op->get_offset() << "~" << op->get_length() + << " in " << *pg + << endl; + + // we better not be missing this. + assert(!pg->missing.is_missing(oid)); + + // prepare our transaction + ObjectStore::Transaction t; + + // am i acker? + PG::RepOpGather *repop = 0; + int ackerosd = pg->acting[0]; + + if ((g_conf.osd_rep == OSD_REP_CHAIN || g_conf.osd_rep == OSD_REP_SPLAY)) { + ackerosd = pg->get_acker(); + + if (pg->is_acker()) { + // i am tail acker. + if (pg->repop_gather.count(op->get_rep_tid())) { + repop = pg->repop_gather[ op->get_rep_tid() ]; + } else { + repop = new_repop_gather(pg, op); + } + + // infer ack from source + int fromosd = op->get_source().num(); + get_repop_gather(repop); + { + //assert(repop->waitfor_ack.count(fromosd)); // no, we may come thru here twice. + repop->waitfor_ack.erase(fromosd); + } + put_repop_gather(pg, repop); + + // prepare dest socket + //messenger->prepare_send_message(op->get_client()); + } + + // chain? forward? + if (g_conf.osd_rep == OSD_REP_CHAIN && !pg->is_acker()) { + // chain rep, not at the tail yet. + int myrank = osdmap->calc_pg_rank(whoami, pg->acting); + int next = myrank+1; + if (next == (int)pg->acting.size()) + next = 1; + issue_repop(pg, op, pg->acting[next]); + } + } + + // do op? + C_OSD_RepModifyCommit *oncommit = 0; + + logger->inc("r_wr"); + logger->inc("r_wrb", op->get_length()); + + if (repop) { + // acker. we'll apply later. + if (op->get_op() != OSD_OP_WRNOOP) { + prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); + prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); + } + } else { + // middle|replica. + if (op->get_op() != OSD_OP_WRNOOP) { + prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, op->get_pg_trim_to()); + prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); + } + + oncommit = new C_OSD_RepModifyCommit(this, op, ackerosd, pg->info.last_complete); + + // apply log update. and possibly update itself. + unsigned tr = store->apply_transaction(t, oncommit); + if (tr != 0 && // no errors + tr != 2) { // or error on collection_add + cerr << "error applying transaction: r = " << tr << endl; + assert(tr == 0); + } + } + + // ack? + if (repop) { + // (logical) local ack. this may induce the actual update. + get_repop_gather(repop); + { + assert(repop->waitfor_ack.count(whoami)); + repop->waitfor_ack.erase(whoami); + } + put_repop_gather(pg, repop); + } + else { + // send ack to acker? + if (g_conf.osd_rep != OSD_REP_CHAIN) { + MOSDOpReply *ack = new MOSDOpReply(op, 0, osdmap->get_epoch(), false); + messenger->send_message(ack, MSG_ADDR_OSD(ackerosd), osdmap->get_inst(ackerosd)); + } + + // ack myself. + assert(oncommit); + oncommit->ack(); + } +} + + +// ========================================================= +// OPS + +void OSD::handle_op(MOSDOp *op) +{ + const pg_t pgid = op->get_pg(); + PG *pg = get_pg(pgid); + + + logger->set("buf", buffer_total_alloc); + + // update qlen stats + hb_stat_ops++; + hb_stat_qlen += pending_ops; + + + // require same or newer map + if (!require_same_or_newer_map(op, op->get_map_epoch())) return; + + // share our map with sender, if they're old + _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch()); + + // what kind of op? + bool read = op->get_op() < 10; // read, stat. but not pull. + + if (!op->get_source().is_osd()) { + // REGULAR OP (non-replication) + + // note original source + op->set_client_inst( op->get_source_inst() ); + op->clear_payload(); // and hose encoded payload (in case we forward) + + // have pg? + if (!pg) { + dout(7) << "hit non-existent pg " + << pgid + << ", waiting" << endl; + waiting_for_pg[pgid].push_back(op); + return; + } + + if (read) { + // read. am i the (same) acker? + if (//pg->get_acker() != whoami || + op->get_map_epoch() < pg->info.history.same_acker_since) { + dout(7) << "acting acker is osd" << pg->get_acker() + << " since " << pg->info.history.same_acker_since + << ", dropping" << endl; + assert(op->get_map_epoch() < osdmap->get_epoch()); + delete op; + return; + } + } else { + // write. am i the (same) primary? + if (pg->get_primary() != whoami || + op->get_map_epoch() < pg->info.history.same_primary_since) { + dout(7) << "acting primary is osd" << pg->get_primary() + << " since " << pg->info.history.same_primary_since + << ", dropping" << endl; + assert(op->get_map_epoch() < osdmap->get_epoch()); + delete op; + return; + } + } + + // must be active. + if (!pg->is_active()) { + // replay? + if (op->get_version().version > 0) { + if (op->get_version() > pg->info.last_update) { + dout(7) << *pg << " queueing replay at " << op->get_version() + << " for " << *op << endl; + pg->replay_queue[op->get_version()] = op; + return; + } else { + dout(7) << *pg << " replay at " << op->get_version() << " <= " << pg->info.last_update + << " for " << *op + << ", will queue for WRNOOP" << endl; + } + } + + dout(7) << *pg << " not active (yet)" << endl; + pg->waiting_for_active.push_back(op); + return; + } + + // missing object? + if (read && op->get_oid().rev > 0) { + // versioned read. hrm. + // are we missing a revision that we might need? + object_t moid = op->get_oid(); + if (pick_missing_object_rev(moid, pg)) { + // is there a local revision we might use instead? + object_t loid = op->get_oid(); + if (store->pick_object_revision_lt(loid) && + moid <= loid) { + // we need moid. pull it. + dout(10) << "handle_op read on " << op->get_oid() + << ", have " << loid + << ", but need missing " << moid + << ", pulling" << endl; + pull(pg, moid); + pg->waiting_for_missing_object[moid].push_back(op); + return; + } + + dout(10) << "handle_op read on " << op->get_oid() + << ", have " << loid + << ", don't need missing " << moid + << endl; + } + } else { + // live revision. easy. + if (op->get_op() != OSD_OP_PUSH && + waitfor_missing_object(op, pg)) return; + } + + dout(7) << "handle_op " << *op << " in " << *pg << endl; + + + // balance reads? + if (read && + g_conf.osd_balance_reads && + pg->get_acker() == whoami) { + // test + if (false) { + if (pg->acting.size() > 1) { + int peer = pg->acting[1]; + dout(-10) << "fwd client read op to osd" << peer << " for " << op->get_client() << " " << op->get_client_inst() << endl; + messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer)); + return; + } + } + + // am i above my average? + float my_avg = hb_stat_qlen / hb_stat_ops; + if (pending_ops > my_avg) { + // is there a peer who is below my average? + for (unsigned i=1; iacting.size(); ++i) { + int peer = pg->acting[i]; + if (peer_qlen.count(peer) && + peer_qlen[peer] < my_avg) { + // calculate a probability that we should redirect + float p = (my_avg - peer_qlen[peer]) / my_avg; // this is dumb. + + if (drand48() <= p) { + // take the first one + dout(-10) << "my qlen " << pending_ops << " > my_avg " << my_avg + << ", p=" << p + << ", fwd to peer w/ qlen " << peer_qlen[peer] + << " osd" << peer + << endl; + messenger->send_message(op, MSG_ADDR_OSD(peer), osdmap->get_inst(peer)); + return; + } + } + } + } + } + + } else { + // REPLICATION OP (it's from another OSD) + + // have pg? + if (!pg) { + derr(-7) << "handle_rep_op " << *op + << " pgid " << pgid << " dne" << endl; + delete op; + //assert(0); // wtf, shouldn't happen. + return; + } + + // check osd map: same set, or primary+acker? + if (g_conf.osd_rep == OSD_REP_CHAIN && + op->get_map_epoch() < pg->info.history.same_since) { + dout(10) << "handle_rep_op pg changed " << pg->info.history + << " after " << op->get_map_epoch() + << ", dropping" << endl; + delete op; + return; + } + if (g_conf.osd_rep != OSD_REP_CHAIN && + (op->get_map_epoch() < pg->info.history.same_primary_since || + op->get_map_epoch() < pg->info.history.same_acker_since)) { + dout(10) << "handle_rep_op pg primary|acker changed " << pg->info.history + << " after " << op->get_map_epoch() + << ", dropping" << endl; + delete op; + return; + } + + assert(pg->get_role() >= 0); + dout(7) << "handle_rep_op " << op << " in " << *pg << endl; + } + + if (g_conf.osd_maxthreads < 1) { + _lock_pg(pgid); + do_op(op, pg); // do it now + _unlock_pg(pgid); + } else { + // queue for worker threads + if (read) + enqueue_op(0, op); // no locking needed for reads + else + enqueue_op(pgid, op); + } +} + +void OSD::handle_op_reply(MOSDOpReply *op) +{ + if (op->get_map_epoch() < boot_epoch) { + dout(3) << "replica op reply from before boot" << endl; + delete op; + return; + } + + // must be a rep op. + assert(op->get_source().is_osd()); + + // make sure we have the pg + const pg_t pgid = op->get_pg(); + PG *pg = get_pg(pgid); + + // require same or newer map + if (!require_same_or_newer_map(op, op->get_map_epoch())) return; + + // share our map with sender, if they're old + _share_map_incoming(op->get_source(), op->get_source_inst(), op->get_map_epoch()); + + if (!pg) { + // hmm. + delete op; + } + + if (g_conf.osd_maxthreads < 1) { + _lock_pg(pgid); + do_op(op, pg); // do it now + _unlock_pg(pgid); + } else { + enqueue_op(pgid, op); // queue for worker threads + } +} + + +/* + * enqueue called with osd_lock held + */ +void OSD::enqueue_op(pg_t pgid, Message *op) +{ + while (pending_ops > g_conf.osd_max_opq) { + dout(10) << "enqueue_op waiting for pending_ops " << pending_ops << " to drop to " << g_conf.osd_max_opq << endl; + op_queue_cond.Wait(osd_lock); + } + + op_queue[pgid].push_back(op); + pending_ops++; + logger->set("opq", pending_ops); + + threadpool->put_op(pgid); +} + +/* + * NOTE: dequeue called in worker thread, without osd_lock + */ +void OSD::dequeue_op(pg_t pgid) +{ + Message *op = 0; + PG *pg = 0; + + osd_lock.Lock(); + { + if (pgid) { + // lock pg + pg = _lock_pg(pgid); + } + + // get pending op + list &ls = op_queue[pgid]; + assert(!ls.empty()); + op = ls.front(); + ls.pop_front(); + + if (pgid) { + dout(10) << "dequeue_op " << op << " write pg " << pgid + << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; + } else { + dout(10) << "dequeue_op " << op << " read " + << ls.size() << " / " << (pending_ops-1) << " more pending" << endl; + } + + if (ls.empty()) + op_queue.erase(pgid); + } + osd_lock.Unlock(); + + // do it + do_op(op, pg); + + // finish + osd_lock.Lock(); + { + if (pgid) { + // unlock pg + _unlock_pg(pgid); + } + + dout(10) << "dequeue_op " << op << " finish" << endl; + assert(pending_ops > 0); + + if (pending_ops > g_conf.osd_max_opq) + op_queue_cond.Signal(); + + pending_ops--; + logger->set("opq", pending_ops); + if (pending_ops == 0 && waiting_for_no_ops) + no_pending_ops.Signal(); + } + osd_lock.Unlock(); +} + + + +/** do_op - do an op + * object lock will be held (if multithreaded) + * osd_lock NOT held. + */ +void OSD::do_op(Message *m, PG *pg) +{ + //dout(15) << "do_op " << *m << endl; + + if (m->get_type() == MSG_OSD_OP) { + MOSDOp *op = (MOSDOp*)m; + + logger->inc("op"); + + switch (op->get_op()) { + + // reads + case OSD_OP_READ: + op_read(op);//, pg); + break; + case OSD_OP_STAT: + op_stat(op);//, pg); + break; + + // rep stuff + case OSD_OP_PULL: + op_pull(op, pg); + break; + case OSD_OP_PUSH: + op_push(op, pg); + break; + + // writes + case OSD_OP_WRNOOP: + case OSD_OP_WRITE: + case OSD_OP_ZERO: + case OSD_OP_DELETE: + case OSD_OP_TRUNCATE: + case OSD_OP_WRLOCK: + case OSD_OP_WRUNLOCK: + case OSD_OP_RDLOCK: + case OSD_OP_RDUNLOCK: + case OSD_OP_UPLOCK: + case OSD_OP_DNLOCK: + if (op->get_source().is_osd()) + op_rep_modify(op, pg); + else + op_modify(op, pg); + break; + + default: + assert(0); + } + } + else if (m->get_type() == MSG_OSD_OPREPLY) { + // must be replication. + MOSDOpReply *r = (MOSDOpReply*)m; + tid_t rep_tid = r->get_rep_tid(); + + if (pg->repop_gather.count(rep_tid)) { + // oh, good. + int fromosd = r->get_source().num(); + repop_ack(pg, pg->repop_gather[rep_tid], + r->get_result(), r->get_commit(), + fromosd, + r->get_pg_complete_thru()); + delete m; + } else { + // early ack. + pg->waiting_for_repop[rep_tid].push_back(r); + } + + } else + assert(0); +} + + + +void OSD::wait_for_no_ops() +{ + if (pending_ops > 0) { + dout(7) << "wait_for_no_ops - waiting for " << pending_ops << endl; + waiting_for_no_ops = true; + while (pending_ops > 0) + no_pending_ops.Wait(osd_lock); + waiting_for_no_ops = false; + assert(pending_ops == 0); + } + dout(7) << "wait_for_no_ops - none" << endl; +} + + +// ============================== +// Object locking + +// +// If the target object of the operation op is locked for writing by another client, the function puts op to the waiting queue waiting_for_wr_unlock +// returns true if object was locked, otherwise returns false +// +bool OSD::block_if_wrlocked(MOSDOp* op) +{ + object_t oid = op->get_oid(); + + msg_addr_t source; + int len = store->getattr(oid, "wrlock", &source, sizeof(msg_addr_t)); + //cout << "getattr returns " << len << " on " << oid << endl; + + if (len == sizeof(source) && + source != op->get_client()) { + //the object is locked for writing by someone else -- add the op to the waiting queue + waiting_for_wr_unlock[oid].push_back(op); + return true; + } + + return false; //the object wasn't locked, so the operation can be handled right away +} + + + +// =============================== +// OPS + +/* +int OSD::list_missing_revs(object_t oid, set& revs, PG *pg) +{ + int c = 0; + oid.rev = 0; + + map::iterator p = pg->missing.missing.lower_bound(oid); + if (p == pg->missing.missing.end()) + return 0; // clearly not + + while (p->first.ino == oid.ino && + p->first.bno == oid.bno) { + revs.insert(p->first); + c++; + } + return c; +}*/ + +bool OSD::pick_missing_object_rev(object_t& oid, PG *pg) +{ + map::iterator p = pg->missing.missing.upper_bound(oid); + if (p == pg->missing.missing.end()) + return false; // clearly no candidate + + if (p->first.ino == oid.ino && p->first.bno == oid.bno) { + oid = p->first; // yes! it's an upper bound revision for me. + return true; + } + return false; +} + +bool OSD::pick_object_rev(object_t& oid) +{ + object_t t = oid; + + if (!store->pick_object_revision_lt(t)) + return false; // we have no revisions of this object! + + objectrev_t crev; + int r = store->getattr(t, "crev", &crev, sizeof(crev)); + assert(r >= 0); + if (crev <= oid.rev) { + dout(10) << "pick_object_rev choosing " << t << " crev " << crev << " for " << oid << endl; + oid = t; + return true; + } + + return false; +} + +bool OSD::waitfor_missing_object(MOSDOp *op, PG *pg) +{ + const object_t oid = op->get_oid(); + + // are we missing the object? + if (pg->missing.missing.count(oid)) { + // we don't have it (yet). + eversion_t v = pg->missing.missing[oid]; + if (pg->objects_pulling.count(oid)) { + dout(7) << "missing " + << oid + << " v " << v + << " in " << *pg + << ", already pulling" + << endl; + } else { + dout(7) << "missing " + << oid + << " v " << v + << " in " << *pg + << ", pulling" + << endl; + pull(pg, oid); + } + pg->waiting_for_missing_object[oid].push_back(op); + return true; + } + + return false; +} + + + + +// READ OPS + +/** op_read + * client read op + * NOTE: called from opqueue. + */ +void OSD::op_read(MOSDOp *op)//, PG *pg) +{ + object_t oid = op->get_oid(); + + // if the target object is locked for writing by another client, put 'op' to the waiting queue + // for _any_ op type -- eg only the locker can unlock! + if (block_if_wrlocked(op)) return; // op will be handled later, after the object unlocks + + dout(10) << "op_read " << oid + << " " << op->get_offset() << "~" << op->get_length() + //<< " in " << *pg + << endl; + + long r = 0; + bufferlist bl; + + if (oid.rev && !pick_object_rev(oid)) { + // we have no revision for this request. + r = -EEXIST; + } else { + // read into a buffer + r = store->read(oid, + op->get_offset(), op->get_length(), + bl); + } + + // set up reply + MOSDOpReply *reply = new MOSDOpReply(op, 0, osdmap->get_epoch(), true); + if (r >= 0) { + reply->set_result(0); + reply->set_data(bl); + reply->set_length(r); + + logger->inc("c_rd"); + logger->inc("c_rdb", r); + + } else { + reply->set_result(r); // error + reply->set_length(0); + } + + dout(10) << " read got " << r << " / " << op->get_length() << " bytes from obj " << oid << endl; + + logger->inc("rd"); + if (r >= 0) logger->inc("rdb", r); + + // send it + messenger->send_message(reply, op->get_client(), op->get_client_inst()); + + delete op; +} + + +/** op_stat + * client stat + * NOTE: called from opqueue + */ +void OSD::op_stat(MOSDOp *op)//, PG *pg) +{ + object_t oid = op->get_oid(); + + // if the target object is locked for writing by another client, put 'op' to the waiting queue + if (block_if_wrlocked(op)) return; //read will be handled later, after the object unlocks + + struct stat st; + memset(&st, sizeof(st), 0); + int r = 0; + + if (oid.rev && !pick_object_rev(oid)) { + // we have no revision for this request. + r = -EEXIST; + } else { + r = store->stat(oid, &st); + } + + dout(3) << "op_stat on " << oid + << " r = " << r + << " size = " << st.st_size + //<< " in " << *pg + << endl; + + MOSDOpReply *reply = new MOSDOpReply(op, r, osdmap->get_epoch(), true); + reply->set_object_size(st.st_size); + messenger->send_message(reply, op->get_client(), op->get_client_inst()); + + logger->inc("stat"); + + delete op; +} + + + +/********* + * new repops + */ + +void OSD::get_repop_gather(PG::RepOpGather *repop) +{ + //repop->lock.Lock(); + dout(10) << "get_repop " << *repop << endl; +} + +void OSD::apply_repop(PG *pg, PG::RepOpGather *repop) +{ + dout(10) << "apply_repop applying update on " << *repop << endl; + assert(!repop->applied); + + Context *oncommit = new C_OSD_WriteCommit(this, pg->info.pgid, repop->rep_tid, repop->pg_local_last_complete); + unsigned r = store->apply_transaction(repop->t, oncommit); + if (r) + dout(-10) << "apply_repop apply transaction return " << r << " on " << *repop << endl; + + // discard my reference to buffer + repop->op->get_data().clear(); + + repop->applied = true; +} + +void OSD::put_repop_gather(PG *pg, PG::RepOpGather *repop) +{ + dout(10) << "put_repop " << *repop << endl; + + // commit? + if (repop->can_send_commit() && + repop->op->wants_commit()) { + // send commit. + MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), true); + dout(10) << "put_repop sending commit on " << *repop << " " << reply << endl; + messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst()); + repop->sent_commit = true; + } + + // ack? + else if (repop->can_send_ack() && + repop->op->wants_ack()) { + // apply + apply_repop(pg, repop); + + // send ack + MOSDOpReply *reply = new MOSDOpReply(repop->op, 0, osdmap->get_epoch(), false); + dout(10) << "put_repop sending ack on " << *repop << " " << reply << endl; + messenger->send_message(reply, repop->op->get_client(), repop->op->get_client_inst()); + repop->sent_ack = true; + + utime_t now = g_clock.now(); + now -= repop->start; + logger->finc("rlsum", now); + logger->inc("rlnum", 1); + } + + // done. + if (repop->can_delete()) { + // adjust peers_complete_thru + if (!repop->pg_complete_thru.empty()) { + eversion_t min = pg->info.last_complete; // hrm.... + for (unsigned i=0; iacting.size(); i++) { + if (repop->pg_complete_thru[pg->acting[i]] < min) // note: if we haven't heard, it'll be zero, which is what we want. + min = repop->pg_complete_thru[pg->acting[i]]; + } + + if (min > pg->peers_complete_thru) { + dout(10) << "put_repop peers_complete_thru " << pg->peers_complete_thru << " -> " << min << " in " << *pg << endl; + pg->peers_complete_thru = min; + } + } + + dout(10) << "put_repop deleting " << *repop << endl; + //repop->lock.Unlock(); + + assert(pg->repop_gather.count(repop->rep_tid)); + pg->repop_gather.erase(repop->rep_tid); + + delete repop->op; + delete repop; + + } else { + //repop->lock.Unlock(); + } +} + + +void OSD::issue_repop(PG *pg, MOSDOp *op, int osd) +{ + object_t oid = op->get_oid(); + + dout(7) << " issue_repop rep_tid " << op->get_rep_tid() + << " in " << *pg + << " o " << oid + << " to osd" << osd + << endl; + + // forward the write/update/whatever + MOSDOp *wr = new MOSDOp(op->get_tid(), + op->get_client(), + oid, + pg->get_pgid(), + osdmap->get_epoch(), + op->get_op()); + wr->get_data() = op->get_data(); // _copy_ bufferlist + wr->set_length(op->get_length()); + wr->set_offset(op->get_offset()); + wr->set_version(op->get_version()); + + wr->set_rep_tid(op->get_rep_tid()); + wr->set_pg_trim_to(pg->peers_complete_thru); + + messenger->send_message(wr, MSG_ADDR_OSD(osd), osdmap->get_inst(osd)); +} + +PG::RepOpGather *OSD::new_repop_gather(PG *pg, + MOSDOp *op) +{ + dout(10) << "new_repop_gather rep_tid " << op->get_rep_tid() << " on " << *op << " in " << *pg << endl; + + PG::RepOpGather *repop = new PG::RepOpGather(op, op->get_rep_tid(), + op->get_version(), + pg->info.last_complete); + + // osds. commits all come to me. + for (unsigned i=0; iacting.size(); i++) { + int osd = pg->acting[i]; + repop->osds.insert(osd); + repop->waitfor_commit.insert(osd); + } + + // acks vary: + if (g_conf.osd_rep == OSD_REP_CHAIN) { + // chain rep. + // there's my local ack... + repop->osds.insert(whoami); + repop->waitfor_ack.insert(whoami); + repop->waitfor_commit.insert(whoami); + + // also, the previous guy will ack to me + int myrank = osdmap->calc_pg_rank(whoami, pg->acting); + if (myrank > 0) { + int osd = pg->acting[ myrank-1 ]; + repop->osds.insert(osd); + repop->waitfor_ack.insert(osd); + repop->waitfor_commit.insert(osd); + } + } else { + // primary, splay. all osds ack to me. + for (unsigned i=0; iacting.size(); i++) { + int osd = pg->acting[i]; + repop->waitfor_ack.insert(osd); + } + } + + repop->start = g_clock.now(); + + pg->repop_gather[ repop->rep_tid ] = repop; + + // anyone waiting? (acks that got here before the op did) + if (pg->waiting_for_repop.count(repop->rep_tid)) { + take_waiters(pg->waiting_for_repop[repop->rep_tid]); + pg->waiting_for_repop.erase(repop->rep_tid); + } + + return repop; +} + + +void OSD::repop_ack(PG *pg, PG::RepOpGather *repop, + int result, bool commit, + int fromosd, eversion_t pg_complete_thru) +{ + MOSDOp *op = repop->op; + + dout(7) << "repop_ack rep_tid " << repop->rep_tid << " op " << *op + << " result " << result << " commit " << commit << " from osd" << fromosd + << " in " << *pg + << endl; + + get_repop_gather(repop); + { + if (commit) { + // commit + assert(repop->waitfor_commit.count(fromosd)); + repop->waitfor_commit.erase(fromosd); + repop->waitfor_ack.erase(fromosd); + repop->pg_complete_thru[fromosd] = pg_complete_thru; + } else { + // ack + repop->waitfor_ack.erase(fromosd); + } + } + put_repop_gather(pg, repop); +} + + + + + +/** op_modify_commit + * transaction commit on the acker. + */ +void OSD::op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru) +{ + PG *pg = lock_pg(pgid); + if (pg) { + if (pg->repop_gather.count(rep_tid)) { + PG::RepOpGather *repop = pg->repop_gather[rep_tid]; + + dout(10) << "op_modify_commit " << *repop->op << endl; + get_repop_gather(repop); + { + assert(repop->waitfor_commit.count(whoami)); + repop->waitfor_commit.erase(whoami); + repop->pg_complete_thru[whoami] = pg_complete_thru; + } + put_repop_gather(pg, repop); + dout(10) << "op_modify_commit done on " << repop << endl; + } else { + dout(10) << "op_modify_commit pg " << pgid << " rep_tid " << rep_tid << " dne" << endl; + } + + unlock_pg(pgid); + } else { + dout(10) << "op_modify_commit pg " << pgid << " dne" << endl; + } +} + + +/** op_modify + * process client modify op + * NOTE: called from opqueue. + */ +void OSD::op_modify(MOSDOp *op, PG *pg) +{ + object_t oid = op->get_oid(); + + const char *opname = MOSDOp::get_opname(op->get_op()); + + // are any peers missing this? + for (unsigned i=1; iacting.size(); i++) { + int peer = pg->acting[i]; + if (pg->peer_missing.count(peer) && + pg->peer_missing[peer].is_missing(oid)) { + // push it before this update. + // FIXME, this is probably extra much work (eg if we're about to overwrite) + pg->peer_missing[peer].got(oid); + push(pg, oid, peer); + } + } + + // dup op? + reqid_t reqid(op->get_client(), op->get_tid()); + if (pg->log.logged_req(reqid)) { + dout(-3) << "op_modify " << opname << " dup op " << reqid + << ", doing WRNOOP" << endl; + op->set_op(OSD_OP_WRNOOP); + opname = MOSDOp::get_opname(op->get_op()); + } + + // locked by someone else? + // for _any_ op type -- eg only the locker can unlock! + if (op->get_op() != OSD_OP_WRNOOP && // except WRNOOP; we just want to flush + block_if_wrlocked(op)) + return; // op will be handled later, after the object unlocks + + + // check crev + objectrev_t crev = 0; + store->getattr(oid, "crev", (char*)&crev, sizeof(crev)); + + // assign version + eversion_t clone_version; + eversion_t nv = pg->log.top; + if (op->get_op() != OSD_OP_WRNOOP) { + nv.epoch = osdmap->get_epoch(); + nv.version++; + assert(nv > pg->info.last_update); + assert(nv > pg->log.top); + + // will clone? + if (crev && op->get_rev() && op->get_rev() > crev) { + clone_version = nv; + nv.version++; + } + + if (op->get_version().version) { + // replay! + if (nv.version < op->get_version().version) { + nv.version = op->get_version().version; + + // clone? + if (crev && op->get_rev() && op->get_rev() > crev) { + // backstep clone + clone_version = nv; + clone_version.version--; + } + } + } + } + + // set version in op, for benefit of client and our eventual reply + op->set_version(nv); + + dout(10) << "op_modify " << opname + << " " << oid + << " v " << nv + << " crev " << crev + << " rev " << op->get_rev() + << " " << op->get_offset() << "~" << op->get_length() + << endl; + + if (op->get_op() == OSD_OP_WRITE) { + logger->inc("c_wr"); + logger->inc("c_wrb", op->get_length()); + } + + // share latest osd map? + osd_lock.Lock(); + { + for (unsigned i=1; iacting.size(); i++) { + int osd = pg->acting[i]; + _share_map_outgoing( MSG_ADDR_OSD(osd), osdmap->get_inst(osd) ); + } + } + osd_lock.Unlock(); + + // issue replica writes + PG::RepOpGather *repop = 0; + bool alone = (pg->acting.size() == 1); + tid_t rep_tid = ++last_tid; + op->set_rep_tid(rep_tid); + + if (g_conf.osd_rep == OSD_REP_CHAIN && !alone) { + // chain rep. send to #2 only. + int next = pg->acting[1]; + if (pg->acting.size() > 2) + next = pg->acting[2]; + issue_repop(pg, op, next); + } + else if (g_conf.osd_rep == OSD_REP_SPLAY && !alone) { + // splay rep. send to rest. + for (unsigned i=1; iacting.size(); ++i) + //for (unsigned i=pg->acting.size()-1; i>=1; --i) + issue_repop(pg, op, pg->acting[i]); + } else { + // primary rep, or alone. + repop = new_repop_gather(pg, op); + + // send to rest. + if (!alone) + for (unsigned i=1; iacting.size(); i++) + issue_repop(pg, op, pg->acting[i]); + } + + if (repop) { + // we are acker. + if (op->get_op() != OSD_OP_WRNOOP) { + // log and update later. + prepare_log_transaction(repop->t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); + prepare_op_transaction(repop->t, op, nv, crev, op->get_rev(), pg); + } + + // (logical) local ack. + // (if alone, this will apply the update.) + get_repop_gather(repop); + { + assert(repop->waitfor_ack.count(whoami)); + repop->waitfor_ack.erase(whoami); + } + put_repop_gather(pg, repop); + + } else { + // chain or splay. apply. + ObjectStore::Transaction t; + prepare_log_transaction(t, op, nv, crev, op->get_rev(), pg, pg->peers_complete_thru); + prepare_op_transaction(t, op, nv, crev, op->get_rev(), pg); + + C_OSD_RepModifyCommit *oncommit = new C_OSD_RepModifyCommit(this, op, pg->get_acker(), + pg->info.last_complete); + unsigned r = store->apply_transaction(t, oncommit); + if (r != 0 && // no errors + r != 2) { // or error on collection_add + cerr << "error applying transaction: r = " << r << endl; + assert(r == 0); + } + + oncommit->ack(); + } +} + + + +void OSD::prepare_log_transaction(ObjectStore::Transaction& t, + MOSDOp *op, eversion_t& version, + objectrev_t crev, objectrev_t rev, + PG *pg, + eversion_t trim_to) +{ + const object_t oid = op->get_oid(); + + // clone entry? + if (crev && rev && rev > crev) { + eversion_t cv = version; + cv.version--; + PG::Log::Entry cloneentry(PG::Log::Entry::CLONE, oid, cv, + op->get_client(), op->get_tid()); + pg->log.add(cloneentry); + + dout(10) << "prepare_log_transaction " << op->get_op() + << " " << cloneentry + << " in " << *pg << endl; + } + + // actual op + int opcode = PG::Log::Entry::MODIFY; + if (op->get_op() == OSD_OP_DELETE) opcode = PG::Log::Entry::DELETE; + PG::Log::Entry logentry(opcode, oid, version, + op->get_client(), op->get_tid()); + + dout(10) << "prepare_log_transaction " << op->get_op() + << " " << logentry + << " in " << *pg << endl; + + // append to log + assert(version > pg->log.top); + pg->log.add(logentry); + assert(pg->log.top == version); + dout(10) << "prepare_log_transaction appended to " << *pg << endl; + + // write to pg log on disk + pg->append_log(t, logentry, trim_to); +} + + +/** prepare_op_transaction + * apply an op to the store wrapped in a transaction. + */ +void OSD::prepare_op_transaction(ObjectStore::Transaction& t, + MOSDOp *op, eversion_t& version, + objectrev_t crev, objectrev_t rev, + PG *pg) +{ + const object_t oid = op->get_oid(); + const pg_t pgid = op->get_pg(); + + bool did_clone = false; + + dout(10) << "prepare_op_transaction " << MOSDOp::get_opname( op->get_op() ) + << " " << oid + << " v " << version + << " crev " << crev + << " rev " << rev + << " in " << *pg << endl; + + // WRNOOP does nothing. + if (op->get_op() == OSD_OP_WRNOOP) + return; + + // raise last_complete? + if (pg->info.last_complete == pg->info.last_update) + pg->info.last_complete = version; + + // raise last_update. + assert(version > pg->info.last_update); + pg->info.last_update = version; + + // write pg info + t.collection_setattr(pgid, "info", &pg->info, sizeof(pg->info)); + + // clone? + if (crev && rev && rev > crev) { + object_t noid = oid; + noid.rev = rev; + dout(10) << "prepare_op_transaction cloning " << oid << " crev " << crev << " to " << noid << endl; + t.clone(oid, noid); + did_clone = true; + } + + // apply the op + switch (op->get_op()) { + case OSD_OP_WRLOCK: + { // lock object + //r = store->setattr(oid, "wrlock", &op->get_asker(), sizeof(msg_addr_t), oncommit); + t.setattr(oid, "wrlock", &op->get_client(), sizeof(msg_addr_t)); + } + break; + + case OSD_OP_WRUNLOCK: + { // unlock objects + //r = store->rmattr(oid, "wrlock", oncommit); + t.rmattr(oid, "wrlock"); + + // unblock all operations that were waiting for this object to become unlocked + if (waiting_for_wr_unlock.count(oid)) { + take_waiters(waiting_for_wr_unlock[oid]); + waiting_for_wr_unlock.erase(oid); + } + } + break; + + case OSD_OP_WRITE: + { // write + assert(op->get_data().length() == op->get_length()); + bufferlist bl; + bl.claim( op->get_data() ); // give buffers to store; we keep *op in memory for a long time! + + //if (oid < 100000000000000ULL) // hack hack-- don't write client data + t.write( oid, op->get_offset(), op->get_length(), bl ); + } + break; + + case OSD_OP_ZERO: + { + assert(0); // are you sure this is what you want? + // zero, remove, or truncate? + struct stat st; + int r = store->stat(oid, &st); + if (r >= 0) { + if (op->get_offset() + op->get_length() >= st.st_size) { + if (op->get_offset()) + t.truncate(oid, op->get_length() + op->get_offset()); + else + t.remove(oid); + } else { + // zero. the dumb way. FIXME. + bufferptr bp(op->get_length()); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + t.write(oid, op->get_offset(), op->get_length(), bl); + } + } else { + // noop? + dout(10) << "apply_transaction zero on " << oid << ", but dne? stat returns " << r << endl; + } + } + break; + + case OSD_OP_TRUNCATE: + { // truncate + //r = store->truncate(oid, op->get_offset()); + t.truncate(oid, op->get_length() ); + } + break; + + case OSD_OP_DELETE: + { // delete + //r = store->remove(oid); + t.remove(oid); + } + break; + + default: + assert(0); + } + + // object collection, version + if (op->get_op() == OSD_OP_DELETE) { + // remove object from c + t.collection_remove(pgid, oid); + } else { + // add object to c + t.collection_add(pgid, oid); + + // object version + t.setattr(oid, "version", &version, sizeof(version)); + + // set object crev + if (crev == 0 || // new object + did_clone) // we cloned + t.setattr(oid, "crev", &rev, sizeof(rev)); + } +} diff --git a/branches/sage/cephmds2/osd/OSD.h b/branches/sage/cephmds2/osd/OSD.h new file mode 100644 index 0000000000000..ae5dba7a8e01a --- /dev/null +++ b/branches/sage/cephmds2/osd/OSD.h @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __OSD_H +#define __OSD_H + +#include "msg/Dispatcher.h" + +#include "common/Mutex.h" +#include "common/ThreadPool.h" + +#include "mon/MonMap.h" + +#include "ObjectStore.h" +#include "PG.h" + +#include +using namespace std; +#include +#include +using namespace __gnu_cxx; + +#include "messages/MOSDOp.h" + +class Messenger; +class Message; + + + + +class OSD : public Dispatcher { +public: + + /** superblock + */ + OSDSuperblock superblock; + epoch_t boot_epoch; + + object_t get_osdmap_object_name(epoch_t epoch) { return object_t(0,epoch << 1); } + object_t get_inc_osdmap_object_name(epoch_t epoch) { return object_t(0, (epoch << 1) + 1); } + + void write_superblock(); + void write_superblock(ObjectStore::Transaction& t); + int read_superblock(); + + + /** OSD **/ + protected: + Messenger *messenger; + int whoami; + + static const int STATE_BOOTING = 1; + static const int STATE_ACTIVE = 2; + static const int STATE_STOPPING = 3; + + int state; + + bool is_booting() { return state == STATE_BOOTING; } + bool is_active() { return state == STATE_ACTIVE; } + bool is_stopping() { return state == STATE_STOPPING; } + + + MonMap *monmap; + + class Logger *logger; + + // local store + char dev_path[100]; + class ObjectStore *store; + + // heartbeat + void heartbeat(); + + class C_Heartbeat : public Context { + OSD *osd; + public: + C_Heartbeat(OSD *o) : osd(o) {} + void finish(int r) { + osd->heartbeat(); + } + } *next_heartbeat; + + // global lock + Mutex osd_lock; + + // -- stats -- + int hb_stat_ops; // ops since last heartbeat + int hb_stat_qlen; // cumulative queue length since last hb + + hash_map peer_qlen; + + // per-pg locking (serializing) + hash_set pg_lock; + hash_map > pg_lock_waiters; + PG *lock_pg(pg_t pgid); + PG *_lock_pg(pg_t pgid); + void unlock_pg(pg_t pgid); + void _unlock_pg(pg_t pgid); + + // finished waiting messages, that will go at tail of dispatch() + list finished; + void take_waiters(list& ls) { + finished.splice(finished.end(), ls); + } + + // object locking + hash_map > waiting_for_wr_unlock; /** list of operations for each object waiting for 'wrunlock' */ + + bool block_if_wrlocked(MOSDOp* op); + + // -- ops -- + class ThreadPool *threadpool; + hash_map > op_queue; + int pending_ops; + bool waiting_for_no_ops; + Cond no_pending_ops; + Cond op_queue_cond; + + void wait_for_no_ops(); + + void enqueue_op(pg_t pgid, Message *op); + void dequeue_op(pg_t pgid); + static void static_dequeueop(OSD *o, pg_t pgid) { + o->dequeue_op(pgid); + }; + + void do_op(Message *m, PG *pg); // actually do it + + void prepare_log_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, + objectrev_t crev, objectrev_t rev, PG *pg, eversion_t trim_to); + void prepare_op_transaction(ObjectStore::Transaction& t, MOSDOp* op, eversion_t& version, + objectrev_t crev, objectrev_t rev, PG *pg); + + bool waitfor_missing_object(MOSDOp *op, PG *pg); + bool pick_missing_object_rev(object_t& oid, PG *pg); + bool pick_object_rev(object_t& oid); + + + + friend class PG; + + protected: + + // -- osd map -- + class OSDMap *osdmap; + list waiting_for_osdmap; + + hash_map peer_map_epoch; + bool _share_map_incoming(msg_addr_t who, const entity_inst_t& inst, epoch_t epoch); + void _share_map_outgoing(msg_addr_t dest, const entity_inst_t& inst); + + void wait_for_new_map(Message *m); + void handle_osd_map(class MOSDMap *m); + + void advance_map(ObjectStore::Transaction& t); + void activate_map(ObjectStore::Transaction& t); + + void get_map(epoch_t e, OSDMap &m); + bool get_map_bl(epoch_t e, bufferlist& bl); + bool get_inc_map_bl(epoch_t e, bufferlist& bl); + bool get_inc_map(epoch_t e, OSDMap::Incremental &inc); + + void send_incremental_map(epoch_t since, msg_addr_t dest, const entity_inst_t& inst, bool full); + + + + // -- replication -- + + // PG + hash_map pg_map; + void load_pgs(); + bool pg_exists(pg_t pg); + PG *create_pg(pg_t pg, ObjectStore::Transaction& t); // create new PG + PG *get_pg(pg_t pg); // return existing PG, or null + void _remove_pg(pg_t pg); // remove from store and memory + + void project_pg_history(pg_t pgid, PG::Info::History& h, epoch_t from); + + void activate_pg(pg_t pgid, epoch_t epoch); + + class C_Activate : public Context { + OSD *osd; + pg_t pgid; + epoch_t epoch; + public: + C_Activate(OSD *o, pg_t p, epoch_t e) : osd(o), pgid(p), epoch(e) {} + void finish(int r) { + osd->activate_pg(pgid, epoch); + } + }; + + + tid_t last_tid; + int num_pulling; + + hash_map > waiting_for_pg; + + // replica ops + void get_repop_gather(PG::RepOpGather*); + void apply_repop(PG *pg, PG::RepOpGather *repop); + void put_repop_gather(PG *pg, PG::RepOpGather*); + void issue_repop(PG *pg, MOSDOp *op, int osd); + PG::RepOpGather *new_repop_gather(PG *pg, MOSDOp *op); + void repop_ack(PG *pg, PG::RepOpGather *repop, + int result, bool commit, + int fromosd, eversion_t pg_complete_thru=0); + + void handle_rep_op_ack(MOSDOpReply *m); + + // recovery + void do_notifies(map< int, list >& notify_list); + void do_queries(map< int, map >& query_map); + void repeer(PG *pg, map< int, map >& query_map); + + void pull(PG *pg, object_t oid); + void push(PG *pg, object_t oid, int dest); + + bool require_current_map(Message *m, epoch_t v); + bool require_same_or_newer_map(Message *m, epoch_t e); + + void handle_pg_query(class MOSDPGQuery *m); + void handle_pg_notify(class MOSDPGNotify *m); + void handle_pg_log(class MOSDPGLog *m); + void handle_pg_remove(class MOSDPGRemove *m); + + void op_pull(class MOSDOp *op, PG *pg); + void op_push(class MOSDOp *op, PG *pg); + + void op_rep_modify(class MOSDOp *op, PG *pg); // write, trucnate, delete + void op_rep_modify_commit(class MOSDOp *op, int ackerosd, + eversion_t last_complete); + friend class C_OSD_RepModifyCommit; + + + public: + OSD(int id, Messenger *m, MonMap *mm, char *dev = 0); + ~OSD(); + + // startup/shutdown + int init(); + int shutdown(); + + // messages + virtual void dispatch(Message *m); + virtual void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst); + virtual bool ms_lookup(msg_addr_t dest, entity_inst_t& inst); + + void handle_osd_ping(class MOSDPing *m); + void handle_op(class MOSDOp *m); + + void op_read(class MOSDOp *m);//, PG *pg); + void op_stat(class MOSDOp *m);//, PG *pg); + void op_modify(class MOSDOp *m, PG *pg); + void op_modify_commit(pg_t pgid, tid_t rep_tid, eversion_t pg_complete_thru); + + // for replication + void handle_op_reply(class MOSDOpReply *m); + + void force_remount(); +}; + +#endif diff --git a/branches/sage/cephmds2/osd/OSDMap.h b/branches/sage/cephmds2/osd/OSDMap.h new file mode 100644 index 0000000000000..2c00eea9cdbdc --- /dev/null +++ b/branches/sage/cephmds2/osd/OSDMap.h @@ -0,0 +1,515 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __OSDMAP_H +#define __OSDMAP_H + +/* + * describe properties of the OSD cluster. + * disks, disk groups, total # osds, + * + */ +#include "config.h" +#include "include/types.h" +#include "msg/Message.h" +#include "common/Mutex.h" +#include "common/Clock.h" + +#include "crush/crush.h" +using namespace crush; + +#include +#include +#include +#include +using namespace std; + + +/* + * some system constants + */ + +// from LSB to MSB, +#define PG_PS_BITS 16 // max bits for placement seed/group portion of PG +#define PG_REP_BITS 6 // up to 64 replicas +#define PG_TYPE_BITS 2 +#define PG_PS_MASK ((1LL< new_up; + map new_down; + list new_in; + list new_out; + map new_overload; // updated overload value + list old_overload; // no longer overload + + void encode(bufferlist& bl) { + bl.append((char*)&epoch, sizeof(epoch)); + bl.append((char*)&mon_epoch, sizeof(mon_epoch)); + bl.append((char*)&ctime, sizeof(ctime)); + ::_encode(new_up, bl); + ::_encode(new_down, bl); + ::_encode(new_in, bl); + ::_encode(new_out, bl); + ::_encode(new_overload, bl); + } + void decode(bufferlist& bl, int& off) { + bl.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + bl.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); + off += sizeof(mon_epoch); + bl.copy(off, sizeof(ctime), (char*)&ctime); + off += sizeof(ctime); + ::_decode(new_up, bl, off); + ::_decode(new_down, bl, off); + ::_decode(new_in, bl, off); + ::_decode(new_out, bl, off); + ::_decode(new_overload, bl, off); + } + + Incremental(epoch_t e=0) : epoch(e), mon_epoch(0) {} + }; + +private: + epoch_t epoch; // what epoch of the osd cluster descriptor is this + epoch_t mon_epoch; // monitor epoch (election iteration) + utime_t ctime; // epoch start time + int pg_bits; // placement group bits + int localized_pg_bits; // bits for localized pgs + + set osds; // all osds + set down_osds; // list of down disks + set out_osds; // list of unmapped disks + map overload_osds; + map osd_inst; + + public: + Crush crush; // hierarchical map + + friend class OSDMonitor; + friend class MDS; + + public: + OSDMap() : epoch(0), mon_epoch(0), pg_bits(5), localized_pg_bits(3) {} + + // map info + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + int get_pg_bits() const { return pg_bits; } + void set_pg_bits(int b) { pg_bits = b; } + int get_localized_pg_bits() const { return localized_pg_bits; } + + const utime_t& get_ctime() const { return ctime; } + + bool is_mkfs() const { return epoch == 1; } + //void set_mkfs() { assert(epoch == 1); } + + /***** cluster state *****/ + int num_osds() { return osds.size(); } + void get_all_osds(set& ls) { ls = osds; } + + const set& get_osds() { return osds; } + const set& get_down_osds() { return down_osds; } + const set& get_out_osds() { return out_osds; } + const map& get_overload_osds() { return overload_osds; } + + bool is_down(int osd) { return down_osds.count(osd); } + bool is_up(int osd) { return !is_down(osd); } + bool is_out(int osd) { return out_osds.count(osd); } + bool is_in(int osd) { return !is_out(osd); } + + const entity_inst_t& get_inst(int osd) { + assert(osd_inst.count(osd)); + return osd_inst[osd]; + } + bool get_inst(int osd, entity_inst_t& inst) { + if (osd_inst.count(osd)) { + inst = osd_inst[osd]; + return true; + } + return false; + } + + void mark_down(int o) { down_osds.insert(o); } + void mark_up(int o) { down_osds.erase(o); } + void mark_out(int o) { out_osds.insert(o); } + void mark_in(int o) { out_osds.erase(o); } + + + void apply_incremental(Incremental &inc) { + assert(inc.epoch == epoch+1); + epoch++; + mon_epoch = inc.mon_epoch; + ctime = inc.ctime; + + for (map::iterator i = inc.new_up.begin(); + i != inc.new_up.end(); + i++) { + assert(down_osds.count(i->first)); + down_osds.erase(i->first); + assert(osd_inst.count(i->first) == 0); + osd_inst[i->first] = i->second; + //cout << "epoch " << epoch << " up osd" << i->first << endl; + } + for (map::iterator i = inc.new_down.begin(); + i != inc.new_down.end(); + i++) { + assert(down_osds.count(i->first) == 0); + down_osds.insert(i->first); + assert(osd_inst.count(i->first) == 0 || + osd_inst[i->first] == i->second); + osd_inst.erase(i->first); + //cout << "epoch " << epoch << " down osd" << i->first << endl; + } + for (list::iterator i = inc.new_in.begin(); + i != inc.new_in.end(); + i++) { + assert(out_osds.count(*i)); + out_osds.erase(*i); + //cout << "epoch " << epoch << " in osd" << *i << endl; + } + for (list::iterator i = inc.new_out.begin(); + i != inc.new_out.end(); + i++) { + assert(out_osds.count(*i) == 0); + out_osds.insert(*i); + //cout << "epoch " << epoch << " out osd" << *i << endl; + } + for (map::iterator i = inc.new_overload.begin(); + i != inc.new_overload.end(); + i++) { + overload_osds[i->first] = i->second; + } + for (list::iterator i = inc.old_overload.begin(); + i != inc.old_overload.end(); + i++) { + assert(overload_osds.count(*i)); + overload_osds.erase(*i); + } + } + + // serialize, unserialize + void encode(bufferlist& blist) { + blist.append((char*)&epoch, sizeof(epoch)); + blist.append((char*)&mon_epoch, sizeof(mon_epoch)); + blist.append((char*)&ctime, sizeof(ctime)); + blist.append((char*)&pg_bits, sizeof(pg_bits)); + + _encode(osds, blist); + _encode(down_osds, blist); + _encode(out_osds, blist); + _encode(overload_osds, blist); + _encode(osd_inst, blist); + + crush._encode(blist); + } + + void decode(bufferlist& blist) { + int off = 0; + blist.copy(off, sizeof(epoch), (char*)&epoch); + off += sizeof(epoch); + blist.copy(off, sizeof(mon_epoch), (char*)&mon_epoch); + off += sizeof(mon_epoch); + blist.copy(off, sizeof(ctime), (char*)&ctime); + off += sizeof(ctime); + blist.copy(off, sizeof(pg_bits), (char*)&pg_bits); + off += sizeof(pg_bits); + + _decode(osds, blist, off); + _decode(down_osds, blist, off); + _decode(out_osds, blist, off); + _decode(overload_osds, blist, off); + _decode(osd_inst, blist, off); + + crush._decode(blist, off); + } + + + + + /**** mapping facilities ****/ + + // oid -> pg + pg_t object_to_pg(object_t oid, FileLayout& layout) { + static crush::Hash H(777); + + int policy = layout.object_layout; + if (policy == 0) + policy = g_conf.osd_object_layout; + + int type = PG_TYPE_RAND; + ps_t ps; + + switch (policy) { + case OBJECT_LAYOUT_LINEAR: + { + //const object_t ono = oid.bno; + //const inodeno_t ino = oid >> OID_ONO_BITS; + ps = (oid.bno + oid.ino) & PG_PS_MASK; + ps &= ((1ULL<> OID_ONO_BITS; + ps = (oid.bno + H(oid.ino)) & PG_PS_MASK; + ps &= ((1ULL<> 32) ) & PG_PS_MASK; + ps &= ((1ULL< pg + pg_t ps_nrep_to_pg(ps_t ps, int nrep) { + /*return ((pg_t)ps & ((1ULL< nrep + int pg_to_nrep(pg_t pg) { + return pg.u.fields.nrep; + //return (pg >> PG_PS_BITS) & ((1ULL << PG_REP_BITS)-1); + } + + // pg -> ps + int pg_to_ps(pg_t pg) { + //return pg & PG_PS_MASK; + return pg.u.fields.ps; + } + + // pg -> (osd list) + int pg_to_osds(pg_t pg, + vector& osds) { // list of osd addr's + pg_t ps = pg_to_ps(pg); + int num_rep = pg_to_nrep(pg); + assert(num_rep > 0); + + // map to osds[] + switch (g_conf.osd_pg_layout) { + case PG_LAYOUT_CRUSH: + { + int forcefeed = -1; + if (pg.u.fields.preferred > 0 && + out_osds.count(pg.u.fields.preferred-1) == 0) + forcefeed = pg.u.fields.preferred-1; + crush.do_rule(crush.rules[num_rep], // FIXME rule thing. + ps, + osds, + out_osds, overload_osds, + forcefeed); + } + break; + + case PG_LAYOUT_LINEAR: + for (int i=0; i 0 && + g_conf.osd_pg_layout != PG_LAYOUT_CRUSH) { + int osd = pg.u.fields.preferred-1; + + // already in there? + if (osds.empty()) { + osds.push_back(osd); + } else { + assert(num_rep > 0); + for (int i=1; i (up osd list) + int pg_to_acting_osds(pg_t pg, + vector& osds) { // list of osd addr's + // get rush list + vector raw; + pg_to_osds(pg, raw); + + osds.clear(); + for (unsigned i=0; i primary osd + int get_pg_primary(pg_t pg) { + vector group; + int nrep = pg_to_osds(pg, group); + if (nrep) + return group[0]; + return -1; // we fail! + } + + // pg -> acting primary osd + int get_pg_acting_primary(pg_t pg) { + vector group; + int nrep = pg_to_acting_osds(pg, group); + if (nrep > 0) + return group[0]; + return -1; // we fail! + } + int get_pg_acting_tail(pg_t pg) { + vector group; + int nrep = pg_to_acting_osds(pg, group); + if (nrep > 0) + return group[group.size()-1]; + return -1; // we fail! + } + + + /* what replica # is a given osd? 0 primary, -1 for none. */ + int calc_pg_rank(int osd, vector& acting, int nrep=0) { + if (!nrep) nrep = acting.size(); + for (int i=0; i& acting, int nrep=0) { + if (!nrep) nrep = acting.size(); + int rank = calc_pg_rank(osd, acting, nrep); + + if (rank < 0) return PG_ROLE_STRAY; + else if (rank == 0) return PG_ROLE_HEAD; + else if (rank == 1) return PG_ROLE_ACKER; + else return PG_ROLE_MIDDLE; + } + + int get_pg_role(pg_t pg, int osd) { + vector group; + int nrep = pg_to_osds(pg, group); + return calc_pg_role(osd, group, nrep); + } + + /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ + int get_pg_acting_rank(pg_t pg, int osd) { + vector group; + int nrep = pg_to_acting_osds(pg, group); + return calc_pg_rank(osd, group, nrep); + } + /* role is -1 (stray), 0 (primary), 1 (replica) */ + int get_pg_acting_role(pg_t pg, int osd) { + vector group; + int nrep = pg_to_acting_osds(pg, group); + return calc_pg_role(osd, group, nrep); + } + + + + +}; + + +#endif diff --git a/branches/sage/cephmds2/osd/ObjectStore.cc b/branches/sage/cephmds2/osd/ObjectStore.cc new file mode 100644 index 0000000000000..82af869e93775 --- /dev/null +++ b/branches/sage/cephmds2/osd/ObjectStore.cc @@ -0,0 +1,149 @@ + +#include "ObjectStore.h" + +#include "config.h" +#include "common/Clock.h" + + +object_t ObjectStore::age_get_oid() { + if (!age_free_oids.empty()) { + object_t o = age_free_oids.front(); + age_free_oids.pop_front(); + return o; + } + return age_cur_oid++; + } + + ssize_t ObjectStore::age_pick_size() { + ssize_t max = file_size_distn.sample() * 1024; + return max/2 + (rand() % 100) * max/200 + 1; + } + + void ObjectStore::age_fill(float pc, utime_t until) { + bufferptr bp(1024*1024); + bp.zero(); + bufferlist bl; + bl.push_back(bp); + while (1) { + if (g_clock.now() > until) break; + + struct statfs st; + statfs(&st); + float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; + if (a >= pc) { + dout(10) << "age_fill at " << a << " / " << pc << " stopping" << endl; + break; + } + + object_t oid = age_get_oid(); + + int b = rand() % 10; + age_objects[b].push_back(oid); + + ssize_t s = age_pick_size(); + + dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << endl; + + off_t off = 0; + while (s) { + ssize_t t = MIN(s, 1024*1024); + write(oid, t, off, bl, false); + off += t; + s -= t; + } + oid++; + } + } + + void ObjectStore::age_empty(float pc) { + int nper = 20; + int n = nper; + while (1) { + struct statfs st; + statfs(&st); + float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks; + if (a <= pc) { + dout(10) << "age_empty at " << a << " / " << pc << " stopping" << endl; + break; + } + + int b = rand() % 10; + n--; + if (n == 0 || age_objects[b].empty()) { + dout(10) << "age_empty sync" << endl; + //sync(); + sync(); + n = nper; + continue; + } + object_t oid = age_objects[b].front(); + age_objects[b].pop_front(); + + dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << endl; + + remove(oid); + age_free_oids.push_back(oid); + } + } + + + void ObjectStore::age(int time, + float high_water, // fill to this % + float low_water, // then empty to this % + int count, // this many times + float final_water, // and end here ( <= low_water) + int fake_size_mb) { + utime_t until = g_clock.now(); + until.sec_ref() += time; + + while (age_objects.size() < 10) age_objects.push_back( list() ); + + if (fake_size_mb) { + int fake_bl = fake_size_mb * 256; + struct statfs st; + statfs(&st); + float f = (float)fake_bl / (float)st.f_blocks; + high_water = (float)high_water * f; + low_water = (float)low_water * f; + final_water = (float)final_water * f; + dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << endl; + } + + // init size distn (once) + if (!did_distn) { + did_distn = true; + age_cur_oid = 1; + file_size_distn.add(1, 19.0758125+0.65434375); + file_size_distn.add(512, 35.6566); + file_size_distn.add(1024, 27.7271875); + file_size_distn.add(2*1024, 16.63503125); + //file_size_distn.add(4*1024, 106.82384375); + //file_size_distn.add(8*1024, 81.493375); + //file_size_distn.add(16*1024, 14.13553125); + //file_size_distn.add(32*1024, 2.176); + //file_size_distn.add(256*1024, 0.655938); + //file_size_distn.add(512*1024, 0.1480625); + //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit + file_size_distn.normalize(); + } + + // clear + for (int i=0; i<10; i++) + age_objects[i].clear(); + + for (int c=1; c<=count; c++) { + if (g_clock.now() > until) break; + + dout(1) << "age " << c << "/" << count << " filling to " << high_water << endl; + age_fill(high_water, until); + if (c == count) { + dout(1) << "age final empty to " << final_water << endl; + age_empty(final_water); + } else { + dout(1) << "age " << c << "/" << count << " emptying to " << low_water << endl; + age_empty(low_water); + } + } + dout(1) << "age finished" << endl; + } + diff --git a/branches/sage/cephmds2/osd/ObjectStore.h b/branches/sage/cephmds2/osd/ObjectStore.h new file mode 100644 index 0000000000000..21fbd867974ed --- /dev/null +++ b/branches/sage/cephmds2/osd/ObjectStore.h @@ -0,0 +1,479 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __OBJECTSTORE_H +#define __OBJECTSTORE_H + +#include "include/types.h" +#include "include/Context.h" +#include "include/buffer.h" + +#include "include/Distribution.h" + +#include +#include /* or */ + +#include +using namespace std; + +#ifndef MIN +# define MIN(a,b) ((a) < (b) ? (a):(b)) +#endif + +/* + * low-level interface to the local OSD file system + */ + + + +class ObjectStore { +public: + + + class FragmentationStat { + public: + int total; + int num_extent; + int avg_extent; + map extent_dist; // powers of two + map extent_dist_sum; // powers of two + + float avg_extent_per_object; + int avg_extent_jump; // avg distance bweteen consecutive extents + + int total_free; + int num_free_extent; + int avg_free_extent; + map free_extent_dist; // powers of two + map free_extent_dist_sum; // powers of two + }; + + + + /********************************* + * transaction + */ + class Transaction { + public: + static const int OP_READ = 1; // oid, offset, len, pbl + static const int OP_STAT = 2; // oid, pstat + static const int OP_GETATTR = 3; // oid, attrname, pattrval + static const int OP_GETATTRS = 4; // oid, pattrset + + static const int OP_WRITE = 10; // oid, offset, len, bl + static const int OP_TRUNCATE = 11; // oid, len + static const int OP_REMOVE = 13; // oid + static const int OP_SETATTR = 14; // oid, attrname, attrval + static const int OP_SETATTRS = 15; // oid, attrset + static const int OP_RMATTR = 16; // oid, attrname + static const int OP_CLONE = 17; // oid, newoid + + static const int OP_MKCOLL = 20; // cid + static const int OP_RMCOLL = 21; // cid + static const int OP_COLL_ADD = 22; // cid, oid + static const int OP_COLL_REMOVE = 23; // cid, oid + static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval + static const int OP_COLL_RMATTR = 25; // cid, attrname + + list ops; + list bls; + list oids; + list cids; + list offsets; + list lengths; + list attrnames; + //list< pair > attrvals; + list attrbls; + + list pbls; + list psts; + list< pair > pattrvals; + list< map* > pattrsets; + + void read(object_t oid, off_t off, size_t len, bufferlist *pbl) { + int op = OP_READ; + ops.push_back(op); + oids.push_back(oid); + offsets.push_back(off); + lengths.push_back(len); + pbls.push_back(pbl); + } + void stat(object_t oid, struct stat *st) { + int op = OP_STAT; + ops.push_back(op); + oids.push_back(oid); + psts.push_back(st); + } + void getattr(object_t oid, const char* name, void* val, int *plen) { + int op = OP_GETATTR; + ops.push_back(op); + oids.push_back(oid); + attrnames.push_back(name); + pattrvals.push_back(pair(val,plen)); + } + void getattrs(object_t oid, map& aset) { + int op = OP_GETATTRS; + ops.push_back(op); + oids.push_back(oid); + pattrsets.push_back(&aset); + } + + void write(object_t oid, off_t off, size_t len, bufferlist& bl) { + int op = OP_WRITE; + ops.push_back(op); + oids.push_back(oid); + offsets.push_back(off); + lengths.push_back(len); + bls.push_back(bl); + } + void truncate(object_t oid, off_t off) { + int op = OP_TRUNCATE; + ops.push_back(op); + oids.push_back(oid); + offsets.push_back(off); + } + void remove(object_t oid) { + int op = OP_REMOVE; + ops.push_back(op); + oids.push_back(oid); + } + void setattr(object_t oid, const char* name, const void* val, int len) { + int op = OP_SETATTR; + ops.push_back(op); + oids.push_back(oid); + attrnames.push_back(name); + //attrvals.push_back(pair(val,len)); + bufferlist bl; + bl.append((char*)val,len); + attrbls.push_back(bl); + } + void setattrs(object_t oid, map& attrset) { + int op = OP_SETATTRS; + ops.push_back(op); + oids.push_back(oid); + pattrsets.push_back(&attrset); + } + void rmattr(object_t oid, const char* name) { + int op = OP_RMATTR; + ops.push_back(op); + oids.push_back(oid); + attrnames.push_back(name); + } + void clone(object_t oid, object_t noid) { + int op = OP_CLONE; + ops.push_back(op); + oids.push_back(oid); + oids.push_back(noid); + } + void create_collection(coll_t cid) { + int op = OP_MKCOLL; + ops.push_back(op); + cids.push_back(cid); + } + void remove_collection(coll_t cid) { + int op = OP_RMCOLL; + ops.push_back(op); + cids.push_back(cid); + } + void collection_add(coll_t cid, object_t oid) { + int op = OP_COLL_ADD; + ops.push_back(op); + cids.push_back(cid); + oids.push_back(oid); + } + void collection_remove(coll_t cid, object_t oid) { + int op = OP_COLL_REMOVE; + ops.push_back(op); + cids.push_back(cid); + oids.push_back(oid); + } + void collection_setattr(coll_t cid, const char* name, const void* val, int len) { + int op = OP_COLL_SETATTR; + ops.push_back(op); + cids.push_back(cid); + attrnames.push_back(name); + //attrvals.push_back(pair(val,len)); + bufferlist bl; + bl.append((char*)val, len); + attrbls.push_back(bl); + } + void collection_rmattr(coll_t cid, const char* name) { + int op = OP_COLL_RMATTR; + ops.push_back(op); + cids.push_back(cid); + attrnames.push_back(name); + } + + // etc. + }; + + + + /* this implementation is here only for naive ObjectStores that + * do not do atomic transactions natively. it is not atomic. + */ + virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { + // non-atomic implementation + for (list::iterator p = t.ops.begin(); + p != t.ops.end(); + p++) { + switch (*p) { + case Transaction::OP_READ: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + bufferlist *pbl = t.pbls.front(); t.pbls.pop_front(); + read(oid, offset, len, *pbl); + } + break; + case Transaction::OP_STAT: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + struct stat *st = t.psts.front(); t.psts.pop_front(); + stat(oid, st); + } + break; + case Transaction::OP_GETATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + pair pattrval = t.pattrvals.front(); t.pattrvals.pop_front(); + *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second); + } + break; + case Transaction::OP_GETATTRS: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + map *pset = t.pattrsets.front(); t.pattrsets.pop_front(); + getattrs(oid, *pset); + } + break; + + case Transaction::OP_WRITE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t offset = t.offsets.front(); t.offsets.pop_front(); + size_t len = t.lengths.front(); t.lengths.pop_front(); + bufferlist bl = t.bls.front(); t.bls.pop_front(); + write(oid, offset, len, bl, 0); + } + break; + + case Transaction::OP_TRUNCATE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + off_t len = t.offsets.front(); t.offsets.pop_front(); + truncate(oid, len, 0); + } + break; + + case Transaction::OP_REMOVE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + remove(oid, 0); + } + break; + + case Transaction::OP_SETATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + bufferlist bl; + bl.claim( t.attrbls.front() ); + t.attrbls.pop_front(); + setattr(oid, attrname, bl.c_str(), bl.length(), 0); + } + break; + case Transaction::OP_SETATTRS: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + map *pattrset = t.pattrsets.front(); t.pattrsets.pop_front(); + setattrs(oid, *pattrset, 0); + } + break; + + case Transaction::OP_RMATTR: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + rmattr(oid, attrname, 0); + } + break; + + case Transaction::OP_CLONE: + { + object_t oid = t.oids.front(); t.oids.pop_front(); + object_t noid = t.oids.front(); t.oids.pop_front(); + clone(oid, noid); + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + create_collection(cid, 0); + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + destroy_collection(cid, 0); + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + object_t oid = t.oids.front(); t.oids.pop_front(); + collection_add(cid, oid, 0); + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + object_t oid = t.oids.front(); t.oids.pop_front(); + collection_remove(cid, oid, 0); + } + break; + + case Transaction::OP_COLL_SETATTR: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + //pair attrval = t.attrvals.front(); t.attrvals.pop_front(); + bufferlist bl; + bl.claim( t.attrbls.front() ); + t.attrbls.pop_front(); + collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); + } + break; + + case Transaction::OP_COLL_RMATTR: + { + coll_t cid = t.cids.front(); t.cids.pop_front(); + const char *attrname = t.attrnames.front(); t.attrnames.pop_front(); + collection_rmattr(cid, attrname, 0); + } + break; + + + default: + cerr << "bad op " << *p << endl; + assert(0); + } + } + + if (onsafe) sync(onsafe); + + return 0; // FIXME count errors + } + + /*********************************************/ + + + + public: + ObjectStore() {} + virtual ~ObjectStore() {} + + // mgmt + virtual int mount() = 0; + virtual int umount() = 0; + virtual int mkfs() = 0; // wipe + + virtual int statfs(struct statfs *buf) = 0; + + // objects + virtual int pick_object_revision_lt(object_t& oid) = 0; + + virtual bool exists(object_t oid) = 0; // useful? + virtual int stat(object_t oid, struct stat *st) = 0; // struct stat? + + virtual int remove(object_t oid, + Context *onsafe=0) = 0; + + virtual int truncate(object_t oid, off_t size, + Context *onsafe=0) = 0; + + virtual int read(object_t oid, + off_t offset, size_t len, + bufferlist& bl) = 0; + + /*virtual int write(object_t oid, + off_t offset, size_t len, + bufferlist& bl, + bool fsync=true) = 0; + */ + virtual int write(object_t oid, + off_t offset, size_t len, + bufferlist& bl, + Context *onsafe) = 0;//{ return -1; } + + virtual int setattr(object_t oid, const char *name, + const void *value, size_t size, + Context *onsafe=0) {return 0;} //= 0; + virtual int setattrs(object_t oid, map& aset, + Context *onsafe=0) {return 0;} //= 0; + virtual int getattr(object_t oid, const char *name, + void *value, size_t size) {return 0;} //= 0; + virtual int getattrs(object_t oid, map& aset) {return 0;}; + + virtual int rmattr(object_t oid, const char *name, + Context *onsafe=0) {return 0;} + + virtual int clone(object_t oid, object_t noid) { + return -1; + } + + virtual int listattr(object_t oid, char *attrs, size_t size) {return 0;} //= 0; + + // collections + virtual int list_collections(list& ls) {return 0;}//= 0; + virtual int create_collection(coll_t c, + Context *onsafe=0) {return 0;}//= 0; + virtual int destroy_collection(coll_t c, + Context *onsafe=0) {return 0;}//= 0; + virtual bool collection_exists(coll_t c) {return 0;} + virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0; + virtual int collection_add(coll_t c, object_t o, + Context *onsafe=0) {return 0;}//= 0; + virtual int collection_remove(coll_t c, object_t o, + Context *onsafe=0) {return 0;}// = 0; + virtual int collection_list(coll_t c, list& o) {return 0;}//= 0; + + virtual int collection_setattr(coll_t cid, const char *name, + const void *value, size_t size, + Context *onsafe=0) {return 0;} //= 0; + virtual int collection_rmattr(coll_t cid, const char *name, + Context *onsafe=0) {return 0;} //= 0; + virtual int collection_getattr(coll_t cid, const char *name, + void *value, size_t size) {return 0;} //= 0; + virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0; + + virtual void sync(Context *onsync) {}; + virtual void sync() {}; + + + virtual void _fake_writes(bool b) {}; + + virtual void _get_frag_stat(FragmentationStat& st) {}; + +}; + + +#endif diff --git a/branches/sage/cephmds2/osd/PG.cc b/branches/sage/cephmds2/osd/PG.cc new file mode 100644 index 0000000000000..4dee6f03bd166 --- /dev/null +++ b/branches/sage/cephmds2/osd/PG.cc @@ -0,0 +1,1312 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include "PG.h" +#include "config.h" +#include "OSD.h" + +#include "common/Timer.h" + +#include "messages/MOSDPGNotify.h" +#include "messages/MOSDPGLog.h" +#include "messages/MOSDPGRemove.h" + +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_osd) cout << g_clock.now() << " osd" << osd->whoami << " " << (osd->osdmap ? osd->osdmap->get_epoch():0) << " " << *this << " " + + +/******* PGLog ********/ + +void PG::Log::copy_after(const Log &other, eversion_t v) +{ + assert(v >= other.bottom); + top = bottom = other.top; + for (list::const_reverse_iterator i = other.log.rbegin(); + i != other.log.rend(); + i++) { + if (i->version == v) break; + assert(i->version > v); + log.push_front(*i); + } + bottom = v; +} + +bool PG::Log::copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor) +{ + assert(split >= other.bottom); + assert(floor >= other.bottom); + assert(floor <= split); + top = bottom = other.top; + + /* runs on replica. split is primary's log.top. floor is how much they want. + split tell us if the primary is divergent.. e.g.: + -> i am A, B is primary, split is 2'6, floor is 2'2. +A B C +2'2 2'2 +2'3 2'3 2'3 +2'4 2'4 2'4 +3'5 | 2'5 2'5 +3'6 | 2'6 +3'7 | +3'8 | +3'9 | + -> i return full backlog. + */ + + for (list::const_reverse_iterator i = other.log.rbegin(); + i != other.log.rend(); + i++) { + // is primary divergent? + // e.g. my 3'6 vs their 2'6 split + if (i->version.version == split.version && i->version.epoch > split.epoch) { + clear(); + return false; // divergent! + } + if (i->version == floor) break; + assert(i->version > floor); + + // e.g. my 2'23 > '12 + log.push_front(*i); + } + bottom = floor; + return true; +} + +void PG::Log::copy_non_backlog(const Log &other) +{ + if (other.backlog) { + top = other.top; + bottom = other.bottom; + for (list::const_reverse_iterator i = other.log.rbegin(); + i != other.log.rend(); + i++) + if (i->version > bottom) + log.push_front(*i); + else + break; + } else { + *this = other; + } +} + + + +void PG::IndexedLog::trim(ObjectStore::Transaction& t, eversion_t s) +{ + if (backlog && s < bottom) + s = bottom; + + while (!log.empty()) { + Entry &e = *log.begin(); + + if (e.version > s) break; + + assert(complete_to != log.begin()); + assert(requested_to != log.begin()); + + // remove from index, + unindex(e); + + // from log + log.pop_front(); + } + + // raise bottom? + if (backlog) backlog = false; + if (bottom < s) bottom = s; +} + + +void PG::IndexedLog::trim_write_ahead(eversion_t last_update) +{ + while (!log.empty() && + log.rbegin()->version > last_update) { + // remove from index + unindex(*log.rbegin()); + + // remove + log.pop_back(); + } +} + +void PG::trim_write_ahead() +{ + if (info.last_update < log.top) { + dout(10) << "trim_write_ahead (" << info.last_update << "," << log.top << "]" << endl; + log.trim_write_ahead(info.last_update); + } else { + assert(info.last_update == log.top); + dout(10) << "trim_write_ahead last_update=top=" << info.last_update << endl; + } + +} + +void PG::proc_replica_log(Log &olog, Missing& omissing, int from) +{ + dout(10) << "proc_replica_log for osd" << from << ": " << olog << " " << omissing << endl; + assert(!is_active()); + + if (!have_master_log) { + // i'm building master log. + // note peer's missing. + peer_missing[from] = omissing; + + // merge log into our own log + merge_log(olog, omissing, from); + proc_missing(olog, omissing, from); + } else { + // i'm just building missing lists. + peer_missing[from] = omissing; + + // iterate over peer log. in reverse. + list::reverse_iterator pp = olog.log.rbegin(); + eversion_t lu = peer_info[from].last_update; + while (pp != olog.log.rend()) { + if (!log.objects.count(pp->oid)) { + dout(10) << " divergent " << *pp << " not in our log, generating backlog" << endl; + generate_backlog(); + } + + if (!log.objects.count(pp->oid)) { + dout(10) << " divergent " << *pp << " dne, must have been new, ignoring" << endl; + ++pp; + continue; + } + + if (log.objects[pp->oid]->version == pp->version) { + break; // we're no longer divergent. + //++pp; + //continue; + } + + if (log.objects[pp->oid]->version > pp->version) { + dout(10) << " divergent " << *pp + << " superceded by " << log.objects[pp->oid] + << ", ignoring" << endl; + } else { + dout(10) << " divergent " << *pp << ", adding to missing" << endl; + peer_missing[from].add(pp->oid, pp->version); + } + + ++pp; + if (pp != olog.log.rend()) + lu = pp->version; + else + lu = olog.bottom; + } + + if (lu < peer_info[from].last_update) { + dout(10) << " peer osd" << from << " last_update now " << lu << endl; + peer_info[from].last_update = lu; + if (lu < oldest_update) { + dout(10) << " oldest_update now " << lu << endl; + oldest_update = lu; + } + } + + proc_missing(olog, peer_missing[from], from); + } +} + +void PG::merge_log(Log &olog, Missing &omissing, int fromosd) +{ + dout(10) << "merge_log " << olog << " from osd" << fromosd + << " into " << log << endl; + + //cout << "log" << endl; + //log.print(cout); + //cout << "olog" << endl; + //olog.print(cout); + + if (log.empty() || + (olog.bottom > log.top && olog.backlog)) { // e.g. log=(0,20] olog=(40,50]+backlog) + + // swap and index + log.log.swap(olog.log); + log.index(); + + // find split point (old log.top) in new log + // add new items to missing along the way. + for (list::reverse_iterator p = log.log.rbegin(); + p != log.log.rend(); + p++) { + if (p->version <= log.top) { + // ok, p is at split point. + + // was our old log divergent? + if (log.top > p->version) { + dout(10) << "merge_log i was (possibly) divergent for (" << p->version << "," << log.top << "]" << endl; + if (p->version < oldest_update) + oldest_update = p->version; + + while (!olog.log.empty() && + olog.log.rbegin()->version > p->version) { + Log::Entry &oe = *olog.log.rbegin(); // old entry (possibly divergent) + if (log.objects.count(oe.oid)) { + if (log.objects[oe.oid]->version < oe.version) { + dout(10) << "merge_log divergent entry " << oe + << " not superceded by " << *log.objects[oe.oid] + << ", adding to missing" << endl; + missing.add(oe.oid, oe.version); + } else { + dout(10) << "merge_log divergent entry " << oe + << " superceded by " << *log.objects[oe.oid] + << ", ignoring" << endl; + } + } else { + dout(10) << "merge_log divergent entry " << oe << ", adding to missing" << endl; + missing.add(oe.oid, oe.version); + } + olog.log.pop_back(); // discard divergent entry + } + } + break; + } + + if (p->is_delete()) { + dout(10) << "merge_log merging " << *p << ", not missing" << endl; + missing.rm(p->oid, p->version); + } else { + dout(10) << "merge_log merging " << *p << ", now missing" << endl; + missing.add(p->oid, p->version); + } + } + + info.last_update = log.top = olog.top; + info.log_bottom = log.bottom = olog.bottom; + info.log_backlog = log.backlog = olog.backlog; + } + + else { + // i can merge the two logs! + + // extend on bottom? + // FIXME: what if we have backlog, but they have lower bottom? + if (olog.bottom < log.bottom && olog.top >= log.bottom && !log.backlog) { + dout(10) << "merge_log extending bottom to " << olog.bottom + << (olog.backlog ? " +backlog":"") + << endl; + + // ok + list::iterator from = olog.log.begin(); + list::iterator to; + for (to = from; + to != olog.log.end(); + to++) { + if (to->version > log.bottom) break; + + // update our index while we're here + log.index(*to); + + dout(15) << *to << endl; + + // new missing object? + if (to->version > info.last_complete) { + if (to->is_update()) + missing.add(to->oid, to->version); + else + missing.rm(to->oid, to->version); + } + } + assert(to != olog.log.end()); + + // splice into our log. + log.log.splice(log.log.begin(), + olog.log, from, to); + + info.log_bottom = log.bottom = olog.bottom; + info.log_backlog = log.backlog = olog.backlog; + } + + // extend on top? + if (olog.top > log.top && + olog.bottom <= log.top) { + dout(10) << "merge_log extending top to " << olog.top << endl; + + list::iterator to = olog.log.end(); + list::iterator from = olog.log.end(); + while (1) { + if (from == olog.log.begin()) break; + from--; + //dout(0) << "? " << *from << endl; + if (from->version < log.top) { + from++; + break; + } + + log.index(*from); + dout(10) << "merge_log " << *from << endl; + + // add to missing + if (from->is_update()) { + missing.add(from->oid, from->version); + } else + missing.rm(from->oid, from->version); + } + + // remove divergent items + while (1) { + Log::Entry *oldtail = &(*log.log.rbegin()); + if (oldtail->version.version+1 == from->version.version) break; + + // divergent! + assert(oldtail->version.version >= from->version.version); + + if (log.objects[oldtail->oid]->version == oldtail->version) { + // and significant. + dout(10) << "merge_log had divergent " << *oldtail << ", adding to missing" << endl; + //missing.add(oldtail->oid); + assert(0); + } else { + dout(10) << "merge_log had divergent " << *oldtail << ", already missing" << endl; + assert(missing.is_missing(oldtail->oid)); + } + log.log.pop_back(); + } + + // splice + log.log.splice(log.log.end(), + olog.log, from, to); + + info.last_update = log.top = olog.top; + } + } + + dout(10) << "merge_log result " << log << " " << missing << endl; + //log.print(cout); + +} + +void PG::proc_missing(Log &olog, Missing &omissing, int fromosd) +{ + // found items? + for (map::iterator p = missing.missing.begin(); + p != missing.missing.end(); + p++) { + if (omissing.is_missing(p->first)) { + assert(omissing.is_missing(p->first, p->second)); + if (omissing.loc.count(p->first)) { + dout(10) << "proc_missing missing " << p->first << " " << p->second + << " on osd" << omissing.loc[p->first] << endl; + missing.loc[p->first] = omissing.loc[p->first]; + } else { + dout(10) << "proc_missing missing " << p->first << " " << p->second + << " also LOST on source, osd" << fromosd << endl; + } + } + else if (p->second <= olog.top) { + dout(10) << "proc_missing missing " << p->first << " " << p->second + << " on source, osd" << fromosd << endl; + missing.loc[p->first] = fromosd; + } else { + dout(10) << "proc_missing " << p->first << " " << p->second + << " > olog.top " << olog.top << ", not found...." + << endl; + } + } + + dout(10) << "proc_missing missing " << missing.missing << endl; +} + + + +void PG::generate_backlog() +{ + dout(10) << "generate_backlog to " << log << endl; + assert(!log.backlog); + log.backlog = true; + + list olist; + osd->store->collection_list(info.pgid, olist); + + int local = 0; + map add; + for (list::iterator it = olist.begin(); + it != olist.end(); + it++) { + local++; + + if (log.logged_object(*it)) continue; // already have it logged. + + // add entry + Log::Entry e; + e.op = Log::Entry::MODIFY; // FIXME when we do smarter op codes! + e.oid = *it; + osd->store->getattr(*it, + "version", + &e.version, sizeof(e.version)); + add[e.version] = e; + dout(10) << "generate_backlog found " << e << endl; + } + + for (map::reverse_iterator i = add.rbegin(); + i != add.rend(); + i++) { + log.log.push_front(i->second); + log.index( *log.log.begin() ); // index + } + + dout(10) << local << " local objects, " + << add.size() << " objects added to backlog, " + << log.objects.size() << " in pg" << endl; + + //log.print(cout); +} + +void PG::drop_backlog() +{ + dout(10) << "drop_backlog for " << log << endl; + //log.print(cout); + + assert(log.backlog); + log.backlog = false; + + while (!log.log.empty()) { + Log::Entry &e = *log.log.begin(); + if (e.version > log.bottom) break; + + dout(15) << "drop_backlog trimming " << e.version << endl; + log.unindex(e); + log.log.pop_front(); + } +} + + + + + +ostream& PG::Log::print(ostream& out) const +{ + out << *this << endl; + for (list::const_iterator p = log.begin(); + p != log.end(); + p++) + out << *p << endl; + return out; +} + + + + + +/******* PG ***********/ +void PG::build_prior() +{ + // build prior set. + prior_set.clear(); + + // current + for (unsigned i=1; iosdmap->get_epoch(); + epoch++) { + OSDMap omap; + osd->get_map(epoch, omap); + + vector acting; + omap.pg_to_acting_osds(get_pgid(), acting); + + for (unsigned i=0; iosdmap->is_up(acting[i]) && // is up now + acting[i] != osd->whoami) // and is not me + prior_set.insert(acting[i]); + } + } + + dout(10) << "build_prior built " << prior_set << endl; +} + +void PG::adjust_prior() +{ + assert(!prior_set.empty()); + + // raise last_epoch_started_any + epoch_t max = 0; + for (map::iterator it = peer_info.begin(); + it != peer_info.end(); + it++) { + if (it->second.last_epoch_started > max) + max = it->second.last_epoch_started; + } + + dout(10) << "adjust_prior last_epoch_started_any " + << last_epoch_started_any << " -> " << max << endl; + assert(max > last_epoch_started_any); + last_epoch_started_any = max; + + // rebuild prior set + build_prior(); +} + + +void PG::clear_primary_state() +{ + dout(10) << "clear_primary_state" << endl; + + // clear peering state + have_master_log = false; + prior_set.clear(); + stray_set.clear(); + clean_set.clear(); + peer_info_requested.clear(); + peer_log_requested.clear(); + peer_info.clear(); + peer_missing.clear(); + + last_epoch_started_any = info.last_epoch_started; +} + +void PG::peer(ObjectStore::Transaction& t, + map< int, map >& query_map) +{ + dout(10) << "peer. acting is " << acting + << ", prior_set is " << prior_set << endl; + + + /** GET ALL PG::Info *********/ + + // -- query info from everyone in prior_set. + bool missing_info = false; + for (set::iterator it = prior_set.begin(); + it != prior_set.end(); + it++) { + if (peer_info.count(*it)) { + dout(10) << " have info from osd" << *it + << ": " << peer_info[*it] + << endl; + continue; + } + missing_info = true; + + if (peer_info_requested.count(*it)) { + dout(10) << " waiting for osd" << *it << endl; + continue; + } + + dout(10) << " querying info from osd" << *it << endl; + query_map[*it][info.pgid] = Query(Query::INFO, info.history); + peer_info_requested.insert(*it); + } + if (missing_info) return; + + + // -- ok, we have all (prior_set) info. (and maybe others.) + + // did we crash? + dout(10) << " last_epoch_started_any " << last_epoch_started_any << endl; + if (last_epoch_started_any) { + OSDMap omap; + osd->get_map(last_epoch_started_any, omap); + + // start with the last active set of replicas + set last_started; + vector acting; + omap.pg_to_acting_osds(get_pgid(), acting); + for (unsigned i=0; iosdmap->get_epoch(); + e++) { + OSDMap omap; + osd->get_map(e, omap); + + set still_up; + + for (set::iterator i = last_started.begin(); + i != last_started.end(); + i++) { + //dout(10) << " down in epoch " << e << " is " << omap.get_down_osds() << endl; + if (omap.is_up(*i)) + still_up.insert(*i); + } + + last_started.swap(still_up); + //dout(10) << " still active as of epoch " << e << ": " << last_started << endl; + } + + if (last_started.empty()) { + dout(10) << " crashed since epoch " << last_epoch_started_any << endl; + state_set(STATE_CRASHED); + } else { + dout(10) << " still active from last started: " << last_started << endl; + } + } else if (osd->osdmap->get_epoch() > 1) { + dout(10) << " crashed since epoch " << last_epoch_started_any << endl; + state_set(STATE_CRASHED); + } + + dout(10) << " peers_complete_thru " << peers_complete_thru << endl; + + + + + /** CREATE THE MASTER PG::Log *********/ + + // who (of all priors and active) has the latest PG version? + eversion_t newest_update = info.last_update; + int newest_update_osd = osd->whoami; + + oldest_update = info.last_update; // only of acting (current) osd set. + peers_complete_thru = info.last_complete; + + for (map::iterator it = peer_info.begin(); + it != peer_info.end(); + it++) { + if (it->second.last_update > newest_update) { + newest_update = it->second.last_update; + newest_update_osd = it->first; + } + if (is_acting(it->first)) { + if (it->second.last_update < oldest_update) + oldest_update = it->second.last_update; + if (it->second.last_complete < peers_complete_thru) + peers_complete_thru = it->second.last_complete; + } + } + + // gather log(+missing) from that person! + if (newest_update_osd != osd->whoami) { + if (peer_log_requested.count(newest_update_osd) || + peer_summary_requested.count(newest_update_osd)) { + dout(10) << " newest update on osd" << newest_update_osd + << " v " << newest_update + << ", already queried" + << endl; + } else { + // we'd like it back to oldest_update, but will settle for log_bottom + eversion_t since = MAX(peer_info[newest_update_osd].log_bottom, + oldest_update); + if (peer_info[newest_update_osd].log_bottom < log.top) { + dout(10) << " newest update on osd" << newest_update_osd + << " v " << newest_update + << ", querying since " << since + << endl; + query_map[newest_update_osd][info.pgid] = Query(Query::LOG, log.top, since, info.history); + peer_log_requested.insert(newest_update_osd); + } else { + dout(10) << " newest update on osd" << newest_update_osd + << " v " << newest_update + << ", querying entire summary/backlog" + << endl; + assert((peer_info[newest_update_osd].last_complete >= + peer_info[newest_update_osd].log_bottom) || + peer_info[newest_update_osd].log_backlog); // or else we're in trouble. + query_map[newest_update_osd][info.pgid] = Query(Query::BACKLOG, info.history); + peer_summary_requested.insert(newest_update_osd); + } + } + return; + } else { + dout(10) << " newest_update " << info.last_update << " (me)" << endl; + } + + dout(10) << " oldest_update " << oldest_update << endl; + + have_master_log = true; + + + // -- do i need to generate backlog for any of my peers? + if (oldest_update < log.bottom && !log.backlog) { + dout(10) << "generating backlog for some peers, bottom " + << log.bottom << " > " << oldest_update + << endl; + generate_backlog(); + } + + + /** COLLECT MISSING+LOG FROM PEERS **********/ + /* + we also detect divergent replicas here by pulling the full log + from everyone. + */ + + // gather missing from peers + for (unsigned i=1; i 0) { + dout(10) << "there are still " << missing.num_lost() << " lost objects" << endl; + + // ***** + // FIXME: i don't think this actually accomplishes anything! + // ***** + + // ok, let's get more summaries! + bool waiting = false; + for (map::iterator it = peer_info.begin(); + it != peer_info.end(); + it++) { + int peer = it->first; + + if (peer_summary_requested.count(peer)) { + dout(10) << " already requested summary/backlog from osd" << peer << endl; + waiting = true; + continue; + } + + dout(10) << " requesting summary/backlog from osd" << peer << endl; + query_map[peer][info.pgid] = Query(Query::BACKLOG, info.history); + peer_summary_requested.insert(peer); + waiting = true; + } + + if (!waiting) { + dout(10) << missing.num_lost() << " objects are still lost, waiting+hoping for a notify from someone else!" << endl; + } + return; + } + + // sanity check + assert(missing.num_lost() == 0); + assert(info.last_complete >= log.bottom || log.backlog); + + + // -- crash recovery? + if (is_crashed()) { + dout(10) << "crashed, allowing op replay for " << g_conf.osd_replay_window << endl; + state_set(STATE_REPLAY); + g_timer.add_event_after(g_conf.osd_replay_window, + new OSD::C_Activate(osd, info.pgid, osd->osdmap->get_epoch())); + } + else if (!is_active()) { + // -- ok, activate! + activate(t); + } +} + + +void PG::activate(ObjectStore::Transaction& t) +{ + assert(!is_active()); + + // twiddle pg state + state_set(STATE_ACTIVE); + state_clear(STATE_STRAY); + if (is_crashed()) { + assert(is_replay()); + state_clear(STATE_CRASHED); + state_clear(STATE_REPLAY); + } + info.last_epoch_started = osd->osdmap->get_epoch(); + + if (role == 0) { // primary state + peers_complete_thru = 0; // we don't know (yet)! + } + + assert(info.last_complete >= log.bottom || log.backlog); + + // write pg info + t.collection_setattr(info.pgid, "info", (char*)&info, sizeof(info)); + + // write log + write_log(t); + + // clean up stray objects + clean_up_local(t); + + // init complete pointer + if (info.last_complete == info.last_update) { + dout(10) << "activate - complete" << endl; + log.complete_to == log.log.end(); + log.requested_to = log.log.end(); + } + //else if (is_primary()) { + else if (true) { + dout(10) << "activate - not complete, " << missing << ", starting recovery" << endl; + + // init complete_to + log.complete_to = log.log.begin(); + while (log.complete_to->version < info.last_complete) { + log.complete_to++; + assert(log.complete_to != log.log.end()); + } + + // start recovery + log.requested_to = log.complete_to; + do_recovery(); + } else { + dout(10) << "activate - not complete, " << missing << endl; + } + + + // if primary.. + if (role == 0 && + osd->osdmap->get_epoch() > 1) { + // who is clean? + clean_set.clear(); + if (info.is_clean()) + clean_set.insert(osd->whoami); + + // start up replicas + for (unsigned i=1; iosdmap->get_epoch(), + info.pgid); + m->info = info; + + if (peer_info[peer].last_update == info.last_update) { + // empty log + } + else if (peer_info[peer].last_update < log.bottom) { + // summary/backlog + assert(log.backlog); + m->log = log; + } + else { + // incremental log + assert(peer_info[peer].last_update < info.last_update); + m->log.copy_after(log, peer_info[peer].last_update); + } + + // update local version of peer's missing list! + { + eversion_t plu = peer_info[peer].last_update; + Missing& pm = peer_missing[peer]; + for (list::iterator p = m->log.log.begin(); + p != m->log.log.end(); + p++) + if (p->version > plu) + pm.add(p->oid, p->version); + } + + dout(10) << "activate sending " << m->log << " " << m->missing + << " to osd" << peer << endl; + //m->log.print(cout); + osd->messenger->send_message(m, MSG_ADDR_OSD(peer), osd->osdmap->get_inst(peer)); + + // update our missing + if (peer_missing[peer].num_missing() == 0) { + dout(10) << "activate peer osd" << peer << " already clean, " << peer_info[peer] << endl; + assert(peer_info[peer].last_complete == info.last_update); + clean_set.insert(peer); + } else { + dout(10) << "activate peer osd" << peer << " " << peer_info[peer] + << " missing " << peer_missing[peer] << endl; + } + + } + + // discard unneeded peering state + //peer_log.clear(); // actually, do this carefully, in case peer() is called again. + + // all clean? + if (is_all_clean()) { + state_set(STATE_CLEAN); + dout(10) << "activate all replicas clean" << endl; + clean_replicas(); + } + } + + + // replay (queue them _before_ other waiting ops!) + if (!replay_queue.empty()) { + eversion_t c = info.last_update; + list replay; + for (map::iterator p = replay_queue.begin(); + p != replay_queue.end(); + p++) { + if (p->first <= info.last_update) { + dout(10) << "activate will WRNOOP " << p->first << " " << *p->second << endl; + replay.push_back(p->second); + continue; + } + if (p->first.version != c.version+1) { + dout(10) << "activate replay " << p->first + << " skipping " << c.version+1 - p->first.version + << " ops" + << endl; + } + dout(10) << "activate replay " << p->first << " " << *p->second << endl; + replay.push_back(p->second); + c = p->first; + } + replay_queue.clear(); + osd->take_waiters(replay); + } + + // waiters + osd->take_waiters(waiting_for_active); +} + +/** clean_up_local + * remove any objects that we're storing but shouldn't. + * as determined by log. + */ +void PG::clean_up_local(ObjectStore::Transaction& t) +{ + dout(10) << "clean_up_local" << endl; + + assert(info.last_update >= log.bottom); // otherwise we need some help! + + if (log.backlog) { + // be thorough. + list ls; + osd->store->collection_list(info.pgid, ls); + set s; + + for (list::iterator i = ls.begin(); + i != ls.end(); + i++) + s.insert(*i); + + set did; + for (list::reverse_iterator p = log.log.rbegin(); + p != log.log.rend(); + p++) { + if (did.count(p->oid)) continue; + did.insert(p->oid); + + if (p->is_delete()) { + if (s.count(p->oid)) { + dout(10) << " deleting " << p->oid + << " when " << p->version << endl; + t.remove(p->oid); + } + s.erase(p->oid); + } else { + // just leave old objects.. they're missing or whatever + s.erase(p->oid); + } + } + + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + dout(10) << " deleting stray " << *i << endl; + t.remove(*i); + } + + } else { + // just scan the log. + set did; + for (list::reverse_iterator p = log.log.rbegin(); + p != log.log.rend(); + p++) { + if (did.count(p->oid)) continue; + did.insert(p->oid); + + if (p->is_delete()) { + dout(10) << " deleting " << p->oid + << " when " << p->version << endl; + t.remove(p->oid); + } else { + // keep old(+missing) objects, just for kicks. + } + } + } +} + + + +void PG::cancel_recovery() +{ + // forget about where missing items are, or anything we're pulling + missing.loc.clear(); + osd->num_pulling -= objects_pulling.size(); + objects_pulling.clear(); +} + +/** + * do one recovery op. + * return true if done, false if nothing left to do. + */ +bool PG::do_recovery() +{ + dout(-10) << "do_recovery pulling " << objects_pulling.size() << " in pg, " + << osd->num_pulling << "/" << g_conf.osd_max_pull << " total" + << endl; + dout(10) << "do_recovery " << missing << endl; + + // can we slow down on this PG? + if (osd->num_pulling >= g_conf.osd_max_pull && !objects_pulling.empty()) { + dout(-10) << "do_recovery already pulling max, waiting" << endl; + return true; + } + + // look at log! + Log::Entry *latest = 0; + + while (log.requested_to != log.log.end()) { + assert(log.objects.count(log.requested_to->oid)); + latest = log.objects[log.requested_to->oid]; + assert(latest); + + dout(10) << "do_recovery " + << *log.requested_to + << (objects_pulling.count(latest->oid) ? " (pulling)":"") + << endl; + + if (latest->is_update() && + !objects_pulling.count(latest->oid) && + missing.is_missing(latest->oid)) { + osd->pull(this, latest->oid); + return true; + } + + log.requested_to++; + } + + if (!objects_pulling.empty()) { + dout(7) << "do_recovery requested everything, still waiting" << endl; + return false; + } + + // done? + assert(missing.num_missing() == 0); + assert(info.last_complete == info.last_update); + + if (is_primary()) { + // i am primary + dout(7) << "do_recovery complete, cleaning strays" << endl; + clean_set.insert(osd->whoami); + if (is_all_clean()) { + state_set(PG::STATE_CLEAN); + clean_replicas(); + } + } else { + // tell primary + dout(7) << "do_recovery complete, telling primary" << endl; + list ls; + ls.push_back(info); + osd->messenger->send_message(new MOSDPGNotify(osd->osdmap->get_epoch(), + ls), + MSG_ADDR_OSD(get_primary()), osd->osdmap->get_inst(get_primary())); + } + + return false; +} + +void PG::do_peer_recovery() +{ + dout(10) << "do_peer_recovery" << endl; + + for (unsigned i=0; isecond; + eversion_t v = peer_missing[peer].rmissing.begin()->first; + + osd->push(this, oid, peer); + + // do other peers need it too? + for (i++; ipush(this, oid, peer); + } + + return; + } + + // nothing to do! +} + + + +void PG::clean_replicas() +{ + dout(10) << "clean_replicas. strays are " << stray_set << endl; + + for (set::iterator p = stray_set.begin(); + p != stray_set.end(); + p++) { + dout(10) << "sending PGRemove to osd" << *p << endl; + set ls; + ls.insert(info.pgid); + MOSDPGRemove *m = new MOSDPGRemove(osd->osdmap->get_epoch(), ls); + osd->messenger->send_message(m, MSG_ADDR_OSD(*p), osd->osdmap->get_inst(*p)); + } + + stray_set.clear(); +} + + + +void PG::write_log(ObjectStore::Transaction& t) +{ + // assemble buffer + bufferlist bl; + + // build buffer + ondisklog.bottom = 0; + ondisklog.block_map.clear(); + for (list::iterator p = log.log.begin(); + p != log.log.end(); + p++) { + if (bl.length() % 4096 == 0) + ondisklog.block_map[bl.length()] = p->version; + bl.append((char*)&(*p), sizeof(*p)); + } + ondisklog.top = bl.length(); + + // write it + t.remove( object_t(1,info.pgid) ); + t.write( object_t(1,info.pgid) , 0, bl.length(), bl); + t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); + t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); + + t.collection_setattr(info.pgid, "info", &info, sizeof(info)); +} + +void PG::trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v) +{ + dout(15) << " trim_ondisk_log_to v " << v << endl; + + map::iterator p = ondisklog.block_map.begin(); + while (p != ondisklog.block_map.end()) { + dout(15) << " " << p->first << " -> " << p->second << endl; + p++; + if (p == ondisklog.block_map.end() || + p->second > v) { // too far! + p--; // back up + break; + } + } + dout(15) << " * " << p->first << " -> " << p->second << endl; + if (p == ondisklog.block_map.begin()) + return; // can't trim anything! + + // we can trim! + off_t trim = p->first; + dout(10) << " trimming ondisklog to [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; + + ondisklog.bottom = trim; + + // adjust block_map + while (p != ondisklog.block_map.begin()) + ondisklog.block_map.erase(ondisklog.block_map.begin()); + + t.collection_setattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); + t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); +} + + +void PG::append_log(ObjectStore::Transaction& t, PG::Log::Entry& logentry, + eversion_t trim_to) +{ + // write entry on disk + bufferlist bl; + bl.append( (char*)&logentry, sizeof(logentry) ); + if (g_conf.osd_pad_pg_log) { // pad to 4k, until i fix ebofs reallocation crap. FIXME. + bufferptr bp(4096 - sizeof(logentry)); + bl.push_back(bp); + } + t.write( object_t(1,info.pgid), ondisklog.top, bl.length(), bl ); + + // update block map? + if (ondisklog.top % 4096 == 0) + ondisklog.block_map[ondisklog.top] = logentry.version; + + ondisklog.top += bl.length(); + t.collection_setattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); + + // trim? + if (trim_to > log.bottom) { + dout(10) << " trimming " << log << " to " << trim_to << endl; + log.trim(t, trim_to); + info.log_bottom = log.bottom; + info.log_backlog = log.backlog; + trim_ondisklog_to(t, trim_to); + } + dout(10) << " ondisklog [" << ondisklog.bottom << "," << ondisklog.top << ")" << endl; +} + +void PG::read_log(ObjectStore *store) +{ + // load bounds + ondisklog.bottom = ondisklog.top = 0; + store->collection_getattr(info.pgid, "ondisklog_bottom", &ondisklog.bottom, sizeof(ondisklog.bottom)); + store->collection_getattr(info.pgid, "ondisklog_top", &ondisklog.top, sizeof(ondisklog.top)); + + log.backlog = info.log_backlog; + log.bottom = info.log_bottom; + + if (ondisklog.top > 0) { + // read + bufferlist bl; + store->read(object_t(1,info.pgid), ondisklog.bottom, ondisklog.top-ondisklog.bottom, bl); + + PG::Log::Entry e; + off_t pos = ondisklog.bottom; + while (pos < ondisklog.top) { + bl.copy(pos-ondisklog.bottom, sizeof(e), (char*)&e); + if (e.version > log.bottom || log.backlog) { // ignore items below log.bottom + if (pos % 4096 == 0) + ondisklog.block_map[pos] = e.version; + log.log.push_back(e); + } + + pos += sizeof(e); + } + } + log.top = info.last_update; + log.index(); + + // build missing + set did; + for (list::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + i++) { + if (i->version <= info.last_complete) break; + if (did.count(i->oid)) continue; + did.insert(i->oid); + + if (i->is_delete()) continue; + + eversion_t v; + int r = osd->store->getattr(i->oid, "version", &v, sizeof(v)); + if (r < 0 || v < i->version) + missing.add(i->oid, i->version); + } +} + diff --git a/branches/sage/cephmds2/osd/PG.h b/branches/sage/cephmds2/osd/PG.h new file mode 100644 index 0000000000000..f8a040346e88e --- /dev/null +++ b/branches/sage/cephmds2/osd/PG.h @@ -0,0 +1,735 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __PG_H +#define __PG_H + + +#include "include/types.h" +#include "include/buffer.h" + +#include "OSDMap.h" +#include "ObjectStore.h" +#include "msg/Messenger.h" +#include "messages/MOSDOpReply.h" + +#include "include/types.h" + +#include +using namespace std; + +#include +using namespace __gnu_cxx; + + +class OSD; + +/* reqid_t - caller + tid to unique identify this request + */ +class reqid_t { +public: + msg_addr_t addr; + tid_t tid; + reqid_t() : tid(0) {} + reqid_t(const msg_addr_t& a, tid_t t) : addr(a), tid(t) {} +}; + +inline ostream& operator<<(ostream& out, const reqid_t& r) { + return out << r.addr << "." << r.tid; +} +inline bool operator==(const reqid_t& l, const reqid_t& r) { + return (l.addr == r.addr) && (l.tid == r.tid); +} +inline bool operator!=(const reqid_t& l, const reqid_t& r) { + return (l.addr != r.addr) || (l.tid != r.tid); +} + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const reqid_t &r) const { + static hash H; + static hash<__uint64_t> I; + return H(r.addr._addr) ^ I(r.tid); + } + }; +} + +/** PG - Replica Placement Group + * + */ + +class PG { +public: + + /* + * PG::Info - summary of PG statistics. + * + * some notes: + * - last_complete implies we have all objects that existed as of that + * stamp, OR a newer object, OR have already applied a later delete. + * - if last_complete >= log.bottom, then we know pg contents thru log.top. + * otherwise, we have no idea what the pg is supposed to contain. + */ + struct Info { + pg_t pgid; + eversion_t last_update; // last object version applied to store. + eversion_t last_complete; // last version pg was complete through. + + eversion_t log_bottom; // oldest log entry. + bool log_backlog; // do we store a complete log? + + epoch_t last_epoch_started; // last epoch started. + epoch_t last_epoch_finished; // last epoch finished. + + struct History { + epoch_t same_since; // same acting set since + epoch_t same_primary_since; // same primary at least back through this epoch. + epoch_t same_acker_since; // same acker at least back through this epoch. + History() : same_since(0), same_primary_since(0), same_acker_since(0) {} + } history; + + Info(pg_t p=0) : pgid(p), + log_backlog(false), + last_epoch_started(0), last_epoch_finished(0) {} + bool is_clean() const { return last_update == last_complete; } + bool is_empty() const { return last_update.version == 0; } + }; + + + /** + * Query - used to ask a peer for information about a pg. + * + * note: if version=0, type=LOG, then we just provide our full log. + * only if type=BACKLOG do we generate a backlog and provide that too. + */ + struct Query { + const static int INFO = 0; + const static int LOG = 1; + const static int BACKLOG = 2; + const static int FULLLOG = 3; + + int type; + eversion_t split, floor; + Info::History history; + + Query() : type(-1) {} + Query(int t, Info::History& h) : + type(t), history(h) { assert(t != LOG); } + Query(int t, eversion_t s, eversion_t f, Info::History& h) : + type(t), split(s), floor(f), history(h) { assert(t == LOG); } + }; + + + /* + * Missing - summary of missing objects. + * kept in memory, as a supplement to Log. + * also used to pass missing info in messages. + */ + class Missing { + public: + map missing; // oid -> v + map rmissing; // v -> oid + + map loc; // where i think i can get them. + + int num_lost() const { return missing.size() - loc.size(); } + int num_missing() const { return missing.size(); } + + bool is_missing(object_t oid) { + return missing.count(oid); + } + bool is_missing(object_t oid, eversion_t v) { + return missing.count(oid) && missing[oid] <= v; + } + void add(object_t oid) { + eversion_t z; + add(oid,z); + } + void add(object_t oid, eversion_t v) { + if (missing.count(oid)) { + if (missing[oid] > v) return; // already missing newer. + rmissing.erase(missing[oid]); + } + missing[oid] = v; + rmissing[v] = oid; + } + void rm(object_t oid, eversion_t when) { + if (missing.count(oid) && missing[oid] < when) { + rmissing.erase(missing[oid]); + missing.erase(oid); + loc.erase(oid); + } + } + void got(object_t oid, eversion_t v) { + assert(missing.count(oid)); + assert(missing[oid] <= v); + loc.erase(oid); + rmissing.erase(missing[oid]); + missing.erase(oid); + } + void got(object_t oid) { + assert(missing.count(oid)); + loc.erase(oid); + rmissing.erase(missing[oid]); + missing.erase(oid); + } + + void _encode(bufferlist& blist) { + ::_encode(missing, blist); + ::_encode(loc, blist); + } + void _decode(bufferlist& blist, int& off) { + ::_decode(missing, blist, off); + ::_decode(loc, blist, off); + + for (map::iterator it = missing.begin(); + it != missing.end(); + it++) + rmissing[it->second] = it->first; + } + }; + + + /* + * Log - incremental log of recent pg changes. + * also, serves as a recovery queue. + * + * when backlog is true, + * objects with versions <= bottom are in log. + * we do not have any deletion info before that time, however. + * log is a "summary" in that it contains all objects in the PG. + */ + class Log { + public: + /** top, bottom + * top - newest entry (update|delete) + * bottom - entry previous to oldest (update|delete) for which we have + * complete negative information. + * i.e. we can infer pg contents for any store whose last_update >= bottom. + */ + eversion_t top; // newest entry (update|delete) + eversion_t bottom; // version prior to oldest (update|delete) + + /** backlog - true if log is a complete summary of pg contents. + * updated will include all items in pg, but deleted will not include + * negative entries for items deleted prior to 'bottom'. + */ + bool backlog; + + /** Entry + * mapped from the eversion_t, so don't include that. + */ + class Entry { + public: + const static int LOST = 0; + const static int MODIFY = 1; + const static int CLONE = 2; + const static int DELETE = 3; + + int op; // write, zero, trunc, remove + object_t oid; + eversion_t version; + objectrev_t rev; + + reqid_t reqid; // caller+tid to uniquely identify request + + Entry() : op(0) {} + Entry(int _op, object_t _oid, const eversion_t& v, + const msg_addr_t& a, tid_t t) : + op(_op), oid(_oid), version(v), reqid(a,t) {} + + bool is_delete() const { return op == DELETE; } + bool is_clone() const { return op == CLONE; } + bool is_modify() const { return op == MODIFY; } + bool is_update() const { return is_clone() || is_modify(); } + }; + + list log; // the actual log. + + Log() : backlog(false) {} + + void clear() { + eversion_t z; + top = bottom = z; + backlog = false; + log.clear(); + } + bool empty() const { + return top.version == 0 && top.epoch == 0; + } + + void _encode(bufferlist& blist) const { + blist.append((char*)&top, sizeof(top)); + blist.append((char*)&bottom, sizeof(bottom)); + blist.append((char*)&backlog, sizeof(backlog)); + ::_encode(log, blist); + } + void _decode(bufferlist& blist, int& off) { + blist.copy(off, sizeof(top), (char*)&top); + off += sizeof(top); + blist.copy(off, sizeof(bottom), (char*)&bottom); + off += sizeof(bottom); + blist.copy(off, sizeof(backlog), (char*)&backlog); + off += sizeof(backlog); + + ::_decode(log, blist, off); + } + + void copy_after(const Log &other, eversion_t v); + bool copy_after_unless_divergent(const Log &other, eversion_t split, eversion_t floor); + void copy_non_backlog(const Log &other); + ostream& print(ostream& out) const; + }; + + /** + * IndexLog - adds in-memory index of the log, by oid. + * plus some methods to manipulate it all. + */ + class IndexedLog : public Log { + public: + hash_map objects; // ptrs into log. be careful! + hash_set caller_ops; + + // recovery pointers + list::iterator requested_to; // not inclusive of referenced item + list::iterator complete_to; // not inclusive of referenced item + + /****/ + IndexedLog() {} + + void clear() { + assert(0); + unindex(); + Log::clear(); + } + + bool logged_object(object_t oid) { + return objects.count(oid); + } + bool logged_req(reqid_t &r) { + return caller_ops.count(r); + } + + void index() { + objects.clear(); + caller_ops.clear(); + for (list::iterator i = log.begin(); + i != log.end(); + i++) { + objects[i->oid] = &(*i); + caller_ops.insert(i->reqid); + } + } + + void index(Entry& e) { + if (objects.count(e.oid) == 0 || + objects[e.oid]->version < e.version) + objects[e.oid] = &e; + caller_ops.insert(e.reqid); + } + void unindex() { + objects.clear(); + caller_ops.clear(); + } + void unindex(Entry& e) { + // NOTE: this only works if we remove from the _bottom_ of the log! + assert(objects.count(e.oid)); + if (objects[e.oid]->version == e.version) + objects.erase(e.oid); + caller_ops.erase(e.reqid); + } + + + // accessors + Entry *is_updated(object_t oid) { + if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; + return 0; + } + Entry *is_deleted(object_t oid) { + if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; + return 0; + } + + // actors + void add(Entry& e) { + // add to log + log.push_back(e); + assert(e.version > top); + assert(top.version == 0 || e.version.version > top.version); + top = e.version; + + // to our index + objects[e.oid] = &(log.back()); + caller_ops.insert(e.reqid); + } + + void trim(ObjectStore::Transaction &t, eversion_t s); + void trim_write_ahead(eversion_t last_update); + }; + + + /** + * OndiskLog - some info about how we store the log on disk. + */ + class OndiskLog { + public: + // ok + off_t bottom; // first byte of log. + off_t top; // byte following end of log. + map block_map; // block -> first stamp logged there + + OndiskLog() : bottom(0), top(0) {} + + bool trim_to(eversion_t v, ObjectStore::Transaction& t); + }; + + + /*** + */ + + class RepOpGather { + public: + class MOSDOp *op; + tid_t rep_tid; + + ObjectStore::Transaction t; + bool applied; + + set waitfor_ack; + set waitfor_commit; + + utime_t start; + + bool sent_ack, sent_commit; + + set osds; + eversion_t new_version; + + eversion_t pg_local_last_complete; + map pg_complete_thru; + + RepOpGather(MOSDOp *o, tid_t rt, eversion_t nv, eversion_t lc) : + op(o), rep_tid(rt), + applied(false), + sent_ack(false), sent_commit(false), + new_version(nv), + pg_local_last_complete(lc) { } + + bool can_send_ack() { + return !sent_ack && !sent_commit && + waitfor_ack.empty(); + } + bool can_send_commit() { + return !sent_commit && + waitfor_ack.empty() && waitfor_commit.empty(); + } + bool can_delete() { + return waitfor_ack.empty() && waitfor_commit.empty(); + } + }; + + + /*** PG ****/ +public: + // any + static const int STATE_ACTIVE = 1; // i am active. (primary: replicas too) + + // primary + static const int STATE_CLEAN = 2; // peers are complete, clean of stray replicas. + static const int STATE_CRASHED = 4; // all replicas went down. + static const int STATE_REPLAY = 8; // crashed, waiting for replay + + // non-primary + static const int STATE_STRAY = 16; // i must notify the primary i exist. + + + protected: + OSD *osd; + +public: + // pg state + Info info; + IndexedLog log; + OndiskLog ondisklog; + Missing missing; + utime_t last_heartbeat; // + +protected: + int role; // 0 = primary, 1 = replica, -1=none. + int state; // see bit defns above + + // primary state + public: + vector acting; + epoch_t last_epoch_started_any; + eversion_t last_complete_commit; + + // [primary only] content recovery state + eversion_t peers_complete_thru; + bool have_master_log; + protected: + set prior_set; // current+prior OSDs, as defined by last_epoch_started_any. + set stray_set; // non-acting osds that have PG data. + set clean_set; // current OSDs that are clean + eversion_t oldest_update; // lowest (valid) last_update in active set + map peer_info; // info from peers (stray or prior) + set peer_info_requested; + map peer_missing; + set peer_log_requested; // logs i've requested (and start stamps) + set peer_summary_requested; + friend class OSD; + + + // [primary|tail] + // old way + map replica_ops; + map > replica_tids_by_osd; // osd -> (tid,...) + + // new way + map repop_gather; + map > waiting_for_repop; + + + // [primary|replica] + // pg waiters + list waiting_for_active; + hash_map > waiting_for_missing_object; + map replay_queue; + + // recovery + map objects_pulling; // which objects are currently being pulled + +public: + void clear_primary_state(); + + public: + bool is_acting(int osd) const { + for (unsigned i=0; i peers_complete_thru) { + peers_complete_thru = t; + return true; + } + return false; + } + + void proc_replica_log(Log &olog, Missing& omissing, int from); + void merge_log(Log &olog, Missing& omissing, int from); + void proc_missing(Log &olog, Missing &omissing, int fromosd); + + void generate_backlog(); + void drop_backlog(); + + void trim_write_ahead(); + + void peer(ObjectStore::Transaction& t, map< int, map >& query_map); + + void activate(ObjectStore::Transaction& t); + + void cancel_recovery(); + bool do_recovery(); + void do_peer_recovery(); + + void clean_replicas(); + + off_t get_log_write_pos() { + return 0; + } + + public: + PG(OSD *o, pg_t p) : + osd(o), + info(p), + role(0), + state(0), + last_epoch_started_any(0), + last_complete_commit(0), + peers_complete_thru(0), + have_master_log(true) + { } + + pg_t get_pgid() const { return info.pgid; } + int get_nrep() const { return acting.size(); } + + int get_primary() { return acting.empty() ? -1:acting[0]; } + //int get_tail() { return acting.empty() ? -1:acting[ acting.size()-1 ]; } + //int get_acker() { return g_conf.osd_rep == OSD_REP_PRIMARY ? get_primary():get_tail(); } + int get_acker() { + if (g_conf.osd_rep == OSD_REP_PRIMARY || + acting.size() <= 1) + return get_primary(); + return acting[1]; + } + + int get_role() const { return role; } + void set_role(int r) { role = r; } + + bool is_primary() const { return role == PG_ROLE_HEAD; } + bool is_acker() const { return role == PG_ROLE_ACKER; } + bool is_head() const { return role == PG_ROLE_HEAD; } + bool is_middle() const { return role == PG_ROLE_MIDDLE; } + bool is_residual() const { return role == PG_ROLE_STRAY; } + + //int get_state() const { return state; } + bool state_test(int m) const { return (state & m) != 0; } + void state_set(int m) { state |= m; } + void state_clear(int m) { state &= ~m; } + + bool is_complete() const { return info.last_complete == info.last_update; } + + bool is_active() const { return state_test(STATE_ACTIVE); } + bool is_crashed() const { return state_test(STATE_CRASHED); } + bool is_replay() const { return state_test(STATE_REPLAY); } + //bool is_complete() { return state_test(STATE_COMPLETE); } + bool is_clean() const { return state_test(STATE_CLEAN); } + bool is_stray() const { return state_test(STATE_STRAY); } + + bool is_empty() const { return info.last_update == 0; } + + int num_active_ops() const { + return objects_pulling.size(); + } + + + // pg on-disk content + void clean_up_local(ObjectStore::Transaction& t); + + // pg on-disk state + void write_log(ObjectStore::Transaction& t); + void append_log(ObjectStore::Transaction& t, + PG::Log::Entry& logentry, + eversion_t trim_to); + void read_log(ObjectStore *store); + void trim_ondisklog_to(ObjectStore::Transaction& t, eversion_t v); + + + +}; + + + +inline ostream& operator<<(ostream& out, const PG::Info::History& h) +{ + return out << h.same_since << "/" << h.same_primary_since << "/" << h.same_acker_since; +} + +inline ostream& operator<<(ostream& out, const PG::Info& pgi) +{ + out << "pginfo(" << hex << pgi.pgid << dec; + if (pgi.is_empty()) + out << " empty"; + else + out << " v " << pgi.last_update << "/" << pgi.last_complete + << " (" << pgi.log_bottom << "," << pgi.last_update << "]" + << (pgi.log_backlog ? "+backlog":""); + out << " e " << pgi.last_epoch_started << "/" << pgi.last_epoch_finished + << " " << pgi.history + << ")"; + return out; +} + +inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) +{ + return out << " " << e.version + << (e.is_delete() ? " - ": + (e.is_clone() ? " c ": + (e.is_modify() ? " m ": + " ? "))) + << e.oid << " by " << e.reqid; +} + +inline ostream& operator<<(ostream& out, const PG::Log& log) +{ + out << "log(" << log.bottom << "," << log.top << "]"; + if (log.backlog) out << "+backlog"; + return out; +} + +inline ostream& operator<<(ostream& out, const PG::Missing& missing) +{ + out << "missing(" << missing.num_missing(); + if (missing.num_lost()) out << ", " << missing.num_lost() << " lost"; + out << ")"; + return out; +} + +inline ostream& operator<<(ostream& out, const PG& pg) +{ + out << "pg[" << pg.info + << " r=" << pg.get_role(); + + if (pg.log.bottom != pg.info.log_bottom) + out << " (info mismatch, " << pg.log << ")"; + + if (pg.log.log.empty()) { + // shoudl it be? + if (pg.log.top.version - pg.log.bottom.version != 0) { + out << " (log bound mismatch, empty)"; + } + } else { + if (((pg.log.log.begin()->version.version - 1 != pg.log.bottom.version) && + !pg.log.backlog) || + (pg.log.log.rbegin()->version.version != pg.log.top.version)) { + out << " (log bound mismatch, actual=[" + << pg.log.log.begin()->version << "," + << pg.log.log.rbegin()->version << "])"; + } + } + + if (pg.get_role() == 0) out << " pct " << pg.peers_complete_thru; + if (!pg.have_master_log) out << " !hml"; + if (pg.is_active()) out << " active"; + if (pg.is_crashed()) out << " crashed"; + if (pg.is_replay()) out << " replay"; + if (pg.is_clean()) out << " clean"; + if (pg.is_stray()) out << " stray"; + //out << " (" << pg.log.bottom << "," << pg.log.top << "]"; + if (pg.missing.num_missing()) out << " m=" << pg.missing.num_missing(); + if (pg.missing.num_lost()) out << " l=" << pg.missing.num_lost(); + out << "]"; + + + return out; +} + + +inline ostream& operator<<(ostream& out, PG::RepOpGather& repop) +{ + out << "repop(" << &repop << " rep_tid=" << repop.rep_tid + << " wfack=" << repop.waitfor_ack + << " wfcommit=" << repop.waitfor_commit; + out << " pct=" << repop.pg_complete_thru; + out << " op=" << *(repop.op); + out << " repop=" << &repop; + out << ")"; + return out; +} + + +#endif diff --git a/branches/sage/cephmds2/osd/rush.cc b/branches/sage/cephmds2/osd/rush.cc new file mode 100644 index 0000000000000..aebca7ac1a351 --- /dev/null +++ b/branches/sage/cephmds2/osd/rush.cc @@ -0,0 +1,230 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +// +// +// rush.cc +// +// $Id$ +// + +#include +#include +#include +#include "rush.h" + + +static +unsigned int +myhash (unsigned int n) +{ + unsigned int v = (n ^ 0xdead1234) * (884811920 * 3 + 1); + return (v); +} + +Rush::Rush () +{ + nClusters = 0; + totalServers = 0; +} + +//---------------------------------------------------------------------- +// +// Rush::AddCluster +// +// Add a cluster. The number of servers in the cluster and +// the weight of each server is passed. The current number of +// clusters is returned. +// +//---------------------------------------------------------------------- +int +Rush::AddCluster (int nServers, double weight) +{ + clusterSize[nClusters] = nServers; + clusterWeight[nClusters] = weight; + if (nClusters == 0) { + serversInPrevious[0] = 0; + totalWeightBefore[0] = 0.0; + } else { + serversInPrevious[nClusters] = serversInPrevious[nClusters-1] + + clusterSize[nClusters-1]; + totalWeightBefore[nClusters] = + totalWeightBefore[nClusters-1] + (double)clusterSize[nClusters-1] * + clusterWeight[nClusters-1]; + } + nClusters += 1; + totalServers += nServers; +#if 0 + for (int i = 0; i < nClusters; i++) { + fprintf (stderr, "size=%-3d prev=%-3d weight=%-6.2f prevWeight=%-8.2f\n", + clusterSize[i], serversInPrevious[i], clusterWeight[i], + totalWeightBefore[i]); + } +#endif + return (nClusters); +} + + +//---------------------------------------------------------------------- +// +// Rush::GetServersByKey +// +// This function returns a list of servers on which an object +// should be placed. The servers array must be large enough to +// contain the list. +// +//---------------------------------------------------------------------- +void +Rush::GetServersByKey (int key, int nReplicas, int servers[]) +{ + int replicasLeft = nReplicas; + int cluster; + int mustAssign, numberAssigned; + int i, toDraw; + int *srv = servers; + double myWeight; + RushRNG rng; + + // There may not be more replicas than servers! + assert (nReplicas <= totalServers); + + for (cluster = nClusters-1; (cluster >= 0) && (replicasLeft > 0); cluster--) { + if (serversInPrevious[cluster] < replicasLeft) { + mustAssign = replicasLeft - serversInPrevious[cluster]; + } else { + mustAssign = 0; + } + toDraw = replicasLeft - mustAssign; + if (toDraw > (clusterSize[cluster] - mustAssign)) { + toDraw = clusterSize[cluster] - mustAssign; + } + myWeight = (double)clusterSize[cluster] * clusterWeight[cluster]; + rng.Seed (myhash (key)^cluster, cluster^0xb90738); + numberAssigned = mustAssign + + rng.HyperGeometricWeighted (toDraw, myWeight, + totalWeightBefore[cluster] + myWeight, + clusterWeight[cluster]); + if (numberAssigned > 0) { + rng.Seed (myhash (key)^cluster ^ 11, cluster^0xfea937); + rng.DrawKofN (srv, numberAssigned, clusterSize[cluster]); + for (i = 0; i < numberAssigned; i++) { + srv[i] += serversInPrevious[cluster]; + } + replicasLeft -= numberAssigned; + srv += numberAssigned; + } + } +} + + + +//---------------------------------------------------------------------- +// +// RushRNG::HyperGeometricWeighted +// +// Use an iterative method to generate a hypergeometric random +// variable. This approach guarantees that, if the number of draws +// is reduced, the number of successes must be as well as long as +// the seed for the RNG is the same. +// +//---------------------------------------------------------------------- +int +RushRNG::HyperGeometricWeighted (int nDraws, double yesWeighted, + double totalWeighted, double weightOne) +{ + int positives = 0, i; + double curRand; + + // If the weight is too small (or is negative), choose zero objects. + if (weightOne <= 1e-9 || nDraws == 0) { + return (0); + } + + // Draw nDraws items from the "bag". For each positive, subtract off + // the weight of an object from the weight of positives remaining. For + // each draw, subtract off the weight of an object from the total weight + // remaining. + for (i = 0; i < nDraws; i++) { + curRand = RandomDouble (); + if (curRand < (yesWeighted / totalWeighted)) { + positives += 1; + yesWeighted -= weightOne; + } + totalWeighted -= weightOne; + } + return (positives); +} + +//---------------------------------------------------------------------- +// +// RushRNG::DrawKofN +// +//---------------------------------------------------------------------- +void +RushRNG::DrawKofN (int vals[], int nToDraw, int setSize) +{ + int deck[setSize]; + int i, pick; + + assert(nToDraw <= setSize); + + for (i = 0; i < setSize; i++) { + deck[i] = i; + } + + for (i = 0; i < nToDraw; i++) { + pick = (int)(RandomDouble () * (double)(setSize - i)); + if (pick >= setSize-i) pick = setSize-i-1; // in case + // assert(i >= 0 && i < nToDraw); + // assert(pick >= 0 && pick < setSize); + vals[i] = deck[pick]; + deck[pick] = deck[setSize-i-1]; + } +} + +#define SEED_X 521288629 +#define SEED_Y 362436069 +RushRNG::RushRNG () +{ + Seed (0, 0); +} + +void +RushRNG::Seed (unsigned int seed1, unsigned int seed2) +{ + state1 = ((seed1 == 0) ? SEED_X : seed1); + state2 = ((seed2 == 0) ? SEED_Y : seed2); +} + +unsigned int +RushRNG::RandomInt () +{ + const unsigned int a = 18000; + const unsigned int b = 18879; + unsigned int rndValue; + + state1 = a * (state1 & 0xffff) + (state1 >> 16); + state2 = b * (state2 & 0xffff) + (state2 >> 16); + rndValue = (state1 << 16) + (state2 & 0xffff); + return (rndValue); +} + +double +RushRNG::RandomDouble () +{ + double v; + + v = (double)RandomInt() / (65536.0*65536.0); + return (v); +} diff --git a/branches/sage/cephmds2/osd/rush.h b/branches/sage/cephmds2/osd/rush.h new file mode 100644 index 0000000000000..3d880a32415e0 --- /dev/null +++ b/branches/sage/cephmds2/osd/rush.h @@ -0,0 +1,60 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +// +// +// rush.h +// +// Classes and definitions for the RUSH algorithm. +// +// $Id$ +// +// + +#ifndef _rush_h_ +#define _rush_h_ + +#define RUSH_MAX_CLUSTERS 100 + +class RushRNG { +public: + unsigned int RandomInt (); + double RandomDouble (); + void Seed (unsigned int a, unsigned int b); + int HyperGeometricWeighted (int nDraws, double yesWeighted, + double totalWeighted, double weightOne); + void DrawKofN (int vals[], int nToDraw, int setSize); + RushRNG(); +private: + unsigned int state1, state2; +}; + +class Rush { +public: + void GetServersByKey (int key, int nReplicas, int servers[]); + int AddCluster (int nServers, double weight); + int Clusters () {return (nClusters);} + int Servers () {return (totalServers);} + Rush (); +private: + int DrawKofN (int *servers, int n, int clusterSize, RushRNG *g); + int nClusters; + int totalServers; + int clusterSize[RUSH_MAX_CLUSTERS]; + int serversInPrevious[RUSH_MAX_CLUSTERS]; + double clusterWeight[RUSH_MAX_CLUSTERS]; + double totalWeightBefore[RUSH_MAX_CLUSTERS]; +}; + +#endif /* _rush_h_ */ diff --git a/branches/sage/cephmds2/osd/tp.cc b/branches/sage/cephmds2/osd/tp.cc new file mode 100644 index 0000000000000..c8171895beef0 --- /dev/null +++ b/branches/sage/cephmds2/osd/tp.cc @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include + +using namespace std; + +#include "common/Mutex.h" +#include "common/ThreadPool.h" +// #include + +class Op { + int i; + +public: + + Op(int i) + { + this->i = i; + } + + int get() + { + return i; + } +}; + +void foop(class TP *t, class Op *o); + +class TP { +public: + + void foo(Op *o) + { + cout << "Thread "<< pthread_self() << ": " << o->get() << "\n"; + usleep(1); + + // sched_yield(); + } + + int main(int argc, char *argv) + { + ThreadPool *t = new ThreadPool(10, (void (*)(TP*, Op*))foop, this); + + for(int i = 0; i < 100; i++) { + Op *o = new Op(i); + t->put_op(o); + } + + sleep(1); + + delete(t); + + return 0; + } +}; + +void foop(class TP *t, class Op *o) { + t->foo(o); +} + +int main(int argc, char *argv) { + TP t; + + t.main(argc,argv); +} + diff --git a/branches/sage/cephmds2/osdc/Blinker.h b/branches/sage/cephmds2/osdc/Blinker.h new file mode 100644 index 0000000000000..231fe47fb1e31 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Blinker.h @@ -0,0 +1,91 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef __BLINKER_H +#define __BLINKER_H + +class Blinker { + + public: + + class Op { + int op; + static const int LOOKUP = 1; + static const int INSERT = 2; + static const int REMOVE = 3; + static const int CLEAR = 4; + Op(int o) : op(o) {} + }; + + class OpLookup : public Op { + public: + bufferptr key; + OpLookup(bufferptr& k) : Op(Op::LOOKUP), key(k) {} + }; + + class OpInsert : public Op { + bufferptr key; + bufferlist val; + OpInsert(bufferptr& k, bufferlist& v) : Op(Op::INSERT), key(k), val(v) {} + }; + + class OpRemove : public Op { + public: + bufferptr key; + OpRemove(bufferptr& k) : Op(Op::REMOVE), key(k) {} + }; + + class OpClear : public Op { + public: + OpClear() : Op(Op::CLEAR) {} + }; + + + +private: + Objecter *objecter; + + // in-flight operations. + + + // cache information about tree structure. + + + +public: + // public interface + + // simple accessors + void lookup(inode_t& inode, bufferptr& key, bufferlist *pval, Context *onfinish); + + // simple modifiers + void insert(inode_t& inode, bufferptr& key, bufferlist& val, Context *onack, Context *onsafe); + void remove(inode_t& inode, bufferptr& key, Context *onack, Context *onsafe); + void clear(inode_t& inode, Context *onack, Context *onsafe); + + // these are dangerous: the table may be large. + void listkeys(inode_t& inode, list* pkeys, Context *onfinish); + void listvals(inode_t& inode, list* pkeys, list* pvals, Context *onfinish); + + // fetch *at least* key, but also anything else that is convenient. + // include lexical bounds for which this is a complete result. + // (if *start and *end are empty, it's the entire table) + void prefetch(inode_t& inode, bufferptr& key, + list* pkeys, list* pvals, + bufferptr *start, bufferptr *end, + Context *onfinish); + + +}; + +#endif diff --git a/branches/sage/cephmds2/osdc/Filer.cc b/branches/sage/cephmds2/osdc/Filer.cc new file mode 100644 index 0000000000000..47094a3056836 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Filer.cc @@ -0,0 +1,235 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include + +#include "Filer.h" +#include "osd/OSDMap.h" + +//#include "messages/MOSDRead.h" +//#include "messages/MOSDReadReply.h" +//#include "messages/MOSDWrite.h" +//#include "messages/MOSDWriteReply.h" +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDMap.h" + +#include "msg/Messenger.h" + +#include "include/Context.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_filer) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".filer " + + +class Filer::C_Probe : public Context { +public: + Filer *filer; + Probe *probe; + object_t oid; + off_t size; + C_Probe(Filer *f, Probe *p, object_t o) : filer(f), probe(p), oid(o), size(0) {} + void finish(int r) { + filer->_probed(probe, oid, size); + } +}; + +int Filer::probe_fwd(inode_t& inode, + off_t start_from, + off_t *end, + Context *onfinish) +{ + dout(10) << "probe_fwd " << hex << inode.ino << dec << " starting from " << start_from << endl; + + Probe *probe = new Probe(inode, start_from, end, onfinish); + + // period (bytes before we jump unto a new set of object(s)) + off_t period = inode.layout.period(); + + // start with 1+ periods. + probe->probing_len = period; + if (start_from % period) + probe->probing_len += period - (start_from % period); + + _probe(probe); + return 0; +} + +void Filer::_probe(Probe *probe) +{ + dout(10) << "_probe " << hex << probe->inode.ino << dec << " " << probe->from << "~" << probe->probing_len << endl; + + // map range onto objects + file_to_extents(probe->inode, probe->from, probe->probing_len, probe->probing); + + for (list::iterator p = probe->probing.begin(); + p != probe->probing.end(); + p++) { + dout(10) << "_probe probing " << p->oid << endl; + C_Probe *c = new C_Probe(this, probe, p->oid); + probe->ops[p->oid] = objecter->stat(p->oid, &c->size, c); + } +} + +void Filer::_probed(Probe *probe, object_t oid, off_t size) +{ + dout(10) << "_probed " << probe->inode.ino << " object " << hex << oid << dec << " has size " << size << endl; + + probe->known[oid] = size; + assert(probe->ops.count(oid)); + probe->ops.erase(oid); + + if (!probe->ops.empty()) + return; // waiting for more! + + // analyze! + off_t end = 0; + for (list::iterator p = probe->probing.begin(); + p != probe->probing.end(); + p++) { + off_t shouldbe = p->length+p->start; + dout(10) << "_probed " << probe->inode.ino << " object " << hex << p->oid << dec + << " should be " << shouldbe + << ", actual is " << probe->known[p->oid] + << endl; + + if (probe->known[p->oid] < 0) { end = -1; break; } // error! + + assert(probe->known[p->oid] <= shouldbe); + if (shouldbe == probe->known[p->oid]) continue; // keep going + + // aha, we found the end! + // calc offset into buffer_extent to get distance from probe->from. + off_t oleft = probe->known[p->oid] - p->start; + for (map::iterator i = p->buffer_extents.begin(); + i != p->buffer_extents.end(); + i++) { + if (oleft <= (off_t)i->second) { + end = probe->from + i->first + oleft; + dout(10) << "_probed end is in buffer_extent " << i->first << "~" << i->second << " off " << oleft + << ", from was " << probe->from << ", end is " << end + << endl; + break; + } + oleft -= i->second; + } + break; + } + + if (end == 0) { + // keep probing! + dout(10) << "_probed didn't find end, probing further" << endl; + off_t period = probe->inode.layout.object_size * probe->inode.layout.stripe_count; + probe->from += probe->probing_len; + probe->probing_len = period; + _probe(probe); + return; + } + + if (end < 0) { + dout(10) << "_probed encountered an error while probing" << endl; + *probe->end = -1; + } else { + // hooray! + dout(10) << "_probed found end at " << end << endl; + *probe->end = end; + } + + // done! finish and clean up. + probe->onfinish->finish(end > 0 ? 0:-1); + delete probe->onfinish; + delete probe; +} + + +void Filer::file_to_extents(inode_t inode, + off_t offset, size_t len, + list& extents, + objectrev_t rev) +{ + dout(10) << "file_to_extents " << offset << "~" << len + << " on " << hex << inode.ino << dec + << endl; + + /* we want only one extent per object! + * this means that each extent we read may map into different bits of the + * final read buffer.. hence OSDExtent.buffer_extents + */ + map< object_t, ObjectExtent > object_extents; + + assert(inode.layout.object_size >= inode.layout.stripe_size); + off_t stripes_per_object = inode.layout.object_size / inode.layout.stripe_size; + dout(20) << " stripes_per_object " << stripes_per_object << endl; + + off_t cur = offset; + off_t left = len; + while (left > 0) { + // layout into objects + off_t blockno = cur / inode.layout.stripe_size; + off_t stripeno = blockno / inode.layout.stripe_count; + off_t stripepos = blockno % inode.layout.stripe_count; + off_t objectsetno = stripeno / stripes_per_object; + off_t objectno = objectsetno * inode.layout.stripe_count + stripepos; + + // find oid, extent + ObjectExtent *ex = 0; + object_t oid( inode.ino, objectno ); + if (object_extents.count(oid)) + ex = &object_extents[oid]; + else { + ex = &object_extents[oid]; + ex->oid = oid; + ex->rev = rev; + ex->pgid = objecter->osdmap->object_to_pg( oid, inode.layout ); + } + + // map range into object + off_t block_start = (stripeno % stripes_per_object)*inode.layout.stripe_size; + off_t block_off = cur % inode.layout.stripe_size; + off_t max = inode.layout.stripe_size - block_off; + + off_t x_offset = block_start + block_off; + off_t x_len; + if (left > max) + x_len = max; + else + x_len = left; + + if (ex->start + (off_t)ex->length == x_offset) { + // add to extent + ex->length += x_len; + } else { + // new extent + assert(ex->length == 0); + assert(ex->start == 0); + ex->start = x_offset; + ex->length = x_len; + } + ex->buffer_extents[cur-offset] = x_len; + + dout(15) << "file_to_extents " << *ex << " in " << ex->pgid << endl; + //cout << "map: ino " << ino << " oid " << ex.oid << " osd " << ex.osd << " offset " << ex.offset << " len " << ex.len << " ... left " << left << endl; + + left -= x_len; + cur += x_len; + } + + // make final list + for (map::iterator it = object_extents.begin(); + it != object_extents.end(); + it++) { + extents.push_back(it->second); + } +} diff --git a/branches/sage/cephmds2/osdc/Filer.h b/branches/sage/cephmds2/osdc/Filer.h new file mode 100644 index 0000000000000..161bfec304531 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Filer.h @@ -0,0 +1,158 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef __FILER_H +#define __FILER_H + +/*** Filer + * + * stripe file ranges onto objects. + * build list for the objecter or objectcacher. + * + * also, provide convenience methods that call objecter for you. + * + * "files" are identified by ino. + */ + +#include +#include +using namespace std; + +#include +#include +using namespace __gnu_cxx; + +#include "include/types.h" + +#include "osd/OSDMap.h" +#include "Objecter.h" + +class Context; +class Messenger; +class OSDMap; + + +/**** Filer interface ***/ + +class Filer { + Objecter *objecter; + + // probes + struct Probe { + inode_t inode; + off_t from; + off_t *end; + Context *onfinish; + + list probing; + off_t probing_len; + + map known; + map ops; + + Probe(inode_t &i, off_t f, off_t *e, Context *c) : + inode(i), from(f), end(e), onfinish(c), probing_len(0) {} + }; + + class C_Probe; + //friend class C_Probe; + + void _probe(Probe *p); + void _probed(Probe *p, object_t oid, off_t size); + + public: + Filer(Objecter *o) : objecter(o) {} + ~Filer() {} + + bool is_active() { + return objecter->is_active(); // || (oc && oc->is_active()); + } + + /*** async file interface ***/ + int read(inode_t& inode, + off_t offset, + size_t len, + bufferlist *bl, // ptr to data + Context *onfinish) { + Objecter::OSDRead *rd = new Objecter::OSDRead(bl); + file_to_extents(inode, offset, len, rd->extents); + return objecter->readx(rd, onfinish) > 0 ? 0:-1; + } + + int write(inode_t& inode, + off_t offset, + size_t len, + bufferlist& bl, + int flags, + Context *onack, + Context *oncommit, + objectrev_t rev=0) { + Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); + file_to_extents(inode, offset, len, wr->extents, rev); + return objecter->modifyx(wr, onack, oncommit) > 0 ? 0:-1; + } + + int zero(inode_t& inode, + off_t offset, + size_t len, + Context *onack, + Context *oncommit) { + Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_ZERO); + file_to_extents(inode, offset, len, z->extents); + return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; + } + + int remove(inode_t& inode, + off_t offset, + size_t len, + Context *onack, + Context *oncommit) { + Objecter::OSDModify *z = new Objecter::OSDModify(OSD_OP_DELETE); + file_to_extents(inode, offset, len, z->extents); + return objecter->modifyx(z, onack, oncommit) > 0 ? 0:-1; + } + + int probe_fwd(inode_t& inode, + off_t start_from, + off_t *end, + Context *onfinish); + + + /***** mapping *****/ + + /* map (ino, ono) to an object name + (to be used on any osd in the proper replica group) */ + /*object_t file_to_object(inodeno_t ino, + size_t _ono) { + __uint64_t ono = _ono; + assert(ino < (1ULL<& extents, + objectrev_t rev=0); + +}; + + + +#endif diff --git a/branches/sage/cephmds2/osdc/Journaler.cc b/branches/sage/cephmds2/osdc/Journaler.cc new file mode 100644 index 0000000000000..1bee1542bf906 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Journaler.cc @@ -0,0 +1,601 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include "Journaler.h" + +#include "include/Context.h" +#include "common/Logger.h" +#include "msg/Messenger.h" + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler " +#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << objecter->messenger->get_myaddr() << ".journaler " + + + +void Journaler::reset() +{ + dout(1) << "reset to blank journal" << endl; + state = STATE_ACTIVE; + write_pos = flush_pos = ack_pos = + read_pos = requested_pos = received_pos = + expire_pos = trimming_pos = trimmed_pos = inode.layout.period(); +} + + +/***************** HEADER *******************/ + +ostream& operator<<(ostream& out, Journaler::Header &h) +{ + return out << "loghead(trim " << h.trimmed_pos + << ", expire " << h.expire_pos + << ", read " << h.read_pos + << ", write " << h.write_pos + << ")"; +} + +class Journaler::C_ReadHead : public Context { + Journaler *ls; +public: + bufferlist bl; + C_ReadHead(Journaler *l) : ls(l) {} + void finish(int r) { + ls->_finish_read_head(r, bl); + } +}; + +class Journaler::C_ProbeEnd : public Context { + Journaler *ls; +public: + off_t end; + C_ProbeEnd(Journaler *l) : ls(l), end(-1) {} + void finish(int r) { + ls->_finish_probe_end(r, end); + } +}; + +void Journaler::recover(Context *onread) +{ + assert(state != STATE_ACTIVE); + + if (onread) + waitfor_recover.push_back(onread); + + if (state != STATE_UNDEF) { + dout(1) << "recover - already recoverying" << endl; + return; + } + + dout(1) << "read_head" << endl; + state = STATE_READHEAD; + C_ReadHead *fin = new C_ReadHead(this); + filer.read(inode, 0, sizeof(Header), &fin->bl, fin); +} + +void Journaler::_finish_read_head(int r, bufferlist& bl) +{ + assert(state == STATE_READHEAD); + + if (bl.length() == 0) { + dout(1) << "_finish_read_head r=" << r << " read 0 bytes, assuming empty log" << endl; + state = STATE_ACTIVE; + list ls; + ls.swap(waitfor_recover); + finish_contexts(ls, 0); + return; + } + + // unpack header + Header h; + assert(bl.length() == sizeof(h)); + bl.copy(0, sizeof(h), (char*)&h); + dout(1) << "_finish_read_head " << h << ". probing for end of log (from " << write_pos << ")..." << endl; + + write_pos = flush_pos = ack_pos = h.write_pos; + read_pos = requested_pos = received_pos = h.read_pos; + expire_pos = h.expire_pos; + trimmed_pos = trimming_pos = h.trimmed_pos; + + // probe the log + state = STATE_PROBING; + C_ProbeEnd *fin = new C_ProbeEnd(this); + filer.probe_fwd(inode, h.write_pos, &fin->end, fin); +} + +void Journaler::_finish_probe_end(int r, off_t end) +{ + assert(r >= 0); + assert(end >= write_pos); + assert(state == STATE_PROBING); + + dout(1) << "_finish_probe_end write_pos = " << end + << " (header had " << write_pos << "). recovered." + << endl; + + write_pos = flush_pos = ack_pos = end; + + // done. + list ls; + ls.swap(waitfor_recover); + finish_contexts(ls, 0); +} + + +// WRITING + +class Journaler::C_WriteHead : public Context { +public: + Journaler *ls; + Header h; + Context *oncommit; + C_WriteHead(Journaler *l, Header& h_, Context *c) : ls(l), h(h_), oncommit(c) {} + void finish(int r) { + ls->_finish_write_head(h, oncommit); + } +}; + +void Journaler::write_head(Context *oncommit) +{ + assert(state == STATE_ACTIVE); + last_written.trimmed_pos = trimmed_pos; + last_written.expire_pos = expire_pos; + last_written.read_pos = read_pos; + last_written.write_pos = ack_pos; //write_pos; + dout(10) << "write_head " << last_written << endl; + + last_wrote_head = g_clock.now(); + + bufferlist bl; + bl.append((char*)&last_written, sizeof(last_written)); + filer.write(inode, 0, bl.length(), bl, 0, + 0, new C_WriteHead(this, last_written, oncommit)); +} + +void Journaler::_finish_write_head(Header &wrote, Context *oncommit) +{ + dout(10) << "_finish_write_head " << wrote << endl; + last_committed = wrote; + if (oncommit) { + oncommit->finish(0); + delete oncommit; + } + + trim(); // trim? +} + + +/***************** WRITING *******************/ + +class Journaler::C_Flush : public Context { + Journaler *ls; + off_t start; +public: + C_Flush(Journaler *l, off_t s) : ls(l), start(s) {} + void finish(int r) { ls->_finish_flush(r, start); } +}; + +void Journaler::_finish_flush(int r, off_t start) +{ + assert(r>=0); + + assert(start >= ack_pos); + assert(start < flush_pos); + assert(pending_flush.count(start)); + + // calc latency? + if (logger) { + utime_t lat = g_clock.now(); + lat -= pending_flush[start]; + logger->finc("lsum", lat); + logger->inc("lnum"); + } + + pending_flush.erase(start); + + // adjust ack_pos + if (pending_flush.empty()) + ack_pos = flush_pos; + else + ack_pos = pending_flush.begin()->first; + + dout(10) << "_finish_flush from " << start + << ", pending_flush now " << pending_flush + << ", write positions now " << write_pos << "/" << flush_pos << "/" << ack_pos + << endl; + + // kick waiters <= ack_pos + while (!waitfor_flush.empty()) { + if (waitfor_flush.begin()->first > ack_pos) break; + finish_contexts(waitfor_flush.begin()->second); + waitfor_flush.erase(waitfor_flush.begin()); + } +} + + +off_t Journaler::append_entry(bufferlist& bl, Context *onsync) +{ + size_t s = bl.length(); + + if (!g_conf.journaler_allow_split_entries) { + // will we span a stripe boundary? + int p = inode.layout.stripe_size; + if (write_pos / p != (write_pos + bl.length() + sizeof(s)) / p) { + // yes. + // move write_pos forward. + off_t owp = write_pos; + write_pos += p; + write_pos -= (write_pos % p); + + // pad with zeros. + bufferptr bp(write_pos - owp); + bp.zero(); + assert(bp.length() >= 4); + write_buf.push_back(bp); + + // now flush. + flush(); + + dout(12) << "append_entry skipped " << (write_pos-owp) << " bytes to " << write_pos << " to avoid spanning stripe boundary" << endl; + } + } + + dout(10) << "append_entry len " << bl.length() << " to " << write_pos << "~" << (bl.length() + sizeof(size_t)) << endl; + + // append + write_buf.append((char*)&s, sizeof(s)); + write_buf.append(bl); + write_pos += sizeof(s) + s; + + // flush now? + if (onsync) + flush(onsync); + + return write_pos; +} + + +void Journaler::flush(Context *onsync) +{ + if (write_pos == flush_pos) { + assert(write_buf.length() == 0); + dout(10) << "flush nothing to flush, write pointers at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; + + if (onsync) { + onsync->finish(0); + delete onsync; + } + return; + } + + unsigned len = write_pos - flush_pos; + assert(len == write_buf.length()); + dout(10) << "flush flushing " << flush_pos << "~" << len << endl; + + // submit write for anything pending + filer.write(inode, flush_pos, len, write_buf, 0, + new C_Flush(this, flush_pos), 0); // flush _start_ pos to _finish_flush + pending_flush[flush_pos] = g_clock.now(); + + // adjust pointers + flush_pos = write_pos; + write_buf.clear(); + + dout(10) << "flush write pointers now at " << write_pos << "/" << flush_pos << "/" << ack_pos << endl; + + // queue waiter (at _new_ write_pos; will go when reached by ack_pos) + if (onsync) + waitfor_flush[write_pos].push_back(onsync); + + // write head? + if (last_wrote_head.sec() + 30 < g_clock.now().sec()) { + write_head(); + } +} + + + +/***************** READING *******************/ + + +class Journaler::C_Read : public Context { + Journaler *ls; +public: + C_Read(Journaler *l) : ls(l) {} + void finish(int r) { ls->_finish_read(r); } +}; + +class Journaler::C_RetryRead : public Context { + Journaler *ls; +public: + C_RetryRead(Journaler *l) : ls(l) {} + void finish(int r) { ls->is_readable(); } // this'll kickstart. +}; + +void Journaler::_finish_read(int r) +{ + assert(r>=0); + + dout(10) << "_finish_read got " << received_pos << "~" << reading_buf.length() << endl; + received_pos += reading_buf.length(); + read_buf.claim_append(reading_buf); + assert(received_pos <= requested_pos); + dout(10) << "_finish_read read_buf now " << read_pos << "~" << read_buf.length() + << ", read pointers " << read_pos << "/" << received_pos << "/" << requested_pos + << endl; + + if (is_readable()) { // NOTE: this check may read more + // readable! + dout(10) << "_finish_read now readable" << endl; + if (on_readable) { + Context *f = on_readable; + on_readable = 0; + f->finish(0); + delete f; + } + + if (read_bl) { + bool r = try_read_entry(*read_bl); + assert(r); // this should have worked. + + // clear state + Context *f = on_read_finish; + on_read_finish = 0; + read_bl = 0; + + // do callback + f->finish(0); + delete f; + } + } + + // prefetch? + _prefetch(); +} + +/* NOTE: this could be slightly smarter... we could allow + * multiple reads to be in progress. e.g., if we prefetch, but + * then discover we need even more for an especially large entry. + * i don't think that circumstance will arise particularly often. + */ +void Journaler::_issue_read(off_t len) +{ + if (_is_reading()) { + dout(10) << "_issue_read " << len << " waiting, already reading " + << received_pos << "~" << (requested_pos-received_pos) << endl; + return; + } + assert(requested_pos == received_pos); + + // stuck at ack_pos? + assert(requested_pos <= ack_pos); + if (requested_pos == ack_pos) { + dout(10) << "_issue_read requested_pos = ack_pos = " << ack_pos << ", waiting" << endl; + assert(write_pos > requested_pos); + if (flush_pos == ack_pos) + flush(); + assert(flush_pos > ack_pos); + waitfor_flush[flush_pos].push_back(new C_RetryRead(this)); + return; + } + + // don't read too much + if (requested_pos + len > ack_pos) { + len = ack_pos - requested_pos; + dout(10) << "_issue_read reading only up to ack_pos " << ack_pos << endl; + } + + // go. + dout(10) << "_issue_read reading " << requested_pos << "~" << len + << ", read pointers " << read_pos << "/" << received_pos << "/" << (requested_pos+len) + << endl; + + filer.read(inode, requested_pos, len, &reading_buf, + new C_Read(this)); + requested_pos += len; +} + +void Journaler::_prefetch() +{ + // prefetch? + off_t left = requested_pos - read_pos; + if (left <= prefetch_from && // should read more, + !_is_reading() && // and not reading anything right now + write_pos > requested_pos) { // there's something more to read... + dout(10) << "_prefetch only " << left << " < " << prefetch_from + << ", prefetching " << endl; + _issue_read(fetch_len); + } +} + + +void Journaler::read_entry(bufferlist *bl, Context *onfinish) +{ + // only one read at a time! + assert(read_bl == 0); + assert(on_read_finish == 0); + + if (is_readable()) { + dout(10) << "read_entry at " << read_pos << ", read_buf is " + << read_pos << "~" << read_buf.length() + << ", readable now" << endl; + + // nice, just do it now. + bool r = try_read_entry(*bl); + assert(r); + + // callback + onfinish->finish(0); + delete onfinish; + } else { + dout(10) << "read_entry at " << read_pos << ", read_buf is " + << read_pos << "~" << read_buf.length() + << ", not readable now" << endl; + + bl->clear(); + + // set it up + read_bl = bl; + on_read_finish = onfinish; + + // is_readable() will have already initiated a read (if it was possible) + } +} + + +/* is_readable() + * return true if next entry is ready. + * kickstart read as necessary. + */ +bool Journaler::is_readable() +{ + // anything to read? + if (read_pos == write_pos) return false; + + // have enough for entry size? + size_t s = 0; + if (read_buf.length() >= sizeof(s)) + read_buf.copy(0, sizeof(s), (char*)&s); + + // entry and payload? + if (read_buf.length() >= sizeof(s) && + read_buf.length() >= sizeof(s) + s) + return true; // yep, next entry is ready. + + // darn it! + + // partial fragment at the end? + if (received_pos == write_pos) { + dout(10) << "is_readable() detected partial entry at tail, adjusting write_pos to " << read_pos << endl; + write_pos = flush_pos = ack_pos = read_pos; + assert(write_buf.length() == 0); + + // truncate? + // FIXME: how much? + + return false; + } + + // start reading some more? + if (!_is_reading()) { + if (s) + fetch_len = MAX(fetch_len, sizeof(s)+s-read_buf.length()); + _issue_read(fetch_len); + } + + return false; +} + + +/* try_read_entry(bl) + * read entry into bl if it's ready. + * otherwise, do nothing. (well, we'll start fetching it for good measure.) + */ +bool Journaler::try_read_entry(bufferlist& bl) +{ + if (!is_readable()) { // this may start a read. + dout(10) << "try_read_entry at " << read_pos << " not readable" << endl; + return false; + } + + size_t s; + assert(read_buf.length() >= sizeof(s)); + read_buf.copy(0, sizeof(s), (char*)&s); + assert(read_buf.length() >= sizeof(s) + s); + + dout(10) << "try_read_entry at " << read_pos << " reading " + << read_pos << "~" << (sizeof(s)+s) << endl; + + // do it + assert(bl.length() == 0); + read_buf.splice(0, sizeof(s)); + read_buf.splice(0, s, &bl); + read_pos += sizeof(s) + s; + + // prefetch? + _prefetch(); + return true; +} + +void Journaler::wait_for_readable(Context *onreadable) +{ + dout(10) << "wait_for_readable at " << read_pos << " onreadable " << onreadable << endl; + assert(!is_readable()); + assert(on_readable == 0); + on_readable = onreadable; +} + + + + +/***************** TRIMMING *******************/ + + +class Journaler::C_Trim : public Context { + Journaler *ls; + off_t to; +public: + C_Trim(Journaler *l, off_t t) : ls(l), to(t) {} + void finish(int r) { + ls->_trim_finish(r, to); + } +}; + +void Journaler::trim() +{ + off_t trim_to = last_committed.expire_pos; + trim_to -= trim_to % inode.layout.period(); + dout(10) << "trim last_commited head was " << last_committed + << ", can trim to " << trim_to + << endl; + if (trim_to == 0 || trim_to == trimming_pos) { + dout(10) << "trim already trimmed/trimming to " + << trimmed_pos << "/" << trimming_pos << endl; + return; + } + + // trim + assert(trim_to <= write_pos); + assert(trim_to > trimming_pos); + dout(10) << "trim trimming to " << trim_to + << ", trimmed/trimming/expire are " + << trimmed_pos << "/" << trimming_pos << "/" << expire_pos + << endl; + + filer.remove(inode, trimming_pos, trim_to-trimming_pos, + 0, new C_Trim(this, trim_to)); + trimming_pos = trim_to; +} + +void Journaler::_trim_finish(int r, off_t to) +{ + dout(10) << "_trim_finish trimmed_pos was " << trimmed_pos + << ", trimmed/trimming/expire now " + << to << "/" << trimming_pos << "/" << expire_pos + << endl; + assert(r >= 0); + + assert(to <= trimming_pos); + assert(to > trimmed_pos); + trimmed_pos = to; + + // finishers? + while (!waitfor_trim.empty() && + waitfor_trim.begin()->first <= trimmed_pos) { + finish_contexts(waitfor_trim.begin()->second, 0); + waitfor_trim.erase(waitfor_trim.begin()); + } +} + + +// eof. diff --git a/branches/sage/cephmds2/osdc/Journaler.h b/branches/sage/cephmds2/osdc/Journaler.h new file mode 100644 index 0000000000000..0b8d7061330e8 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Journaler.h @@ -0,0 +1,218 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* Journaler + * + * This class stripes a serial log over objects on the store. Four logical pointers: + * + * write_pos - where we're writing new entries + * read_pos - where we're reading old entires + * expire_pos - what is deemed "old" by user + * trimmed_pos - where we're expiring old items + * + * trimmed_pos <= expire_pos <= read_pos <= write_pos. + * + * Often, read_pos <= write_pos (as with MDS log). During recovery, write_pos is undefined + * until the end of the log is discovered. + * + * A "head" struct at the beginning of the log is used to store metadata at + * regular intervals. The basic invariants include: + * + * head.read_pos <= read_pos -- the head may "lag", since it's updated lazily. + * head.write_pos <= write_pos + * head.expire_pos <= expire_pos + * head.trimmed_pos <= trimmed_pos + * + * More significantly, + * + * head.expire_pos >= trimmed_pos -- this ensures we can find the "beginning" of the log + * as last recorded, before it is trimmed. trimming will + * block until a sufficiently current expire_pos is committed. + * + * To recover log state, we simply start at the last write_pos in the head, and probe the + * object sequence sizes until we read the end. + * + * Head struct is stored in the first object. Actual journal starts after layout.period() bytes. + * + */ + +#ifndef __JOURNALER_H +#define __JOURNALER_H + +#include "Objecter.h" +#include "Filer.h" + +#include +#include + +class Context; +class Logger; + +class Journaler { + + // this goes at the head of the log "file". + struct Header { + off_t trimmed_pos; + off_t expire_pos; + off_t read_pos; + off_t write_pos; + Header() : trimmed_pos(0), expire_pos(0), read_pos(0), write_pos(0) {} + } last_written, last_committed; + + friend ostream& operator<<(ostream& out, Header &h); + + + // me + inode_t inode; + Objecter *objecter; + Filer filer; + + Logger *logger; + + // my state + static const int STATE_UNDEF = 0; + static const int STATE_READHEAD = 1; + static const int STATE_PROBING = 2; + static const int STATE_ACTIVE = 2; + + int state; + + // header + utime_t last_wrote_head; + void _finish_write_head(Header &wrote, Context *oncommit); + class C_WriteHead; + friend class C_WriteHead; + + list waitfor_recover; + void _finish_read_head(int r, bufferlist& bl); + void _finish_probe_end(int r, off_t end); + class C_ReadHead; + friend class C_ReadHead; + class C_ProbeEnd; + friend class C_ProbeEnd; + + + + // writer + off_t write_pos; // logical write position, where next entry will go + off_t flush_pos; // where we will flush. if write_pos>flush_pos, we're buffering writes. + off_t ack_pos; // what has been acked. + bufferlist write_buf; // write buffer. flush_pos + write_buf.length() == write_pos. + + std::map pending_flush; // start offsets and times for pending flushes + std::map > waitfor_flush; // when flushed through given offset + + void _finish_flush(int r, off_t start); + class C_Flush; + friend class C_Flush; + + // reader + off_t read_pos; // logical read position, where next entry starts. + off_t requested_pos; // what we've requested from OSD. + off_t received_pos; // what we've received from OSD. + bufferlist read_buf; // read buffer. read_pos + read_buf.length() == prefetch_pos. + bufferlist reading_buf; // what i'm reading into + + off_t fetch_len; // how much to read at a time + off_t prefetch_from; // how far from end do we read next chunk + + // for read_entry() in-progress read + bufferlist *read_bl; + Context *on_read_finish; + // for wait_for_readable() + Context *on_readable; + + bool _is_reading() { + return requested_pos > received_pos; + } + void _finish_read(int r); // we just read some (read completion callback) + void _issue_read(off_t len); // read some more + void _prefetch(); // maybe read ahead + class C_Read; + friend class C_Read; + class C_RetryRead; + friend class C_RetryRead; + + // trimmer + off_t expire_pos; // what we're allowed to trim to + off_t trimming_pos; // what we've requested to trim through + off_t trimmed_pos; // what has been trimmed + map > waitfor_trim; + + void _trim_finish(int r, off_t to); + class C_Trim; + friend class C_Trim; + +public: + Journaler(inode_t& inode_, Objecter *obj, Logger *l, off_t fl=0, off_t pff=0) : + inode(inode_), objecter(obj), filer(objecter), logger(l), + state(STATE_UNDEF), + write_pos(0), flush_pos(0), ack_pos(0), + read_pos(0), requested_pos(0), received_pos(0), + fetch_len(fl), prefetch_from(pff), + read_bl(0), on_read_finish(0), on_readable(0), + expire_pos(0), trimming_pos(0), trimmed_pos(0) + { + // prefetch intelligently. + // (watch out, this is big if you use big objects or weird striping) + if (!fetch_len) + fetch_len = inode.layout.object_size*inode.layout.stripe_count; + if (!prefetch_from) + prefetch_from = fetch_len / 2; + } + + // me + //void open(Context *onopen); + //void claim(Context *onclaim, msg_addr_t from); + + /* reset + * NOTE: we assume the caller knows/has ensured that any objects + * in our sequence do not exist.. e.g. after a MKFS. this is _not_ + * an "erase" method. + */ + void reset(); + void recover(Context *onfinish); + void write_head(Context *onsave=0); + + bool is_active() { return state == STATE_ACTIVE; } + + off_t get_write_pos() const { return write_pos; } + off_t get_read_pos() const { return read_pos; } + off_t get_expire_pos() const { return expire_pos; } + off_t get_trimmed_pos() const { return trimmed_pos; } + + // write + off_t append_entry(bufferlist& bl, Context *onsync = 0); + void flush(Context *onsync = 0); + + // read + void set_read_pos(off_t p) { + assert(requested_pos == received_pos); // we can't cope w/ in-progress read right now. + assert(read_bl == 0); // ... + read_pos = requested_pos = received_pos = p; + read_buf.clear(); + } + bool is_readable(); + bool try_read_entry(bufferlist& bl); + void wait_for_readable(Context *onfinish); + void read_entry(bufferlist* bl, Context *onfinish); + + // trim + void set_expire_pos(off_t ep) { expire_pos = ep; } + void trim(); + //bool is_trimmable() { return trimming_pos < expire_pos; } + //void trim(off_t trim_to=0, Context *c=0); +}; + + +#endif diff --git a/branches/sage/cephmds2/osdc/ObjectCacher.cc b/branches/sage/cephmds2/osdc/ObjectCacher.cc new file mode 100644 index 0000000000000..e2520f595096d --- /dev/null +++ b/branches/sage/cephmds2/osdc/ObjectCacher.cc @@ -0,0 +1,1472 @@ + +#include "msg/Messenger.h" +#include "ObjectCacher.h" +#include "Objecter.h" + + + +/*** ObjectCacher::BufferHead ***/ + + +/*** ObjectCacher::Object ***/ + +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << oc->objecter->messenger->get_myaddr() << ".objectcacher.object(" << oid << ") " + + +ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *bh, off_t off) +{ + dout(20) << "split " << *bh << " at " << off << endl; + + // split off right + ObjectCacher::BufferHead *right = new BufferHead(this); + right->last_write_tid = bh->last_write_tid; + right->set_state(bh->get_state()); + + off_t newleftlen = off - bh->start(); + right->set_start( off ); + right->set_length( bh->length() - newleftlen ); + + // shorten left + oc->bh_stat_sub(bh); + bh->set_length( newleftlen ); + oc->bh_stat_add(bh); + + // add right + oc->bh_add(this, right); + + // split buffers too + bufferlist bl; + bl.claim(bh->bl); + if (bl.length()) { + assert(bl.length() == (bh->length() + right->length())); + right->bl.substr_of(bl, bh->length(), right->length()); + bh->bl.substr_of(bl, 0, bh->length()); + } + + // move read waiters + if (!bh->waitfor_read.empty()) { + map >::iterator o, p = bh->waitfor_read.end(); + p--; + while (p != bh->waitfor_read.begin()) { + if (p->first < right->start()) break; + dout(0) << "split moving waiters at byte " << p->first << " to right bh" << endl; + right->waitfor_read[p->first].swap( p->second ); + o = p; + p--; + bh->waitfor_read.erase(o); + } + } + + dout(20) << "split left is " << *bh << endl; + dout(20) << "split right is " << *right << endl; + return right; +} + + +void ObjectCacher::Object::merge_left(BufferHead *left, BufferHead *right) +{ + assert(left->end() == right->start()); + assert(left->get_state() == right->get_state()); + + dout(10) << "merge_left " << *left << " + " << *right << endl; + oc->bh_remove(this, right); + oc->bh_stat_sub(left); + left->set_length( left->length() + right->length()); + oc->bh_stat_add(left); + + // data + left->bl.claim_append(right->bl); + + // version + // note: this is sorta busted, but should only be used for dirty buffers + left->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); + left->last_write = MAX( left->last_write, right->last_write ); + + // waiters + for (map >::iterator p = right->waitfor_read.begin(); + p != right->waitfor_read.end(); + p++) + left->waitfor_read[p->first].splice( left->waitfor_read[p->first].begin(), + p->second ); + + // hose right + delete right; + + dout(10) << "merge_left result " << *left << endl; +} + +/* buggy possibly, but more importnatly, unnecessary. +void ObjectCacher::Object::merge_right(BufferHead *left, BufferHead *right) +{ + assert(left->end() == right->start()); + assert(left->get_state() == right->get_state()); + + dout(10) << "merge_right " << *left << " + " << *right << endl; + oc->bh_remove(this, left); + oc->bh_stat_sub(right); + data.erase(right->start()); + right->set_start( left->start() ); + data[right->start()] = right; + right->set_length( left->length() + right->length()); + oc->bh_stat_add(right); + + // data + bufferlist nbl; + nbl.claim(left->bl); + nbl.claim_append(right->bl); + right->bl.claim(nbl); + + // version + // note: this is sorta busted, but should only be used for dirty buffers + right->last_write_tid = MAX( left->last_write_tid, right->last_write_tid ); + + // waiters + map > old; + old.swap(right->waitfor_read); + + // take left's waiters + right->waitfor_read.swap(left->waitfor_read); + + // shift old waiters + for (map >::iterator p = old.begin(); + p != old.end(); + p++) + right->waitfor_read[p->first + left->length()].swap( p->second ); + + // hose left + delete left; + + dout(10) << "merge_right result " << *right << endl; +} +*/ + +/* + * map a range of bytes into buffer_heads. + * - create missing buffer_heads as necessary. + */ +int ObjectCacher::Object::map_read(Objecter::OSDRead *rd, + map& hits, + map& missing, + map& rx) +{ + for (list::iterator ex_it = rd->extents.begin(); + ex_it != rd->extents.end(); + ex_it++) { + + if (ex_it->oid != oid) continue; + + dout(10) << "map_read " << ex_it->oid + << " " << ex_it->start << "~" << ex_it->length << endl; + + map::iterator p = data.lower_bound(ex_it->start); + // p->first >= start + + off_t cur = ex_it->start; + off_t left = ex_it->length; + + if (p != data.begin() && + (p == data.end() || p->first > cur)) { + p--; // might overlap! + if (p->first + p->second->length() <= cur) + p++; // doesn't overlap. + } + + while (left > 0) { + // at end? + if (p == data.end()) { + // rest is a miss. + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( left ); + oc->bh_add(this, n); + missing[cur] = n; + dout(20) << "map_read miss " << left << " left, " << *n << endl; + cur += left; + left -= left; + assert(left == 0); + assert(cur == ex_it->start + (off_t)ex_it->length); + break; // no more. + } + + if (p->first <= cur) { + // have it (or part of it) + BufferHead *e = p->second; + + if (e->is_clean() || + e->is_dirty() || + e->is_tx()) { + hits[cur] = e; // readable! + dout(20) << "map_read hit " << *e << endl; + } + else if (e->is_rx()) { + rx[cur] = e; // missing, not readable. + dout(20) << "map_read rx " << *e << endl; + } + else assert(0); + + off_t lenfromcur = MIN(e->end() - cur, left); + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; // more? + + } else if (p->first > cur) { + // gap.. miss + off_t next = p->first; + BufferHead *n = new BufferHead(this); + n->set_start( cur ); + n->set_length( MIN(next - cur, left) ); + oc->bh_add(this,n); + missing[cur] = n; + cur += MIN(left, n->length()); + left -= MIN(left, n->length()); + dout(20) << "map_read gap " << *n << endl; + continue; // more? + } + else + assert(0); + } + } + return(0); +} + +/* + * map a range of extents on an object's buffer cache. + * - combine any bh's we're writing into one + * - break up bufferheads that don't fall completely within the range + * //no! - return a bh that includes the write. may also include other dirty data to left and/or right. + */ +ObjectCacher::BufferHead *ObjectCacher::Object::map_write(Objecter::OSDWrite *wr) +{ + BufferHead *final = 0; + + for (list::iterator ex_it = wr->extents.begin(); + ex_it != wr->extents.end(); + ex_it++) { + + if (ex_it->oid != oid) continue; + + dout(10) << "map_write oex " << ex_it->oid + << " " << ex_it->start << "~" << ex_it->length << endl; + + map::iterator p = data.lower_bound(ex_it->start); + // p->first >= start + + off_t cur = ex_it->start; + off_t left = ex_it->length; + + if (p != data.begin() && + (p == data.end() || p->first > cur)) { + p--; // might overlap or butt up! + + /*// dirty and butts up? + if (p->first + p->second->length() == cur && + p->second->is_dirty()) { + dout(10) << "map_write will append to tail of " << *p->second << endl; + final = p->second; + } + */ + if (p->first + p->second->length() <= cur) + p++; // doesn't overlap. + } + + while (left > 0) { + off_t max = left; + + // at end ? + if (p == data.end()) { + if (final == NULL) { + final = new BufferHead(this); + final->set_start( cur ); + final->set_length( max ); + oc->bh_add(this, final); + dout(10) << "map_write adding trailing bh " << *final << endl; + } else { + final->set_length( final->length() + max ); + } + left -= max; + cur += max; + continue; + } + + dout(10) << "p is " << *p->second << endl; + + if (p->first <= cur) { + BufferHead *bh = p->second; + dout(10) << "map_write bh " << *bh << " intersected" << endl; + + /*if (bh->is_dirty()) { + // already dirty, let's use it. + final = bh; + } else { + */ + if (p->first < cur) { + assert(final == 0); + if (cur + max >= p->first + p->second->length()) { + // we want right bit (one splice) + final = split(bh, cur); // just split it, take right half. + p++; + assert(p->second == final); + } else { + // we want middle bit (two splices) + final = split(bh, cur); + p++; + assert(p->second == final); + split(final, cur+max); + } + } else if (p->first == cur) { + /*if (bh->is_dirty()) { + // already dirty, use it. + } + else*/ + if (p->second->length() <= max) { + // whole bufferhead, piece of cake. + } else { + // we want left bit (one splice) + split(bh, cur + max); // just split + } + if (final) + merge_left(final,bh); + else + final = bh; + } + + // keep going. + off_t lenfromcur = final->end() - cur; + cur += lenfromcur; + left -= lenfromcur; + p++; + continue; + } else { + // gap! + off_t next = p->first; + off_t glen = MIN(next - cur, max); + dout(10) << "map_write gap " << cur << "~" << glen << endl; + if (final) { + final->set_length( final->length() + glen ); + } else { + final = new BufferHead(this); + final->set_start( cur ); + final->set_length( glen ); + oc->bh_add(this, final); + } + + cur += glen; + left -= glen; + continue; // more? + } + } + } + + // set versoin + assert(final); + dout(10) << "map_write final is " << *final << endl; + + return final; +} + + + +/*** ObjectCacher ***/ + +#undef dout +#define dout(l) if (l<=g_conf.debug || l<=g_conf.debug_objectcacher) cout << objecter->messenger->get_myaddr() << ".objectcacher " + + +/* private */ + +void ObjectCacher::bh_read(BufferHead *bh) +{ + dout(7) << "bh_read on " << *bh << endl; + + mark_rx(bh); + + // finisher + C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); + + // go + objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), &onfinish->bl, + onfinish); +} + +void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) +{ + //lock.Lock(); + dout(7) << "bh_read_finish " + << oid + << " " << start << "~" << length + << endl; + + if (objects.count(oid) == 0) { + dout(7) << "bh_read_finish no object cache" << endl; + } else { + Object *ob = objects[oid]; + + // apply to bh's! + off_t opos = start; + map::iterator p = ob->data.lower_bound(opos); + + while (p != ob->data.end() && + opos < start+(off_t)length) { + BufferHead *bh = p->second; + + if (bh->start() > opos) { + dout(1) << "weirdness: gap when applying read results, " + << opos << "~" << bh->start() - opos + << endl; + opos = bh->start(); + continue; + } + + if (!bh->is_rx()) { + dout(10) << "bh_read_finish skipping non-rx " << *bh << endl; + opos = bh->end(); + p++; + continue; + } + + assert(opos >= bh->start()); + assert(bh->start() == opos); // we don't merge rx bh's... yet! + assert(bh->length() <= start+(off_t)length-opos); + + bh->bl.substr_of(bl, + opos-bh->start(), + bh->length()); + mark_clean(bh); + dout(10) << "bh_read_finish read " << *bh << endl; + + opos = bh->end(); + p++; + + // finishers? + // called with lock held. + list ls; + for (map >::iterator p = bh->waitfor_read.begin(); + p != bh->waitfor_read.end(); + p++) + ls.splice(ls.end(), p->second); + bh->waitfor_read.clear(); + finish_contexts(ls); + } + } + //lock.Unlock(); +} + + +void ObjectCacher::bh_write(BufferHead *bh) +{ + dout(7) << "bh_write " << *bh << endl; + + // finishers + C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); + C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); + + // go + tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->bl, + onack, oncommit); + + // set bh last_write_tid + onack->tid = tid; + oncommit->tid = tid; + bh->ob->last_write_tid = tid; + bh->last_write_tid = tid; + + mark_tx(bh); +} + +void ObjectCacher::lock_ack(list& oids, tid_t tid) +{ + for (list::iterator i = oids.begin(); + i != oids.end(); + i++) { + object_t oid = *i; + + if (objects.count(oid) == 0) { + dout(7) << "lock_ack no object cache" << endl; + assert(0); + } + + Object *ob = objects[oid]; + + list ls; + + assert(tid <= ob->last_write_tid); + if (ob->last_write_tid == tid) { + dout(10) << "lock_ack " << *ob + << " tid " << tid << endl; + + switch (ob->lock_state) { + case Object::LOCK_RDUNLOCKING: + case Object::LOCK_WRUNLOCKING: + ob->lock_state = Object::LOCK_NONE; + break; + case Object::LOCK_RDLOCKING: + case Object::LOCK_DOWNGRADING: + ob->lock_state = Object::LOCK_RDLOCK; + ls.splice(ls.begin(), ob->waitfor_rd); + break; + case Object::LOCK_UPGRADING: + case Object::LOCK_WRLOCKING: + ob->lock_state = Object::LOCK_WRLOCK; + ls.splice(ls.begin(), ob->waitfor_wr); + ls.splice(ls.begin(), ob->waitfor_rd); + break; + + default: + assert(0); + } + + ob->last_ack_tid = tid; + + if (ob->can_close()) + close_object(ob); + } else { + dout(10) << "lock_ack " << *ob + << " tid " << tid << " obsolete" << endl; + } + + // waiters? + if (ob->waitfor_ack.count(tid)) { + ls.splice(ls.end(), ob->waitfor_ack[tid]); + ob->waitfor_ack.erase(tid); + } + + finish_contexts(ls); + + } +} + +void ObjectCacher::bh_write_ack(object_t oid, off_t start, size_t length, tid_t tid) +{ + //lock.Lock(); + + dout(7) << "bh_write_ack " + << oid + << " tid " << tid + << " " << start << "~" << length + << endl; + if (objects.count(oid) == 0) { + dout(7) << "bh_write_ack no object cache" << endl; + assert(0); + } else { + Object *ob = objects[oid]; + + // apply to bh's! + for (map::iterator p = ob->data.lower_bound(start); + p != ob->data.end(); + p++) { + BufferHead *bh = p->second; + + if (bh->start() > start+(off_t)length) break; + + if (bh->start() < start && + bh->end() > start+(off_t)length) { + dout(20) << "bh_write_ack skipping " << *bh << endl; + continue; + } + + // make sure bh is tx + if (!bh->is_tx()) { + dout(10) << "bh_write_ack skipping non-tx " << *bh << endl; + continue; + } + + // make sure bh tid matches + if (bh->last_write_tid != tid) { + assert(bh->last_write_tid > tid); + dout(10) << "bh_write_ack newer tid on " << *bh << endl; + continue; + } + + // ok! mark bh clean. + mark_clean(bh); + dout(10) << "bh_write_ack clean " << *bh << endl; + } + + // update object last_ack. + assert(ob->last_ack_tid < tid); + ob->last_ack_tid = tid; + + // waiters? + if (ob->waitfor_ack.count(tid)) { + list ls; + ls.splice(ls.begin(), ob->waitfor_ack[tid]); + ob->waitfor_ack.erase(tid); + finish_contexts(ls); + } + } + //lock.Unlock(); +} + +void ObjectCacher::bh_write_commit(object_t oid, off_t start, size_t length, tid_t tid) +{ + //lock.Lock(); + + // update object last_commit + dout(7) << "bh_write_commit " + << oid + << " tid " << tid + << " " << start << "~" << length + << endl; + if (objects.count(oid) == 0) { + dout(7) << "bh_write_commit no object cache" << endl; + //assert(0); + } else { + Object *ob = objects[oid]; + + // update last_commit. + ob->last_commit_tid = tid; + + // waiters? + if (ob->waitfor_commit.count(tid)) { + list ls; + ls.splice(ls.begin(), ob->waitfor_commit[tid]); + ob->waitfor_commit.erase(tid); + finish_contexts(ls); + } + } + + // lock.Unlock(); +} + + +void ObjectCacher::flush(off_t amount) +{ + utime_t cutoff = g_clock.now(); + //cutoff.sec_ref() -= g_conf.client_oc_max_dirty_age; + + dout(10) << "flush " << amount << endl; + + off_t did = 0; + while (amount == 0 || did < amount) { + BufferHead *bh = (BufferHead*) lru_dirty.lru_get_next_expire(); + if (!bh) break; + if (bh->last_write > cutoff) break; + + did += bh->length(); + bh_write(bh); + } +} + + +void ObjectCacher::trim(off_t max) +{ + if (max < 0) + max = g_conf.client_oc_size; + + dout(10) << "trim start: max " << max + << " clean " << get_stat_clean() + << endl; + + while (get_stat_clean() > max) { + BufferHead *bh = (BufferHead*) lru_rest.lru_expire(); + if (!bh) break; + + dout(10) << "trim trimming " << *bh << endl; + assert(bh->is_clean()); + + Object *ob = bh->ob; + bh_remove(ob, bh); + delete bh; + + if (ob->can_close()) { + dout(10) << "trim trimming " << *ob << endl; + close_object(ob); + } + } + + dout(10) << "trim finish: max " << max + << " clean " << get_stat_clean() + << endl; +} + + + +/* public */ + +/* + * returns # bytes read (if in cache). onfinish is untouched (caller must delete it) + * returns 0 if doing async read + */ +int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) +{ + bool success = true; + list hit_ls; + map stripe_map; // final buffer offset -> substring + + for (list::iterator ex_it = rd->extents.begin(); + ex_it != rd->extents.end(); + ex_it++) { + dout(10) << "readx " << *ex_it << endl; + + // get Object cache + Object *o = get_object(ex_it->oid, ino); + + // map extent into bufferheads + map hits, missing, rx; + o->map_read(rd, hits, missing, rx); + + if (!missing.empty() || !rx.empty()) { + // read missing + for (map::iterator bh_it = missing.begin(); + bh_it != missing.end(); + bh_it++) { + bh_read(bh_it->second); + if (success) { + dout(10) << "readx missed, waiting on " << *bh_it->second + << " off " << bh_it->first << endl; + success = false; + bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); + } + } + + // bump rx + for (map::iterator bh_it = rx.begin(); + bh_it != rx.end(); + bh_it++) { + touch_bh(bh_it->second); // bump in lru, so we don't lose it. + if (success) { + dout(10) << "readx missed, waiting on " << *bh_it->second + << " off " << bh_it->first << endl; + success = false; + bh_it->second->waitfor_read[bh_it->first].push_back( new C_RetryRead(this, rd, ino, onfinish) ); + } + } + } else { + assert(!hits.empty()); + + // make a plain list + for (map::iterator bh_it = hits.begin(); + bh_it != hits.end(); + bh_it++) { + dout(10) << "readx hit bh " << *bh_it->second << endl; + hit_ls.push_back(bh_it->second); + } + + // create reverse map of buffer offset -> object for the eventual result. + // this is over a single ObjectExtent, so we know that + // - the bh's are contiguous + // - the buffer frags need not be (and almost certainly aren't) + off_t opos = ex_it->start; + map::iterator bh_it = hits.begin(); + assert(bh_it->second->start() <= opos); + size_t bhoff = opos - bh_it->second->start(); + map::iterator f_it = ex_it->buffer_extents.begin(); + size_t foff = 0; + while (1) { + BufferHead *bh = bh_it->second; + assert(opos == (off_t)(bh->start() + bhoff)); + + dout(10) << "readx rmap opos " << opos + << ": " << *bh << " +" << bhoff + << " frag " << f_it->first << "~" << f_it->second << " +" << foff + << endl; + + size_t len = MIN(f_it->second - foff, + bh->length() - bhoff); + stripe_map[f_it->first].substr_of(bh->bl, + opos - bh->start(), + len); + opos += len; + bhoff += len; + foff += len; + if (opos == bh->end()) { + bh_it++; + bhoff = 0; + } + if (foff == f_it->second) { + f_it++; + foff = 0; + } + if (bh_it == hits.end()) break; + if (f_it == ex_it->buffer_extents.end()) break; + } + assert(f_it == ex_it->buffer_extents.end()); + assert(opos == ex_it->start + (off_t)ex_it->length); + } + } + + // bump hits in lru + for (list::iterator bhit = hit_ls.begin(); + bhit != hit_ls.end(); + bhit++) + touch_bh(*bhit); + + if (!success) return 0; // wait! + + // no misses... success! do the read. + assert(!hit_ls.empty()); + dout(10) << "readx has all buffers" << endl; + + // ok, assemble into result buffer. + rd->bl->clear(); + size_t pos = 0; + for (map::iterator i = stripe_map.begin(); + i != stripe_map.end(); + i++) { + assert(pos == i->first); + dout(10) << "readx adding buffer len " << i->second.length() << " at " << pos << endl; + pos += i->second.length(); + rd->bl->claim_append(i->second); + } + dout(10) << "readx result is " << rd->bl->length() << endl; + + trim(); + + return pos; +} + + +int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) +{ + utime_t now = g_clock.now(); + + for (list::iterator ex_it = wr->extents.begin(); + ex_it != wr->extents.end(); + ex_it++) { + // get object cache + Object *o = get_object(ex_it->oid, ino); + + // map it all into a single bufferhead. + BufferHead *bh = o->map_write(wr); + + // adjust buffer pointers (ie "copy" data into my cache) + // this is over a single ObjectExtent, so we know that + // - there is one contiguous bh + // - the buffer frags need not be (and almost certainly aren't) + // note: i assume striping is monotonic... no jumps backwards, ever! + off_t opos = ex_it->start; + for (map::iterator f_it = ex_it->buffer_extents.begin(); + f_it != ex_it->buffer_extents.end(); + f_it++) { + dout(10) << "writex writing " << f_it->first << "~" << f_it->second << " into " << *bh << " at " << opos << endl; + size_t bhoff = bh->start() - opos; + assert(f_it->second <= bh->length() - bhoff); + + bufferlist frag; + frag.substr_of(wr->bl, + f_it->first, f_it->second); + + bh->bl.claim_append(frag); + opos += f_it->second; + } + + // it's dirty. + mark_dirty(bh); + touch_bh(bh); + bh->last_write = now; + + // recombine with left? + map::iterator p = o->data.find(bh->start()); + if (p != o->data.begin()) { + p--; + if (p->second->is_dirty()) { + o->merge_left(p->second,bh); + bh = p->second; + } + } + // right? + p = o->data.find(bh->start()); + p++; + if (p != o->data.end() && + p->second->is_dirty()) + o->merge_left(p->second,bh); + } + + delete wr; + + trim(); + return 0; +} + + +// blocking wait for write. +void ObjectCacher::wait_for_write(size_t len, Mutex& lock) +{ + while (get_stat_dirty() > g_conf.client_oc_max_dirty) { + dout(10) << "wait_for_write waiting" << endl; + flusher_cond.Signal(); + stat_waiter++; + stat_cond.Wait(lock); + stat_waiter--; + dout(10) << "wait_for_write woke up" << endl; + } +} + +void ObjectCacher::flusher_entry() +{ + dout(10) << "flusher start" << endl; + lock.Lock(); + while (!flusher_stop) { + while (!flusher_stop) { + off_t all = get_stat_tx() + get_stat_rx() + get_stat_clean() + get_stat_dirty(); + dout(11) << "flusher " + << all << " / " << g_conf.client_oc_size << ": " + << get_stat_tx() << " tx, " + << get_stat_rx() << " rx, " + << get_stat_clean() << " clean, " + << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty" + << endl; + if (get_stat_dirty() > g_conf.client_oc_max_dirty) { + // flush some dirty pages + dout(10) << "flusher " + << get_stat_dirty() << " / " << g_conf.client_oc_max_dirty << " dirty," + << " flushing some dirty bhs" << endl; + flush(get_stat_dirty() - g_conf.client_oc_max_dirty); + } + else { + // check tail of lru for old dirty items + utime_t cutoff = g_clock.now(); + cutoff.sec_ref()--; + BufferHead *bh = 0; + while ((bh = (BufferHead*)lru_dirty.lru_get_next_expire()) != 0 && + bh->last_write < cutoff) { + dout(10) << "flusher flushing aged dirty bh " << *bh << endl; + bh_write(bh); + } + break; + } + } + if (flusher_stop) break; + flusher_cond.WaitInterval(lock, utime_t(1,0)); + } + lock.Unlock(); + dout(10) << "flusher finish" << endl; +} + + + +// blocking. atomic+sync. +int ObjectCacher::atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock) +{ + dout(10) << "atomic_sync_readx " << rd + << " in " << ino + << endl; + + if (rd->extents.size() == 1) { + // single object. + // just write synchronously. + Cond cond; + bool done = false; + objecter->readx(rd, new C_SafeCond(&lock, &cond, &done)); + + // block + while (!done) cond.Wait(lock); + } else { + // spans multiple objects, or is big. + + // sort by object... + map by_oid; + for (list::iterator ex_it = rd->extents.begin(); + ex_it != rd->extents.end(); + ex_it++) + by_oid[ex_it->oid] = *ex_it; + + // lock + for (map::iterator i = by_oid.begin(); + i != by_oid.end(); + i++) { + Object *o = get_object(i->first, ino); + rdlock(o); + } + + // readx will hose rd + list extents = rd->extents; + + // do the read, into our cache + Cond cond; + bool done = false; + readx(rd, ino, new C_SafeCond(&lock, &cond, &done)); + + // block + while (!done) cond.Wait(lock); + + // release the locks + for (list::iterator ex_it = extents.begin(); + ex_it != extents.end(); + ex_it++) { + assert(objects.count(ex_it->oid)); + Object *o = objects[ex_it->oid]; + rdunlock(o); + } + } + + return 0; +} + +int ObjectCacher::atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock) +{ + dout(10) << "atomic_sync_writex " << wr + << " in " << ino + << endl; + + if (wr->extents.size() == 1 && + wr->extents.front().length <= g_conf.client_oc_max_sync_write) { + // single object. + + // make sure we aren't already locking/locked... + object_t oid = wr->extents.front().oid; + Object *o = 0; + if (objects.count(oid)) o = get_object(oid, ino); + if (!o || + (o->lock_state != Object::LOCK_WRLOCK && + o->lock_state != Object::LOCK_WRLOCKING && + o->lock_state != Object::LOCK_UPGRADING)) { + // just write synchronously. + dout(10) << "atomic_sync_writex " << wr + << " in " << ino + << " doing sync write" + << endl; + + Cond cond; + bool done = false; + objecter->modifyx(wr, new C_SafeCond(&lock, &cond, &done), 0); + + // block + while (!done) cond.Wait(lock); + return 0; + } + } + + // spans multiple objects, or is big. + // sort by object... + map by_oid; + for (list::iterator ex_it = wr->extents.begin(); + ex_it != wr->extents.end(); + ex_it++) + by_oid[ex_it->oid] = *ex_it; + + // wrlock + for (map::iterator i = by_oid.begin(); + i != by_oid.end(); + i++) { + Object *o = get_object(i->first, ino); + wrlock(o); + } + + // writex will hose wr + list extents = wr->extents; + + // do the write, into our cache + writex(wr, ino); + + // flush + // ...and release the locks? + for (list::iterator ex_it = extents.begin(); + ex_it != extents.end(); + ex_it++) { + assert(objects.count(ex_it->oid)); + Object *o = objects[ex_it->oid]; + + wrunlock(o); + } + + return 0; +} + + + +// locking ----------------------------- + +void ObjectCacher::rdlock(Object *o) +{ + // lock? + if (o->lock_state == Object::LOCK_NONE || + o->lock_state == Object::LOCK_RDUNLOCKING || + o->lock_state == Object::LOCK_WRUNLOCKING) { + dout(10) << "rdlock rdlock " << *o << endl; + + o->lock_state = Object::LOCK_RDLOCKING; + + C_LockAck *ack = new C_LockAck(this, o->get_oid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + + commit->tid = + ack->tid = + o->last_write_tid = + objecter->lock(OSD_OP_RDLOCK, o->get_oid(), ack, commit); + } + + // stake our claim. + o->rdlock_ref++; + + // wait? + if (o->lock_state == Object::LOCK_RDLOCKING || + o->lock_state == Object::LOCK_WRLOCKING) { + dout(10) << "rdlock waiting for rdlock|wrlock on " << *o << endl; + Cond cond; + bool done = false; + o->waitfor_rd.push_back(new C_SafeCond(&lock, &cond, &done)); + while (!done) cond.Wait(lock); + } + assert(o->lock_state == Object::LOCK_RDLOCK || + o->lock_state == Object::LOCK_WRLOCK || + o->lock_state == Object::LOCK_UPGRADING || + o->lock_state == Object::LOCK_DOWNGRADING); +} + +void ObjectCacher::wrlock(Object *o) +{ + // lock? + if (o->lock_state != Object::LOCK_WRLOCK && + o->lock_state != Object::LOCK_WRLOCKING && + o->lock_state != Object::LOCK_UPGRADING) { + dout(10) << "wrlock wrlock " << *o << endl; + + int op = 0; + if (o->lock_state == Object::LOCK_RDLOCK) { + o->lock_state = Object::LOCK_UPGRADING; + op = OSD_OP_UPLOCK; + } else { + o->lock_state = Object::LOCK_WRLOCKING; + op = OSD_OP_WRLOCK; + } + + C_LockAck *ack = new C_LockAck(this, o->get_oid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + + commit->tid = + ack->tid = + o->last_write_tid = + objecter->lock(op, o->get_oid(), ack, commit); + } + + // stake our claim. + o->wrlock_ref++; + + // wait? + if (o->lock_state == Object::LOCK_WRLOCKING || + o->lock_state == Object::LOCK_UPGRADING) { + dout(10) << "wrlock waiting for wrlock on " << *o << endl; + Cond cond; + bool done = false; + o->waitfor_wr.push_back(new C_SafeCond(&lock, &cond, &done)); + while (!done) cond.Wait(lock); + } + assert(o->lock_state == Object::LOCK_WRLOCK); +} + + +void ObjectCacher::rdunlock(Object *o) +{ + dout(10) << "rdunlock " << *o << endl; + assert(o->lock_state == Object::LOCK_RDLOCK || + o->lock_state == Object::LOCK_WRLOCK || + o->lock_state == Object::LOCK_UPGRADING || + o->lock_state == Object::LOCK_DOWNGRADING); + + assert(o->rdlock_ref > 0); + o->rdlock_ref--; + if (o->rdlock_ref > 0 || + o->wrlock_ref > 0) { + dout(10) << "rdunlock " << *o << " still has rdlock|wrlock refs" << endl; + return; + } + + release(o); // release first + + o->lock_state = Object::LOCK_RDUNLOCKING; + + C_LockAck *lockack = new C_LockAck(this, o->get_oid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + commit->tid = + lockack->tid = + o->last_write_tid = + objecter->lock(OSD_OP_RDUNLOCK, o->get_oid(), lockack, commit); +} + +void ObjectCacher::wrunlock(Object *o) +{ + dout(10) << "wrunlock " << *o << endl; + assert(o->lock_state == Object::LOCK_WRLOCK); + + assert(o->wrlock_ref > 0); + o->wrlock_ref--; + if (o->wrlock_ref > 0) { + dout(10) << "wrunlock " << *o << " still has wrlock refs" << endl; + return; + } + + flush(o); // flush first + + int op = 0; + if (o->rdlock_ref > 0) { + dout(10) << "wrunlock rdlock " << *o << endl; + op = OSD_OP_DNLOCK; + o->lock_state = Object::LOCK_DOWNGRADING; + } else { + dout(10) << "wrunlock wrunlock " << *o << endl; + op = OSD_OP_WRUNLOCK; + o->lock_state = Object::LOCK_WRUNLOCKING; + } + + C_LockAck *lockack = new C_LockAck(this, o->get_oid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + commit->tid = + lockack->tid = + o->last_write_tid = + objecter->lock(op, o->get_oid(), lockack, commit); +} + + +// ------------------------------------------------- + + +bool ObjectCacher::set_is_cached(inodeno_t ino) +{ + if (objects_by_ino.count(ino) == 0) + return false; + + set& s = objects_by_ino[ino]; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + if (!ob->data.empty()) return true; + } + + return false; +} + +bool ObjectCacher::set_is_dirty_or_committing(inodeno_t ino) +{ + if (objects_by_ino.count(ino) == 0) + return false; + + set& s = objects_by_ino[ino]; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + for (map::iterator p = ob->data.begin(); + p != ob->data.end(); + p++) { + BufferHead *bh = p->second; + if (bh->is_dirty() || bh->is_tx()) + return true; + } + } + + return false; +} + + +// flush. non-blocking. no callback. +// true if clean, already flushed. +// false if we wrote something. +bool ObjectCacher::flush(Object *ob) +{ + bool clean = true; + for (map::iterator p = ob->data.begin(); + p != ob->data.end(); + p++) { + BufferHead *bh = p->second; + if (bh->is_tx()) { + clean = false; + continue; + } + if (!bh->is_dirty()) continue; + + bh_write(bh); + clean = false; + } + return clean; +} + +// flush. non-blocking, takes callback. +// returns true if already flushed +bool ObjectCacher::flush_set(inodeno_t ino, Context *onfinish) +{ + if (objects_by_ino.count(ino) == 0) { + dout(10) << "flush_set on " << ino << " dne" << endl; + return true; + } + + dout(10) << "flush_set " << ino << endl; + + C_Gather *gather = 0; // we'll need to wait for all objects to flush! + + set& s = objects_by_ino[ino]; + bool safe = true; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + if (!flush(ob)) { + // we'll need to gather... + if (!gather && onfinish) + gather = new C_Gather(onfinish); + safe = false; + + dout(10) << "flush_set " << ino << " will wait for ack tid " + << ob->last_write_tid + << " on " << *ob + << endl; + if (gather) + ob->waitfor_ack[ob->last_write_tid].push_back(gather->new_sub()); + } + } + + if (safe) { + dout(10) << "flush_set " << ino << " has no dirty|tx bhs" << endl; + return true; + } + return false; +} + + +// commit. non-blocking, takes callback. +// return true if already flushed. +bool ObjectCacher::commit_set(inodeno_t ino, Context *onfinish) +{ + assert(onfinish); // doesn't make any sense otherwise. + + if (objects_by_ino.count(ino) == 0) { + dout(10) << "commit_set on " << ino << " dne" << endl; + return true; + } + + dout(10) << "commit_set " << ino << endl; + + C_Gather *gather = 0; // we'll need to wait for all objects to commit + + set& s = objects_by_ino[ino]; + bool safe = true; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + // make sure it's flushing. + flush_set(ino); + + if (ob->last_write_tid > ob->last_commit_tid) { + dout(10) << "commit_set " << ino << " " << *ob + << " will finish on commit tid " << ob->last_write_tid + << endl; + if (!gather && onfinish) gather = new C_Gather(onfinish); + safe = false; + if (gather) + ob->waitfor_commit[ob->last_write_tid].push_back( gather->new_sub() ); + } + } + + if (safe) { + dout(10) << "commit_set " << ino << " all committed" << endl; + return true; + } + return false; +} + + +off_t ObjectCacher::release(Object *ob) +{ + list clean; + off_t o_unclean = 0; + + for (map::iterator p = ob->data.begin(); + p != ob->data.end(); + p++) { + BufferHead *bh = p->second; + if (bh->is_clean()) + clean.push_back(bh); + else + o_unclean += bh->length(); + } + + for (list::iterator p = clean.begin(); + p != clean.end(); + p++) + bh_remove(ob, *p); + + return o_unclean; +} + +off_t ObjectCacher::release_set(inodeno_t ino) +{ + // return # bytes not clean (and thus not released). + off_t unclean = 0; + + if (objects_by_ino.count(ino) == 0) { + dout(10) << "release_set on " << ino << " dne" << endl; + return 0; + } + + dout(10) << "release_set " << ino << endl; + + set& s = objects_by_ino[ino]; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + off_t o_unclean = release(ob); + unclean += o_unclean; + + if (o_unclean) + dout(10) << "release_set " << ino << " " << *ob + << " has " << o_unclean << " bytes left" + << endl; + + } + + if (unclean) { + dout(10) << "release_set " << ino + << ", " << unclean << " bytes left" << endl; + } + + return unclean; +} + + +void ObjectCacher::kick_sync_writers(inodeno_t ino) +{ + if (objects_by_ino.count(ino) == 0) { + dout(10) << "kick_sync_writers on " << ino << " dne" << endl; + return; + } + + dout(10) << "kick_sync_writers on " << ino << endl; + + list ls; + + set& s = objects_by_ino[ino]; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + ls.splice(ls.begin(), ob->waitfor_wr); + } + + finish_contexts(ls); +} + +void ObjectCacher::kick_sync_readers(inodeno_t ino) +{ + if (objects_by_ino.count(ino) == 0) { + dout(10) << "kick_sync_readers on " << ino << " dne" << endl; + return; + } + + dout(10) << "kick_sync_readers on " << ino << endl; + + list ls; + + set& s = objects_by_ino[ino]; + for (set::iterator i = s.begin(); + i != s.end(); + i++) { + Object *ob = *i; + + ls.splice(ls.begin(), ob->waitfor_rd); + } + + finish_contexts(ls); +} + + + diff --git a/branches/sage/cephmds2/osdc/ObjectCacher.h b/branches/sage/cephmds2/osdc/ObjectCacher.h new file mode 100644 index 0000000000000..27b154023209d --- /dev/null +++ b/branches/sage/cephmds2/osdc/ObjectCacher.h @@ -0,0 +1,547 @@ +#ifndef __OBJECTCACHER_H_ +#define __OBJECTCACHER_H_ + +#include "include/types.h" +#include "include/lru.h" +#include "include/Context.h" + +#include "common/Cond.h" +#include "common/Thread.h" + +#include "Objecter.h" +#include "Filer.h" + +class Objecter; +class Objecter::OSDRead; +class Objecter::OSDWrite; + +class ObjectCacher { + public: + + class Object; + + // ******* BufferHead ********* + class BufferHead : public LRUObject { + public: + // states + static const int STATE_MISSING = 0; + static const int STATE_CLEAN = 1; + static const int STATE_DIRTY = 2; + static const int STATE_RX = 3; + static const int STATE_TX = 4; + + private: + // my fields + int state; + int ref; + struct { + off_t start, length; // bh extent in object + } ex; + + public: + Object *ob; + bufferlist bl; + tid_t last_write_tid; // version of bh (if non-zero) + utime_t last_write; + + map< off_t, list > waitfor_read; + + public: + // cons + BufferHead(Object *o) : + state(STATE_MISSING), + ref(0), + ob(o), + last_write_tid(0) {} + + // extent + off_t start() { return ex.start; } + void set_start(off_t s) { ex.start = s; } + off_t length() { return ex.length; } + void set_length(off_t l) { ex.length = l; } + off_t end() { return ex.start + ex.length; } + off_t last() { return end() - 1; } + + // states + void set_state(int s) { + if (s == STATE_RX || s == STATE_TX) get(); + if (state == STATE_RX || state == STATE_TX) put(); + state = s; + } + int get_state() { return state; } + + bool is_missing() { return state == STATE_MISSING; } + bool is_dirty() { return state == STATE_DIRTY; } + bool is_clean() { return state == STATE_CLEAN; } + bool is_tx() { return state == STATE_TX; } + bool is_rx() { return state == STATE_RX; } + + // reference counting + int get() { + assert(ref >= 0); + if (ref == 0) lru_pin(); + return ++ref; + } + int put() { + assert(ref > 0); + if (ref == 1) lru_unpin(); + --ref; + return ref; + } + }; + + + // ******* Object ********* + class Object { + private: + // ObjectCacher::Object fields + ObjectCacher *oc; + object_t oid; // this _always_ is oid.rev=0 + inodeno_t ino; + objectrev_t rev; // last rev we're written + + public: + map data; + + tid_t last_write_tid; // version of bh (if non-zero) + tid_t last_ack_tid; // last update acked. + tid_t last_commit_tid; // last update commited. + + map< tid_t, list > waitfor_ack; + map< tid_t, list > waitfor_commit; + list waitfor_rd; + list waitfor_wr; + + // lock + static const int LOCK_NONE = 0; + static const int LOCK_WRLOCKING = 1; + static const int LOCK_WRLOCK = 2; + static const int LOCK_WRUNLOCKING = 3; + static const int LOCK_RDLOCKING = 4; + static const int LOCK_RDLOCK = 5; + static const int LOCK_RDUNLOCKING = 6; + static const int LOCK_UPGRADING = 7; // rd -> wr + static const int LOCK_DOWNGRADING = 8; // wr -> rd + int lock_state; + int wrlock_ref; // how many ppl want or are using a WRITE lock + int rdlock_ref; // how many ppl want or are using a READ lock + + public: + Object(ObjectCacher *_oc, object_t o, inodeno_t i) : + oc(_oc), + oid(o), ino(i), + last_write_tid(0), last_ack_tid(0), last_commit_tid(0), + lock_state(LOCK_NONE), wrlock_ref(0), rdlock_ref(0) + {} + + object_t get_oid() { return oid; } + inodeno_t get_ino() { return ino; } + + bool can_close() { + return data.empty() && lock_state == LOCK_NONE && + waitfor_ack.empty() && waitfor_commit.empty() && + waitfor_rd.empty() && waitfor_wr.empty(); + } + + // bh + void add_bh(BufferHead *bh) { + // add to my map + assert(data.count(bh->start()) == 0); + + if (0) { // sanity check FIXME DEBUG + //cout << "add_bh " << bh->start() << "~" << bh->length() << endl; + map::iterator p = data.lower_bound(bh->start()); + if (p != data.end()) { + //cout << " after " << *p->second << endl; + //cout << " after starts at " << p->first << endl; + assert(p->first >= bh->end()); + } + if (p != data.begin()) { + p--; + //cout << " before starts at " << p->second->start() + //<< " and ends at " << p->second->end() << endl; + //cout << " before " << *p->second << endl; + assert(p->second->end() <= bh->start()); + } + } + + data[bh->start()] = bh; + } + void remove_bh(BufferHead *bh) { + assert(data.count(bh->start())); + data.erase(bh->start()); + } + bool is_empty() { return data.empty(); } + + // mid-level + BufferHead *split(BufferHead *bh, off_t off); + void merge_left(BufferHead *left, BufferHead *right); + void merge_right(BufferHead *left, BufferHead *right); + + int map_read(Objecter::OSDRead *rd, + map& hits, + map& missing, + map& rx); + BufferHead *map_write(Objecter::OSDWrite *wr); + + }; + + // ******* ObjectCacher ********* + // ObjectCacher fields + public: + Objecter *objecter; + Filer filer; + + private: + Mutex& lock; + + hash_map objects; + hash_map > objects_by_ino; + + set dirty_bh; + LRU lru_dirty, lru_rest; + + Cond flusher_cond; + bool flusher_stop; + void flusher_entry(); + class FlusherThread : public Thread { + ObjectCacher *oc; + public: + FlusherThread(ObjectCacher *o) : oc(o) {} + void *entry() { + oc->flusher_entry(); + return 0; + } + } flusher_thread; + + + // objects + Object *get_object(object_t oid, inodeno_t ino) { + // have it? + if (objects.count(oid)) + return objects[oid]; + + // create it. + Object *o = new Object(this, oid, ino); + objects[oid] = o; + objects_by_ino[ino].insert(o); + return o; + } + void close_object(Object *ob) { + assert(ob->can_close()); + + // ok! + objects.erase(ob->get_oid()); + objects_by_ino[ob->get_ino()].erase(ob); + if (objects_by_ino[ob->get_ino()].empty()) + objects_by_ino.erase(ob->get_ino()); + delete ob; + } + + // bh stats + Cond stat_cond; + int stat_waiter; + + off_t stat_clean; + off_t stat_dirty; + off_t stat_rx; + off_t stat_tx; + off_t stat_missing; + + void bh_stat_add(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: stat_missing += bh->length(); break; + case BufferHead::STATE_CLEAN: stat_clean += bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty += bh->length(); break; + case BufferHead::STATE_TX: stat_tx += bh->length(); break; + case BufferHead::STATE_RX: stat_rx += bh->length(); break; + } + if (stat_waiter) stat_cond.Signal(); + } + void bh_stat_sub(BufferHead *bh) { + switch (bh->get_state()) { + case BufferHead::STATE_MISSING: stat_missing -= bh->length(); break; + case BufferHead::STATE_CLEAN: stat_clean -= bh->length(); break; + case BufferHead::STATE_DIRTY: stat_dirty -= bh->length(); break; + case BufferHead::STATE_TX: stat_tx -= bh->length(); break; + case BufferHead::STATE_RX: stat_rx -= bh->length(); break; + } + } + off_t get_stat_tx() { return stat_tx; } + off_t get_stat_rx() { return stat_rx; } + off_t get_stat_dirty() { return stat_dirty; } + off_t get_stat_clean() { return stat_clean; } + + void touch_bh(BufferHead *bh) { + if (bh->is_dirty()) + lru_dirty.lru_touch(bh); + else + lru_rest.lru_touch(bh); + } + + // bh states + void bh_set_state(BufferHead *bh, int s) { + // move between lru lists? + if (s == BufferHead::STATE_DIRTY && bh->get_state() != BufferHead::STATE_DIRTY) { + lru_rest.lru_remove(bh); + lru_dirty.lru_insert_top(bh); + dirty_bh.insert(bh); + } + if (s != BufferHead::STATE_DIRTY && bh->get_state() == BufferHead::STATE_DIRTY) { + lru_dirty.lru_remove(bh); + lru_rest.lru_insert_mid(bh); + dirty_bh.erase(bh); + } + + // set state + bh_stat_sub(bh); + bh->set_state(s); + bh_stat_add(bh); + } + + void copy_bh_state(BufferHead *bh1, BufferHead *bh2) { + bh_set_state(bh2, bh1->get_state()); + } + + void mark_missing(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_MISSING); }; + void mark_clean(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_CLEAN); }; + void mark_rx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_RX); }; + void mark_tx(BufferHead *bh) { bh_set_state(bh, BufferHead::STATE_TX); }; + void mark_dirty(BufferHead *bh) { + bh_set_state(bh, BufferHead::STATE_DIRTY); + lru_dirty.lru_touch(bh); + //bh->set_dirty_stamp(g_clock.now()); + }; + + void bh_add(Object *ob, BufferHead *bh) { + ob->add_bh(bh); + if (bh->is_dirty()) + lru_dirty.lru_insert_top(bh); + else + lru_rest.lru_insert_top(bh); + bh_stat_add(bh); + } + void bh_remove(Object *ob, BufferHead *bh) { + ob->remove_bh(bh); + if (bh->is_dirty()) + lru_dirty.lru_remove(bh); + else + lru_rest.lru_remove(bh); + bh_stat_sub(bh); + } + + // io + void bh_read(BufferHead *bh); + void bh_write(BufferHead *bh); + + void trim(off_t max=-1); + void flush(off_t amount=0); + + bool flush(Object *o); + off_t release(Object *o); + + void rdlock(Object *o); + void rdunlock(Object *o); + void wrlock(Object *o); + void wrunlock(Object *o); + + public: + void bh_read_finish(object_t oid, off_t offset, size_t length, bufferlist &bl); + void bh_write_ack(object_t oid, off_t offset, size_t length, tid_t t); + void bh_write_commit(object_t oid, off_t offset, size_t length, tid_t t); + void lock_ack(list& oids, tid_t tid); + + class C_ReadFinish : public Context { + ObjectCacher *oc; + object_t oid; + off_t start; + size_t length; + public: + bufferlist bl; + C_ReadFinish(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + void finish(int r) { + oc->bh_read_finish(oid, start, length, bl); + } + }; + + class C_WriteAck : public Context { + ObjectCacher *oc; + object_t oid; + off_t start; + size_t length; + public: + tid_t tid; + C_WriteAck(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + void finish(int r) { + oc->bh_write_ack(oid, start, length, tid); + } + }; + class C_WriteCommit : public Context { + ObjectCacher *oc; + object_t oid; + off_t start; + size_t length; + public: + tid_t tid; + C_WriteCommit(ObjectCacher *c, object_t o, off_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + void finish(int r) { + oc->bh_write_commit(oid, start, length, tid); + } + }; + + class C_LockAck : public Context { + ObjectCacher *oc; + public: + list oids; + tid_t tid; + C_LockAck(ObjectCacher *c, object_t o) : oc(c) { + oids.push_back(o); + } + void finish(int r) { + oc->lock_ack(oids, tid); + } + }; + + + + public: + ObjectCacher(Objecter *o, Mutex& l) : + objecter(o), filer(o), lock(l), + flusher_stop(false), flusher_thread(this), + stat_waiter(0), + stat_clean(0), stat_dirty(0), stat_rx(0), stat_tx(0), stat_missing(0) { + flusher_thread.create(); + } + ~ObjectCacher() { + //lock.Lock(); // hmm.. watch out for deadlock! + flusher_stop = true; + flusher_cond.Signal(); + //lock.Unlock(); + flusher_thread.join(); + } + + + class C_RetryRead : public Context { + ObjectCacher *oc; + Objecter::OSDRead *rd; + inodeno_t ino; + Context *onfinish; + public: + C_RetryRead(ObjectCacher *_oc, Objecter::OSDRead *r, inodeno_t i, Context *c) : oc(_oc), rd(r), ino(i), onfinish(c) {} + void finish(int) { + int r = oc->readx(rd, ino, onfinish); + if (r > 0) { + onfinish->finish(r); + delete onfinish; + } + } + }; + + // non-blocking. async. + int readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish); + int writex(Objecter::OSDWrite *wr, inodeno_t ino); + + // write blocking + void wait_for_write(size_t len, Mutex& lock); + + // blocking. atomic+sync. + int atomic_sync_readx(Objecter::OSDRead *rd, inodeno_t ino, Mutex& lock); + int atomic_sync_writex(Objecter::OSDWrite *wr, inodeno_t ino, Mutex& lock); + + bool set_is_cached(inodeno_t ino); + bool set_is_dirty_or_committing(inodeno_t ino); + + bool flush_set(inodeno_t ino, Context *onfinish=0); + void flush_all(Context *onfinish=0); + + bool commit_set(inodeno_t ino, Context *oncommit); + void commit_all(Context *oncommit=0); + + off_t release_set(inodeno_t ino); // returns # of bytes not released (ie non-clean) + + void kick_sync_writers(inodeno_t ino); + void kick_sync_readers(inodeno_t ino); + + + // file functions + + /*** async+caching (non-blocking) file interface ***/ + int file_read(inode_t& inode, + off_t offset, size_t len, + bufferlist *bl, + Context *onfinish) { + Objecter::OSDRead *rd = new Objecter::OSDRead(bl); + filer.file_to_extents(inode, offset, len, rd->extents); + return readx(rd, inode.ino, onfinish); + } + + int file_write(inode_t& inode, + off_t offset, size_t len, + bufferlist& bl, + objectrev_t rev=0) { + Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); + filer.file_to_extents(inode, offset, len, wr->extents); + return writex(wr, inode.ino); + } + + + + /*** sync+blocking file interface ***/ + + int file_atomic_sync_read(inode_t& inode, + off_t offset, size_t len, + bufferlist *bl, + Mutex &lock) { + Objecter::OSDRead *rd = new Objecter::OSDRead(bl); + filer.file_to_extents(inode, offset, len, rd->extents); + return atomic_sync_readx(rd, inode.ino, lock); + } + + int file_atomic_sync_write(inode_t& inode, + off_t offset, size_t len, + bufferlist& bl, + Mutex &lock, + objectrev_t rev=0) { + Objecter::OSDWrite *wr = new Objecter::OSDWrite(bl); + filer.file_to_extents(inode, offset, len, wr->extents); + return atomic_sync_writex(wr, inode.ino, lock); + } + +}; + + +inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) +{ + out << "bh[" + << bh.start() << "~" << bh.length() + << " (" << bh.bl.length() << ")" + << " v " << bh.last_write_tid; + if (bh.is_tx()) out << " tx"; + if (bh.is_rx()) out << " rx"; + if (bh.is_dirty()) out << " dirty"; + if (bh.is_clean()) out << " clean"; + if (bh.is_missing()) out << " missing"; + out << "]"; + return out; +} + +inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) +{ + out << "object[" + << hex << ob.get_oid() << " ino " << ob.get_ino() << dec + << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; + + switch (ob.lock_state) { + case ObjectCacher::Object::LOCK_WRLOCKING: out << " wrlocking"; break; + case ObjectCacher::Object::LOCK_WRLOCK: out << " wrlock"; break; + case ObjectCacher::Object::LOCK_WRUNLOCKING: out << " wrunlocking"; break; + case ObjectCacher::Object::LOCK_RDLOCKING: out << " rdlocking"; break; + case ObjectCacher::Object::LOCK_RDLOCK: out << " rdlock"; break; + case ObjectCacher::Object::LOCK_RDUNLOCKING: out << " rdunlocking"; break; + } + + out << "]"; + return out; +} + +#endif diff --git a/branches/sage/cephmds2/osdc/Objecter.cc b/branches/sage/cephmds2/osdc/Objecter.cc new file mode 100644 index 0000000000000..5e56781a20569 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Objecter.cc @@ -0,0 +1,831 @@ + +#include "Objecter.h" +#include "osd/OSDMap.h" +#include "mon/MonMap.h" + +#include "msg/Messenger.h" +#include "msg/Message.h" + +#include "messages/MOSDOp.h" +#include "messages/MOSDOpReply.h" +#include "messages/MOSDMap.h" +#include "messages/MOSDGetMap.h" + +#include "messages/MOSDFailure.h" + +#include + +#include "config.h" +#undef dout +#define dout(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cout << g_clock.now() << " " << messenger->get_myaddr() << ".objecter " +#define derr(x) if (x <= g_conf.debug || x <= g_conf.debug_objecter) cerr << g_clock.now() << " " << messenger->get_myaddr() << ".objecter " + + +// messages ------------------------------ + +void Objecter::dispatch(Message *m) +{ + switch (m->get_type()) { + case MSG_OSD_OPREPLY: + handle_osd_op_reply((MOSDOpReply*)m); + break; + + case MSG_OSD_MAP: + handle_osd_map((MOSDMap*)m); + break; + + default: + dout(1) << "don't know message type " << m->get_type() << endl; + assert(0); + } +} + +void Objecter::handle_osd_map(MOSDMap *m) +{ + assert(osdmap); + + if (m->get_last() <= osdmap->get_epoch()) { + dout(3) << "handle_osd_map ignoring epochs [" + << m->get_first() << "," << m->get_last() + << "] <= " << osdmap->get_epoch() << endl; + } + else { + dout(3) << "handle_osd_map got epochs [" + << m->get_first() << "," << m->get_last() + << "] > " << osdmap->get_epoch() + << endl; + + set changed_pgs; + + for (epoch_t e = osdmap->get_epoch() + 1; + e <= m->get_last(); + e++) { + if (m->incremental_maps.count(e)) { + dout(3) << "handle_osd_map decoding incremental epoch " << e << endl; + OSDMap::Incremental inc; + int off = 0; + inc.decode(m->incremental_maps[e], off); + osdmap->apply_incremental(inc); + + // notify messenger + for (map::iterator i = inc.new_down.begin(); + i != inc.new_down.end(); + i++) + messenger->mark_down(MSG_ADDR_OSD(i->first), i->second); + for (map::iterator i = inc.new_up.begin(); + i != inc.new_up.end(); + i++) + messenger->mark_up(MSG_ADDR_OSD(i->first), i->second); + + } + else if (m->maps.count(e)) { + dout(3) << "handle_osd_map decoding full epoch " << e << endl; + osdmap->decode(m->maps[e]); + } + else { + dout(3) << "handle_osd_map requesting missing epoch " << osdmap->get_epoch()+1 << endl; + int mon = monmap->pick_mon(); + messenger->send_message(new MOSDGetMap(osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + break; + } + + // scan pgs for changes + scan_pgs(changed_pgs); + + assert(e == osdmap->get_epoch()); + } + + // kick requests who might be timing out on the wrong osds + if (!changed_pgs.empty()) + kick_requests(changed_pgs); + } + + delete m; +} + +void Objecter::scan_pgs(set& changed_pgs) +{ + dout(10) << "scan_pgs" << endl; + + for (hash_map::iterator i = pg_map.begin(); + i != pg_map.end(); + i++) { + pg_t pgid = i->first; + PG& pg = i->second; + + // calc new. + vector other; + osdmap->pg_to_acting_osds(pgid, other); + + if (other == pg.acting) + continue; // no change. + + other.swap(pg.acting); + + if (g_conf.osd_rep == OSD_REP_PRIMARY) { + // same primary? + if (!other.empty() && + !pg.acting.empty() && + other[0] == pg.acting[0]) + continue; + } + else if (g_conf.osd_rep == OSD_REP_SPLAY) { + // same primary and acker? + if (!other.empty() && + !pg.acting.empty() && + other[0] == pg.acting[0] && + other[other.size() > 1 ? 1:0] == pg.acting[pg.acting.size() > 1 ? 1:0]) + continue; + } + else if (g_conf.osd_rep == OSD_REP_CHAIN) { + // any change is significant. + } + + // changed significantly. + dout(10) << "scan_pgs pg " << pgid + << " (" << pg.active_tids << ")" + << " " << other << " -> " << pg.acting + << endl; + changed_pgs.insert(pgid); + } +} + +void Objecter::kick_requests(set& changed_pgs) +{ + dout(10) << "kick_requests in pgs " << changed_pgs << endl; + + for (set::iterator i = changed_pgs.begin(); + i != changed_pgs.end(); + i++) { + pg_t pgid = *i; + PG& pg = pg_map[pgid]; + + // resubmit ops! + set tids; + tids.swap( pg.active_tids ); + close_pg( pgid ); // will pbly reopen, unless it's just commits we're missing + + for (set::iterator p = tids.begin(); + p != tids.end(); + p++) { + tid_t tid = *p; + + if (op_modify.count(tid)) { + OSDModify *wr = op_modify[tid]; + op_modify.erase(tid); + + // WRITE + if (wr->tid_version.count(tid)) { + if (wr->op == OSD_OP_WRITE && + !g_conf.objecter_buffer_uncommitted) { + dout(0) << "kick_requests missing commit, cannot replay: objecter_buffer_uncommitted == FALSE" << endl; + } else { + dout(0) << "kick_requests missing commit, replay write " << tid + << " v " << wr->tid_version[tid] << endl; + modifyx_submit(wr, wr->waitfor_commit[tid], tid); + } + } + else if (wr->waitfor_ack.count(tid)) { + dout(0) << "kick_requests missing ack, resub write " << tid << endl; + modifyx_submit(wr, wr->waitfor_ack[tid], tid); + } + } + + else if (op_read.count(tid)) { + // READ + OSDRead *rd = op_read[tid]; + op_read.erase(tid); + dout(0) << "kick_requests resub read " << tid << endl; + + // resubmit + readx_submit(rd, rd->ops[tid]); + rd->ops.erase(tid); + } + + else if (op_stat.count(tid)) { + OSDStat *st = op_stat[tid]; + op_stat.erase(tid); + + dout(0) << "kick_requests resub stat " << tid << endl; + + // resubmit + stat_submit(st); + } + + else + assert(0); + } + } +} + + + +void Objecter::handle_osd_op_reply(MOSDOpReply *m) +{ + // read or modify? + switch (m->get_op()) { + case OSD_OP_READ: + handle_osd_read_reply(m); + break; + + case OSD_OP_STAT: + handle_osd_stat_reply(m); + break; + + case OSD_OP_WRNOOP: + case OSD_OP_WRITE: + case OSD_OP_ZERO: + case OSD_OP_DELETE: + case OSD_OP_WRUNLOCK: + case OSD_OP_WRLOCK: + case OSD_OP_RDLOCK: + case OSD_OP_RDUNLOCK: + case OSD_OP_UPLOCK: + case OSD_OP_DNLOCK: + handle_osd_modify_reply(m); + break; + + default: + assert(0); + } +} + + + +// stat ----------------------------------- + +tid_t Objecter::stat(object_t oid, off_t *size, Context *onfinish, + objectrev_t rev) +{ + OSDStat *st = new OSDStat(size); + st->extents.push_back(ObjectExtent(oid, 0, 0)); + st->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); + st->extents.front().rev = rev; + st->onfinish = onfinish; + + return stat_submit(st); +} + +tid_t Objecter::stat_submit(OSDStat *st) +{ + // find OSD + ObjectExtent &ex = st->extents.front(); + PG &pg = get_pg( ex.pgid ); + + // send + last_tid++; + MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(), + ex.oid, ex.pgid, osdmap->get_epoch(), + OSD_OP_STAT); + dout(10) << "stat_submit " << st << " tid " << last_tid + << " oid " << ex.oid + << " pg " << ex.pgid + << " osd" << pg.acker() + << endl; + + if (pg.acker() >= 0) + messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker())); + + // add to gather set + st->tid = last_tid; + op_stat[last_tid] = st; + + pg.active_tids.insert(last_tid); + + return last_tid; +} + +void Objecter::handle_osd_stat_reply(MOSDOpReply *m) +{ + // get pio + tid_t tid = m->get_tid(); + + if (op_stat.count(tid) == 0) { + dout(7) << "handle_osd_stat_reply " << tid << " ... stray" << endl; + delete m; + return; + } + + dout(7) << "handle_osd_stat_reply " << tid + << " r=" << m->get_result() + << " size=" << m->get_object_size() + << endl; + OSDStat *st = op_stat[ tid ]; + op_stat.erase( tid ); + + // remove from osd/tid maps + PG& pg = get_pg( m->get_pg() ); + assert(pg.active_tids.count(tid)); + pg.active_tids.erase(tid); + if (pg.active_tids.empty()) close_pg( m->get_pg() ); + + // success? + if (m->get_result() == -EAGAIN) { + dout(7) << " got -EAGAIN, resubmitting" << endl; + stat_submit(st); + delete m; + return; + } + //assert(m->get_result() >= 0); + + // ok! + if (m->get_result() < 0) { + *st->size = -1; + } else { + *st->size = m->get_object_size(); + } + + // finish, clean up + Context *onfinish = st->onfinish; + + // done + delete st; + if (onfinish) { + onfinish->finish(m->get_result()); + delete onfinish; + } + + delete m; +} + + +// read ----------------------------------- + + +tid_t Objecter::read(object_t oid, off_t off, size_t len, bufferlist *bl, + Context *onfinish, + objectrev_t rev) +{ + OSDRead *rd = new OSDRead(bl); + rd->extents.push_back(ObjectExtent(oid, off, len)); + rd->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); + rd->extents.front().rev = rev; + readx(rd, onfinish); + return last_tid; +} + + +tid_t Objecter::readx(OSDRead *rd, Context *onfinish) +{ + rd->onfinish = onfinish; + + // issue reads + for (list::iterator it = rd->extents.begin(); + it != rd->extents.end(); + it++) + readx_submit(rd, *it); + + return last_tid; +} + +tid_t Objecter::readx_submit(OSDRead *rd, ObjectExtent &ex) +{ + // find OSD + PG &pg = get_pg( ex.pgid ); + + // send + last_tid++; + MOSDOp *m = new MOSDOp(last_tid, messenger->get_myaddr(), + ex.oid, ex.pgid, osdmap->get_epoch(), + OSD_OP_READ); + m->set_length(ex.length); + m->set_offset(ex.start); + dout(10) << "readx_submit " << rd << " tid " << last_tid + << " oid " << ex.oid << " " << ex.start << "~" << ex.length + << " (" << ex.buffer_extents.size() << " buffer fragments)" + << " pg " << ex.pgid + << " osd" << pg.acker() + << endl; + + if (pg.acker() >= 0) + messenger->send_message(m, MSG_ADDR_OSD(pg.acker()), osdmap->get_inst(pg.acker())); + + // add to gather set + rd->ops[last_tid] = ex; + op_read[last_tid] = rd; + + pg.active_tids.insert(last_tid); + + return last_tid; +} + + +void Objecter::handle_osd_read_reply(MOSDOpReply *m) +{ + // get pio + tid_t tid = m->get_tid(); + + if (op_read.count(tid) == 0) { + dout(7) << "handle_osd_read_reply " << tid << " ... stray" << endl; + delete m; + return; + } + + dout(7) << "handle_osd_read_reply " << tid << endl; + OSDRead *rd = op_read[ tid ]; + op_read.erase( tid ); + + // remove from osd/tid maps + PG& pg = get_pg( m->get_pg() ); + assert(pg.active_tids.count(tid)); + pg.active_tids.erase(tid); + if (pg.active_tids.empty()) close_pg( m->get_pg() ); + + // our op finished + rd->ops.erase(tid); + + // success? + if (m->get_result() == -EAGAIN) { + dout(7) << " got -EAGAIN, resubmitting" << endl; + readx_submit(rd, rd->ops[tid]); + delete m; + return; + } + //assert(m->get_result() >= 0); + + // what buffer offset are we? + dout(7) << " got frag from " << m->get_oid() << " " + << m->get_offset() << "~" << m->get_length() + << ", still have " << rd->ops.size() << " more ops" << endl; + + if (rd->ops.empty()) { + // all done + size_t bytes_read = 0; + + if (rd->read_data.size()) { + dout(15) << " assembling frags" << endl; + + /** FIXME This doesn't handle holes efficiently. + * It allocates zero buffers to fill whole buffer, and + * then discards trailing ones at the end. + * + * Actually, this whole thing is pretty messy with temporary bufferlist*'s all over + * the heap. + */ + + // we have other fragments, assemble them all... blech! + rd->read_data[m->get_oid()] = new bufferlist; + rd->read_data[m->get_oid()]->claim( m->get_data() ); + + // map extents back into buffer + map by_off; // buffer offset -> bufferlist + + // for each object extent... + for (list::iterator eit = rd->extents.begin(); + eit != rd->extents.end(); + eit++) { + bufferlist *ox_buf = rd->read_data[eit->oid]; + unsigned ox_len = ox_buf->length(); + unsigned ox_off = 0; + assert(ox_len <= eit->length); + + // for each buffer extent we're mapping into... + for (map::iterator bit = eit->buffer_extents.begin(); + bit != eit->buffer_extents.end(); + bit++) { + dout(21) << " object " << eit->oid << " extent " << eit->start << "~" << eit->length << " : ox offset " << ox_off << " -> buffer extent " << bit->first << "~" << bit->second << endl; + by_off[bit->first] = new bufferlist; + + if (ox_off + bit->second <= ox_len) { + // we got the whole bx + by_off[bit->first]->substr_of(*ox_buf, ox_off, bit->second); + if (bytes_read < bit->first + bit->second) + bytes_read = bit->first + bit->second; + } else if (ox_off + bit->second > ox_len && ox_off < ox_len) { + // we got part of this bx + by_off[bit->first]->substr_of(*ox_buf, ox_off, (ox_len-ox_off)); + if (bytes_read < bit->first + ox_len-ox_off) + bytes_read = bit->first + ox_len-ox_off; + + // zero end of bx + dout(21) << " adding some zeros to the end " << ox_off + bit->second-ox_len << endl; + bufferptr z(ox_off + bit->second - ox_len); + z.zero(); + by_off[bit->first]->append( z ); + } else { + // we got none of this bx. zero whole thing. + assert(ox_off >= ox_len); + dout(21) << " adding all zeros for this bit " << bit->second << endl; + bufferptr z(bit->second); + z.zero(); + by_off[bit->first]->append( z ); + } + ox_off += bit->second; + } + assert(ox_off == eit->length); + } + + // sort and string bits together + for (map::iterator it = by_off.begin(); + it != by_off.end(); + it++) { + assert(it->second->length()); + if (it->first < (off_t)bytes_read) { + dout(21) << " concat buffer frag off " << it->first << " len " << it->second->length() << endl; + rd->bl->claim_append(*(it->second)); + } else { + dout(21) << " NO concat zero buffer frag off " << it->first << " len " << it->second->length() << endl; + } + delete it->second; + } + + // trim trailing zeros? + if (rd->bl->length() > bytes_read) { + dout(10) << " trimming off trailing zeros . bytes_read=" << bytes_read + << " len=" << rd->bl->length() << endl; + rd->bl->splice(bytes_read, rd->bl->length() - bytes_read); + assert(bytes_read == rd->bl->length()); + } + + // hose p->read_data bufferlist*'s + for (map::iterator it = rd->read_data.begin(); + it != rd->read_data.end(); + it++) { + delete it->second; + } + } else { + dout(15) << " only one frag" << endl; + + // only one fragment, easy + rd->bl->claim( m->get_data() ); + bytes_read = rd->bl->length(); + } + + // finish, clean up + Context *onfinish = rd->onfinish; + + dout(7) << " " << bytes_read << " bytes " + << rd->bl->length() + << endl; + + // done + delete rd; + if (onfinish) { + onfinish->finish(bytes_read);// > 0 ? bytes_read:m->get_result()); + delete onfinish; + } + } else { + // store my bufferlist for later assembling + rd->read_data[m->get_oid()] = new bufferlist; + rd->read_data[m->get_oid()]->claim( m->get_data() ); + } + + delete m; +} + + + +// write ------------------------------------ + +tid_t Objecter::write(object_t oid, off_t off, size_t len, bufferlist &bl, + Context *onack, Context *oncommit, + objectrev_t rev) +{ + OSDWrite *wr = new OSDWrite(bl); + wr->extents.push_back(ObjectExtent(oid, off, len)); + wr->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); + wr->extents.front().buffer_extents[0] = len; + wr->extents.front().rev = rev; + modifyx(wr, onack, oncommit); + return last_tid; +} + + +// zero + +tid_t Objecter::zero(object_t oid, off_t off, size_t len, + Context *onack, Context *oncommit, + objectrev_t rev) +{ + OSDModify *z = new OSDModify(OSD_OP_ZERO); + z->extents.push_back(ObjectExtent(oid, off, len)); + z->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); + z->extents.front().rev = rev; + modifyx(z, onack, oncommit); + return last_tid; +} + + +// lock ops + +tid_t Objecter::lock(int op, object_t oid, + Context *onack, Context *oncommit) +{ + OSDModify *l = new OSDModify(op); + l->extents.push_back(ObjectExtent(oid, 0, 0)); + l->extents.front().pgid = osdmap->object_to_pg( oid, g_OSD_FileLayout ); + modifyx(l, onack, oncommit); + return last_tid; +} + + + +// generic modify ----------------------------------- + +tid_t Objecter::modifyx(OSDModify *wr, Context *onack, Context *oncommit) +{ + wr->onack = onack; + wr->oncommit = oncommit; + + // issue writes/whatevers + for (list::iterator it = wr->extents.begin(); + it != wr->extents.end(); + it++) + modifyx_submit(wr, *it); + + return last_tid; +} + + +tid_t Objecter::modifyx_submit(OSDModify *wr, ObjectExtent &ex, tid_t usetid) +{ + // find + PG &pg = get_pg( ex.pgid ); + + // send + tid_t tid; + if (usetid > 0) + tid = usetid; + else + tid = ++last_tid; + + MOSDOp *m = new MOSDOp(tid, messenger->get_myaddr(), + ex.oid, ex.pgid, osdmap->get_epoch(), + wr->op); + m->set_length(ex.length); + m->set_offset(ex.start); + m->set_rev(ex.rev); + + if (wr->tid_version.count(tid)) + m->set_version(wr->tid_version[tid]); // we're replaying this op! + + // what type of op? + switch (wr->op) { + case OSD_OP_WRITE: + { + // map buffer segments into this extent + // (may be fragmented bc of striping) + bufferlist cur; + for (map::iterator bit = ex.buffer_extents.begin(); + bit != ex.buffer_extents.end(); + bit++) { + bufferlist thisbit; + thisbit.substr_of(((OSDWrite*)wr)->bl, bit->first, bit->second); + cur.claim_append(thisbit); + } + assert(cur.length() == ex.length); + m->set_data(cur);//.claim(cur); + } + break; + } + + // add to gather set + wr->waitfor_ack[tid] = ex; + wr->waitfor_commit[tid] = ex; + op_modify[tid] = wr; + pg.active_tids.insert(tid); + + ++num_unacked; + ++num_uncommitted; + + // send + dout(10) << "modifyx_submit " << MOSDOp::get_opname(wr->op) << " tid " << tid + << " oid " << ex.oid + << " " << ex.start << "~" << ex.length + << " pg " << ex.pgid + << " osd" << pg.primary() + << endl; + if (pg.primary() >= 0) + messenger->send_message(m, MSG_ADDR_OSD(pg.primary()), osdmap->get_inst(pg.primary())); + + dout(5) << num_unacked << " unacked, " << num_uncommitted << " uncommitted" << endl; + + return tid; +} + + + +void Objecter::handle_osd_modify_reply(MOSDOpReply *m) +{ + // get pio + tid_t tid = m->get_tid(); + + if (op_modify.count(tid) == 0) { + dout(7) << "handle_osd_modify_reply " << tid + << (m->get_commit() ? " commit":" ack") + << " ... stray" << endl; + delete m; + return; + } + + dout(7) << "handle_osd_modify_reply " << tid + << (m->get_commit() ? " commit":" ack") + << " v " << m->get_version() + << endl; + OSDModify *wr = op_modify[ tid ]; + + Context *onack = 0; + Context *oncommit = 0; + + PG &pg = get_pg( m->get_pg() ); + + // ignore? + if (pg.acker() != m->get_source().num()) { + dout(7) << " ignoring ack|commit from non-acker" << endl; + delete m; + return; + } + + assert(m->get_result() >= 0); + + // ack or commit? + if (m->get_commit()) { + //dout(15) << " handle_osd_write_reply commit on " << tid << endl; + assert(wr->tid_version.count(tid) == 0 || + m->get_version() == wr->tid_version[tid]); + + // remove from tid/osd maps + assert(pg.active_tids.count(tid)); + pg.active_tids.erase(tid); + if (pg.active_tids.empty()) close_pg( m->get_pg() ); + + // commit. + op_modify.erase( tid ); + wr->waitfor_ack.erase(tid); + wr->waitfor_commit.erase(tid); + + num_uncommitted--; + + if (wr->waitfor_commit.empty()) { + onack = wr->onack; + oncommit = wr->oncommit; + delete wr; + } + } else { + // ack. + //dout(15) << " handle_osd_write_reply ack on " << tid << endl; + assert(wr->waitfor_ack.count(tid)); + wr->waitfor_ack.erase(tid); + + num_unacked--; + + if (wr->tid_version.count(tid) && + wr->tid_version[tid].version != m->get_version().version) { + dout(-10) << "handle_osd_modify_reply WARNING: replay of tid " << tid + << " did not achieve previous ordering" << endl; + } + wr->tid_version[tid] = m->get_version(); + + if (wr->waitfor_ack.empty()) { + onack = wr->onack; + wr->onack = 0; // only do callback once + + // buffer uncommitted? + if (!g_conf.objecter_buffer_uncommitted && + wr->op == OSD_OP_WRITE) { + // discard buffer! + ((OSDWrite*)wr)->bl.clear(); + } + } + } + + // do callbacks + if (onack) { + onack->finish(0); + delete onack; + } + if (oncommit) { + oncommit->finish(0); + delete oncommit; + } + + delete m; +} + + + +void Objecter::ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst) +{ + if (dest.is_mon()) { + // try a new mon + int mon = monmap->pick_mon(true); + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", resending to mon" << mon + << endl; + messenger->send_message(m, MSG_ADDR_MON(mon), monmap->get_inst(mon)); + } + else if (dest.is_osd()) { + int mon = monmap->pick_mon(); + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", dropping and reporting to mon" << mon + << endl; + messenger->send_message(new MOSDFailure(dest, inst, osdmap->get_epoch()), + MSG_ADDR_MON(mon), monmap->get_inst(mon)); + delete m; + } else { + dout(0) << "ms_handle_failure " << dest << " inst " << inst + << ", dropping" << endl; + delete m; + } +} diff --git a/branches/sage/cephmds2/osdc/Objecter.h b/branches/sage/cephmds2/osdc/Objecter.h new file mode 100644 index 0000000000000..72e637789f988 --- /dev/null +++ b/branches/sage/cephmds2/osdc/Objecter.h @@ -0,0 +1,191 @@ +#ifndef __OBJECTER_H +#define __OBJECTER_H + +#include "include/types.h" +#include "include/buffer.h" + +#include "osd/OSDMap.h" +#include "messages/MOSDOp.h" + +#include +#include +#include +using namespace std; +using namespace __gnu_cxx; + +class Context; +class Messenger; +class OSDMap; +class MonMap; +class Message; + +class Objecter { + public: + Messenger *messenger; + MonMap *monmap; + OSDMap *osdmap; + + private: + tid_t last_tid; + int num_unacked; + int num_uncommitted; + + /*** track pending operations ***/ + // read + public: + class OSDOp { + public: + list extents; + virtual ~OSDOp() {} + }; + + class OSDRead : public OSDOp { + public: + bufferlist *bl; + Context *onfinish; + map ops; + map read_data; // bits of data as they come back + + OSDRead(bufferlist *b) : bl(b), onfinish(0) { + bl->clear(); + } + }; + + class OSDStat : public OSDOp { + public: + tid_t tid; + off_t *size; // where the size goes. + Context *onfinish; + OSDStat(off_t *s) : tid(0), size(s), onfinish(0) { } + }; + + // generic modify + class OSDModify : public OSDOp { + public: + int op; + list extents; + Context *onack; + Context *oncommit; + map waitfor_ack; + map tid_version; + map waitfor_commit; + + OSDModify(int o) : op(o), onack(0), oncommit(0) {} + }; + + // write (includes the bufferlist) + class OSDWrite : public OSDModify { + public: + bufferlist bl; + OSDWrite(bufferlist &b) : OSDModify(OSD_OP_WRITE), bl(b) {} + }; + + + + private: + // pending ops + hash_map op_stat; + hash_map op_read; + hash_map op_modify; + + /** + * track pending ops by pg + * ...so we can cope with failures, map changes + */ + class PG { + public: + vector acting; + set active_tids; // active ops + + PG() {} + + // primary - where i write + int primary() { + if (acting.empty()) return -1; + return acting[0]; + } + // acker - where i read, and receive acks from + int acker() { + if (acting.empty()) return -1; + if (g_conf.osd_rep == OSD_REP_PRIMARY) + return acting[0]; + else + return acting[acting.size() > 1 ? 1:0]; + } + }; + + hash_map pg_map; + + + PG &get_pg(pg_t pgid) { + if (!pg_map.count(pgid)) + osdmap->pg_to_acting_osds(pgid, pg_map[pgid].acting); + return pg_map[pgid]; + } + void close_pg(pg_t pgid) { + assert(pg_map.count(pgid)); + assert(pg_map[pgid].active_tids.empty()); + pg_map.erase(pgid); + } + void scan_pgs(set& chnaged_pgs); + void kick_requests(set& changed_pgs); + + + public: + Objecter(Messenger *m, MonMap *mm, OSDMap *om) : + messenger(m), monmap(mm), osdmap(om), + last_tid(0), + num_unacked(0), num_uncommitted(0) + {} + ~Objecter() { + // clean up op_* + // *** + } + + // messages + public: + void dispatch(Message *m); + void handle_osd_op_reply(class MOSDOpReply *m); + void handle_osd_stat_reply(class MOSDOpReply *m); + void handle_osd_read_reply(class MOSDOpReply *m); + void handle_osd_modify_reply(class MOSDOpReply *m); + void handle_osd_lock_reply(class MOSDOpReply *m); + void handle_osd_map(class MOSDMap *m); + + private: + tid_t readx_submit(OSDRead *rd, ObjectExtent& ex); + tid_t modifyx_submit(OSDModify *wr, ObjectExtent& ex, tid_t tid=0); + tid_t stat_submit(OSDStat *st); + + // public interface + public: + bool is_active() { + return !(op_read.empty() && op_modify.empty()); + } + + // med level + tid_t readx(OSDRead *read, Context *onfinish); + tid_t modifyx(OSDModify *wr, Context *onack, Context *oncommit); + //tid_t lockx(OSDLock *l, Context *onack, Context *oncommit); + + // even lazier + tid_t read(object_t oid, off_t off, size_t len, bufferlist *bl, + Context *onfinish, + objectrev_t rev=0); + tid_t write(object_t oid, off_t off, size_t len, bufferlist &bl, + Context *onack, Context *oncommit, + objectrev_t rev=0); + tid_t zero(object_t oid, off_t off, size_t len, + Context *onack, Context *oncommit, + objectrev_t rev=0); + tid_t stat(object_t oid, off_t *size, Context *onfinish, + objectrev_t rev=0); + + tid_t lock(int op, object_t oid, Context *onack, Context *oncommit); + + + void ms_handle_failure(Message *m, msg_addr_t dest, const entity_inst_t& inst); + +}; + +#endif diff --git a/branches/sage/cephmds2/script/add_header.pl b/branches/sage/cephmds2/script/add_header.pl new file mode 100755 index 0000000000000..f5891cc668c45 --- /dev/null +++ b/branches/sage/cephmds2/script/add_header.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl + +use strict; +my $fn = shift @ARGV; +my $f = `cat $fn`; + +my $header = '// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +'; + +unless ($f =~ /Ceph - scalable distributed file system/) { + open(O, ">$fn.new"); + print O $header; + print O $f; + close O; + rename "$fn.new", $fn; +} + diff --git a/branches/sage/cephmds2/script/adjusttabs.pl b/branches/sage/cephmds2/script/adjusttabs.pl new file mode 100755 index 0000000000000..66edff2ac6c02 --- /dev/null +++ b/branches/sage/cephmds2/script/adjusttabs.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl + +my $tablen = shift @ARGV; +my $fn = shift @ARGV; + +my $tab = ' ' x $tablen; +open(I, $fn); +my $f; +my $oldtab = ' ' x 4; +while () { + if (my ($oldlen) = /\-\*\- .*tab-width:(\d)/) { + print "old length was $oldlen\n"; + $oldtab = ' ' x $oldlen; + s/tab-width:\d/tab-width:$tablen/; + } + s/\t/$oldtab/g; + $f .= $_; +} +close I; +open(O, ">$fn.new"); +print O $f; +close O; + +rename "$fn.new", $fn; diff --git a/branches/sage/cephmds2/script/clean_osd_cow.sh b/branches/sage/cephmds2/script/clean_osd_cow.sh new file mode 100755 index 0000000000000..1e443c95e7ebc --- /dev/null +++ b/branches/sage/cephmds2/script/clean_osd_cow.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +rm osddata/*/*\.* diff --git a/branches/sage/cephmds2/script/clean_trace.pl b/branches/sage/cephmds2/script/clean_trace.pl new file mode 100755 index 0000000000000..cb02ff7abe7c2 --- /dev/null +++ b/branches/sage/cephmds2/script/clean_trace.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +my $n = 0; +while (<>) { + next unless /trace: /; + my $l = $'; $'; + print $l; +} diff --git a/branches/sage/cephmds2/script/comb.pl b/branches/sage/cephmds2/script/comb.pl new file mode 100755 index 0000000000000..88a4bb72a7970 --- /dev/null +++ b/branches/sage/cephmds2/script/comb.pl @@ -0,0 +1,113 @@ +#!/usr/bin/perl + +use strict; + +my $xaxis = shift @ARGV; +my @vars; +while (@ARGV) { + $_ = shift @ARGV; + last if ($_ eq '-'); + push(@vars, $_); +} +my @dirs; +while (@ARGV) { + $_ = shift @ARGV; + last if ($_ eq '-'); + push(@dirs, $_) if -d $_; +} +my @filt = @ARGV; +push( @filt, '.' ) unless @filt; + +print "#xaxis $xaxis +#vars @vars +#dirs @dirs +#filt @filt +"; + +sub load_sum { + my $fn = shift @_; + + open(I, "$fn"); + my $k = ; + chomp($k); + my @k = split(/\s+/,$k); + shift @k; + + my $s; + while () { + chomp; + s/^\#//; + next unless $_; + my @l = split(/\s+/,$_); + my $k = shift @l; + for my $f (@k) { + $s->{$k}->{$f} = shift @l; + } + + # clnode latency? + if ($fn =~ /cl/) { + $s->{$k}->{'wrlat'} = $s->{$k}->{'wrlsum'} / $s->{$k}->{'wrlnum'} if $s->{$k}->{'wrlnum'} > 0; + $s->{$k}->{'rlat'} = $s->{$k}->{'rlsum'} / $s->{$k}->{'rlnum'} if $s->{$k}->{'rlnum'} > 0; + $s->{$k}->{'lat'} = $s->{$k}->{'lsum'} / $s->{$k}->{'lnum'} if $s->{$k}->{'lnum'} > 0; + $s->{$k}->{'latw'} = $s->{$k}->{'lwsum'} / $s->{$k}->{'lwnum'} if $s->{$k}->{'lwnum'} > 0; + $s->{$k}->{'latr'} = $s->{$k}->{'lrsum'} / $s->{$k}->{'lrnum'} if $s->{$k}->{'lrnum'} > 0; + $s->{$k}->{'statlat'} = $s->{$k}->{'lstatsum'} / $s->{$k}->{'lstatnum'} if $s->{$k}->{'lstatnum'} > 0; + $s->{$k}->{'dirlat'} = $s->{$k}->{'ldirsum'} / $s->{$k}->{'ldirnum'} if $s->{$k}->{'ldirnum'} > 0; + } + } + return $s; +} + + +my %res; +my @key; +my %didkey; +for my $f (@filt) { + my @reg = split(/,/, $f); + #print "reg @reg\n"; + for my $d (@dirs) { + if ($f ne '.') { + my $r = (split(/\//,$d))[-1]; + my @db = split(/,/, $r); + #print "db @db\n"; + my $ok = 1; + for my $r (@reg) { + + $ok = 0 unless grep {$_ eq $r} @db; + } + next unless $ok; + } + #next if ($f ne '.' && $d !~ /$reg/); + #print "$d\n"; + my ($x) = $d =~ /$xaxis=(\d+)/; + + for my $v (@vars) { + my ($what, $field) = $v =~ /^(.+)\.([^\.]+)$/; + #print "$what $field .. $v .. $f.$field\n"; + my $s = &load_sum("$d/sum.$what"); + + #print "\t$v"; + if ($field =~ /^sum=/) { + #warn "SUM field $field\n"; + push( @{$res{$x}}, $s->{'sum'}->{$'} ); #'}); + } else { + #warn "avg field $field\n"; + push( @{$res{$x}}, $s->{'avgval'}->{$field} ); + } + + push( @key, "$f.$field" ) unless $didkey{"$f.$field"}; + $didkey{"$f.$field"} = 1; + + if (0 && exists $s->{'avgvaldevt'}) { + push( @{$res{$x}}, $s->{'avgvaldevt'}->{$field} ); + push( @key, "$f.$field.dev" ) unless $didkey{"$f.$field.dev"}; + $didkey{"$f.$field.dev"} = 1; + } + } + } +} + +print join("\t", "#", @key) . "\n"; +for my $x (sort {$a <=> $b} keys %res) { + print join("\t", $x, @{$res{$x}}) . "\n"; +} diff --git a/branches/sage/cephmds2/script/find_auth_pins.pl b/branches/sage/cephmds2/script/find_auth_pins.pl new file mode 100755 index 0000000000000..c02c12922ed7b --- /dev/null +++ b/branches/sage/cephmds2/script/find_auth_pins.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +my %pin; +my %hist; +my $l = 1; +my @pins; +while (<>) { + + #cdir:adjust_nested_auth_pins on [dir 163 /foo/ rep@13 | child] count now 0 + 1 + + if (/adjust_nested_auth_pins/) { + my ($what) = /\[(\w+ \d+) /; + $hist{$what} .= "$l: $_" + if defined $pin{$what}; + } + + # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 + + if (/auth_pin /) { + my ($what) = /\[(\w+ \d+) /; +# print "add_waiter $c $what\n"; + $pin{$what}++; + $hist{$what} .= "$l: $_"; + push( @pins, $what ) unless grep {$_ eq $what} @pins; + } + + # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 + + if (/auth_unpin/) { + my ($what) = /\[(\w+ \d+) /;# / on (.*\])/; + $pin{$what}--; + $hist{$what} .= "$l: $_"; + unless ($pin{$what}) { + delete $hist{$what}; + delete $pin{$what}; + @pins = grep {$_ ne $what} @pins; + } + } + $l++; +} + +for my $what (@pins) { + print "---- count $pin{$what} on $what +$hist{$what} +"; +} diff --git a/branches/sage/cephmds2/script/find_bufferleaks.pl b/branches/sage/cephmds2/script/find_bufferleaks.pl new file mode 100755 index 0000000000000..152515d5e788e --- /dev/null +++ b/branches/sage/cephmds2/script/find_bufferleaks.pl @@ -0,0 +1,69 @@ +#!/usr/bin/perl + +use strict; +my %buffers; +my %bufferlists; +my %ref; +my %mal; +my $l = 1; +while (<>) { + #print "$l: $_"; + + # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 + + if (/^buffer\.cons /) { + my ($x) = /(0x\S+)/; + $buffers{$x} = 1; + } + if (/^buffer\.des /) { + my ($x) = /(0x\S+)/; + die "des without cons at $l: $_" unless $buffers{$x}; + delete $buffers{$x}; + die "des with ref>0 at $l: $_" unless $ref{$x} == 0; + delete $ref{$x}; + } + + if (/^bufferlist\.cons /) { + my ($x) = /(0x\S+)/; + $bufferlists{$x} = 1; + } + if (/^bufferlist\.des /) { + my ($x) = /(0x\S+)/; + warn "des without cons at $l: $_" unless $bufferlists{$x}; + delete $bufferlists{$x}; + } + + + if (/^buffer\.malloc /) { + my ($x) = /(0x\S+)/; + $mal{$x} = 1; + } + if (/^buffer\.free /) { + my ($x) = /(0x\S+)/; + die "free with malloc at $l: $_" unless $mal{$x}; + delete $mal{$x}; + } + + if (/^buffer\.get /) { + my ($x) = /(0x\S+)/; + $ref{$x}++; + } + if (/^buffer\.get /) { + my ($x) = /(0x\S+)/; + $ref{$x}--; + } + +$l++; +} + +for my $x (keys %bufferlists) { + print "leaked bufferlist $x\n"; +} + +for my $x (keys %buffers) { + print "leaked buffer $x ref $ref{$x}\n"; +} + +for my $x (keys %mal) { + print "leaked buffer dataptr $x ref $ref{$x}\n"; +} diff --git a/branches/sage/cephmds2/script/find_lost_bdev_ops.pl b/branches/sage/cephmds2/script/find_lost_bdev_ops.pl new file mode 100755 index 0000000000000..ac1793b42dfac --- /dev/null +++ b/branches/sage/cephmds2/script/find_lost_bdev_ops.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl + +use strict; +my %op; + +my $line = 0; +while (<>) { + #print $line . $_ if /0x8d4f6a0/; + chomp; + $line++; + + #bdev(./ebofsdev/0)._submit_io bio(wr 269~1 usemap 0x4de33cc0) + if (my ($bio) = /_submit_io bio\(.*(0x\w+)\)/) { + $op{$bio} = $line; + } + + # cancel + #bdev(./ebofsdev/3)._cancel_io bio(wr 1525~1 bh_write 0x8a437b8) + if (my ($bio) = /_cancel_io bio\(.*(0x\w+)\)/ && + !(/FAILED/)) { + delete $op{$bio}; + } + + # finish + #bdev(./ebofsdev/3).complete_thread finishing bio(wr 1131~1 write_cnode 0x832c1f8) + if (my ($bio) = /complete_thread finishing bio\(.*(0x\w+)\)/) { + delete $op{$bio}; + } + +} + +for my $bio (keys %op) { + print "---- lost bio $bio\n"; +} diff --git a/branches/sage/cephmds2/script/find_lost_commit.pl b/branches/sage/cephmds2/script/find_lost_commit.pl new file mode 100755 index 0000000000000..73934248ad5c0 --- /dev/null +++ b/branches/sage/cephmds2/script/find_lost_commit.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl + +use strict; +my %op; + +my $line = 0; +while (<>) { + #print "$line: $_"; + $line++; + + #osd3 do_op MOSDOp(client0.933 oid 100000000000008 0x84b4480) in pg[pginfo(4020000000d v 5662/0 e 2/1) r=0 active (0,5662]] + if (my ($from, $opno, $oid, $op) = /do_op MOSDOp\((\S+) op (\d+) oid (\d+) (\w+)\)/) { +# print "$op\n"; + if ($opno == 2 || $opno == 11 || $opno == 12 || $opno == 14 || $opno == 15) { + $op{$op} = $from; + } + } + + # commits + #osd1 op_modify_commit on op MOSDOp(client1.289 oid 100000100000002 0x51a2f788) + if (my ($op) = /op_modify_commit.* (\w+)\)/) { + delete $op{$op}; + } + #osd4 rep_modify_commit on op MOSDOp(osd3.289 oid 100000000000008 0x84b0980) + if (my ($op) = /rep_modify_commit.* (\w+)\)/) { + delete $op{$op}; + } + + # forwarded? + if (my ($op) = /sending (\w+) to osd/) { + delete $op{$op}; + } + +} + +for my $op (keys %op) { + print "---- lost op $op $op{$op}\n"; +} diff --git a/branches/sage/cephmds2/script/find_lost_objecter.pl b/branches/sage/cephmds2/script/find_lost_objecter.pl new file mode 100755 index 0000000000000..a0c2089140e23 --- /dev/null +++ b/branches/sage/cephmds2/script/find_lost_objecter.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl + +use strict; +my %ack; +my %commit; + +my $line = 0; +while (<>) { + #print "$line: $_"; + $line++; + + #client0.objecter writex_submit tid 21 osd0 oid 100000000000001 851424~100000 + if (my ($who, $tid) = /(\S+)\.objecter writex_submit tid\D+(\d+)\D+osd/) { +# print "$who.$tid\n"; + $ack{"$who.$tid"} = $line; + $commit{"$who.$tid"} = $line; + } + + #client1.objecter handle_osd_write_reply 304 commit 0 + #client1.objecter handle_osd_write_reply 777 commit 1 + if (my ($who, $tid, $commit) = /(\S+)\.objecter handle_osd_write_reply\D+(\d+)\D+commit\D+(\d)/) { +# print "$who.$tid\n"; + delete $ack{"$who.$tid"}; + delete $commit{"$who.$tid"} if $commit; + } + +} + +for my $op (keys %commit) { + print "---- lost commit $op $commit{$op}\n"; +} +for my $op (keys %ack) { + print "---- lost ack $op $commit{$op}\n"; +} diff --git a/branches/sage/cephmds2/script/find_pathpins.pl b/branches/sage/cephmds2/script/find_pathpins.pl new file mode 100755 index 0000000000000..e4a7d81dfb7b7 --- /dev/null +++ b/branches/sage/cephmds2/script/find_pathpins.pl @@ -0,0 +1,41 @@ +#!/usr/bin/perl + +my %pin; +my %hist; +my $l = 1; +my @pins; +while (<>) { + + # cinode:auth_pin on inode [1000000002625 /gnu/blah_client_created. 0x89b7700] count now 1 + 0 + + if (/path_pinned /) { + my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; + $what = "$dname $dir"; + #print "$l pin $what\n"; + $pin{$what}++; + $hist{$what} .= "$l: $_"; + push( @pins, $what ) unless grep {$_ eq $what} @pins; + } + + # cinode:auth_unpin on inode [1000000002625 (dangling) 0x89b7700] count now 0 + 0 + + if (/path_unpinned/) { + my ($dname, $dir) = /\[dentry (\S+) .* in \[dir (\d+) /; + $what = "$dname $dir"; + #print "$l unpin $what\n"; + $pin{$what}--; + $hist{$what} .= "$l: $_"; + unless ($pin{$what}) { + delete $hist{$what}; + delete $pin{$what}; + @pins = grep {$_ ne $what} @pins; + } + } + $l++; +} + +for my $what (@pins) { + print "---- count $pin{$what} on $what +$hist{$what} +"; +} diff --git a/branches/sage/cephmds2/script/find_requests.pl b/branches/sage/cephmds2/script/find_requests.pl new file mode 100755 index 0000000000000..5144896249413 --- /dev/null +++ b/branches/sage/cephmds2/script/find_requests.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +my %waiting; # context => what where what is "inode ..." or "dir ..." +my %hist; # context => history since waited +my @waiting; + +my $line = 0; +while (<>) { + + #print $line . $_ if /0x8d4f6a0/; + $line++; + if (/request_start/) { + my ($c) = /(0x\w+)/; + my ($what) = $'; #'; + chomp $what; + #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; + $waiting{$c} = $what + if $what && !$waiting{$c}; + $hist{$c} .= "$line: $_"; + unless (grep {$_ eq $c} @waiting) { + push( @waiting, $c ); + } + } + #if (/finish_waiting/) { + # my ($c) = /(0x\w+)/; + # $hist{$c} .= "$line: $_"; + #} + if (/request_finish/ || + /request_forward/) { + my ($c) = /(0x\w+)/; + #print "took\n" if /0x8d4f6a0/; + delete $waiting{$c}; + delete $hist{$c}; + @waiting = grep {$_ ne $c} @waiting; + } +} + +for my $c (@waiting) { + print "---- lost request $c $waiting{$c} +$hist{$c} +"; +} diff --git a/branches/sage/cephmds2/script/find_waiters.pl b/branches/sage/cephmds2/script/find_waiters.pl new file mode 100755 index 0000000000000..c89d2b1a49db7 --- /dev/null +++ b/branches/sage/cephmds2/script/find_waiters.pl @@ -0,0 +1,46 @@ +#!/usr/bin/perl + +my %waiting; # context => what where what is "inode ..." or "dir ..." +my %hist; # context => history since waited +my @waiting; + +my $line = 0; +while (<>) { + #print $line . $_ if /0x8d4f6a0/; + $line++; + if (/add_waiter/) { + my ($c) = /(0x\w+)/; + my ($what) = / on (.*\])/; + #print "$line add_waiter $c $what\n" if /0x8d4f6a0/; + $waiting{$c} = $what + if $what && !$waiting{$c}; + $hist{$c} .= "$line: $_"; + unless (grep {$_ eq $c} @waiting) { + push( @waiting, $c ); + } + } + #if (/finish_waiting/) { + # my ($c) = /(0x\w+)/; + # $hist{$c} .= "$line: $_"; + #} + if (/take_waiting/) { + my ($c) = /(0x\w+)/; + if (/SKIPPING/) { + #print "skipping\n" if /0x8d4f6a0/; + $hist{$c} .= "$line: $_"; + } elsif (/took/) { + #print "took\n" if /0x8d4f6a0/; + delete $waiting{$c}; + delete $hist{$c}; + @waiting = grep {$_ ne $c} @waiting; + } else { + die "i don't understand: $_"; + } + } +} + +for my $c (@waiting) { + print "---- lost waiter $c $waiting{$c} +$hist{$c} +"; +} diff --git a/branches/sage/cephmds2/script/grepblock b/branches/sage/cephmds2/script/grepblock new file mode 100755 index 0000000000000..f5acf95732abb --- /dev/null +++ b/branches/sage/cephmds2/script/grepblock @@ -0,0 +1,15 @@ +#!/usr/bin/perl + +use strict; + +my $block = shift ARGV; +die unless int $block; + +while (<>) { + my $yes = 0; + for my $x (/(\d+\~\d+)/) { + my ($s,$l) = split(/\~/,$x); + $yes = 1 if ($block >= $s && $block < $s+$l); + } + print if $yes; +} diff --git a/branches/sage/cephmds2/script/merge_trace_rw.pl b/branches/sage/cephmds2/script/merge_trace_rw.pl new file mode 100644 index 0000000000000..378d629ef43f6 --- /dev/null +++ b/branches/sage/cephmds2/script/merge_trace_rw.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl + +use strict; + +my @file = <>; +sub get_op { + my @op = shift @file; + while (@file && + $file[0] !~ /^[a-z]+$/) { + push( @op, shift @file ); + } + #print "op = ( @op )\n"; + return @op; +} + +my $n = 0; +while (@file) { + my ($op, @args) = &get_op; + while ($op eq "read\n" || + $op eq "write\n") { + die unless scalar(@args) == 3; + my ($nop, @nargs) = &get_op; + if ($nop eq $op + && ($args[0] == $nargs[0] ) + && ($args[2] + $args[1] == $nargs[2]) + ) { + die unless scalar(@nargs) == 3; + $args[1] += $nargs[1]; + $args[1] .= "\n"; + die unless scalar(@args) == 3; + #print STDOUT "combining $n $op @args\n"; + $n++; + } else { +# print STDERR "not combinging\n"; + unshift( @file, $nop, @nargs ); + die unless scalar(@args) == 3; + last; + } + } + print $op; + print join('', @args); +} diff --git a/branches/sage/cephmds2/script/profonly.pl b/branches/sage/cephmds2/script/profonly.pl new file mode 100755 index 0000000000000..6a05dec473ca0 --- /dev/null +++ b/branches/sage/cephmds2/script/profonly.pl @@ -0,0 +1,12 @@ +#!/usr/bin/perl + +my $rank = shift @ARGV; +my $args = join(' ',@ARGV); +if ($rank == $ENV{MPD_JRANK}) { + $c = "LD_PRELOAD=$ENV{'HOME'}/csl/obsd/src/pmds/gprof-helper.so ./newsyn $args"; +} else { + $c = "./newsyn.nopg $args"; +} + +#print "$rank: $c\n"; +system $c; diff --git a/branches/sage/cephmds2/script/runset.pl b/branches/sage/cephmds2/script/runset.pl new file mode 100755 index 0000000000000..a1425862ceb42 --- /dev/null +++ b/branches/sage/cephmds2/script/runset.pl @@ -0,0 +1,380 @@ +#!/usr/bin/perl + +use strict; +use Data::Dumper; + +=item sample input file + +# hi there +{ + # startup + 'n' => 30, # mpi nodes + 'sleep' => 10, # seconds between runs + 'nummds' => 1, + 'numosd' => 8, + 'numclient' => 400,#[10, 50, 100, 200, 400], + + # parameters + 'fs' => [ 'ebofs', 'fakestore' ], + 'until' => 150, # --syn until $n ... when to stop clients + 'writefile' => 1, + 'writefile_size' => [ 4096, 65526, 256000, 1024000, 2560000 ], + 'writefile_mb' => 1000, + + 'custom' => '--tcp_skip_rank0 --osd_maxthreads 0'; + + # for final summation (script/sum.pl) + 'start' => 30, + 'end' => 120, + + '_psub' => 'alc.tp' # switch to psub mode! +}; + +=cut + +my $usage = "script/runset.pl [--clean] jobs/some/job blah\n"; + +my $clean; +my $use_srun; +my $nobg = '&'; +my $in = shift || die $usage; +if ($in eq '--clean') { + $clean = 1; + $in = shift || die $usage; +} +if ($in eq '--srun') { + $use_srun = 1; + $in = shift || die $usage; +} +if ($in eq '--nobg') { + $nobg = ''; + $in = shift || die $usage; +} +my $tag = shift || die $usage; +my $fake = shift; + + +my ($job) = $in =~ /^jobs\/(.*)/; +my ($jname) = $job =~ /\/(\w+)$/; +$jname ||= $job; +die "not jobs/?" unless defined $job; +my $out = "log/$job.$tag"; +my $relout = "$job.$tag"; + + +my $cwd = `/bin/pwd`; +chomp($cwd); + + + +print "# --- job $job, tag $tag ---\n"; + + +# get input +my $raw = `cat $in`; +my $sim = eval $raw; +unless (ref $sim) { + print "bad input: $in\n"; + system "perl -c $in"; + exit 1; +} + +# prep output +system "mkdir -p $out" unless -d "$out"; + +open(W, ">$out/in"); +print W $raw; +close W; + +my $comb = $sim->{'comb'}; +delete $sim->{'comb'}; +my %filters; +my @fulldirs; + + + +sub reset { + print "reset: restarting mpd in 3 seconds\n"; + system "sleep 3 && (mpiexec -l -n 32 killall newsyn ; restartmpd.sh)"; + print "reset: done\n"; +} + + +if (`hostname` =~ /alc/ && !$use_srun) { + print "# this looks like alc\n"; + $sim->{'_psub'} = 'jobs/alc.tp'; +} + + +sub iterate { + my $sim = shift @_; + my $fix = shift @_ || {}; + my $vary; + my @r; + + my $this; + for my $k (sort keys %$sim) { + next if $k =~ /^_/; + if (defined $fix->{$k}) { + $this->{$k} = $fix->{$k}; + } + elsif (ref $sim->{$k} eq 'HASH') { + # nothing + } + elsif (!(ref $sim->{$k})) { + $this->{$k} = $sim->{$k}; + } + else { + #print ref $sim->{$k}; + if (!(defined $vary)) { + $vary = $k; + } + } + } + + if ($vary) { + #print "vary $vary\n"; + for my $v (@{$sim->{$vary}}) { + $this->{$vary} = $v; + push(@r, &iterate($sim, $this)); + } + } else { + + if ($sim->{'_dep'}) { + my @s = @{$sim->{'_dep'}}; + while (@s) { + my $dv = shift @s; + my $eq = shift @s; + + $eq =~ s/\$(\w+)/"\$this->{'$1'}"/eg; + $this->{$dv} = eval $eq; + #print "$dv : $eq -> $this->{$dv}\n"; + } + } + + push(@r, $this); + } + return @r; +} + + + +sub run { + my $h = shift @_; + + my @fn; + my @filt; + my @vals; + for my $k (sort keys %$sim) { + next if $k =~ /^_/; + next unless ref $sim->{$k} eq 'ARRAY'; + push(@fn, "$k=$h->{$k}"); + push(@vals, $h->{$k}); + next if $comb && $k eq $comb->{'x'}; + push(@filt, "$k=$h->{$k}"); + } + my $keys = join(",", @fn); + $keys =~ s/ /_/g; + my $fn = $out . '/' . $keys; + my $name = $jname . '_' . join('_',@vals); #$tag . '_' . $keys; + + push( @fulldirs, "" . $fn ); + + + # filters + $filters{ join(',', @filt) } = 1; + + + #system "sh $fn/sh.post" if -e "$fn/sh.post";# && !(-e "$fn/.post"); + if (-e "$fn/.done") { + print "already done.\n"; + return; + } + system "rm -r $fn" if $clean && -d "$fn"; + system "mkdir $fn" unless -d "$fn"; + + my $e = './newsyn'; + #$e = './tcpsynobfs' if $h->{'fs'} eq 'obfs'; + my $c = "$e"; + $c .= " --mkfs" unless $h->{'no_mkfs'}; + $c .= " --$h->{'fs'}"; + $c .= " --syn until $h->{'until'}" if $h->{'until'}; + + $c .= " --syn writefile $h->{'writefile_mb'} $h->{'writefile_size'}" if $h->{'writefile'}; + $c .= " --syn rw $h->{'rw_mb'} $h->{'rw_size'}" if $h->{'rw'}; + $c .= " --syn readfile $h->{'readfile_mb'} $h->{'readfile_size'}" if $h->{'readfile'}; + $c .= " --syn makedirs $h->{'makedirs_dirs'} $h->{'makedirs_files'} $h->{'makedirs_depth'}" if $h->{'makedirs'}; + + if ($h->{'ebofs_freelist'}) { + system "cp freelist/ebofs.freelist.$h->{'ebofs_freelist'} ebofs.freelist"; + $c .= " --osd_age_time -1"; + } + + for my $k ('nummds', 'numclient', 'numosd', 'kill_after', + 'osd_maxthreads', 'osd_object_layout', 'osd_pg_layout','osd_pg_bits', + 'mds_bal_rep', 'mds_bal_interval', 'mds_bal_max','mds_decay_halflife', + 'mds_bal_hash_rd','mds_bal_hash_wr','mds_bal_unhash_rd','mds_bal_unhash_wr', + 'mds_cache_size','mds_log_max_len', + 'mds_local_osd', + 'osd_age_time','osd_age', + 'osd_rep', + 'osd_pad_pg_log','ebofs_realloc', + 'osd_balance_reads', + 'tcp_multi_out', + 'client_cache_stat_ttl','client_cache_readdir_ttl', + 'client_oc', + 'fake_osdmap_updates', + 'bdev_el_bidir', 'ebofs_idle_commit_ms', 'ebofs_commit_ms', + 'ebofs_oc_size','ebofs_cc_size','ebofs_bc_size','ebofs_bc_max_dirty','ebofs_abp_max_alloc', + 'file_layout_ssize','file_layout_scount','file_layout_osize','file_layout_num_rep', + 'meta_dir_layout_ssize','meta_dir_layout_scount','meta_dir_layout_osize','meta_dir_layout_num_rep', + 'meta_log_layout_ssize','meta_log_layout_scount','meta_log_layout_osize','meta_log_layout_num_rep') { + $c .= " --$k $h->{$k}" if defined $h->{$k}; + } + + $c .= ' ' . $h->{'custom'} if $h->{'custom'}; + + $c .= " --log_name $relout/$keys"; + + my $post = "#!/bin/sh +script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/osd\\* > $fn/sum.osd +script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds? $fn/mds?? > $fn/sum.mds +script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/mds*.log > $fn/sum.mds.log +script/sum.pl -start $h->{'start'} -end $h->{'end'} $fn/clnode* > $fn/sum.cl +touch $fn/.post +"; + open(O,">$fn/sh.post"); + print O $post; + close O; + + my $killmin = 1 + int ($h->{'kill_after'} / 60); + + $c = "bash -c \"ulimit -c 0 ; $c\""; + #$c = "bash -c \"$c\""; + + my $srun = "srun --wait=600 --exclude=jobs/ltest.ignore -l -t $killmin -N $h->{'n'} -p ltest"; + my $mpiexec = "mpiexec -l -n $h->{'n'}"; + my $launch; + if ($use_srun) { + $launch = $srun; + } else { + $launch = $mpiexec; + } + + if ($sim->{'_psub'}) { + # template! + my $tp = `cat $sim->{'_psub'}`; + $tp =~ s/\$CWD/$cwd/g; + $tp =~ s/\$NAME/$name/g; + $tp =~ s/\$NUM/$h->{'n'}/g; + $tp =~ s/\$OUT/$fn\/o/g; + $tp =~ s/\$DONE/$fn\/.done/g; + $tp =~ s/\$CMD/$c/g; + open(O,">$out/$name"); + print O $tp; + close O; + print "\npsub $out/$name\n"; + return; + } else { + # run + my $cmd = "\n$launch $c > $fn/o && touch $fn/.done";# + #my $cmd = "\n$launch $c > $fn/o ; touch $fn/.done"; + print "$cmd $nobg\n"; + my $r = undef; + unless ($fake) { + if ($sim->{'_pre'}) { + print "pre: $launch $sim->{'_pre'}\n"; + system "$launch $sim->{'_pre'}"; + } + $r = system $cmd; + if ($sim->{'_post'}) { + print "post: $launch $sim->{'_post'}\n"; + system "$launch $sim->{'_post'}"; + } + if ($r) { + print "r = $r\n"; + #&reset; + } + system "sh $fn/sh.post"; + } + return $r; + } +} + + + +my @r = &iterate($sim); +my $n = scalar(@r); +my $c = 1; +my %r; +my $nfailed = 0; +for my $h (@r) { + my $d = `date`; + chomp($d); + $d =~ s/ P.T .*//; + print "# === $c/$n"; + print " ($nfailed failed)" if $nfailed; + print " $d: "; + my $r = &run($h); + + if (!(defined $r)) { + # already done + } else { + if ($r) { + $nfailed++; + } + print "sleep $h->{'sleep'}\n"; + sleep $h->{'sleep'}; + } + + $c++; +} +print "$nfailed failed\n"; + + +my @comb; +if ($comb) { + my $x = $comb->{'x'}; + my @vars = @{$comb->{'vars'}}; + + print "\n\n# post\n"; + for my $p (@fulldirs) { + print "sh $p/sh.post\n"; + } + + my @filters = sort keys %filters; + my $cmd = "script/comb.pl $x @vars - @fulldirs - @filters > $out/c"; + print "$cmd\n"; + open(O,">$out/comb"); + print O "$cmd\n"; + close O; + system $cmd; + + print "\n\n"; + + my $plot; + $plot .= "set data style linespoints;\n"; + my $s = 2; + for my $v (@vars) { + my $c = $s; + $s++; + my @p; + for my $f (@filters) { + my $t = $f; + if ($comb->{'maptitle'}) { + for my $a (keys %{$comb->{'maptitle'}}) { + my $b = $comb->{'maptitle'}->{$a}; + $t =~ s/$a/$b/; + } + } + push (@p, "\"$out/c\" u 1:$c t \"$t\"" ); + $c += scalar(@vars); + } + $plot .= "# $v\nplot " . join(", ", @p) . ";\n\n"; + } + print $plot; + open(O,">$out/plot"); + print O $plot; + close O; +} + diff --git a/branches/sage/cephmds2/script/sum.pl b/branches/sage/cephmds2/script/sum.pl new file mode 100755 index 0000000000000..92ef9a9b222a8 --- /dev/null +++ b/branches/sage/cephmds2/script/sum.pl @@ -0,0 +1,148 @@ +#!/usr/bin/perl + +use strict; +my $starttime = 1; +my $endtime = -1; + +my $avgrows = 0; + +while ($ARGV[0] =~ /^-/) { + $_ = shift @ARGV; + if ($_ eq '-avg') { + $avgrows = 1; + } + elsif ($_ eq '-start') { + $starttime = shift @ARGV; + } + elsif ($_ eq '-end') { + $endtime = shift @ARGV; + } + else { + die "i don't understand arg $_"; + } +} +my @files = @ARGV; + +if (scalar(@files) == 1 && $files[0] =~ /\*/) { + my ($dir, $pat) = $files[0] =~ /^(.*)\/([^\/]+)$/; + @files = (); + $pat =~ s/\*//; +# print "dir $dir pat $pat\n"; + opendir(D,"$dir"); + for my $f (readdir(D)) { + # print "$f\n"; + next unless $f =~ /^$pat/; + push(@files, "$dir/$f"); + } + closedir(D); + +# print "files = @files\n"; +} + +my @data; +for my $f (@files) { + open(I,$f); + push( @data, ); + close I; +} + +my %sum; # time -> name -> val +my %col; # colnum -> name .. colnums start at 0 (time doesn't count) +my %min; +my %max; +my %avg; +my %tcount; +my $files; +for (@data) { + chomp; + my @r = split(/\s+/,$_); + my $r = shift @r; + + # column headings? + if ($r =~ /^\#/) { + my $num = 0; + while (my $name = shift @r) { + $col{$num} = $name; + $num++; + } + next; + } + + next unless int $r; + next if $r < $starttime; + next if $endtime > 0 && $r > $endtime; + + $tcount{$r}++; + $files = $tcount{$r} if $tcount{$r} > $files; + #print "$r: @r\n"; + my $i = 0; + while (@r) { + my $v = shift @r; + $sum{$r}->{$col{$i}} += $v; # if $v > 0; + + $min{$col{$i}} = $v + if ($min{$col{$i}} > $v || !(defined $min{$col{$i}})); + $max{$col{$i}} = $v + if ($max{$col{$i}} < $v); + + $avg{$col{$i}} += $v; + $i++; + } +} + +## dump +my @c = sort {$a <=> $b} keys %col; +# cols +print join("\t",'#', map { $col{$_} } @c) . "\n"; +my $n = 0; +for my $k (sort {$a <=> $b} keys %sum) { + if ($avgrows) { + print join("\t",$k, #map int, + map { $sum{$k}->{$col{$_}}/$tcount{$k} } @c ) . "\n"; + } else { + print join("\t",$k, map { $sum{$k}->{$col{$_}} } @c ) . "\n"; + } + $n++; +} + +my $rows = $n || 1; +#my $files = $tcount{$starttime}; +my %avgval; + +## devt +#warn "rows $rows, files $files\n"; +my %avgvalvart; # std dev of each col avg, over time +for my $k (keys %avg) { + my $av = $avgval{$k} = $avg{$k} / ($rows*$files); + + my $var = 0.0; + for my $t (sort {$a <=> $b} keys %sum) { + my $a = $sum{$t}->{$k} / $files; + $var += ($a - $av) * ($a - $av); + } + + $avgvalvart{$k} = $var / $rows; +} + + + + +print "\n"; +print join("\t",'#', map { $col{$_} } @c) . "\n"; +print join("\t", '#minval', map { $min{$col{$_}} } @c ) . "\n"; +print join("\t", '#maxval', map { $max{$col{$_}} } @c ) . "\n"; +print join("\t", '#rows', map { $rows } @c) . "\n"; +print join("\t", '#files', map { $files } @c) . "\n"; +print join("\t", '#sum', + map { $avg{$col{$_}} } @c ) . "\n"; +print join("\t", '#avgval', #map int, + map { $avgval{$col{$_}} } @c ) . "\n"; +# map { ($rows*$files) ? ($_ / ($rows*$files)):0 } map { $avg{$col{$_}} } @c ) . "\n"; + +print join("\t", '#avgvalvart', + map { $avgvalvart{$col{$_}} } @c ) . "\n"; +print join("\t", '#avgvaldevt', + map { sqrt($_) } map { $avgvalvart{$col{$_}} } @c ) . "\n"; + +print join("\t", '#avgsum', #map int, + map { $_ / $rows } map { $avg{$col{$_}} } @c ) . "\n"; diff --git a/branches/sage/cephmds2/tcpfuse.cc b/branches/sage/cephmds2/tcpfuse.cc new file mode 100644 index 0000000000000..3d7be50d377d6 --- /dev/null +++ b/branches/sage/cephmds2/tcpfuse.cc @@ -0,0 +1,80 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mds/MDCluster.h" +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "client/Client.h" +#include "client/fuse.h" + +#include "msg/TCPMessenger.h" + +#include "common/Timer.h" + +#include + +#include +#include +#include + +int main(int argc, char **argv, char *envp[]) { + + //cerr << "tcpfuse starting " << myrank << "/" << world << endl; + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + // args for fuse + vec_to_argv(args, argc, argv); + + // start up tcpmessenger + tcpaddr_t nsa; + if (tcpmessenger_findns(nsa) < 0) exit(1); + tcpmessenger_init(); + tcpmessenger_start(); + tcpmessenger_start_rankserver(nsa); + + Client *client = new Client(new TCPMessenger(MSG_ADDR_CLIENT_NEW)); + client->init(); + + // start up fuse + // use my argc, argv (make sure you pass a mount point!) + cout << "mounting" << endl; + client->mount(); + + cerr << "starting fuse on pid " << getpid() << endl; + ceph_fuse_main(client, argc, argv); + cerr << "fuse finished on pid " << getpid() << endl; + + client->unmount(); + cout << "unmounted" << endl; + client->shutdown(); + + delete client; + + // wait for it to finish + tcpmessenger_wait(); + tcpmessenger_shutdown(); // shutdown MPI + + return 0; +} + diff --git a/branches/sage/cephmds2/tcpsyn.cc b/branches/sage/cephmds2/tcpsyn.cc new file mode 100644 index 0000000000000..cc9f470640c36 --- /dev/null +++ b/branches/sage/cephmds2/tcpsyn.cc @@ -0,0 +1,292 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mds/MDCluster.h" +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "mon/Monitor.h" +#include "client/Client.h" +#include "client/SyntheticClient.h" + +#include "msg/TCPMessenger.h" + +#include "common/Timer.h" + +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + +class C_Test : public Context { +public: + void finish(int r) { + cout << "C_Test->finish(" << r << ")" << endl; + } +}; + + +#include "msg/mpistarter.cc" + +utime_t tick_start; +int tick_count = 0; + +class C_Tick : public Context { +public: + void finish(int) { + utime_t now = g_clock.now() - tick_start; + dout(0) << "tick +" << g_conf.tick << " -> " << now << " (" << tick_count << ")" << endl; + tick_count += g_conf.tick; + utime_t next = tick_start; + next.sec_ref() += tick_count; + g_timer.add_event_at(next, new C_Tick); + } +}; + +class C_Die : public Context { +public: + void finish(int) { + cerr << "die" << endl; + exit(1); + } +}; + +class C_Debug : public Context { + public: + void finish(int) { + int size = &g_conf.debug_after - &g_conf.debug; + memcpy((char*)&g_conf.debug, (char*)&g_debug_after_conf.debug, size); + dout(0) << "debug_after flipping debug settings" << endl; + } +}; + + +int main(int argc, char **argv) +{ + vector args; + argv_to_vec(argc, argv, args); + + parse_config_options(args); + + parse_syn_options(args); + + if (g_conf.kill_after) + g_timer.add_event_after(g_conf.kill_after, new C_Die); + if (g_conf.debug_after) + g_timer.add_event_after(g_conf.debug_after, new C_Debug); + + if (g_conf.tick) { + tick_start = g_clock.now(); + g_timer.add_event_after(g_conf.tick, new C_Tick); + } + + vector nargs; + for (unsigned i=0; i mpiwho = mpi_bootstrap_tcp(argc, argv); + int myrank = mpiwho.first; + int world = mpiwho.second; + + int need = 0; + if (g_conf.tcp_skip_rank0) need++; + need += NUMMDS; + need += NUMOSD; + if (NUMCLIENT) { + if (!g_conf.tcp_overlay_clients) + need += 1; + } + assert(need <= world); + + if (myrank == 0) + cerr << "nummds " << NUMMDS << " numosd " << NUMOSD << " numclient " << NUMCLIENT << " .. need " << need << ", have " << world << endl; + + MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); + + + char hostname[100]; + gethostname(hostname,100); + int pid = getpid(); + + int started = 0; + + //if (myrank == 0) g_conf.debug = 20; + + // create mon + if (myrank == 0) { + Monitor *mon = new Monitor(0, new TCPMessenger(MSG_ADDR_MON(0))); + mon->init(); + } + + // create mds + MDS *mds[NUMMDS]; + OSD *mdsosd[NUMMDS]; + for (int i=0; iinit(); + started++; + + if (g_conf.mds_local_osd) { + mdsosd[i] = new OSD(i+10000, new TCPMessenger(MSG_ADDR_OSD(i+10000))); + mdsosd[i]->init(); + } + } + + // create osd + OSD *osd[NUMOSD]; + for (int i=0; iinit(); + started++; + } + + if (g_conf.tcp_overlay_clients) sleep(5); + + // create client + int skip_osd = NUMOSD; + if (g_conf.tcp_overlay_clients) + skip_osd = 0; // put clients with osds too! + int client_nodes = world - NUMMDS - skip_osd - g_conf.tcp_skip_rank0; + int clients_per_node = 1; + if (NUMCLIENT) clients_per_node = (NUMCLIENT-1) / client_nodes + 1; + set clientlist; + Client *client[NUMCLIENT]; + SyntheticClient *syn[NUMCLIENT]; + for (int i=0; iinit(); + started++; + + syn[i] = new SyntheticClient(client[i]); + } + + if (!clientlist.empty()) dout(2) << "i have " << clientlist << endl; + + int nclients = 0; + for (set::iterator it = clientlist.begin(); + it != clientlist.end(); + it++) { + int i = *it; + + //cerr << "starting synthetic client" << i << " on rank " << myrank << endl; + client[i]->mount(); + syn[i]->start_thread(); + + nclients++; + } + if (nclients) { + cerr << nclients << " clients on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl; + } + + for (set::iterator it = clientlist.begin(); + it != clientlist.end(); + it++) { + int i = *it; + + // cout << "waiting for synthetic client" << i << " to finish" << endl; + syn[i]->join_thread(); + delete syn[i]; + + client[i]->unmount(); + //cout << "client" << i << " unmounted" << endl; + client[i]->shutdown(); + } + + + if (myrank && !started) { + //dout(1) << "IDLE" << endl; + cerr << "idle on tcprank " << tcpmessenger_get_rank() << " " << hostname << "." << pid << endl; + tcpmessenger_stop_rankserver(); + } + + // wait for everything to finish + tcpmessenger_wait(); + + if (started) cerr << "tcpsyn finishing" << endl; + + tcpmessenger_shutdown(); + + + /* + // cleanup + for (int i=0; i +#include +#include + +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "fakeclient/FakeClient.h" + +#include "mds/MDCluster.h" +#include "mds/MDCache.h" +#include "mds/MDStore.h" + +#include "msg/FakeMessenger.h" + +#include "messages/MPing.h" + +using namespace std; + +__uint64_t ino = 1; + + + +#include "config.h" +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_fakeclient + +// this parses find output +int play(); + +int main(int oargc, char **oargv) { + cerr << "hi there" << endl; + + int argc; + char **argv; + parse_config_options(oargc, oargv, + argc, argv); + + MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); + + // local config settings + g_conf.num_client = g_conf.num_fakeclient; // to fool mds, hack gross + + // create osds + OSD *osd[NUMOSD]; + for (int i=0; iinit(); + } + + // create mds + MDS *mds[NUMMDS]; + for (int i=0; iinit(); + } + + + // create clients + FakeClient *client[NUMCLIENT]; + for (int i=0; iinit(); + } + + // mount clients + for (int i=0; imount(); + + // loop + fakemessenger_do_loop(); + + //mds[0]->shutdown_start(); + //fakemessenger_do_loop(); + + // + if (argc > 1 && + strcmp(argv[1], "nocheck") == 0) { + cerr << "---- nocheck" << endl; + } else { + cout << "---- check ----" << endl; + for (int i=0; imdcache->shutdown_pass(); + } + + // cleanup + cout << "cleanup" << endl; + for (int i=0; i + * Daniel Jönsson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the Do What The Fuck You Want To + * Public License as published by Banlu Kemiyatorn. See + * http://sam.zoy.org/projects/COPYING.WTFPL for more details. + * + * Compilation example: + * gcc -shared -fPIC gprof-helper.c -o gprof-helper.so -lpthread -ldl + * + * Usage example: + * LD_PRELOAD=./gprof-helper.so your_program + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +static void * wrapper_routine(void *); + +/* Original pthread function */ +static int (*pthread_create_orig)(pthread_t *__restrict, + __const pthread_attr_t *__restrict, + void *(*)(void *), + void *__restrict) = NULL; + +/* Library initialization function */ +void wooinit(void) __attribute__((constructor)); + +void wooinit(void) +{ + pthread_create_orig = dlsym(RTLD_NEXT, "pthread_create"); + fprintf(stderr, "pthreads: using profiling hooks for gprof\n"); + if(pthread_create_orig == NULL) + { + char *error = dlerror(); + if(error == NULL) + { + error = "pthread_create is NULL"; + } + fprintf(stderr, "%s\n", error); + exit(EXIT_FAILURE); + } +} + +/* Our data structure passed to the wrapper */ +typedef struct wrapper_s +{ + void * (*start_routine)(void *); + void * arg; + + pthread_mutex_t lock; + pthread_cond_t wait; + + struct itimerval itimer; + +} wrapper_t; + +/* The wrapper function in charge for setting the itimer value */ +static void * wrapper_routine(void * data) +{ + /* Put user data in thread-local variables */ + void * (*start_routine)(void *) = ((wrapper_t*)data)->start_routine; + void * arg = ((wrapper_t*)data)->arg; + + /* Set the profile timer value */ + setitimer(ITIMER_PROF, &((wrapper_t*)data)->itimer, NULL); + + /* Tell the calling thread that we don't need its data anymore */ + pthread_mutex_lock(&((wrapper_t*)data)->lock); + pthread_cond_signal(&((wrapper_t*)data)->wait); + pthread_mutex_unlock(&((wrapper_t*)data)->lock); + + /* Call the real function */ + return start_routine(arg); +} + +/* Our wrapper function for the real pthread_create() */ +int pthread_create(pthread_t *__restrict thread, + __const pthread_attr_t *__restrict attr, + void * (*start_routine)(void *), + void *__restrict arg) +{ + wrapper_t wrapper_data; + int i_return; + + /* Initialize the wrapper structure */ + wrapper_data.start_routine = start_routine; + wrapper_data.arg = arg; + getitimer(ITIMER_PROF, &wrapper_data.itimer); + pthread_cond_init(&wrapper_data.wait, NULL); + pthread_mutex_init(&wrapper_data.lock, NULL); + pthread_mutex_lock(&wrapper_data.lock); + + /* The real pthread_create call */ + i_return = pthread_create_orig(thread, + attr, + &wrapper_routine, + &wrapper_data); + + /* If the thread was successfully spawned, wait for the data + * to be released */ + if(i_return == 0) + { + pthread_cond_wait(&wrapper_data.wait, &wrapper_data.lock); + } + + pthread_mutex_unlock(&wrapper_data.lock); + pthread_mutex_destroy(&wrapper_data.lock); + pthread_cond_destroy(&wrapper_data.wait); + + return i_return; +} + diff --git a/branches/sage/cephmds2/test/makedirs.cc b/branches/sage/cephmds2/test/makedirs.cc new file mode 100644 index 0000000000000..8fd74d996ef9f --- /dev/null +++ b/branches/sage/cephmds2/test/makedirs.cc @@ -0,0 +1,38 @@ +#include +#include +using namespace std; + +int make_dirs(const char *basedir, int dirs, int files, int depth) +{ + //if (time_to_stop()) return 0; + + // make sure base dir exists + int r = mkdir(basedir, 0755); + if (r != 0) { + cout << "can't make base dir? " << basedir << endl; + return -1; + } + + // children + char d[500]; + cout << "make_dirs " << basedir << " dirs " << dirs << " files " << files << " depth " << depth << endl; + for (int i=0; i +#include +#include +using namespace std; + +#include "mds/MDCluster.h" +#include "mds/MDS.h" +#include "osd/OSD.h" +#include "fakeclient/FakeClient.h" + +#include "mds/MDCache.h" +#include "mds/MDStore.h" + +#include "msg/MPIMessenger.h" +//#include "msg/CheesySerializer.h" + +#include "messages/MPing.h" + + +__uint64_t ino = 1; + + + +#include "config.h" +#define NUMMDS g_conf.num_mds +#define NUMOSD g_conf.num_osd +#define NUMCLIENT g_conf.num_client + +// this parses find output +int play(); + +int main(int argc, char **argv) { + cout << "mpitest starting" << endl; + + int myrank = mpimessenger_init(argc, argv); + int world = mpimessenger_world(); + + + + MDCluster *mdc = new MDCluster(NUMMDS, NUMOSD); + + // create osds + OSD *osd[NUMOSD]; + for (int i=0; iinit(); + } + + // create mds + MDS *mds[NUMMDS]; + for (int i=0; iinit(); + } + + // create clients + FakeClient *client[NUMCLIENT]; + for (int i=0; iset_dispatcher(serializer); + + client[i] = new FakeClient(mdc, i, real, g_conf.fakeclient_requests); + client[i]->init(); + } + + // seed initial requests + for (int i=0; iissue_request(); + } + + mpimessenger_start(); // start message loop + mpimessenger_wait(); // wait for thread to finish + mpimessenger_shutdown(); // shutdown MPI + + // + /* + cout << "---- check ----" << endl; + for (int i=0; imdcache->shutdown_pass(); + } + */ + + // cleanup + //cout << "cleanup" << endl; + for (int i=0; i +#include "mpi.h" + +#include "messages/MClientRequest.h" +#include "msg/MTMessenger.h" +#include "include/error.h" + +#define SARG_SIZE 64 +#define SERVER_RANK 0 +#define NTHREADS 11 // number of threads per rank +#define NMESSAGES 31 // number of messages per thread + +static void server_loop(MTMessenger &msgr, int world_size) +{ + // we expect this many messages from clients, then we quit + // (world_size-1 since server is one of the processes). + int totmsg = NTHREADS * NMESSAGES * (world_size - 1); + int nmsg = 0; + + char buf[SARG_SIZE]; + + while(nmsg < totmsg) { + MClientRequest *req = (MClientRequest*)msgr.recvreq(); + ASSERT(req->get_type() == MSG_CLIENT_REQUEST); + + //cout << "Server acknowledging " << req->get_sarg() << endl; + + sprintf(buf, "%s reply", req->get_sarg().c_str()); + MClientRequest resp(0, 0); + resp.set_sarg(buf); + msgr.sendresp(req, &resp); + + delete req; + nmsg++; + } + + cout << "Server successful" << endl; +} + +// arguments for client thread start function (see pthread_create) +struct client_arg +{ + MTMessenger *msgr; + int rank; + int thread; +}; + +static void *client_session(void *_carg) +{ + client_arg *carg = (client_arg *)_carg; + + char buf[SARG_SIZE]; + + // repeat some number (arbitrary really) of rounds + for (int i = 0; i < NMESSAGES; i++) { + + // send the message, receive the reply and check reply is as + // expected + + MClientRequest request(0, 0); + sprintf(buf, "r%d:t%d:m%d", carg->rank, carg->thread, i); + request.set_sarg(buf); + + //cout << "Client sending " << request.get_sarg() << endl; + + MClientRequest *resp = + (MClientRequest*)carg->msgr->sendrecv(&request, SERVER_RANK); + + ASSERT(resp->get_type() == MSG_CLIENT_REQUEST); + sprintf(buf, "r%d:t%d:m%d reply", carg->rank, carg->thread, i); + ASSERT(strcmp(buf, resp->get_sarg().c_str()) == 0); + + //cout << "Client verified " << resp->get_sarg() << endl; + + delete resp; + } + + cout << "Client (" << carg->rank << "," << carg->thread + << ") successful" << endl; + + delete carg; + return NULL; +} + +static void launch_clients(MTMessenger &msgr, int rank) +{ + pthread_t tid[NTHREADS]; + + // launch some number (arbitrary really) of threads + for (int i = 0; i < NTHREADS; i++) { + + client_arg *carg = (client_arg*)malloc(sizeof(client_arg)); + ASSERT(carg); + carg->msgr = &msgr; + carg->rank = rank; + carg->thread = i; + + if (pthread_create(&tid[i], NULL, client_session, carg) < 0) + SYSERROR(); + } + + // we must wait for all the threads to exit before returning, + // otherwise we shutdown MPI before while the threads are + // chatting. + for (int i = 0; i < NTHREADS; i++) { + void *retval; + + if (pthread_join(tid[i], &retval) < 0) + SYSERROR(); + } +} + +int main(int argc, char **argv) +{ + MTMessenger msgr(argc, argv); + + int rank; + ASSERT(MPI_Comm_rank(MPI_COMM_WORLD, &rank) == MPI_SUCCESS); + int world_size; + ASSERT(MPI_Comm_size(MPI_COMM_WORLD, &world_size) == MPI_SUCCESS); + + if (rank == SERVER_RANK) + server_loop(msgr, world_size); + else + launch_clients(msgr, rank); + + return 0; +} diff --git a/branches/sage/cephmds2/test/rushconfig b/branches/sage/cephmds2/test/rushconfig new file mode 100644 index 0000000000000..40d82702ea0a5 --- /dev/null +++ b/branches/sage/cephmds2/test/rushconfig @@ -0,0 +1,7 @@ +6 +8 10.0 +4 20.0 +7 30.0 +9 10.0 +8 15.0 +5 11.0 diff --git a/branches/sage/cephmds2/test/rushtest.cc b/branches/sage/cephmds2/test/rushtest.cc new file mode 100644 index 0000000000000..ecff83523e0c6 --- /dev/null +++ b/branches/sage/cephmds2/test/rushtest.cc @@ -0,0 +1,49 @@ +// +// $Id$ +// + +#include +#include +#include "../osd/rush.h" + +main (int argc, char *argv[]) +{ + Rush rush; + char buf[200]; + int i, j, k, numClusters; + int numKeys = 5; + int numReplicas = 4; + int curSize; + double curWeight; + int servers[1000]; + + if (argc > 1) { + numKeys = atoi (argv[1]); + } + if (argc > 2) { + numReplicas = atoi (argv[2]); + } + + fgets (buf, sizeof (buf) - 2, stdin); + sscanf (buf, "%d", &numClusters); + for (i = 0; i < numClusters; i++) { + fgets (buf, sizeof (buf) - 2, stdin); + sscanf (buf, "%d %lf", &curSize, &curWeight); + rush.AddCluster (curSize, curWeight); + if (rush.Servers () < numReplicas) { + fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", + rush.Clusters ()); + exit (-1); + } + for (j = 0; j < numKeys; j++) { + rush.GetServersByKey (j, numReplicas, servers); +#if 0 + printf ("%-3d %-6d ", i, j); + for (k = 0; k < numReplicas; k++) { + printf ("%-5d ", servers[k]); + } + putchar ('\n'); +#endif + } + } +} diff --git a/branches/sage/cephmds2/test/rushtest.cc~ b/branches/sage/cephmds2/test/rushtest.cc~ new file mode 100644 index 0000000000000..0b9512ccd0c3d --- /dev/null +++ b/branches/sage/cephmds2/test/rushtest.cc~ @@ -0,0 +1,49 @@ +// +// $Id$ +// + +#include +#include +#include "rush.h" + +main (int argc, char *argv[]) +{ + Rush rush; + char buf[200]; + int i, j, k, numClusters; + int numKeys = 5; + int numReplicas = 4; + int curSize; + double curWeight; + int servers[1000]; + + if (argc > 1) { + numKeys = atoi (argv[1]); + } + if (argc > 2) { + numReplicas = atoi (argv[2]); + } + + fgets (buf, sizeof (buf) - 2, stdin); + sscanf (buf, "%d", &numClusters); + for (i = 0; i < numClusters; i++) { + fgets (buf, sizeof (buf) - 2, stdin); + sscanf (buf, "%d %lf", &curSize, &curWeight); + rush.AddCluster (curSize, curWeight); + if (rush.Servers () < numReplicas) { + fprintf (stderr, "ERROR: must have at least %d disks in the system!\n", + rush.Clusters ()); + exit (-1); + } + for (j = 0; j < numKeys; j++) { + rush.GetServersByKey (j, numReplicas, servers); +#if 0 + printf ("%-3d %-6d ", i, j); + for (k = 0; k < numReplicas; k++) { + printf ("%-5d ", servers[k]); + } + putchar ('\n'); +#endif + } + } +} diff --git a/branches/sage/cephmds2/test/testbucket.cc b/branches/sage/cephmds2/test/testbucket.cc new file mode 100644 index 0000000000000..d8676da18faba --- /dev/null +++ b/branches/sage/cephmds2/test/testbucket.cc @@ -0,0 +1,67 @@ + + +#include "../crush/Bucket.h" +using namespace crush; + +#include +#include +using namespace std; + + +ostream& operator<<(ostream& out, vector& v) +{ + out << "["; + for (int i=0; i disks; + for (int i=0; i<20; i++) + disks.push_back(i); + + + /* + UniformBucket ub(1, 1, 0, 10, disks); + ub.make_primes(h); + cout << "primes are " << ub.primes << endl; + */ + + MixedBucket mb(2, 1); + for (int i=0;i<20;i++) + mb.add_item(i, 10); + + /* + MixedBucket b(3, 1); + b.add_item(1, ub.get_weight()); + b.add_item(2, mb.get_weight()); + */ + MixedBucket b= mb; + + vector ocount(disks.size()); + int numrep = 3; + + vector v(numrep); + for (int x=1; x<1000000; x++) { + //cout << H(x) << "\t" << h(x) << endl; + for (int i=0; i +using namespace std; + +#include "include/bufferlist.h" + + +int main() +{ + + bufferptr p1 = new buffer("123456",6); + bufferptr p2 = p1; + + cout << "it is '" << p1.c_str() << "'" << endl; + + bufferptr p3 = new buffer("abcdef",6); + + cout << "p3 is " << p3 << endl; + + bufferlist bl; + bl.push_back(p2); + bl.push_back(p1); + bl.push_back(p3); + + cout << "bl is " << bl << endl; + + cout << "len is " << bl.length() << endl; + + bufferlist took; + bl.splice(10,4,&took); + + cout << "took out " << took << "leftover is " << bl << endl; + //cout << "len is " << bl.length() << endl; + + bufferlist bl2; + bl2.substr_of(bl, 3, 5); + cout << "bl2 is " << bl2 << endl; + + +} diff --git a/branches/sage/cephmds2/test/testcrush.cc b/branches/sage/cephmds2/test/testcrush.cc new file mode 100644 index 0000000000000..bd432b23ee95c --- /dev/null +++ b/branches/sage/cephmds2/test/testcrush.cc @@ -0,0 +1,266 @@ + + +#include "../crush/crush.h" +using namespace crush; + +#include + +#include +#include +using namespace std; + +/* +ostream& operator<<(ostream& out, vector& v) +{ + out << "["; + for (int i=0; i& d) +{ + d.clear(); + while (n) { + d.push_back(no); + no++; + n--; + } +} + + +Bucket *make_bucket(Crush& c, vector& wid, int h, int& ndisks, int& nbuckets) +{ + if (h == 0) { + // uniform + Hash hash(123); + vector disks; + for (int i=0; imake_primes(hash); + c.add_bucket(b); + //cout << h << " uniformbucket with " << wid[h] << " disks" << endl; + return b; + } else { + // mixed + MixedBucket *b = new MixedBucket(nbuckets--, h+1); + for (int i=0; iadd_item(n->get_id(), n->get_weight()); + } + c.add_bucket(b); + //cout << h << " mixedbucket with " << wid[h] << endl; + return b; + } +} + +int make_hierarchy(Crush& c, vector& wid, int& ndisks, int& nbuckets) +{ + Bucket *b = make_bucket(c, wid, wid.size()-1, ndisks, nbuckets); + return b->get_id(); +} + + + +int main() +{ + Hash h(73232313); + + // crush + Crush c; + + + // buckets + vector disks; + int root = -1; + int nbuckets = -1; + int ndisks = 0; + + if (0) { + make_disks(12, ndisks, disks); + UniformBucket ub1(-1, 1, 0, 30, disks); + ub1.make_primes(h); + cout << "ub1 primes are " << ub1.primes << endl; + c.add_bucket(&ub1); + + make_disks(17, ndisks, disks); + UniformBucket ub2(-2, 1, 0, 30, disks); + ub2.make_primes(h); + cout << "ub2 primes are " << ub2.primes << endl; + c.add_bucket(&ub2); + + make_disks(4, ndisks, disks); + UniformBucket ub3(-3, 1, 0, 30, disks); + ub3.make_primes(h); + cout << "ub3 primes are " << ub3.primes << endl; + c.add_bucket(&ub3); + + make_disks(20, ndisks, disks); + MixedBucket umb1(-4, 1); + for (int i=0; i<20; i++) + umb1.add_item(disks[i], 30); + c.add_bucket(&umb1); + + MixedBucket b(-100, 1); + //b.add_item(-2, ub1.get_weight()); + b.add_item(-4, umb1.get_weight()); + //b.add_item(-2, ub2.get_weight()); + //b.add_item(-3, ub3.get_weight()); + } + + if (0) { + int bucket = -1; + MixedBucket *root = new MixedBucket(bucket--, 2); + + for (int i=0; i<5; i++) { + MixedBucket *b = new MixedBucket(bucket--, 1); + + int n = 5; + + if (1) { + // add n buckets of n disks + for (int j=0; jadd_item(disks[k], 10); + + //b->add_item(disks[j], 10); + c.add_bucket(d); + b->add_item(d->get_id(), d->get_weight()); + } + + c.add_bucket(b); + root->add_item(b->get_id(), b->get_weight()); + } else { + // add n*n disks + make_disks(n*n, ndisks, disks); + for (int k=0; kadd_item(disks[k], 10); + + c.add_bucket(b); + root->add_item(b->get_id(), b->get_weight()); + } + } + + c.add_bucket(root); + } + + + if (1) { + vector wid; + for (int d=0; d<5; d++) + wid.push_back(10); + root = make_hierarchy(c, wid, ndisks, nbuckets); + } + + + + // rule + int numrep = 1; + + Rule rule; + if (0) { + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -100)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, numrep, 0)); + } + if (1) { + /* + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, -4)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 2, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + */ + rule.steps.push_back(RuleStep(CRUSH_RULE_TAKE, root)); + rule.steps.push_back(RuleStep(CRUSH_RULE_CHOOSE, 1, 0)); + rule.steps.push_back(RuleStep(CRUSH_RULE_EMIT)); + } + + //c.overload[10] = .1; + + + int pg_per = 100; + int numpg = pg_per*ndisks/numrep; + + vector ocount(ndisks); + cout << ndisks << " disks, " << 1-nbuckets << " buckets" << endl; + cout << pg_per << " pgs per disk" << endl; + cout << numpg << " logical pgs" << endl; + cout << "numrep is " << numrep << endl; + + + int place = 1000000; + int times = place / numpg; + if (!times) times = 1; + + cout << "looping " << times << " times" << endl; + + float tvar = 0; + int tvarnum = 0; + + int x = 0; + for (int t=0; t v(numrep); + + for (int z=0; z +using namespace std; + +int print(string s) { + filepath fp = s; + cout << "s = " << s << " filepath = " << fp << endl; + cout << " depth " << fp.depth() << endl; + for (int i=0; i +#include +#include +using namespace std; + +#include "config.h" +#include "messages/MPing.h" +#include "common/Mutex.h" + +#include "msg/MPIMessenger.h" + +class Pinger : public Dispatcher { +public: + Messenger *messenger; + Pinger(Messenger *m) : messenger(m) { + m->set_dispatcher(this); + } + void dispatch(Message *m) { + //dout(1) << "got incoming " << m << endl; + delete m; + + } +}; + +int main(int argc, char **argv) { + int num = 1000; + + int myrank = mpimessenger_init(argc, argv); + int world = mpimessenger_world(); + + Pinger *p = new Pinger( new MPIMessenger(myrank) ); + + mpimessenger_start(); + + //while (1) { + for (int i=0; i<10000; i++) { + + // ping random nodes + int d = rand() % world; + if (d != myrank) { + //cout << "sending " << i << " to " << d << endl; + p->messenger->send_message(new MPing(), d); + } + + } + + + //cout << "shutting down" << endl; + //p->messenger->shutdown(); + + mpimessenger_wait(); + mpimessenger_shutdown(); // shutdown MPI +} diff --git a/branches/sage/cephmds2/test/testnewbuffers.cc b/branches/sage/cephmds2/test/testnewbuffers.cc new file mode 100644 index 0000000000000..0fea7571a4572 --- /dev/null +++ b/branches/sage/cephmds2/test/testnewbuffers.cc @@ -0,0 +1,91 @@ + +#include +#include +using namespace std; + + +#include "include/newbuffer.h" +//#include "include/bufferlist.h" + +#include "common/Thread.h" + + + class Th : public Thread { + public: + bufferlist bl; + Th(bufferlist& o) : bl(o) { } + + void *entry() { + //cout << "start" << endl; + // thrash it a bit. + for (int n=0; n<10000; n++) { + bufferlist bl2; + unsigned off = rand() % (bl.length() -1); + unsigned len = 1 + rand() % (bl.length() - off - 1); + bl2.substr_of(bl, off, len); + bufferlist bl3; + bl3.append(bl); + bl3.append(bl2); + //cout << bl3 << endl; + bl2.clear(); + bl3.clear(); + } + //cout << "end" << endl; + } + }; + +int main() +{ + + bufferptr p1 = buffer::copy("123456",7); + //bufferptr p1 = new buffer("123456",7); + bufferptr p2 = p1; + + cout << "p1 is '" << p1.c_str() << "'" << " " << p1 << endl; + cout << "p2 is '" << p2.c_str() << "'" << " " << p2 << endl; + + bufferptr p3 = buffer::copy("abcdef",7); + //bufferptr p3 = new buffer("abcdef",7); + + cout << "p3 is " << p3.c_str() << " " << p3 << endl; + + bufferlist bl; + bl.push_back(p2); + bl.push_back(p1); + bl.push_back(p3); + + cout << "bl is " << bl << endl; + + bufferlist took; + bl.splice(10,4,&took); + + cout << "took out " << took << ", leftover is " << bl << endl; + //cout << "len is " << bl.length() << endl; + + bufferlist bl2; + bl2.substr_of(bl, 3, 5); + cout << "bl2 is " << bl2 << endl; + + + cout << "bl before " << bl << endl; + + list ls; + for (int t=0; t<40; t++) { + Th *t = new Th(bl); + cout << "create" << endl; + t->create(); + ls.push_back(t); + } + + bl.clear(); + + while (!ls.empty()) { + cout << "join" << endl; + ls.front()->join(); + delete ls.front(); + ls.pop_front(); + } + + cout << "bl after " << bl << endl; + +} diff --git a/branches/sage/cephmds2/test/testtree.cc b/branches/sage/cephmds2/test/testtree.cc new file mode 100644 index 0000000000000..2c21bcbe52e25 --- /dev/null +++ b/branches/sage/cephmds2/test/testtree.cc @@ -0,0 +1,46 @@ + + +#include "../crush/BinaryTree.h" +using namespace crush; + +#include +#include +using namespace std; + +int main() +{ + BinaryTree t; + + vector nodes; + + for (int i=0; i<30; i++) { + cout << "adding " << i << endl; + int n = t.add_node(1); + nodes.push_back(n); + //cout << t << endl; + } + cout << t << endl; + + for (int k=0; k<10000; k++) { + if (rand() % 2) { + cout << "adding" << endl; + nodes.push_back( t.add_node(1) ); + } else { + if (!nodes.empty()) { + //for (int i=0; i +using namespace std; + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char**argv) +{ + int a = 1; + int b = 2; + + mknod("test", 0600, 0); + + cout << "setxattr " << setxattr("test", "asdf", &a, sizeof(a), 0) << endl; + cout << "errno " << errno << " " << strerror(errno) << endl; + cout << "getxattr " << getxattr("test", "asdf", &b, sizeof(b)) << endl; + cout << "errno " << errno << " " << strerror(errno) << endl; + cout << "a is " << a << " and b is " << b << endl; + return 0; +} -- 2.39.5